xref: /openbmc/linux/fs/ext4/fast_commit.c (revision a80f7fcf18672ae4971a6b713b58c0d389aa99fe)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to gaurantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118 
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121 
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124 	BUFFER_TRACE(bh, "");
125 	if (uptodate) {
126 		ext4_debug("%s: Block %lld up-to-date",
127 			   __func__, bh->b_blocknr);
128 		set_buffer_uptodate(bh);
129 	} else {
130 		ext4_debug("%s: Block %lld not up-to-date",
131 			   __func__, bh->b_blocknr);
132 		clear_buffer_uptodate(bh);
133 	}
134 
135 	unlock_buffer(bh);
136 }
137 
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140 	struct ext4_inode_info *ei = EXT4_I(inode);
141 
142 	ei->i_fc_lblk_start = 0;
143 	ei->i_fc_lblk_len = 0;
144 }
145 
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148 	struct ext4_inode_info *ei = EXT4_I(inode);
149 
150 	ext4_fc_reset_inode(inode);
151 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 	INIT_LIST_HEAD(&ei->i_fc_list);
153 	init_waitqueue_head(&ei->i_fc_wait);
154 	atomic_set(&ei->i_fc_updates, 0);
155 	ei->i_fc_committed_subtid = 0;
156 }
157 
158 /*
159  * Inform Ext4's fast about start of an inode update
160  *
161  * This function is called by the high level call VFS callbacks before
162  * performing any inode update. This function blocks if there's an ongoing
163  * fast commit on the inode in question.
164  */
165 void ext4_fc_start_update(struct inode *inode)
166 {
167 	struct ext4_inode_info *ei = EXT4_I(inode);
168 
169 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
170 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
171 		return;
172 
173 restart:
174 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
175 	if (list_empty(&ei->i_fc_list))
176 		goto out;
177 
178 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
179 		wait_queue_head_t *wq;
180 #if (BITS_PER_LONG < 64)
181 		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
182 				EXT4_STATE_FC_COMMITTING);
183 		wq = bit_waitqueue(&ei->i_state_flags,
184 				   EXT4_STATE_FC_COMMITTING);
185 #else
186 		DEFINE_WAIT_BIT(wait, &ei->i_flags,
187 				EXT4_STATE_FC_COMMITTING);
188 		wq = bit_waitqueue(&ei->i_flags,
189 				   EXT4_STATE_FC_COMMITTING);
190 #endif
191 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
192 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
193 		schedule();
194 		finish_wait(wq, &wait.wq_entry);
195 		goto restart;
196 	}
197 out:
198 	atomic_inc(&ei->i_fc_updates);
199 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
200 }
201 
202 /*
203  * Stop inode update and wake up waiting fast commits if any.
204  */
205 void ext4_fc_stop_update(struct inode *inode)
206 {
207 	struct ext4_inode_info *ei = EXT4_I(inode);
208 
209 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
210 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
211 		return;
212 
213 	if (atomic_dec_and_test(&ei->i_fc_updates))
214 		wake_up_all(&ei->i_fc_wait);
215 }
216 
217 /*
218  * Remove inode from fast commit list. If the inode is being committed
219  * we wait until inode commit is done.
220  */
221 void ext4_fc_del(struct inode *inode)
222 {
223 	struct ext4_inode_info *ei = EXT4_I(inode);
224 
225 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
226 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
227 		return;
228 
229 restart:
230 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
231 	if (list_empty(&ei->i_fc_list)) {
232 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
233 		return;
234 	}
235 
236 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
237 		wait_queue_head_t *wq;
238 #if (BITS_PER_LONG < 64)
239 		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
240 				EXT4_STATE_FC_COMMITTING);
241 		wq = bit_waitqueue(&ei->i_state_flags,
242 				   EXT4_STATE_FC_COMMITTING);
243 #else
244 		DEFINE_WAIT_BIT(wait, &ei->i_flags,
245 				EXT4_STATE_FC_COMMITTING);
246 		wq = bit_waitqueue(&ei->i_flags,
247 				   EXT4_STATE_FC_COMMITTING);
248 #endif
249 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
250 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 		schedule();
252 		finish_wait(wq, &wait.wq_entry);
253 		goto restart;
254 	}
255 	if (!list_empty(&ei->i_fc_list))
256 		list_del_init(&ei->i_fc_list);
257 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259 
260 /*
261  * Mark file system as fast commit ineligible. This means that next commit
262  * operation would result in a full jbd2 commit.
263  */
264 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
265 {
266 	struct ext4_sb_info *sbi = EXT4_SB(sb);
267 
268 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
269 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
270 		return;
271 
272 	sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
273 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
274 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
275 }
276 
277 /*
278  * Start a fast commit ineligible update. Any commits that happen while
279  * such an operation is in progress fall back to full commits.
280  */
281 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
282 {
283 	struct ext4_sb_info *sbi = EXT4_SB(sb);
284 
285 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
286 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
287 		return;
288 
289 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
290 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
291 	atomic_inc(&sbi->s_fc_ineligible_updates);
292 }
293 
294 /*
295  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
296  * to ensure that after stopping the ineligible update, at least one full
297  * commit takes place.
298  */
299 void ext4_fc_stop_ineligible(struct super_block *sb)
300 {
301 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
302 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
303 		return;
304 
305 	EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
306 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
307 }
308 
309 static inline int ext4_fc_is_ineligible(struct super_block *sb)
310 {
311 	return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
312 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
313 }
314 
315 /*
316  * Generic fast commit tracking function. If this is the first time this we are
317  * called after a full commit, we initialize fast commit fields and then call
318  * __fc_track_fn() with update = 0. If we have already been called after a full
319  * commit, we pass update = 1. Based on that, the track function can determine
320  * if it needs to track a field for the first time or if it needs to just
321  * update the previously tracked value.
322  *
323  * If enqueue is set, this function enqueues the inode in fast commit list.
324  */
325 static int ext4_fc_track_template(
326 	handle_t *handle, struct inode *inode,
327 	int (*__fc_track_fn)(struct inode *, void *, bool),
328 	void *args, int enqueue)
329 {
330 	bool update = false;
331 	struct ext4_inode_info *ei = EXT4_I(inode);
332 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333 	tid_t tid = 0;
334 	int ret;
335 
336 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
337 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
338 		return -EOPNOTSUPP;
339 
340 	if (ext4_fc_is_ineligible(inode->i_sb))
341 		return -EINVAL;
342 
343 	tid = handle->h_transaction->t_tid;
344 	mutex_lock(&ei->i_fc_lock);
345 	if (tid == ei->i_sync_tid) {
346 		update = true;
347 	} else {
348 		ext4_fc_reset_inode(inode);
349 		ei->i_sync_tid = tid;
350 	}
351 	ret = __fc_track_fn(inode, args, update);
352 	mutex_unlock(&ei->i_fc_lock);
353 
354 	if (!enqueue)
355 		return ret;
356 
357 	spin_lock(&sbi->s_fc_lock);
358 	if (list_empty(&EXT4_I(inode)->i_fc_list))
359 		list_add_tail(&EXT4_I(inode)->i_fc_list,
360 				(sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
361 				&sbi->s_fc_q[FC_Q_STAGING] :
362 				&sbi->s_fc_q[FC_Q_MAIN]);
363 	spin_unlock(&sbi->s_fc_lock);
364 
365 	return ret;
366 }
367 
368 struct __track_dentry_update_args {
369 	struct dentry *dentry;
370 	int op;
371 };
372 
373 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
374 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
375 {
376 	struct ext4_fc_dentry_update *node;
377 	struct ext4_inode_info *ei = EXT4_I(inode);
378 	struct __track_dentry_update_args *dentry_update =
379 		(struct __track_dentry_update_args *)arg;
380 	struct dentry *dentry = dentry_update->dentry;
381 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
382 
383 	mutex_unlock(&ei->i_fc_lock);
384 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
385 	if (!node) {
386 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
387 		mutex_lock(&ei->i_fc_lock);
388 		return -ENOMEM;
389 	}
390 
391 	node->fcd_op = dentry_update->op;
392 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
393 	node->fcd_ino = inode->i_ino;
394 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
395 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
396 		if (!node->fcd_name.name) {
397 			kmem_cache_free(ext4_fc_dentry_cachep, node);
398 			ext4_fc_mark_ineligible(inode->i_sb,
399 				EXT4_FC_REASON_NOMEM);
400 			mutex_lock(&ei->i_fc_lock);
401 			return -ENOMEM;
402 		}
403 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
404 			dentry->d_name.len);
405 	} else {
406 		memcpy(node->fcd_iname, dentry->d_name.name,
407 			dentry->d_name.len);
408 		node->fcd_name.name = node->fcd_iname;
409 	}
410 	node->fcd_name.len = dentry->d_name.len;
411 
412 	spin_lock(&sbi->s_fc_lock);
413 	if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
414 		list_add_tail(&node->fcd_list,
415 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
416 	else
417 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
418 	spin_unlock(&sbi->s_fc_lock);
419 	mutex_lock(&ei->i_fc_lock);
420 
421 	return 0;
422 }
423 
424 void __ext4_fc_track_unlink(handle_t *handle,
425 		struct inode *inode, struct dentry *dentry)
426 {
427 	struct __track_dentry_update_args args;
428 	int ret;
429 
430 	args.dentry = dentry;
431 	args.op = EXT4_FC_TAG_UNLINK;
432 
433 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
434 					(void *)&args, 0);
435 	trace_ext4_fc_track_unlink(inode, dentry, ret);
436 }
437 
438 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
439 {
440 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
441 }
442 
443 void __ext4_fc_track_link(handle_t *handle,
444 	struct inode *inode, struct dentry *dentry)
445 {
446 	struct __track_dentry_update_args args;
447 	int ret;
448 
449 	args.dentry = dentry;
450 	args.op = EXT4_FC_TAG_LINK;
451 
452 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
453 					(void *)&args, 0);
454 	trace_ext4_fc_track_link(inode, dentry, ret);
455 }
456 
457 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
458 {
459 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
460 }
461 
462 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
463 {
464 	struct __track_dentry_update_args args;
465 	struct inode *inode = d_inode(dentry);
466 	int ret;
467 
468 	args.dentry = dentry;
469 	args.op = EXT4_FC_TAG_CREAT;
470 
471 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
472 					(void *)&args, 0);
473 	trace_ext4_fc_track_create(inode, dentry, ret);
474 }
475 
476 /* __track_fn for inode tracking */
477 static int __track_inode(struct inode *inode, void *arg, bool update)
478 {
479 	if (update)
480 		return -EEXIST;
481 
482 	EXT4_I(inode)->i_fc_lblk_len = 0;
483 
484 	return 0;
485 }
486 
487 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
488 {
489 	int ret;
490 
491 	if (S_ISDIR(inode->i_mode))
492 		return;
493 
494 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
495 	trace_ext4_fc_track_inode(inode, ret);
496 }
497 
498 struct __track_range_args {
499 	ext4_lblk_t start, end;
500 };
501 
502 /* __track_fn for tracking data updates */
503 static int __track_range(struct inode *inode, void *arg, bool update)
504 {
505 	struct ext4_inode_info *ei = EXT4_I(inode);
506 	ext4_lblk_t oldstart;
507 	struct __track_range_args *__arg =
508 		(struct __track_range_args *)arg;
509 
510 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
511 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
512 		return -ECANCELED;
513 	}
514 
515 	oldstart = ei->i_fc_lblk_start;
516 
517 	if (update && ei->i_fc_lblk_len > 0) {
518 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
519 		ei->i_fc_lblk_len =
520 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
521 				ei->i_fc_lblk_start + 1;
522 	} else {
523 		ei->i_fc_lblk_start = __arg->start;
524 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
525 	}
526 
527 	return 0;
528 }
529 
530 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
531 			 ext4_lblk_t end)
532 {
533 	struct __track_range_args args;
534 	int ret;
535 
536 	if (S_ISDIR(inode->i_mode))
537 		return;
538 
539 	args.start = start;
540 	args.end = end;
541 
542 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
543 
544 	trace_ext4_fc_track_range(inode, start, end, ret);
545 }
546 
547 static void ext4_fc_submit_bh(struct super_block *sb)
548 {
549 	int write_flags = REQ_SYNC;
550 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
551 
552 	if (test_opt(sb, BARRIER))
553 		write_flags |= REQ_FUA | REQ_PREFLUSH;
554 	lock_buffer(bh);
555 	clear_buffer_dirty(bh);
556 	set_buffer_uptodate(bh);
557 	bh->b_end_io = ext4_end_buffer_io_sync;
558 	submit_bh(REQ_OP_WRITE, write_flags, bh);
559 	EXT4_SB(sb)->s_fc_bh = NULL;
560 }
561 
562 /* Ext4 commit path routines */
563 
564 /* memzero and update CRC */
565 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
566 				u32 *crc)
567 {
568 	void *ret;
569 
570 	ret = memset(dst, 0, len);
571 	if (crc)
572 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
573 	return ret;
574 }
575 
576 /*
577  * Allocate len bytes on a fast commit buffer.
578  *
579  * During the commit time this function is used to manage fast commit
580  * block space. We don't split a fast commit log onto different
581  * blocks. So this function makes sure that if there's not enough space
582  * on the current block, the remaining space in the current block is
583  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
584  * new block is from jbd2 and CRC is updated to reflect the padding
585  * we added.
586  */
587 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
588 {
589 	struct ext4_fc_tl *tl;
590 	struct ext4_sb_info *sbi = EXT4_SB(sb);
591 	struct buffer_head *bh;
592 	int bsize = sbi->s_journal->j_blocksize;
593 	int ret, off = sbi->s_fc_bytes % bsize;
594 	int pad_len;
595 
596 	/*
597 	 * After allocating len, we should have space at least for a 0 byte
598 	 * padding.
599 	 */
600 	if (len + sizeof(struct ext4_fc_tl) > bsize)
601 		return NULL;
602 
603 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
604 		/*
605 		 * Only allocate from current buffer if we have enough space for
606 		 * this request AND we have space to add a zero byte padding.
607 		 */
608 		if (!sbi->s_fc_bh) {
609 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
610 			if (ret)
611 				return NULL;
612 			sbi->s_fc_bh = bh;
613 		}
614 		sbi->s_fc_bytes += len;
615 		return sbi->s_fc_bh->b_data + off;
616 	}
617 	/* Need to add PAD tag */
618 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
619 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
620 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
621 	tl->fc_len = cpu_to_le16(pad_len);
622 	if (crc)
623 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
624 	if (pad_len > 0)
625 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
626 	ext4_fc_submit_bh(sb);
627 
628 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
629 	if (ret)
630 		return NULL;
631 	sbi->s_fc_bh = bh;
632 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
633 	return sbi->s_fc_bh->b_data;
634 }
635 
636 /* memcpy to fc reserved space and update CRC */
637 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
638 				int len, u32 *crc)
639 {
640 	if (crc)
641 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
642 	return memcpy(dst, src, len);
643 }
644 
645 /*
646  * Complete a fast commit by writing tail tag.
647  *
648  * Writing tail tag marks the end of a fast commit. In order to guarantee
649  * atomicity, after writing tail tag, even if there's space remaining
650  * in the block, next commit shouldn't use it. That's why tail tag
651  * has the length as that of the remaining space on the block.
652  */
653 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
654 {
655 	struct ext4_sb_info *sbi = EXT4_SB(sb);
656 	struct ext4_fc_tl tl;
657 	struct ext4_fc_tail tail;
658 	int off, bsize = sbi->s_journal->j_blocksize;
659 	u8 *dst;
660 
661 	/*
662 	 * ext4_fc_reserve_space takes care of allocating an extra block if
663 	 * there's no enough space on this block for accommodating this tail.
664 	 */
665 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
666 	if (!dst)
667 		return -ENOSPC;
668 
669 	off = sbi->s_fc_bytes % bsize;
670 
671 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
672 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
673 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
674 
675 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
676 	dst += sizeof(tl);
677 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
678 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
679 	dst += sizeof(tail.fc_tid);
680 	tail.fc_crc = cpu_to_le32(crc);
681 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
682 
683 	ext4_fc_submit_bh(sb);
684 
685 	return 0;
686 }
687 
688 /*
689  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
690  * Returns false if there's not enough space.
691  */
692 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
693 			   u32 *crc)
694 {
695 	struct ext4_fc_tl tl;
696 	u8 *dst;
697 
698 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
699 	if (!dst)
700 		return false;
701 
702 	tl.fc_tag = cpu_to_le16(tag);
703 	tl.fc_len = cpu_to_le16(len);
704 
705 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
706 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
707 
708 	return true;
709 }
710 
711 /* Same as above, but adds dentry tlv. */
712 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
713 					int parent_ino, int ino, int dlen,
714 					const unsigned char *dname,
715 					u32 *crc)
716 {
717 	struct ext4_fc_dentry_info fcd;
718 	struct ext4_fc_tl tl;
719 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
720 					crc);
721 
722 	if (!dst)
723 		return false;
724 
725 	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
726 	fcd.fc_ino = cpu_to_le32(ino);
727 	tl.fc_tag = cpu_to_le16(tag);
728 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
729 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
730 	dst += sizeof(tl);
731 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
732 	dst += sizeof(fcd);
733 	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
734 	dst += dlen;
735 
736 	return true;
737 }
738 
739 /*
740  * Writes inode in the fast commit space under TLV with tag @tag.
741  * Returns 0 on success, error on failure.
742  */
743 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
744 {
745 	struct ext4_inode_info *ei = EXT4_I(inode);
746 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
747 	int ret;
748 	struct ext4_iloc iloc;
749 	struct ext4_fc_inode fc_inode;
750 	struct ext4_fc_tl tl;
751 	u8 *dst;
752 
753 	ret = ext4_get_inode_loc(inode, &iloc);
754 	if (ret)
755 		return ret;
756 
757 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
758 		inode_len += ei->i_extra_isize;
759 
760 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
761 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
762 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
763 
764 	dst = ext4_fc_reserve_space(inode->i_sb,
765 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
766 	if (!dst)
767 		return -ECANCELED;
768 
769 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
770 		return -ECANCELED;
771 	dst += sizeof(tl);
772 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
773 		return -ECANCELED;
774 	dst += sizeof(fc_inode);
775 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
776 					inode_len, crc))
777 		return -ECANCELED;
778 
779 	return 0;
780 }
781 
782 /*
783  * Writes updated data ranges for the inode in question. Updates CRC.
784  * Returns 0 on success, error otherwise.
785  */
786 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
787 {
788 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
789 	struct ext4_inode_info *ei = EXT4_I(inode);
790 	struct ext4_map_blocks map;
791 	struct ext4_fc_add_range fc_ext;
792 	struct ext4_fc_del_range lrange;
793 	struct ext4_extent *ex;
794 	int ret;
795 
796 	mutex_lock(&ei->i_fc_lock);
797 	if (ei->i_fc_lblk_len == 0) {
798 		mutex_unlock(&ei->i_fc_lock);
799 		return 0;
800 	}
801 	old_blk_size = ei->i_fc_lblk_start;
802 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
803 	ei->i_fc_lblk_len = 0;
804 	mutex_unlock(&ei->i_fc_lock);
805 
806 	cur_lblk_off = old_blk_size;
807 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
808 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
809 
810 	while (cur_lblk_off <= new_blk_size) {
811 		map.m_lblk = cur_lblk_off;
812 		map.m_len = new_blk_size - cur_lblk_off + 1;
813 		ret = ext4_map_blocks(NULL, inode, &map, 0);
814 		if (ret < 0)
815 			return -ECANCELED;
816 
817 		if (map.m_len == 0) {
818 			cur_lblk_off++;
819 			continue;
820 		}
821 
822 		if (ret == 0) {
823 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
824 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
825 			lrange.fc_len = cpu_to_le32(map.m_len);
826 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
827 					    sizeof(lrange), (u8 *)&lrange, crc))
828 				return -ENOSPC;
829 		} else {
830 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
831 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
832 			ex->ee_block = cpu_to_le32(map.m_lblk);
833 			ex->ee_len = cpu_to_le16(map.m_len);
834 			ext4_ext_store_pblock(ex, map.m_pblk);
835 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
836 				ext4_ext_mark_unwritten(ex);
837 			else
838 				ext4_ext_mark_initialized(ex);
839 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
840 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
841 				return -ENOSPC;
842 		}
843 
844 		cur_lblk_off += map.m_len;
845 	}
846 
847 	return 0;
848 }
849 
850 
851 /* Submit data for all the fast commit inodes */
852 static int ext4_fc_submit_inode_data_all(journal_t *journal)
853 {
854 	struct super_block *sb = (struct super_block *)(journal->j_private);
855 	struct ext4_sb_info *sbi = EXT4_SB(sb);
856 	struct ext4_inode_info *ei;
857 	struct list_head *pos;
858 	int ret = 0;
859 
860 	spin_lock(&sbi->s_fc_lock);
861 	sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
862 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
863 		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
864 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
865 		while (atomic_read(&ei->i_fc_updates)) {
866 			DEFINE_WAIT(wait);
867 
868 			prepare_to_wait(&ei->i_fc_wait, &wait,
869 						TASK_UNINTERRUPTIBLE);
870 			if (atomic_read(&ei->i_fc_updates)) {
871 				spin_unlock(&sbi->s_fc_lock);
872 				schedule();
873 				spin_lock(&sbi->s_fc_lock);
874 			}
875 			finish_wait(&ei->i_fc_wait, &wait);
876 		}
877 		spin_unlock(&sbi->s_fc_lock);
878 		ret = jbd2_submit_inode_data(ei->jinode);
879 		if (ret)
880 			return ret;
881 		spin_lock(&sbi->s_fc_lock);
882 	}
883 	spin_unlock(&sbi->s_fc_lock);
884 
885 	return ret;
886 }
887 
888 /* Wait for completion of data for all the fast commit inodes */
889 static int ext4_fc_wait_inode_data_all(journal_t *journal)
890 {
891 	struct super_block *sb = (struct super_block *)(journal->j_private);
892 	struct ext4_sb_info *sbi = EXT4_SB(sb);
893 	struct ext4_inode_info *pos, *n;
894 	int ret = 0;
895 
896 	spin_lock(&sbi->s_fc_lock);
897 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
898 		if (!ext4_test_inode_state(&pos->vfs_inode,
899 					   EXT4_STATE_FC_COMMITTING))
900 			continue;
901 		spin_unlock(&sbi->s_fc_lock);
902 
903 		ret = jbd2_wait_inode_data(journal, pos->jinode);
904 		if (ret)
905 			return ret;
906 		spin_lock(&sbi->s_fc_lock);
907 	}
908 	spin_unlock(&sbi->s_fc_lock);
909 
910 	return 0;
911 }
912 
913 /* Commit all the directory entry updates */
914 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
915 {
916 	struct super_block *sb = (struct super_block *)(journal->j_private);
917 	struct ext4_sb_info *sbi = EXT4_SB(sb);
918 	struct ext4_fc_dentry_update *fc_dentry;
919 	struct inode *inode;
920 	struct list_head *pos, *n, *fcd_pos, *fcd_n;
921 	struct ext4_inode_info *ei;
922 	int ret;
923 
924 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
925 		return 0;
926 	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
927 		fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
928 					fcd_list);
929 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
930 			spin_unlock(&sbi->s_fc_lock);
931 			if (!ext4_fc_add_dentry_tlv(
932 				sb, fc_dentry->fcd_op,
933 				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
934 				fc_dentry->fcd_name.len,
935 				fc_dentry->fcd_name.name, crc)) {
936 				ret = -ENOSPC;
937 				goto lock_and_exit;
938 			}
939 			spin_lock(&sbi->s_fc_lock);
940 			continue;
941 		}
942 
943 		inode = NULL;
944 		list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
945 			ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
946 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
947 				inode = &ei->vfs_inode;
948 				break;
949 			}
950 		}
951 		/*
952 		 * If we don't find inode in our list, then it was deleted,
953 		 * in which case, we don't need to record it's create tag.
954 		 */
955 		if (!inode)
956 			continue;
957 		spin_unlock(&sbi->s_fc_lock);
958 
959 		/*
960 		 * We first write the inode and then the create dirent. This
961 		 * allows the recovery code to create an unnamed inode first
962 		 * and then link it to a directory entry. This allows us
963 		 * to use namei.c routines almost as is and simplifies
964 		 * the recovery code.
965 		 */
966 		ret = ext4_fc_write_inode(inode, crc);
967 		if (ret)
968 			goto lock_and_exit;
969 
970 		ret = ext4_fc_write_inode_data(inode, crc);
971 		if (ret)
972 			goto lock_and_exit;
973 
974 		if (!ext4_fc_add_dentry_tlv(
975 			sb, fc_dentry->fcd_op,
976 			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
977 			fc_dentry->fcd_name.len,
978 			fc_dentry->fcd_name.name, crc)) {
979 			ret = -ENOSPC;
980 			goto lock_and_exit;
981 		}
982 
983 		spin_lock(&sbi->s_fc_lock);
984 	}
985 	return 0;
986 lock_and_exit:
987 	spin_lock(&sbi->s_fc_lock);
988 	return ret;
989 }
990 
991 static int ext4_fc_perform_commit(journal_t *journal)
992 {
993 	struct super_block *sb = (struct super_block *)(journal->j_private);
994 	struct ext4_sb_info *sbi = EXT4_SB(sb);
995 	struct ext4_inode_info *iter;
996 	struct ext4_fc_head head;
997 	struct list_head *pos;
998 	struct inode *inode;
999 	struct blk_plug plug;
1000 	int ret = 0;
1001 	u32 crc = 0;
1002 
1003 	ret = ext4_fc_submit_inode_data_all(journal);
1004 	if (ret)
1005 		return ret;
1006 
1007 	ret = ext4_fc_wait_inode_data_all(journal);
1008 	if (ret)
1009 		return ret;
1010 
1011 	blk_start_plug(&plug);
1012 	if (sbi->s_fc_bytes == 0) {
1013 		/*
1014 		 * Add a head tag only if this is the first fast commit
1015 		 * in this TID.
1016 		 */
1017 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1018 		head.fc_tid = cpu_to_le32(
1019 			sbi->s_journal->j_running_transaction->t_tid);
1020 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1021 			(u8 *)&head, &crc))
1022 			goto out;
1023 	}
1024 
1025 	spin_lock(&sbi->s_fc_lock);
1026 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1027 	if (ret) {
1028 		spin_unlock(&sbi->s_fc_lock);
1029 		goto out;
1030 	}
1031 
1032 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1033 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1034 		inode = &iter->vfs_inode;
1035 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1036 			continue;
1037 
1038 		spin_unlock(&sbi->s_fc_lock);
1039 		ret = ext4_fc_write_inode_data(inode, &crc);
1040 		if (ret)
1041 			goto out;
1042 		ret = ext4_fc_write_inode(inode, &crc);
1043 		if (ret)
1044 			goto out;
1045 		spin_lock(&sbi->s_fc_lock);
1046 		EXT4_I(inode)->i_fc_committed_subtid =
1047 			atomic_read(&sbi->s_fc_subtid);
1048 	}
1049 	spin_unlock(&sbi->s_fc_lock);
1050 
1051 	ret = ext4_fc_write_tail(sb, crc);
1052 
1053 out:
1054 	blk_finish_plug(&plug);
1055 	return ret;
1056 }
1057 
1058 /*
1059  * The main commit entry point. Performs a fast commit for transaction
1060  * commit_tid if needed. If it's not possible to perform a fast commit
1061  * due to various reasons, we fall back to full commit. Returns 0
1062  * on success, error otherwise.
1063  */
1064 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1065 {
1066 	struct super_block *sb = (struct super_block *)(journal->j_private);
1067 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1068 	int nblks = 0, ret, bsize = journal->j_blocksize;
1069 	int subtid = atomic_read(&sbi->s_fc_subtid);
1070 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1071 	ktime_t start_time, commit_time;
1072 
1073 	trace_ext4_fc_commit_start(sb);
1074 
1075 	start_time = ktime_get();
1076 
1077 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1078 		(ext4_fc_is_ineligible(sb))) {
1079 		reason = EXT4_FC_REASON_INELIGIBLE;
1080 		goto out;
1081 	}
1082 
1083 restart_fc:
1084 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1085 	if (ret == -EALREADY) {
1086 		/* There was an ongoing commit, check if we need to restart */
1087 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1088 			commit_tid > journal->j_commit_sequence)
1089 			goto restart_fc;
1090 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1091 		goto out;
1092 	} else if (ret) {
1093 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1094 		reason = EXT4_FC_REASON_FC_START_FAILED;
1095 		goto out;
1096 	}
1097 
1098 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1099 	ret = ext4_fc_perform_commit(journal);
1100 	if (ret < 0) {
1101 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1102 		reason = EXT4_FC_REASON_FC_FAILED;
1103 		goto out;
1104 	}
1105 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1106 	ret = jbd2_fc_wait_bufs(journal, nblks);
1107 	if (ret < 0) {
1108 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109 		reason = EXT4_FC_REASON_FC_FAILED;
1110 		goto out;
1111 	}
1112 	atomic_inc(&sbi->s_fc_subtid);
1113 	jbd2_fc_end_commit(journal);
1114 out:
1115 	/* Has any ineligible update happened since we started? */
1116 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1117 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1118 		reason = EXT4_FC_REASON_INELIGIBLE;
1119 	}
1120 
1121 	spin_lock(&sbi->s_fc_lock);
1122 	if (reason != EXT4_FC_REASON_OK &&
1123 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1124 		sbi->s_fc_stats.fc_ineligible_commits++;
1125 	} else {
1126 		sbi->s_fc_stats.fc_num_commits++;
1127 		sbi->s_fc_stats.fc_numblks += nblks;
1128 	}
1129 	spin_unlock(&sbi->s_fc_lock);
1130 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1131 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1132 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1133 	/*
1134 	 * weight the commit time higher than the average time so we don't
1135 	 * react too strongly to vast changes in the commit time
1136 	 */
1137 	if (likely(sbi->s_fc_avg_commit_time))
1138 		sbi->s_fc_avg_commit_time = (commit_time +
1139 				sbi->s_fc_avg_commit_time * 3) / 4;
1140 	else
1141 		sbi->s_fc_avg_commit_time = commit_time;
1142 	jbd_debug(1,
1143 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1144 		nblks, reason, subtid);
1145 	if (reason == EXT4_FC_REASON_FC_FAILED)
1146 		return jbd2_fc_end_commit_fallback(journal, commit_tid);
1147 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1148 		reason == EXT4_FC_REASON_INELIGIBLE)
1149 		return jbd2_complete_transaction(journal, commit_tid);
1150 	return 0;
1151 }
1152 
1153 /*
1154  * Fast commit cleanup routine. This is called after every fast commit and
1155  * full commit. full is true if we are called after a full commit.
1156  */
1157 static void ext4_fc_cleanup(journal_t *journal, int full)
1158 {
1159 	struct super_block *sb = journal->j_private;
1160 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1161 	struct ext4_inode_info *iter;
1162 	struct ext4_fc_dentry_update *fc_dentry;
1163 	struct list_head *pos, *n;
1164 
1165 	if (full && sbi->s_fc_bh)
1166 		sbi->s_fc_bh = NULL;
1167 
1168 	jbd2_fc_release_bufs(journal);
1169 
1170 	spin_lock(&sbi->s_fc_lock);
1171 	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1172 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1173 		list_del_init(&iter->i_fc_list);
1174 		ext4_clear_inode_state(&iter->vfs_inode,
1175 				       EXT4_STATE_FC_COMMITTING);
1176 		ext4_fc_reset_inode(&iter->vfs_inode);
1177 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1178 		smp_mb();
1179 #if (BITS_PER_LONG < 64)
1180 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1181 #else
1182 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1183 #endif
1184 	}
1185 
1186 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1187 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1188 					     struct ext4_fc_dentry_update,
1189 					     fcd_list);
1190 		list_del_init(&fc_dentry->fcd_list);
1191 		spin_unlock(&sbi->s_fc_lock);
1192 
1193 		if (fc_dentry->fcd_name.name &&
1194 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1195 			kfree(fc_dentry->fcd_name.name);
1196 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1197 		spin_lock(&sbi->s_fc_lock);
1198 	}
1199 
1200 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1201 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1202 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1203 				&sbi->s_fc_q[FC_Q_STAGING]);
1204 
1205 	sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1206 	sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1207 
1208 	if (full)
1209 		sbi->s_fc_bytes = 0;
1210 	spin_unlock(&sbi->s_fc_lock);
1211 	trace_ext4_fc_stats(sb);
1212 }
1213 
1214 /* Ext4 Replay Path Routines */
1215 
1216 /* Get length of a particular tlv */
1217 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1218 {
1219 	return le16_to_cpu(tl->fc_len);
1220 }
1221 
1222 /* Get a pointer to "value" of a tlv */
1223 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1224 {
1225 	return (u8 *)tl + sizeof(*tl);
1226 }
1227 
1228 /* Helper struct for dentry replay routines */
1229 struct dentry_info_args {
1230 	int parent_ino, dname_len, ino, inode_len;
1231 	char *dname;
1232 };
1233 
1234 static inline void tl_to_darg(struct dentry_info_args *darg,
1235 				struct  ext4_fc_tl *tl)
1236 {
1237 	struct ext4_fc_dentry_info *fcd;
1238 
1239 	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1240 
1241 	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1242 	darg->ino = le32_to_cpu(fcd->fc_ino);
1243 	darg->dname = fcd->fc_dname;
1244 	darg->dname_len = ext4_fc_tag_len(tl) -
1245 			sizeof(struct ext4_fc_dentry_info);
1246 }
1247 
1248 /* Unlink replay function */
1249 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1250 {
1251 	struct inode *inode, *old_parent;
1252 	struct qstr entry;
1253 	struct dentry_info_args darg;
1254 	int ret = 0;
1255 
1256 	tl_to_darg(&darg, tl);
1257 
1258 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1259 			darg.parent_ino, darg.dname_len);
1260 
1261 	entry.name = darg.dname;
1262 	entry.len = darg.dname_len;
1263 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1264 
1265 	if (IS_ERR_OR_NULL(inode)) {
1266 		jbd_debug(1, "Inode %d not found", darg.ino);
1267 		return 0;
1268 	}
1269 
1270 	old_parent = ext4_iget(sb, darg.parent_ino,
1271 				EXT4_IGET_NORMAL);
1272 	if (IS_ERR_OR_NULL(old_parent)) {
1273 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1274 		iput(inode);
1275 		return 0;
1276 	}
1277 
1278 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1279 	/* -ENOENT ok coz it might not exist anymore. */
1280 	if (ret == -ENOENT)
1281 		ret = 0;
1282 	iput(old_parent);
1283 	iput(inode);
1284 	return ret;
1285 }
1286 
1287 static int ext4_fc_replay_link_internal(struct super_block *sb,
1288 				struct dentry_info_args *darg,
1289 				struct inode *inode)
1290 {
1291 	struct inode *dir = NULL;
1292 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1293 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1294 	int ret = 0;
1295 
1296 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1297 	if (IS_ERR(dir)) {
1298 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1299 		dir = NULL;
1300 		goto out;
1301 	}
1302 
1303 	dentry_dir = d_obtain_alias(dir);
1304 	if (IS_ERR(dentry_dir)) {
1305 		jbd_debug(1, "Failed to obtain dentry");
1306 		dentry_dir = NULL;
1307 		goto out;
1308 	}
1309 
1310 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1311 	if (!dentry_inode) {
1312 		jbd_debug(1, "Inode dentry not created.");
1313 		ret = -ENOMEM;
1314 		goto out;
1315 	}
1316 
1317 	ret = __ext4_link(dir, inode, dentry_inode);
1318 	/*
1319 	 * It's possible that link already existed since data blocks
1320 	 * for the dir in question got persisted before we crashed OR
1321 	 * we replayed this tag and crashed before the entire replay
1322 	 * could complete.
1323 	 */
1324 	if (ret && ret != -EEXIST) {
1325 		jbd_debug(1, "Failed to link\n");
1326 		goto out;
1327 	}
1328 
1329 	ret = 0;
1330 out:
1331 	if (dentry_dir) {
1332 		d_drop(dentry_dir);
1333 		dput(dentry_dir);
1334 	} else if (dir) {
1335 		iput(dir);
1336 	}
1337 	if (dentry_inode) {
1338 		d_drop(dentry_inode);
1339 		dput(dentry_inode);
1340 	}
1341 
1342 	return ret;
1343 }
1344 
1345 /* Link replay function */
1346 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1347 {
1348 	struct inode *inode;
1349 	struct dentry_info_args darg;
1350 	int ret = 0;
1351 
1352 	tl_to_darg(&darg, tl);
1353 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1354 			darg.parent_ino, darg.dname_len);
1355 
1356 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1357 	if (IS_ERR_OR_NULL(inode)) {
1358 		jbd_debug(1, "Inode not found.");
1359 		return 0;
1360 	}
1361 
1362 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1363 	iput(inode);
1364 	return ret;
1365 }
1366 
1367 /*
1368  * Record all the modified inodes during replay. We use this later to setup
1369  * block bitmaps correctly.
1370  */
1371 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1372 {
1373 	struct ext4_fc_replay_state *state;
1374 	int i;
1375 
1376 	state = &EXT4_SB(sb)->s_fc_replay_state;
1377 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1378 		if (state->fc_modified_inodes[i] == ino)
1379 			return 0;
1380 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1381 		state->fc_modified_inodes_size +=
1382 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1383 		state->fc_modified_inodes = krealloc(
1384 					state->fc_modified_inodes, sizeof(int) *
1385 					state->fc_modified_inodes_size,
1386 					GFP_KERNEL);
1387 		if (!state->fc_modified_inodes)
1388 			return -ENOMEM;
1389 	}
1390 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1391 	return 0;
1392 }
1393 
1394 /*
1395  * Inode replay function
1396  */
1397 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1398 {
1399 	struct ext4_fc_inode *fc_inode;
1400 	struct ext4_inode *raw_inode;
1401 	struct ext4_inode *raw_fc_inode;
1402 	struct inode *inode = NULL;
1403 	struct ext4_iloc iloc;
1404 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1405 	struct ext4_extent_header *eh;
1406 
1407 	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1408 
1409 	ino = le32_to_cpu(fc_inode->fc_ino);
1410 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1411 
1412 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1413 	if (!IS_ERR_OR_NULL(inode)) {
1414 		ext4_ext_clear_bb(inode);
1415 		iput(inode);
1416 	}
1417 
1418 	ext4_fc_record_modified_inode(sb, ino);
1419 
1420 	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1421 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1422 	if (ret)
1423 		goto out;
1424 
1425 	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1426 	raw_inode = ext4_raw_inode(&iloc);
1427 
1428 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1429 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1430 		inode_len - offsetof(struct ext4_inode, i_generation));
1431 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1432 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1433 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1434 			memset(eh, 0, sizeof(*eh));
1435 			eh->eh_magic = EXT4_EXT_MAGIC;
1436 			eh->eh_max = cpu_to_le16(
1437 				(sizeof(raw_inode->i_block) -
1438 				 sizeof(struct ext4_extent_header))
1439 				 / sizeof(struct ext4_extent));
1440 		}
1441 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1442 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1443 			sizeof(raw_inode->i_block));
1444 	}
1445 
1446 	/* Immediately update the inode on disk. */
1447 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1448 	if (ret)
1449 		goto out;
1450 	ret = sync_dirty_buffer(iloc.bh);
1451 	if (ret)
1452 		goto out;
1453 	ret = ext4_mark_inode_used(sb, ino);
1454 	if (ret)
1455 		goto out;
1456 
1457 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1458 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1459 	if (IS_ERR_OR_NULL(inode)) {
1460 		jbd_debug(1, "Inode not found.");
1461 		return -EFSCORRUPTED;
1462 	}
1463 
1464 	/*
1465 	 * Our allocator could have made different decisions than before
1466 	 * crashing. This should be fixed but until then, we calculate
1467 	 * the number of blocks the inode.
1468 	 */
1469 	ext4_ext_replay_set_iblocks(inode);
1470 
1471 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1472 	ext4_reset_inode_seed(inode);
1473 
1474 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1475 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1476 	sync_dirty_buffer(iloc.bh);
1477 	brelse(iloc.bh);
1478 out:
1479 	iput(inode);
1480 	if (!ret)
1481 		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1482 
1483 	return 0;
1484 }
1485 
1486 /*
1487  * Dentry create replay function.
1488  *
1489  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1490  * inode for which we are trying to create a dentry here, should already have
1491  * been replayed before we start here.
1492  */
1493 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1494 {
1495 	int ret = 0;
1496 	struct inode *inode = NULL;
1497 	struct inode *dir = NULL;
1498 	struct dentry_info_args darg;
1499 
1500 	tl_to_darg(&darg, tl);
1501 
1502 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1503 			darg.parent_ino, darg.dname_len);
1504 
1505 	/* This takes care of update group descriptor and other metadata */
1506 	ret = ext4_mark_inode_used(sb, darg.ino);
1507 	if (ret)
1508 		goto out;
1509 
1510 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1511 	if (IS_ERR_OR_NULL(inode)) {
1512 		jbd_debug(1, "inode %d not found.", darg.ino);
1513 		inode = NULL;
1514 		ret = -EINVAL;
1515 		goto out;
1516 	}
1517 
1518 	if (S_ISDIR(inode->i_mode)) {
1519 		/*
1520 		 * If we are creating a directory, we need to make sure that the
1521 		 * dot and dot dot dirents are setup properly.
1522 		 */
1523 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1524 		if (IS_ERR_OR_NULL(dir)) {
1525 			jbd_debug(1, "Dir %d not found.", darg.ino);
1526 			goto out;
1527 		}
1528 		ret = ext4_init_new_dir(NULL, dir, inode);
1529 		iput(dir);
1530 		if (ret) {
1531 			ret = 0;
1532 			goto out;
1533 		}
1534 	}
1535 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1536 	if (ret)
1537 		goto out;
1538 	set_nlink(inode, 1);
1539 	ext4_mark_inode_dirty(NULL, inode);
1540 out:
1541 	if (inode)
1542 		iput(inode);
1543 	return ret;
1544 }
1545 
1546 /*
1547  * Record physical disk regions which are in use as per fast commit area. Our
1548  * simple replay phase allocator excludes these regions from allocation.
1549  */
1550 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1551 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1552 {
1553 	struct ext4_fc_replay_state *state;
1554 	struct ext4_fc_alloc_region *region;
1555 
1556 	state = &EXT4_SB(sb)->s_fc_replay_state;
1557 	if (state->fc_regions_used == state->fc_regions_size) {
1558 		state->fc_regions_size +=
1559 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1560 		state->fc_regions = krealloc(
1561 					state->fc_regions,
1562 					state->fc_regions_size *
1563 					sizeof(struct ext4_fc_alloc_region),
1564 					GFP_KERNEL);
1565 		if (!state->fc_regions)
1566 			return -ENOMEM;
1567 	}
1568 	region = &state->fc_regions[state->fc_regions_used++];
1569 	region->ino = ino;
1570 	region->lblk = lblk;
1571 	region->pblk = pblk;
1572 	region->len = len;
1573 
1574 	return 0;
1575 }
1576 
1577 /* Replay add range tag */
1578 static int ext4_fc_replay_add_range(struct super_block *sb,
1579 				struct ext4_fc_tl *tl)
1580 {
1581 	struct ext4_fc_add_range *fc_add_ex;
1582 	struct ext4_extent newex, *ex;
1583 	struct inode *inode;
1584 	ext4_lblk_t start, cur;
1585 	int remaining, len;
1586 	ext4_fsblk_t start_pblk;
1587 	struct ext4_map_blocks map;
1588 	struct ext4_ext_path *path = NULL;
1589 	int ret;
1590 
1591 	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1592 	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1593 
1594 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1595 		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1596 		ext4_ext_get_actual_len(ex));
1597 
1598 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1599 				EXT4_IGET_NORMAL);
1600 	if (IS_ERR_OR_NULL(inode)) {
1601 		jbd_debug(1, "Inode not found.");
1602 		return 0;
1603 	}
1604 
1605 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1606 
1607 	start = le32_to_cpu(ex->ee_block);
1608 	start_pblk = ext4_ext_pblock(ex);
1609 	len = ext4_ext_get_actual_len(ex);
1610 
1611 	cur = start;
1612 	remaining = len;
1613 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1614 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1615 		  inode->i_ino);
1616 
1617 	while (remaining > 0) {
1618 		map.m_lblk = cur;
1619 		map.m_len = remaining;
1620 		map.m_pblk = 0;
1621 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1622 
1623 		if (ret < 0) {
1624 			iput(inode);
1625 			return 0;
1626 		}
1627 
1628 		if (ret == 0) {
1629 			/* Range is not mapped */
1630 			path = ext4_find_extent(inode, cur, NULL, 0);
1631 			if (IS_ERR(path)) {
1632 				iput(inode);
1633 				return 0;
1634 			}
1635 			memset(&newex, 0, sizeof(newex));
1636 			newex.ee_block = cpu_to_le32(cur);
1637 			ext4_ext_store_pblock(
1638 				&newex, start_pblk + cur - start);
1639 			newex.ee_len = cpu_to_le16(map.m_len);
1640 			if (ext4_ext_is_unwritten(ex))
1641 				ext4_ext_mark_unwritten(&newex);
1642 			down_write(&EXT4_I(inode)->i_data_sem);
1643 			ret = ext4_ext_insert_extent(
1644 				NULL, inode, &path, &newex, 0);
1645 			up_write((&EXT4_I(inode)->i_data_sem));
1646 			ext4_ext_drop_refs(path);
1647 			kfree(path);
1648 			if (ret) {
1649 				iput(inode);
1650 				return 0;
1651 			}
1652 			goto next;
1653 		}
1654 
1655 		if (start_pblk + cur - start != map.m_pblk) {
1656 			/*
1657 			 * Logical to physical mapping changed. This can happen
1658 			 * if this range was removed and then reallocated to
1659 			 * map to new physical blocks during a fast commit.
1660 			 */
1661 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1662 					ext4_ext_is_unwritten(ex),
1663 					start_pblk + cur - start);
1664 			if (ret) {
1665 				iput(inode);
1666 				return 0;
1667 			}
1668 			/*
1669 			 * Mark the old blocks as free since they aren't used
1670 			 * anymore. We maintain an array of all the modified
1671 			 * inodes. In case these blocks are still used at either
1672 			 * a different logical range in the same inode or in
1673 			 * some different inode, we will mark them as allocated
1674 			 * at the end of the FC replay using our array of
1675 			 * modified inodes.
1676 			 */
1677 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1678 			goto next;
1679 		}
1680 
1681 		/* Range is mapped and needs a state change */
1682 		jbd_debug(1, "Converting from %d to %d %lld",
1683 				map.m_flags & EXT4_MAP_UNWRITTEN,
1684 			ext4_ext_is_unwritten(ex), map.m_pblk);
1685 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1686 					ext4_ext_is_unwritten(ex), map.m_pblk);
1687 		if (ret) {
1688 			iput(inode);
1689 			return 0;
1690 		}
1691 		/*
1692 		 * We may have split the extent tree while toggling the state.
1693 		 * Try to shrink the extent tree now.
1694 		 */
1695 		ext4_ext_replay_shrink_inode(inode, start + len);
1696 next:
1697 		cur += map.m_len;
1698 		remaining -= map.m_len;
1699 	}
1700 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1701 					sb->s_blocksize_bits);
1702 	iput(inode);
1703 	return 0;
1704 }
1705 
1706 /* Replay DEL_RANGE tag */
1707 static int
1708 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1709 {
1710 	struct inode *inode;
1711 	struct ext4_fc_del_range *lrange;
1712 	struct ext4_map_blocks map;
1713 	ext4_lblk_t cur, remaining;
1714 	int ret;
1715 
1716 	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1717 	cur = le32_to_cpu(lrange->fc_lblk);
1718 	remaining = le32_to_cpu(lrange->fc_len);
1719 
1720 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1721 		le32_to_cpu(lrange->fc_ino), cur, remaining);
1722 
1723 	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1724 	if (IS_ERR_OR_NULL(inode)) {
1725 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1726 		return 0;
1727 	}
1728 
1729 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1730 
1731 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1732 			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1733 			le32_to_cpu(lrange->fc_len));
1734 	while (remaining > 0) {
1735 		map.m_lblk = cur;
1736 		map.m_len = remaining;
1737 
1738 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1739 		if (ret < 0) {
1740 			iput(inode);
1741 			return 0;
1742 		}
1743 		if (ret > 0) {
1744 			remaining -= ret;
1745 			cur += ret;
1746 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1747 		} else {
1748 			remaining -= map.m_len;
1749 			cur += map.m_len;
1750 		}
1751 	}
1752 
1753 	ret = ext4_punch_hole(inode,
1754 		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1755 		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1756 	if (ret)
1757 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1758 	ext4_ext_replay_shrink_inode(inode,
1759 		i_size_read(inode) >> sb->s_blocksize_bits);
1760 	ext4_mark_inode_dirty(NULL, inode);
1761 	iput(inode);
1762 
1763 	return 0;
1764 }
1765 
1766 static inline const char *tag2str(u16 tag)
1767 {
1768 	switch (tag) {
1769 	case EXT4_FC_TAG_LINK:
1770 		return "TAG_ADD_ENTRY";
1771 	case EXT4_FC_TAG_UNLINK:
1772 		return "TAG_DEL_ENTRY";
1773 	case EXT4_FC_TAG_ADD_RANGE:
1774 		return "TAG_ADD_RANGE";
1775 	case EXT4_FC_TAG_CREAT:
1776 		return "TAG_CREAT_DENTRY";
1777 	case EXT4_FC_TAG_DEL_RANGE:
1778 		return "TAG_DEL_RANGE";
1779 	case EXT4_FC_TAG_INODE:
1780 		return "TAG_INODE";
1781 	case EXT4_FC_TAG_PAD:
1782 		return "TAG_PAD";
1783 	case EXT4_FC_TAG_TAIL:
1784 		return "TAG_TAIL";
1785 	case EXT4_FC_TAG_HEAD:
1786 		return "TAG_HEAD";
1787 	default:
1788 		return "TAG_ERROR";
1789 	}
1790 }
1791 
1792 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1793 {
1794 	struct ext4_fc_replay_state *state;
1795 	struct inode *inode;
1796 	struct ext4_ext_path *path = NULL;
1797 	struct ext4_map_blocks map;
1798 	int i, ret, j;
1799 	ext4_lblk_t cur, end;
1800 
1801 	state = &EXT4_SB(sb)->s_fc_replay_state;
1802 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1803 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1804 			EXT4_IGET_NORMAL);
1805 		if (IS_ERR_OR_NULL(inode)) {
1806 			jbd_debug(1, "Inode %d not found.",
1807 				state->fc_modified_inodes[i]);
1808 			continue;
1809 		}
1810 		cur = 0;
1811 		end = EXT_MAX_BLOCKS;
1812 		while (cur < end) {
1813 			map.m_lblk = cur;
1814 			map.m_len = end - cur;
1815 
1816 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1817 			if (ret < 0)
1818 				break;
1819 
1820 			if (ret > 0) {
1821 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1822 				if (!IS_ERR_OR_NULL(path)) {
1823 					for (j = 0; j < path->p_depth; j++)
1824 						ext4_mb_mark_bb(inode->i_sb,
1825 							path[j].p_block, 1, 1);
1826 					ext4_ext_drop_refs(path);
1827 					kfree(path);
1828 				}
1829 				cur += ret;
1830 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1831 							map.m_len, 1);
1832 			} else {
1833 				cur = cur + (map.m_len ? map.m_len : 1);
1834 			}
1835 		}
1836 		iput(inode);
1837 	}
1838 }
1839 
1840 /*
1841  * Check if block is in excluded regions for block allocation. The simple
1842  * allocator that runs during replay phase is calls this function to see
1843  * if it is okay to use a block.
1844  */
1845 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1846 {
1847 	int i;
1848 	struct ext4_fc_replay_state *state;
1849 
1850 	state = &EXT4_SB(sb)->s_fc_replay_state;
1851 	for (i = 0; i < state->fc_regions_valid; i++) {
1852 		if (state->fc_regions[i].ino == 0 ||
1853 			state->fc_regions[i].len == 0)
1854 			continue;
1855 		if (blk >= state->fc_regions[i].pblk &&
1856 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1857 			return true;
1858 	}
1859 	return false;
1860 }
1861 
1862 /* Cleanup function called after replay */
1863 void ext4_fc_replay_cleanup(struct super_block *sb)
1864 {
1865 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1866 
1867 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1868 	kfree(sbi->s_fc_replay_state.fc_regions);
1869 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1870 }
1871 
1872 /*
1873  * Recovery Scan phase handler
1874  *
1875  * This function is called during the scan phase and is responsible
1876  * for doing following things:
1877  * - Make sure the fast commit area has valid tags for replay
1878  * - Count number of tags that need to be replayed by the replay handler
1879  * - Verify CRC
1880  * - Create a list of excluded blocks for allocation during replay phase
1881  *
1882  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1883  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1884  * to indicate that scan has finished and JBD2 can now start replay phase.
1885  * It returns a negative error to indicate that there was an error. At the end
1886  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1887  * to indicate the number of tags that need to replayed during the replay phase.
1888  */
1889 static int ext4_fc_replay_scan(journal_t *journal,
1890 				struct buffer_head *bh, int off,
1891 				tid_t expected_tid)
1892 {
1893 	struct super_block *sb = journal->j_private;
1894 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1895 	struct ext4_fc_replay_state *state;
1896 	int ret = JBD2_FC_REPLAY_CONTINUE;
1897 	struct ext4_fc_add_range *ext;
1898 	struct ext4_fc_tl *tl;
1899 	struct ext4_fc_tail *tail;
1900 	__u8 *start, *end;
1901 	struct ext4_fc_head *head;
1902 	struct ext4_extent *ex;
1903 
1904 	state = &sbi->s_fc_replay_state;
1905 
1906 	start = (u8 *)bh->b_data;
1907 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1908 
1909 	if (state->fc_replay_expected_off == 0) {
1910 		state->fc_cur_tag = 0;
1911 		state->fc_replay_num_tags = 0;
1912 		state->fc_crc = 0;
1913 		state->fc_regions = NULL;
1914 		state->fc_regions_valid = state->fc_regions_used =
1915 			state->fc_regions_size = 0;
1916 		/* Check if we can stop early */
1917 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1918 			!= EXT4_FC_TAG_HEAD)
1919 			return 0;
1920 	}
1921 
1922 	if (off != state->fc_replay_expected_off) {
1923 		ret = -EFSCORRUPTED;
1924 		goto out_err;
1925 	}
1926 
1927 	state->fc_replay_expected_off++;
1928 	fc_for_each_tl(start, end, tl) {
1929 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1930 			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1931 		switch (le16_to_cpu(tl->fc_tag)) {
1932 		case EXT4_FC_TAG_ADD_RANGE:
1933 			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1934 			ex = (struct ext4_extent *)&ext->fc_ex;
1935 			ret = ext4_fc_record_regions(sb,
1936 				le32_to_cpu(ext->fc_ino),
1937 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1938 				ext4_ext_get_actual_len(ex));
1939 			if (ret < 0)
1940 				break;
1941 			ret = JBD2_FC_REPLAY_CONTINUE;
1942 			fallthrough;
1943 		case EXT4_FC_TAG_DEL_RANGE:
1944 		case EXT4_FC_TAG_LINK:
1945 		case EXT4_FC_TAG_UNLINK:
1946 		case EXT4_FC_TAG_CREAT:
1947 		case EXT4_FC_TAG_INODE:
1948 		case EXT4_FC_TAG_PAD:
1949 			state->fc_cur_tag++;
1950 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1951 					sizeof(*tl) + ext4_fc_tag_len(tl));
1952 			break;
1953 		case EXT4_FC_TAG_TAIL:
1954 			state->fc_cur_tag++;
1955 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1956 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1957 						sizeof(*tl) +
1958 						offsetof(struct ext4_fc_tail,
1959 						fc_crc));
1960 			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1961 				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1962 				state->fc_replay_num_tags = state->fc_cur_tag;
1963 				state->fc_regions_valid =
1964 					state->fc_regions_used;
1965 			} else {
1966 				ret = state->fc_replay_num_tags ?
1967 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1968 			}
1969 			state->fc_crc = 0;
1970 			break;
1971 		case EXT4_FC_TAG_HEAD:
1972 			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1973 			if (le32_to_cpu(head->fc_features) &
1974 				~EXT4_FC_SUPPORTED_FEATURES) {
1975 				ret = -EOPNOTSUPP;
1976 				break;
1977 			}
1978 			if (le32_to_cpu(head->fc_tid) != expected_tid) {
1979 				ret = JBD2_FC_REPLAY_STOP;
1980 				break;
1981 			}
1982 			state->fc_cur_tag++;
1983 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1984 					sizeof(*tl) + ext4_fc_tag_len(tl));
1985 			break;
1986 		default:
1987 			ret = state->fc_replay_num_tags ?
1988 				JBD2_FC_REPLAY_STOP : -ECANCELED;
1989 		}
1990 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1991 			break;
1992 	}
1993 
1994 out_err:
1995 	trace_ext4_fc_replay_scan(sb, ret, off);
1996 	return ret;
1997 }
1998 
1999 /*
2000  * Main recovery path entry point.
2001  * The meaning of return codes is similar as above.
2002  */
2003 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2004 				enum passtype pass, int off, tid_t expected_tid)
2005 {
2006 	struct super_block *sb = journal->j_private;
2007 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2008 	struct ext4_fc_tl *tl;
2009 	__u8 *start, *end;
2010 	int ret = JBD2_FC_REPLAY_CONTINUE;
2011 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2012 	struct ext4_fc_tail *tail;
2013 
2014 	if (pass == PASS_SCAN) {
2015 		state->fc_current_pass = PASS_SCAN;
2016 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2017 	}
2018 
2019 	if (state->fc_current_pass != pass) {
2020 		state->fc_current_pass = pass;
2021 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2022 	}
2023 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2024 		jbd_debug(1, "Replay stops\n");
2025 		ext4_fc_set_bitmaps_and_counters(sb);
2026 		return 0;
2027 	}
2028 
2029 #ifdef CONFIG_EXT4_DEBUG
2030 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2031 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2032 		return JBD2_FC_REPLAY_STOP;
2033 	}
2034 #endif
2035 
2036 	start = (u8 *)bh->b_data;
2037 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2038 
2039 	fc_for_each_tl(start, end, tl) {
2040 		if (state->fc_replay_num_tags == 0) {
2041 			ret = JBD2_FC_REPLAY_STOP;
2042 			ext4_fc_set_bitmaps_and_counters(sb);
2043 			break;
2044 		}
2045 		jbd_debug(3, "Replay phase, tag:%s\n",
2046 				tag2str(le16_to_cpu(tl->fc_tag)));
2047 		state->fc_replay_num_tags--;
2048 		switch (le16_to_cpu(tl->fc_tag)) {
2049 		case EXT4_FC_TAG_LINK:
2050 			ret = ext4_fc_replay_link(sb, tl);
2051 			break;
2052 		case EXT4_FC_TAG_UNLINK:
2053 			ret = ext4_fc_replay_unlink(sb, tl);
2054 			break;
2055 		case EXT4_FC_TAG_ADD_RANGE:
2056 			ret = ext4_fc_replay_add_range(sb, tl);
2057 			break;
2058 		case EXT4_FC_TAG_CREAT:
2059 			ret = ext4_fc_replay_create(sb, tl);
2060 			break;
2061 		case EXT4_FC_TAG_DEL_RANGE:
2062 			ret = ext4_fc_replay_del_range(sb, tl);
2063 			break;
2064 		case EXT4_FC_TAG_INODE:
2065 			ret = ext4_fc_replay_inode(sb, tl);
2066 			break;
2067 		case EXT4_FC_TAG_PAD:
2068 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2069 				ext4_fc_tag_len(tl), 0);
2070 			break;
2071 		case EXT4_FC_TAG_TAIL:
2072 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2073 				ext4_fc_tag_len(tl), 0);
2074 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2075 			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2076 			break;
2077 		case EXT4_FC_TAG_HEAD:
2078 			break;
2079 		default:
2080 			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2081 				ext4_fc_tag_len(tl), 0);
2082 			ret = -ECANCELED;
2083 			break;
2084 		}
2085 		if (ret < 0)
2086 			break;
2087 		ret = JBD2_FC_REPLAY_CONTINUE;
2088 	}
2089 	return ret;
2090 }
2091 
2092 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2093 {
2094 	int num_fc_blocks;
2095 
2096 	/*
2097 	 * We set replay callback even if fast commit disabled because we may
2098 	 * could still have fast commit blocks that need to be replayed even if
2099 	 * fast commit has now been turned off.
2100 	 */
2101 	journal->j_fc_replay_callback = ext4_fc_replay;
2102 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2103 		return;
2104 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2105 	if (!buffer_uptodate(journal->j_sb_buffer)
2106 		&& ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
2107 					true)) {
2108 		ext4_msg(sb, KERN_ERR, "I/O error on journal");
2109 		return;
2110 	}
2111 	num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
2112 	if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
2113 					EXT4_NUM_FC_BLKS)) {
2114 		pr_warn("Error while enabling fast commits, turning off.");
2115 		ext4_clear_feature_fast_commit(sb);
2116 	}
2117 }
2118 
2119 const char *fc_ineligible_reasons[] = {
2120 	"Extended attributes changed",
2121 	"Cross rename",
2122 	"Journal flag changed",
2123 	"Insufficient memory",
2124 	"Swap boot",
2125 	"Resize",
2126 	"Dir renamed",
2127 	"Falloc range op",
2128 	"FC Commit Failed"
2129 };
2130 
2131 int ext4_fc_info_show(struct seq_file *seq, void *v)
2132 {
2133 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2134 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2135 	int i;
2136 
2137 	if (v != SEQ_START_TOKEN)
2138 		return 0;
2139 
2140 	seq_printf(seq,
2141 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2142 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2143 		   stats->fc_numblks,
2144 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2145 	seq_puts(seq, "Ineligible reasons:\n");
2146 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2147 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2148 			stats->fc_ineligible_reason_count[i]);
2149 
2150 	return 0;
2151 }
2152 
2153 int __init ext4_fc_init_dentry_cache(void)
2154 {
2155 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2156 					   SLAB_RECLAIM_ACCOUNT);
2157 
2158 	if (ext4_fc_dentry_cachep == NULL)
2159 		return -ENOMEM;
2160 
2161 	return 0;
2162 }
2163