xref: /openbmc/linux/fs/ext4/fast_commit.c (revision 3cea11cd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to gaurantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118 
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121 
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124 	BUFFER_TRACE(bh, "");
125 	if (uptodate) {
126 		ext4_debug("%s: Block %lld up-to-date",
127 			   __func__, bh->b_blocknr);
128 		set_buffer_uptodate(bh);
129 	} else {
130 		ext4_debug("%s: Block %lld not up-to-date",
131 			   __func__, bh->b_blocknr);
132 		clear_buffer_uptodate(bh);
133 	}
134 
135 	unlock_buffer(bh);
136 }
137 
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140 	struct ext4_inode_info *ei = EXT4_I(inode);
141 
142 	ei->i_fc_lblk_start = 0;
143 	ei->i_fc_lblk_len = 0;
144 }
145 
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148 	struct ext4_inode_info *ei = EXT4_I(inode);
149 
150 	ext4_fc_reset_inode(inode);
151 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 	INIT_LIST_HEAD(&ei->i_fc_list);
153 	init_waitqueue_head(&ei->i_fc_wait);
154 	atomic_set(&ei->i_fc_updates, 0);
155 	ei->i_fc_committed_subtid = 0;
156 }
157 
158 /*
159  * Inform Ext4's fast about start of an inode update
160  *
161  * This function is called by the high level call VFS callbacks before
162  * performing any inode update. This function blocks if there's an ongoing
163  * fast commit on the inode in question.
164  */
165 void ext4_fc_start_update(struct inode *inode)
166 {
167 	struct ext4_inode_info *ei = EXT4_I(inode);
168 
169 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
170 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
171 		return;
172 
173 restart:
174 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
175 	if (list_empty(&ei->i_fc_list))
176 		goto out;
177 
178 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
179 		wait_queue_head_t *wq;
180 #if (BITS_PER_LONG < 64)
181 		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
182 				EXT4_STATE_FC_COMMITTING);
183 		wq = bit_waitqueue(&ei->i_state_flags,
184 				   EXT4_STATE_FC_COMMITTING);
185 #else
186 		DEFINE_WAIT_BIT(wait, &ei->i_flags,
187 				EXT4_STATE_FC_COMMITTING);
188 		wq = bit_waitqueue(&ei->i_flags,
189 				   EXT4_STATE_FC_COMMITTING);
190 #endif
191 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
192 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
193 		schedule();
194 		finish_wait(wq, &wait.wq_entry);
195 		goto restart;
196 	}
197 out:
198 	atomic_inc(&ei->i_fc_updates);
199 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
200 }
201 
202 /*
203  * Stop inode update and wake up waiting fast commits if any.
204  */
205 void ext4_fc_stop_update(struct inode *inode)
206 {
207 	struct ext4_inode_info *ei = EXT4_I(inode);
208 
209 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
210 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
211 		return;
212 
213 	if (atomic_dec_and_test(&ei->i_fc_updates))
214 		wake_up_all(&ei->i_fc_wait);
215 }
216 
217 /*
218  * Remove inode from fast commit list. If the inode is being committed
219  * we wait until inode commit is done.
220  */
221 void ext4_fc_del(struct inode *inode)
222 {
223 	struct ext4_inode_info *ei = EXT4_I(inode);
224 
225 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
226 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
227 		return;
228 
229 restart:
230 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
231 	if (list_empty(&ei->i_fc_list)) {
232 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
233 		return;
234 	}
235 
236 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
237 		wait_queue_head_t *wq;
238 #if (BITS_PER_LONG < 64)
239 		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
240 				EXT4_STATE_FC_COMMITTING);
241 		wq = bit_waitqueue(&ei->i_state_flags,
242 				   EXT4_STATE_FC_COMMITTING);
243 #else
244 		DEFINE_WAIT_BIT(wait, &ei->i_flags,
245 				EXT4_STATE_FC_COMMITTING);
246 		wq = bit_waitqueue(&ei->i_flags,
247 				   EXT4_STATE_FC_COMMITTING);
248 #endif
249 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
250 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 		schedule();
252 		finish_wait(wq, &wait.wq_entry);
253 		goto restart;
254 	}
255 	if (!list_empty(&ei->i_fc_list))
256 		list_del_init(&ei->i_fc_list);
257 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259 
260 /*
261  * Mark file system as fast commit ineligible. This means that next commit
262  * operation would result in a full jbd2 commit.
263  */
264 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
265 {
266 	struct ext4_sb_info *sbi = EXT4_SB(sb);
267 
268 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
269 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
270 		return;
271 
272 	sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
273 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
274 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
275 }
276 
277 /*
278  * Start a fast commit ineligible update. Any commits that happen while
279  * such an operation is in progress fall back to full commits.
280  */
281 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
282 {
283 	struct ext4_sb_info *sbi = EXT4_SB(sb);
284 
285 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
286 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
287 		return;
288 
289 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
290 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
291 	atomic_inc(&sbi->s_fc_ineligible_updates);
292 }
293 
294 /*
295  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
296  * to ensure that after stopping the ineligible update, at least one full
297  * commit takes place.
298  */
299 void ext4_fc_stop_ineligible(struct super_block *sb)
300 {
301 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
302 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
303 		return;
304 
305 	EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
306 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
307 }
308 
309 static inline int ext4_fc_is_ineligible(struct super_block *sb)
310 {
311 	return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
312 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
313 }
314 
315 /*
316  * Generic fast commit tracking function. If this is the first time this we are
317  * called after a full commit, we initialize fast commit fields and then call
318  * __fc_track_fn() with update = 0. If we have already been called after a full
319  * commit, we pass update = 1. Based on that, the track function can determine
320  * if it needs to track a field for the first time or if it needs to just
321  * update the previously tracked value.
322  *
323  * If enqueue is set, this function enqueues the inode in fast commit list.
324  */
325 static int ext4_fc_track_template(
326 	struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool),
327 	void *args, int enqueue)
328 {
329 	tid_t running_txn_tid;
330 	bool update = false;
331 	struct ext4_inode_info *ei = EXT4_I(inode);
332 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333 	int ret;
334 
335 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
336 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
337 		return -EOPNOTSUPP;
338 
339 	if (ext4_fc_is_ineligible(inode->i_sb))
340 		return -EINVAL;
341 
342 	running_txn_tid = sbi->s_journal ?
343 		sbi->s_journal->j_commit_sequence + 1 : 0;
344 
345 	mutex_lock(&ei->i_fc_lock);
346 	if (running_txn_tid == ei->i_sync_tid) {
347 		update = true;
348 	} else {
349 		ext4_fc_reset_inode(inode);
350 		ei->i_sync_tid = running_txn_tid;
351 	}
352 	ret = __fc_track_fn(inode, args, update);
353 	mutex_unlock(&ei->i_fc_lock);
354 
355 	if (!enqueue)
356 		return ret;
357 
358 	spin_lock(&sbi->s_fc_lock);
359 	if (list_empty(&EXT4_I(inode)->i_fc_list))
360 		list_add_tail(&EXT4_I(inode)->i_fc_list,
361 				(sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
362 				&sbi->s_fc_q[FC_Q_STAGING] :
363 				&sbi->s_fc_q[FC_Q_MAIN]);
364 	spin_unlock(&sbi->s_fc_lock);
365 
366 	return ret;
367 }
368 
369 struct __track_dentry_update_args {
370 	struct dentry *dentry;
371 	int op;
372 };
373 
374 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
375 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
376 {
377 	struct ext4_fc_dentry_update *node;
378 	struct ext4_inode_info *ei = EXT4_I(inode);
379 	struct __track_dentry_update_args *dentry_update =
380 		(struct __track_dentry_update_args *)arg;
381 	struct dentry *dentry = dentry_update->dentry;
382 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
383 
384 	mutex_unlock(&ei->i_fc_lock);
385 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
386 	if (!node) {
387 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
388 		mutex_lock(&ei->i_fc_lock);
389 		return -ENOMEM;
390 	}
391 
392 	node->fcd_op = dentry_update->op;
393 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
394 	node->fcd_ino = inode->i_ino;
395 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
396 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
397 		if (!node->fcd_name.name) {
398 			kmem_cache_free(ext4_fc_dentry_cachep, node);
399 			ext4_fc_mark_ineligible(inode->i_sb,
400 				EXT4_FC_REASON_MEM);
401 			mutex_lock(&ei->i_fc_lock);
402 			return -ENOMEM;
403 		}
404 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
405 			dentry->d_name.len);
406 	} else {
407 		memcpy(node->fcd_iname, dentry->d_name.name,
408 			dentry->d_name.len);
409 		node->fcd_name.name = node->fcd_iname;
410 	}
411 	node->fcd_name.len = dentry->d_name.len;
412 
413 	spin_lock(&sbi->s_fc_lock);
414 	if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
415 		list_add_tail(&node->fcd_list,
416 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
417 	else
418 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
419 	spin_unlock(&sbi->s_fc_lock);
420 	mutex_lock(&ei->i_fc_lock);
421 
422 	return 0;
423 }
424 
425 void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
426 {
427 	struct __track_dentry_update_args args;
428 	int ret;
429 
430 	args.dentry = dentry;
431 	args.op = EXT4_FC_TAG_UNLINK;
432 
433 	ret = ext4_fc_track_template(inode, __track_dentry_update,
434 					(void *)&args, 0);
435 	trace_ext4_fc_track_unlink(inode, dentry, ret);
436 }
437 
438 void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
439 {
440 	struct __track_dentry_update_args args;
441 	int ret;
442 
443 	args.dentry = dentry;
444 	args.op = EXT4_FC_TAG_LINK;
445 
446 	ret = ext4_fc_track_template(inode, __track_dentry_update,
447 					(void *)&args, 0);
448 	trace_ext4_fc_track_link(inode, dentry, ret);
449 }
450 
451 void ext4_fc_track_create(struct inode *inode, struct dentry *dentry)
452 {
453 	struct __track_dentry_update_args args;
454 	int ret;
455 
456 	args.dentry = dentry;
457 	args.op = EXT4_FC_TAG_CREAT;
458 
459 	ret = ext4_fc_track_template(inode, __track_dentry_update,
460 					(void *)&args, 0);
461 	trace_ext4_fc_track_create(inode, dentry, ret);
462 }
463 
464 /* __track_fn for inode tracking */
465 static int __track_inode(struct inode *inode, void *arg, bool update)
466 {
467 	if (update)
468 		return -EEXIST;
469 
470 	EXT4_I(inode)->i_fc_lblk_len = 0;
471 
472 	return 0;
473 }
474 
475 void ext4_fc_track_inode(struct inode *inode)
476 {
477 	int ret;
478 
479 	if (S_ISDIR(inode->i_mode))
480 		return;
481 
482 	ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
483 	trace_ext4_fc_track_inode(inode, ret);
484 }
485 
486 struct __track_range_args {
487 	ext4_lblk_t start, end;
488 };
489 
490 /* __track_fn for tracking data updates */
491 static int __track_range(struct inode *inode, void *arg, bool update)
492 {
493 	struct ext4_inode_info *ei = EXT4_I(inode);
494 	ext4_lblk_t oldstart;
495 	struct __track_range_args *__arg =
496 		(struct __track_range_args *)arg;
497 
498 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
499 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
500 		return -ECANCELED;
501 	}
502 
503 	oldstart = ei->i_fc_lblk_start;
504 
505 	if (update && ei->i_fc_lblk_len > 0) {
506 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
507 		ei->i_fc_lblk_len =
508 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
509 				ei->i_fc_lblk_start + 1;
510 	} else {
511 		ei->i_fc_lblk_start = __arg->start;
512 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
513 	}
514 
515 	return 0;
516 }
517 
518 void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
519 			 ext4_lblk_t end)
520 {
521 	struct __track_range_args args;
522 	int ret;
523 
524 	if (S_ISDIR(inode->i_mode))
525 		return;
526 
527 	args.start = start;
528 	args.end = end;
529 
530 	ret = ext4_fc_track_template(inode,  __track_range, &args, 1);
531 
532 	trace_ext4_fc_track_range(inode, start, end, ret);
533 }
534 
535 static void ext4_fc_submit_bh(struct super_block *sb)
536 {
537 	int write_flags = REQ_SYNC;
538 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
539 
540 	if (test_opt(sb, BARRIER))
541 		write_flags |= REQ_FUA | REQ_PREFLUSH;
542 	lock_buffer(bh);
543 	clear_buffer_dirty(bh);
544 	set_buffer_uptodate(bh);
545 	bh->b_end_io = ext4_end_buffer_io_sync;
546 	submit_bh(REQ_OP_WRITE, write_flags, bh);
547 	EXT4_SB(sb)->s_fc_bh = NULL;
548 }
549 
550 /* Ext4 commit path routines */
551 
552 /* memzero and update CRC */
553 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
554 				u32 *crc)
555 {
556 	void *ret;
557 
558 	ret = memset(dst, 0, len);
559 	if (crc)
560 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
561 	return ret;
562 }
563 
564 /*
565  * Allocate len bytes on a fast commit buffer.
566  *
567  * During the commit time this function is used to manage fast commit
568  * block space. We don't split a fast commit log onto different
569  * blocks. So this function makes sure that if there's not enough space
570  * on the current block, the remaining space in the current block is
571  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
572  * new block is from jbd2 and CRC is updated to reflect the padding
573  * we added.
574  */
575 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
576 {
577 	struct ext4_fc_tl *tl;
578 	struct ext4_sb_info *sbi = EXT4_SB(sb);
579 	struct buffer_head *bh;
580 	int bsize = sbi->s_journal->j_blocksize;
581 	int ret, off = sbi->s_fc_bytes % bsize;
582 	int pad_len;
583 
584 	/*
585 	 * After allocating len, we should have space at least for a 0 byte
586 	 * padding.
587 	 */
588 	if (len + sizeof(struct ext4_fc_tl) > bsize)
589 		return NULL;
590 
591 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
592 		/*
593 		 * Only allocate from current buffer if we have enough space for
594 		 * this request AND we have space to add a zero byte padding.
595 		 */
596 		if (!sbi->s_fc_bh) {
597 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
598 			if (ret)
599 				return NULL;
600 			sbi->s_fc_bh = bh;
601 		}
602 		sbi->s_fc_bytes += len;
603 		return sbi->s_fc_bh->b_data + off;
604 	}
605 	/* Need to add PAD tag */
606 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
607 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
608 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
609 	tl->fc_len = cpu_to_le16(pad_len);
610 	if (crc)
611 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
612 	if (pad_len > 0)
613 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
614 	ext4_fc_submit_bh(sb);
615 
616 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
617 	if (ret)
618 		return NULL;
619 	sbi->s_fc_bh = bh;
620 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
621 	return sbi->s_fc_bh->b_data;
622 }
623 
624 /* memcpy to fc reserved space and update CRC */
625 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
626 				int len, u32 *crc)
627 {
628 	if (crc)
629 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
630 	return memcpy(dst, src, len);
631 }
632 
633 /*
634  * Complete a fast commit by writing tail tag.
635  *
636  * Writing tail tag marks the end of a fast commit. In order to guarantee
637  * atomicity, after writing tail tag, even if there's space remaining
638  * in the block, next commit shouldn't use it. That's why tail tag
639  * has the length as that of the remaining space on the block.
640  */
641 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
642 {
643 	struct ext4_sb_info *sbi = EXT4_SB(sb);
644 	struct ext4_fc_tl tl;
645 	struct ext4_fc_tail tail;
646 	int off, bsize = sbi->s_journal->j_blocksize;
647 	u8 *dst;
648 
649 	/*
650 	 * ext4_fc_reserve_space takes care of allocating an extra block if
651 	 * there's no enough space on this block for accommodating this tail.
652 	 */
653 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
654 	if (!dst)
655 		return -ENOSPC;
656 
657 	off = sbi->s_fc_bytes % bsize;
658 
659 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
660 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
661 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
662 
663 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
664 	dst += sizeof(tl);
665 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
666 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
667 	dst += sizeof(tail.fc_tid);
668 	tail.fc_crc = cpu_to_le32(crc);
669 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
670 
671 	ext4_fc_submit_bh(sb);
672 
673 	return 0;
674 }
675 
676 /*
677  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
678  * Returns false if there's not enough space.
679  */
680 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
681 			   u32 *crc)
682 {
683 	struct ext4_fc_tl tl;
684 	u8 *dst;
685 
686 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
687 	if (!dst)
688 		return false;
689 
690 	tl.fc_tag = cpu_to_le16(tag);
691 	tl.fc_len = cpu_to_le16(len);
692 
693 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
694 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
695 
696 	return true;
697 }
698 
699 /* Same as above, but adds dentry tlv. */
700 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
701 					int parent_ino, int ino, int dlen,
702 					const unsigned char *dname,
703 					u32 *crc)
704 {
705 	struct ext4_fc_dentry_info fcd;
706 	struct ext4_fc_tl tl;
707 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
708 					crc);
709 
710 	if (!dst)
711 		return false;
712 
713 	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
714 	fcd.fc_ino = cpu_to_le32(ino);
715 	tl.fc_tag = cpu_to_le16(tag);
716 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
717 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
718 	dst += sizeof(tl);
719 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
720 	dst += sizeof(fcd);
721 	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
722 	dst += dlen;
723 
724 	return true;
725 }
726 
727 /*
728  * Writes inode in the fast commit space under TLV with tag @tag.
729  * Returns 0 on success, error on failure.
730  */
731 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
732 {
733 	struct ext4_inode_info *ei = EXT4_I(inode);
734 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
735 	int ret;
736 	struct ext4_iloc iloc;
737 	struct ext4_fc_inode fc_inode;
738 	struct ext4_fc_tl tl;
739 	u8 *dst;
740 
741 	ret = ext4_get_inode_loc(inode, &iloc);
742 	if (ret)
743 		return ret;
744 
745 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
746 		inode_len += ei->i_extra_isize;
747 
748 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
749 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
750 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
751 
752 	dst = ext4_fc_reserve_space(inode->i_sb,
753 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
754 	if (!dst)
755 		return -ECANCELED;
756 
757 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
758 		return -ECANCELED;
759 	dst += sizeof(tl);
760 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
761 		return -ECANCELED;
762 	dst += sizeof(fc_inode);
763 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
764 					inode_len, crc))
765 		return -ECANCELED;
766 
767 	return 0;
768 }
769 
770 /*
771  * Writes updated data ranges for the inode in question. Updates CRC.
772  * Returns 0 on success, error otherwise.
773  */
774 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
775 {
776 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
777 	struct ext4_inode_info *ei = EXT4_I(inode);
778 	struct ext4_map_blocks map;
779 	struct ext4_fc_add_range fc_ext;
780 	struct ext4_fc_del_range lrange;
781 	struct ext4_extent *ex;
782 	int ret;
783 
784 	mutex_lock(&ei->i_fc_lock);
785 	if (ei->i_fc_lblk_len == 0) {
786 		mutex_unlock(&ei->i_fc_lock);
787 		return 0;
788 	}
789 	old_blk_size = ei->i_fc_lblk_start;
790 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
791 	ei->i_fc_lblk_len = 0;
792 	mutex_unlock(&ei->i_fc_lock);
793 
794 	cur_lblk_off = old_blk_size;
795 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
796 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
797 
798 	while (cur_lblk_off <= new_blk_size) {
799 		map.m_lblk = cur_lblk_off;
800 		map.m_len = new_blk_size - cur_lblk_off + 1;
801 		ret = ext4_map_blocks(NULL, inode, &map, 0);
802 		if (ret < 0)
803 			return -ECANCELED;
804 
805 		if (map.m_len == 0) {
806 			cur_lblk_off++;
807 			continue;
808 		}
809 
810 		if (ret == 0) {
811 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
812 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
813 			lrange.fc_len = cpu_to_le32(map.m_len);
814 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
815 					    sizeof(lrange), (u8 *)&lrange, crc))
816 				return -ENOSPC;
817 		} else {
818 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
819 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
820 			ex->ee_block = cpu_to_le32(map.m_lblk);
821 			ex->ee_len = cpu_to_le16(map.m_len);
822 			ext4_ext_store_pblock(ex, map.m_pblk);
823 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
824 				ext4_ext_mark_unwritten(ex);
825 			else
826 				ext4_ext_mark_initialized(ex);
827 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
828 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
829 				return -ENOSPC;
830 		}
831 
832 		cur_lblk_off += map.m_len;
833 	}
834 
835 	return 0;
836 }
837 
838 
839 /* Submit data for all the fast commit inodes */
840 static int ext4_fc_submit_inode_data_all(journal_t *journal)
841 {
842 	struct super_block *sb = (struct super_block *)(journal->j_private);
843 	struct ext4_sb_info *sbi = EXT4_SB(sb);
844 	struct ext4_inode_info *ei;
845 	struct list_head *pos;
846 	int ret = 0;
847 
848 	spin_lock(&sbi->s_fc_lock);
849 	sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
850 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
851 		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
852 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
853 		while (atomic_read(&ei->i_fc_updates)) {
854 			DEFINE_WAIT(wait);
855 
856 			prepare_to_wait(&ei->i_fc_wait, &wait,
857 						TASK_UNINTERRUPTIBLE);
858 			if (atomic_read(&ei->i_fc_updates)) {
859 				spin_unlock(&sbi->s_fc_lock);
860 				schedule();
861 				spin_lock(&sbi->s_fc_lock);
862 			}
863 			finish_wait(&ei->i_fc_wait, &wait);
864 		}
865 		spin_unlock(&sbi->s_fc_lock);
866 		ret = jbd2_submit_inode_data(ei->jinode);
867 		if (ret)
868 			return ret;
869 		spin_lock(&sbi->s_fc_lock);
870 	}
871 	spin_unlock(&sbi->s_fc_lock);
872 
873 	return ret;
874 }
875 
876 /* Wait for completion of data for all the fast commit inodes */
877 static int ext4_fc_wait_inode_data_all(journal_t *journal)
878 {
879 	struct super_block *sb = (struct super_block *)(journal->j_private);
880 	struct ext4_sb_info *sbi = EXT4_SB(sb);
881 	struct ext4_inode_info *pos, *n;
882 	int ret = 0;
883 
884 	spin_lock(&sbi->s_fc_lock);
885 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
886 		if (!ext4_test_inode_state(&pos->vfs_inode,
887 					   EXT4_STATE_FC_COMMITTING))
888 			continue;
889 		spin_unlock(&sbi->s_fc_lock);
890 
891 		ret = jbd2_wait_inode_data(journal, pos->jinode);
892 		if (ret)
893 			return ret;
894 		spin_lock(&sbi->s_fc_lock);
895 	}
896 	spin_unlock(&sbi->s_fc_lock);
897 
898 	return 0;
899 }
900 
901 /* Commit all the directory entry updates */
902 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
903 {
904 	struct super_block *sb = (struct super_block *)(journal->j_private);
905 	struct ext4_sb_info *sbi = EXT4_SB(sb);
906 	struct ext4_fc_dentry_update *fc_dentry;
907 	struct inode *inode;
908 	struct list_head *pos, *n, *fcd_pos, *fcd_n;
909 	struct ext4_inode_info *ei;
910 	int ret;
911 
912 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
913 		return 0;
914 	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
915 		fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
916 					fcd_list);
917 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
918 			spin_unlock(&sbi->s_fc_lock);
919 			if (!ext4_fc_add_dentry_tlv(
920 				sb, fc_dentry->fcd_op,
921 				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
922 				fc_dentry->fcd_name.len,
923 				fc_dentry->fcd_name.name, crc)) {
924 				ret = -ENOSPC;
925 				goto lock_and_exit;
926 			}
927 			spin_lock(&sbi->s_fc_lock);
928 			continue;
929 		}
930 
931 		inode = NULL;
932 		list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
933 			ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
934 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
935 				inode = &ei->vfs_inode;
936 				break;
937 			}
938 		}
939 		/*
940 		 * If we don't find inode in our list, then it was deleted,
941 		 * in which case, we don't need to record it's create tag.
942 		 */
943 		if (!inode)
944 			continue;
945 		spin_unlock(&sbi->s_fc_lock);
946 
947 		/*
948 		 * We first write the inode and then the create dirent. This
949 		 * allows the recovery code to create an unnamed inode first
950 		 * and then link it to a directory entry. This allows us
951 		 * to use namei.c routines almost as is and simplifies
952 		 * the recovery code.
953 		 */
954 		ret = ext4_fc_write_inode(inode, crc);
955 		if (ret)
956 			goto lock_and_exit;
957 
958 		ret = ext4_fc_write_inode_data(inode, crc);
959 		if (ret)
960 			goto lock_and_exit;
961 
962 		if (!ext4_fc_add_dentry_tlv(
963 			sb, fc_dentry->fcd_op,
964 			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
965 			fc_dentry->fcd_name.len,
966 			fc_dentry->fcd_name.name, crc)) {
967 			ret = -ENOSPC;
968 			goto lock_and_exit;
969 		}
970 
971 		spin_lock(&sbi->s_fc_lock);
972 	}
973 	return 0;
974 lock_and_exit:
975 	spin_lock(&sbi->s_fc_lock);
976 	return ret;
977 }
978 
979 static int ext4_fc_perform_commit(journal_t *journal)
980 {
981 	struct super_block *sb = (struct super_block *)(journal->j_private);
982 	struct ext4_sb_info *sbi = EXT4_SB(sb);
983 	struct ext4_inode_info *iter;
984 	struct ext4_fc_head head;
985 	struct list_head *pos;
986 	struct inode *inode;
987 	struct blk_plug plug;
988 	int ret = 0;
989 	u32 crc = 0;
990 
991 	ret = ext4_fc_submit_inode_data_all(journal);
992 	if (ret)
993 		return ret;
994 
995 	ret = ext4_fc_wait_inode_data_all(journal);
996 	if (ret)
997 		return ret;
998 
999 	blk_start_plug(&plug);
1000 	if (sbi->s_fc_bytes == 0) {
1001 		/*
1002 		 * Add a head tag only if this is the first fast commit
1003 		 * in this TID.
1004 		 */
1005 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1006 		head.fc_tid = cpu_to_le32(
1007 			sbi->s_journal->j_running_transaction->t_tid);
1008 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1009 			(u8 *)&head, &crc))
1010 			goto out;
1011 	}
1012 
1013 	spin_lock(&sbi->s_fc_lock);
1014 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1015 	if (ret) {
1016 		spin_unlock(&sbi->s_fc_lock);
1017 		goto out;
1018 	}
1019 
1020 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1021 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1022 		inode = &iter->vfs_inode;
1023 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1024 			continue;
1025 
1026 		spin_unlock(&sbi->s_fc_lock);
1027 		ret = ext4_fc_write_inode_data(inode, &crc);
1028 		if (ret)
1029 			goto out;
1030 		ret = ext4_fc_write_inode(inode, &crc);
1031 		if (ret)
1032 			goto out;
1033 		spin_lock(&sbi->s_fc_lock);
1034 		EXT4_I(inode)->i_fc_committed_subtid =
1035 			atomic_read(&sbi->s_fc_subtid);
1036 	}
1037 	spin_unlock(&sbi->s_fc_lock);
1038 
1039 	ret = ext4_fc_write_tail(sb, crc);
1040 
1041 out:
1042 	blk_finish_plug(&plug);
1043 	return ret;
1044 }
1045 
1046 /*
1047  * The main commit entry point. Performs a fast commit for transaction
1048  * commit_tid if needed. If it's not possible to perform a fast commit
1049  * due to various reasons, we fall back to full commit. Returns 0
1050  * on success, error otherwise.
1051  */
1052 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1053 {
1054 	struct super_block *sb = (struct super_block *)(journal->j_private);
1055 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1056 	int nblks = 0, ret, bsize = journal->j_blocksize;
1057 	int subtid = atomic_read(&sbi->s_fc_subtid);
1058 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1059 	ktime_t start_time, commit_time;
1060 
1061 	trace_ext4_fc_commit_start(sb);
1062 
1063 	start_time = ktime_get();
1064 
1065 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1066 		(ext4_fc_is_ineligible(sb))) {
1067 		reason = EXT4_FC_REASON_INELIGIBLE;
1068 		goto out;
1069 	}
1070 
1071 restart_fc:
1072 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1073 	if (ret == -EALREADY) {
1074 		/* There was an ongoing commit, check if we need to restart */
1075 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1076 			commit_tid > journal->j_commit_sequence)
1077 			goto restart_fc;
1078 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1079 		goto out;
1080 	} else if (ret) {
1081 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1082 		reason = EXT4_FC_REASON_FC_START_FAILED;
1083 		goto out;
1084 	}
1085 
1086 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1087 	ret = ext4_fc_perform_commit(journal);
1088 	if (ret < 0) {
1089 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1090 		reason = EXT4_FC_REASON_FC_FAILED;
1091 		goto out;
1092 	}
1093 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1094 	ret = jbd2_fc_wait_bufs(journal, nblks);
1095 	if (ret < 0) {
1096 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1097 		reason = EXT4_FC_REASON_FC_FAILED;
1098 		goto out;
1099 	}
1100 	atomic_inc(&sbi->s_fc_subtid);
1101 	jbd2_fc_end_commit(journal);
1102 out:
1103 	/* Has any ineligible update happened since we started? */
1104 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1105 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1106 		reason = EXT4_FC_REASON_INELIGIBLE;
1107 	}
1108 
1109 	spin_lock(&sbi->s_fc_lock);
1110 	if (reason != EXT4_FC_REASON_OK &&
1111 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1112 		sbi->s_fc_stats.fc_ineligible_commits++;
1113 	} else {
1114 		sbi->s_fc_stats.fc_num_commits++;
1115 		sbi->s_fc_stats.fc_numblks += nblks;
1116 	}
1117 	spin_unlock(&sbi->s_fc_lock);
1118 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1119 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1120 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1121 	/*
1122 	 * weight the commit time higher than the average time so we don't
1123 	 * react too strongly to vast changes in the commit time
1124 	 */
1125 	if (likely(sbi->s_fc_avg_commit_time))
1126 		sbi->s_fc_avg_commit_time = (commit_time +
1127 				sbi->s_fc_avg_commit_time * 3) / 4;
1128 	else
1129 		sbi->s_fc_avg_commit_time = commit_time;
1130 	jbd_debug(1,
1131 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1132 		nblks, reason, subtid);
1133 	if (reason == EXT4_FC_REASON_FC_FAILED)
1134 		return jbd2_fc_end_commit_fallback(journal, commit_tid);
1135 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1136 		reason == EXT4_FC_REASON_INELIGIBLE)
1137 		return jbd2_complete_transaction(journal, commit_tid);
1138 	return 0;
1139 }
1140 
1141 /*
1142  * Fast commit cleanup routine. This is called after every fast commit and
1143  * full commit. full is true if we are called after a full commit.
1144  */
1145 static void ext4_fc_cleanup(journal_t *journal, int full)
1146 {
1147 	struct super_block *sb = journal->j_private;
1148 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1149 	struct ext4_inode_info *iter;
1150 	struct ext4_fc_dentry_update *fc_dentry;
1151 	struct list_head *pos, *n;
1152 
1153 	if (full && sbi->s_fc_bh)
1154 		sbi->s_fc_bh = NULL;
1155 
1156 	jbd2_fc_release_bufs(journal);
1157 
1158 	spin_lock(&sbi->s_fc_lock);
1159 	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1160 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1161 		list_del_init(&iter->i_fc_list);
1162 		ext4_clear_inode_state(&iter->vfs_inode,
1163 				       EXT4_STATE_FC_COMMITTING);
1164 		ext4_fc_reset_inode(&iter->vfs_inode);
1165 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1166 		smp_mb();
1167 #if (BITS_PER_LONG < 64)
1168 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1169 #else
1170 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1171 #endif
1172 	}
1173 
1174 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1175 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1176 					     struct ext4_fc_dentry_update,
1177 					     fcd_list);
1178 		list_del_init(&fc_dentry->fcd_list);
1179 		spin_unlock(&sbi->s_fc_lock);
1180 
1181 		if (fc_dentry->fcd_name.name &&
1182 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1183 			kfree(fc_dentry->fcd_name.name);
1184 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1185 		spin_lock(&sbi->s_fc_lock);
1186 	}
1187 
1188 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1189 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1190 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1191 				&sbi->s_fc_q[FC_Q_STAGING]);
1192 
1193 	sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1194 	sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1195 
1196 	if (full)
1197 		sbi->s_fc_bytes = 0;
1198 	spin_unlock(&sbi->s_fc_lock);
1199 	trace_ext4_fc_stats(sb);
1200 }
1201 
1202 /* Ext4 Replay Path Routines */
1203 
1204 /* Get length of a particular tlv */
1205 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1206 {
1207 	return le16_to_cpu(tl->fc_len);
1208 }
1209 
1210 /* Get a pointer to "value" of a tlv */
1211 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1212 {
1213 	return (u8 *)tl + sizeof(*tl);
1214 }
1215 
1216 /* Helper struct for dentry replay routines */
1217 struct dentry_info_args {
1218 	int parent_ino, dname_len, ino, inode_len;
1219 	char *dname;
1220 };
1221 
1222 static inline void tl_to_darg(struct dentry_info_args *darg,
1223 				struct  ext4_fc_tl *tl)
1224 {
1225 	struct ext4_fc_dentry_info *fcd;
1226 
1227 	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1228 
1229 	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1230 	darg->ino = le32_to_cpu(fcd->fc_ino);
1231 	darg->dname = fcd->fc_dname;
1232 	darg->dname_len = ext4_fc_tag_len(tl) -
1233 			sizeof(struct ext4_fc_dentry_info);
1234 }
1235 
1236 /* Unlink replay function */
1237 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1238 {
1239 	struct inode *inode, *old_parent;
1240 	struct qstr entry;
1241 	struct dentry_info_args darg;
1242 	int ret = 0;
1243 
1244 	tl_to_darg(&darg, tl);
1245 
1246 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1247 			darg.parent_ino, darg.dname_len);
1248 
1249 	entry.name = darg.dname;
1250 	entry.len = darg.dname_len;
1251 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1252 
1253 	if (IS_ERR_OR_NULL(inode)) {
1254 		jbd_debug(1, "Inode %d not found", darg.ino);
1255 		return 0;
1256 	}
1257 
1258 	old_parent = ext4_iget(sb, darg.parent_ino,
1259 				EXT4_IGET_NORMAL);
1260 	if (IS_ERR_OR_NULL(old_parent)) {
1261 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1262 		iput(inode);
1263 		return 0;
1264 	}
1265 
1266 	ret = __ext4_unlink(old_parent, &entry, inode);
1267 	/* -ENOENT ok coz it might not exist anymore. */
1268 	if (ret == -ENOENT)
1269 		ret = 0;
1270 	iput(old_parent);
1271 	iput(inode);
1272 	return ret;
1273 }
1274 
1275 static int ext4_fc_replay_link_internal(struct super_block *sb,
1276 				struct dentry_info_args *darg,
1277 				struct inode *inode)
1278 {
1279 	struct inode *dir = NULL;
1280 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1281 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1282 	int ret = 0;
1283 
1284 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1285 	if (IS_ERR(dir)) {
1286 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1287 		dir = NULL;
1288 		goto out;
1289 	}
1290 
1291 	dentry_dir = d_obtain_alias(dir);
1292 	if (IS_ERR(dentry_dir)) {
1293 		jbd_debug(1, "Failed to obtain dentry");
1294 		dentry_dir = NULL;
1295 		goto out;
1296 	}
1297 
1298 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1299 	if (!dentry_inode) {
1300 		jbd_debug(1, "Inode dentry not created.");
1301 		ret = -ENOMEM;
1302 		goto out;
1303 	}
1304 
1305 	ret = __ext4_link(dir, inode, dentry_inode);
1306 	/*
1307 	 * It's possible that link already existed since data blocks
1308 	 * for the dir in question got persisted before we crashed OR
1309 	 * we replayed this tag and crashed before the entire replay
1310 	 * could complete.
1311 	 */
1312 	if (ret && ret != -EEXIST) {
1313 		jbd_debug(1, "Failed to link\n");
1314 		goto out;
1315 	}
1316 
1317 	ret = 0;
1318 out:
1319 	if (dentry_dir) {
1320 		d_drop(dentry_dir);
1321 		dput(dentry_dir);
1322 	} else if (dir) {
1323 		iput(dir);
1324 	}
1325 	if (dentry_inode) {
1326 		d_drop(dentry_inode);
1327 		dput(dentry_inode);
1328 	}
1329 
1330 	return ret;
1331 }
1332 
1333 /* Link replay function */
1334 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1335 {
1336 	struct inode *inode;
1337 	struct dentry_info_args darg;
1338 	int ret = 0;
1339 
1340 	tl_to_darg(&darg, tl);
1341 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1342 			darg.parent_ino, darg.dname_len);
1343 
1344 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1345 	if (IS_ERR_OR_NULL(inode)) {
1346 		jbd_debug(1, "Inode not found.");
1347 		return 0;
1348 	}
1349 
1350 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1351 	iput(inode);
1352 	return ret;
1353 }
1354 
1355 /*
1356  * Record all the modified inodes during replay. We use this later to setup
1357  * block bitmaps correctly.
1358  */
1359 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1360 {
1361 	struct ext4_fc_replay_state *state;
1362 	int i;
1363 
1364 	state = &EXT4_SB(sb)->s_fc_replay_state;
1365 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1366 		if (state->fc_modified_inodes[i] == ino)
1367 			return 0;
1368 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1369 		state->fc_modified_inodes_size +=
1370 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1371 		state->fc_modified_inodes = krealloc(
1372 					state->fc_modified_inodes, sizeof(int) *
1373 					state->fc_modified_inodes_size,
1374 					GFP_KERNEL);
1375 		if (!state->fc_modified_inodes)
1376 			return -ENOMEM;
1377 	}
1378 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1379 	return 0;
1380 }
1381 
1382 /*
1383  * Inode replay function
1384  */
1385 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1386 {
1387 	struct ext4_fc_inode *fc_inode;
1388 	struct ext4_inode *raw_inode;
1389 	struct ext4_inode *raw_fc_inode;
1390 	struct inode *inode = NULL;
1391 	struct ext4_iloc iloc;
1392 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1393 	struct ext4_extent_header *eh;
1394 
1395 	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1396 
1397 	ino = le32_to_cpu(fc_inode->fc_ino);
1398 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1399 
1400 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1401 	if (!IS_ERR_OR_NULL(inode)) {
1402 		ext4_ext_clear_bb(inode);
1403 		iput(inode);
1404 	}
1405 
1406 	ext4_fc_record_modified_inode(sb, ino);
1407 
1408 	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1409 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1410 	if (ret)
1411 		goto out;
1412 
1413 	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1414 	raw_inode = ext4_raw_inode(&iloc);
1415 
1416 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1417 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1418 		inode_len - offsetof(struct ext4_inode, i_generation));
1419 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1420 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1421 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1422 			memset(eh, 0, sizeof(*eh));
1423 			eh->eh_magic = EXT4_EXT_MAGIC;
1424 			eh->eh_max = cpu_to_le16(
1425 				(sizeof(raw_inode->i_block) -
1426 				 sizeof(struct ext4_extent_header))
1427 				 / sizeof(struct ext4_extent));
1428 		}
1429 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1430 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1431 			sizeof(raw_inode->i_block));
1432 	}
1433 
1434 	/* Immediately update the inode on disk. */
1435 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1436 	if (ret)
1437 		goto out;
1438 	ret = sync_dirty_buffer(iloc.bh);
1439 	if (ret)
1440 		goto out;
1441 	ret = ext4_mark_inode_used(sb, ino);
1442 	if (ret)
1443 		goto out;
1444 
1445 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1446 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1447 	if (IS_ERR_OR_NULL(inode)) {
1448 		jbd_debug(1, "Inode not found.");
1449 		return -EFSCORRUPTED;
1450 	}
1451 
1452 	/*
1453 	 * Our allocator could have made different decisions than before
1454 	 * crashing. This should be fixed but until then, we calculate
1455 	 * the number of blocks the inode.
1456 	 */
1457 	ext4_ext_replay_set_iblocks(inode);
1458 
1459 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1460 	ext4_reset_inode_seed(inode);
1461 
1462 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1463 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1464 	sync_dirty_buffer(iloc.bh);
1465 	brelse(iloc.bh);
1466 out:
1467 	iput(inode);
1468 	if (!ret)
1469 		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1470 
1471 	return 0;
1472 }
1473 
1474 /*
1475  * Dentry create replay function.
1476  *
1477  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1478  * inode for which we are trying to create a dentry here, should already have
1479  * been replayed before we start here.
1480  */
1481 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1482 {
1483 	int ret = 0;
1484 	struct inode *inode = NULL;
1485 	struct inode *dir = NULL;
1486 	struct dentry_info_args darg;
1487 
1488 	tl_to_darg(&darg, tl);
1489 
1490 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1491 			darg.parent_ino, darg.dname_len);
1492 
1493 	/* This takes care of update group descriptor and other metadata */
1494 	ret = ext4_mark_inode_used(sb, darg.ino);
1495 	if (ret)
1496 		goto out;
1497 
1498 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1499 	if (IS_ERR_OR_NULL(inode)) {
1500 		jbd_debug(1, "inode %d not found.", darg.ino);
1501 		inode = NULL;
1502 		ret = -EINVAL;
1503 		goto out;
1504 	}
1505 
1506 	if (S_ISDIR(inode->i_mode)) {
1507 		/*
1508 		 * If we are creating a directory, we need to make sure that the
1509 		 * dot and dot dot dirents are setup properly.
1510 		 */
1511 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1512 		if (IS_ERR_OR_NULL(dir)) {
1513 			jbd_debug(1, "Dir %d not found.", darg.ino);
1514 			goto out;
1515 		}
1516 		ret = ext4_init_new_dir(NULL, dir, inode);
1517 		iput(dir);
1518 		if (ret) {
1519 			ret = 0;
1520 			goto out;
1521 		}
1522 	}
1523 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1524 	if (ret)
1525 		goto out;
1526 	set_nlink(inode, 1);
1527 	ext4_mark_inode_dirty(NULL, inode);
1528 out:
1529 	if (inode)
1530 		iput(inode);
1531 	return ret;
1532 }
1533 
1534 /*
1535  * Record physical disk regions which are in use as per fast commit area. Our
1536  * simple replay phase allocator excludes these regions from allocation.
1537  */
1538 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1539 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1540 {
1541 	struct ext4_fc_replay_state *state;
1542 	struct ext4_fc_alloc_region *region;
1543 
1544 	state = &EXT4_SB(sb)->s_fc_replay_state;
1545 	if (state->fc_regions_used == state->fc_regions_size) {
1546 		state->fc_regions_size +=
1547 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1548 		state->fc_regions = krealloc(
1549 					state->fc_regions,
1550 					state->fc_regions_size *
1551 					sizeof(struct ext4_fc_alloc_region),
1552 					GFP_KERNEL);
1553 		if (!state->fc_regions)
1554 			return -ENOMEM;
1555 	}
1556 	region = &state->fc_regions[state->fc_regions_used++];
1557 	region->ino = ino;
1558 	region->lblk = lblk;
1559 	region->pblk = pblk;
1560 	region->len = len;
1561 
1562 	return 0;
1563 }
1564 
1565 /* Replay add range tag */
1566 static int ext4_fc_replay_add_range(struct super_block *sb,
1567 				struct ext4_fc_tl *tl)
1568 {
1569 	struct ext4_fc_add_range *fc_add_ex;
1570 	struct ext4_extent newex, *ex;
1571 	struct inode *inode;
1572 	ext4_lblk_t start, cur;
1573 	int remaining, len;
1574 	ext4_fsblk_t start_pblk;
1575 	struct ext4_map_blocks map;
1576 	struct ext4_ext_path *path = NULL;
1577 	int ret;
1578 
1579 	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1580 	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1581 
1582 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1583 		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1584 		ext4_ext_get_actual_len(ex));
1585 
1586 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1587 				EXT4_IGET_NORMAL);
1588 	if (IS_ERR_OR_NULL(inode)) {
1589 		jbd_debug(1, "Inode not found.");
1590 		return 0;
1591 	}
1592 
1593 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1594 
1595 	start = le32_to_cpu(ex->ee_block);
1596 	start_pblk = ext4_ext_pblock(ex);
1597 	len = ext4_ext_get_actual_len(ex);
1598 
1599 	cur = start;
1600 	remaining = len;
1601 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1602 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1603 		  inode->i_ino);
1604 
1605 	while (remaining > 0) {
1606 		map.m_lblk = cur;
1607 		map.m_len = remaining;
1608 		map.m_pblk = 0;
1609 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1610 
1611 		if (ret < 0) {
1612 			iput(inode);
1613 			return 0;
1614 		}
1615 
1616 		if (ret == 0) {
1617 			/* Range is not mapped */
1618 			path = ext4_find_extent(inode, cur, NULL, 0);
1619 			if (IS_ERR(path)) {
1620 				iput(inode);
1621 				return 0;
1622 			}
1623 			memset(&newex, 0, sizeof(newex));
1624 			newex.ee_block = cpu_to_le32(cur);
1625 			ext4_ext_store_pblock(
1626 				&newex, start_pblk + cur - start);
1627 			newex.ee_len = cpu_to_le16(map.m_len);
1628 			if (ext4_ext_is_unwritten(ex))
1629 				ext4_ext_mark_unwritten(&newex);
1630 			down_write(&EXT4_I(inode)->i_data_sem);
1631 			ret = ext4_ext_insert_extent(
1632 				NULL, inode, &path, &newex, 0);
1633 			up_write((&EXT4_I(inode)->i_data_sem));
1634 			ext4_ext_drop_refs(path);
1635 			kfree(path);
1636 			if (ret) {
1637 				iput(inode);
1638 				return 0;
1639 			}
1640 			goto next;
1641 		}
1642 
1643 		if (start_pblk + cur - start != map.m_pblk) {
1644 			/*
1645 			 * Logical to physical mapping changed. This can happen
1646 			 * if this range was removed and then reallocated to
1647 			 * map to new physical blocks during a fast commit.
1648 			 */
1649 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1650 					ext4_ext_is_unwritten(ex),
1651 					start_pblk + cur - start);
1652 			if (ret) {
1653 				iput(inode);
1654 				return 0;
1655 			}
1656 			/*
1657 			 * Mark the old blocks as free since they aren't used
1658 			 * anymore. We maintain an array of all the modified
1659 			 * inodes. In case these blocks are still used at either
1660 			 * a different logical range in the same inode or in
1661 			 * some different inode, we will mark them as allocated
1662 			 * at the end of the FC replay using our array of
1663 			 * modified inodes.
1664 			 */
1665 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1666 			goto next;
1667 		}
1668 
1669 		/* Range is mapped and needs a state change */
1670 		jbd_debug(1, "Converting from %d to %d %lld",
1671 				map.m_flags & EXT4_MAP_UNWRITTEN,
1672 			ext4_ext_is_unwritten(ex), map.m_pblk);
1673 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1674 					ext4_ext_is_unwritten(ex), map.m_pblk);
1675 		if (ret) {
1676 			iput(inode);
1677 			return 0;
1678 		}
1679 		/*
1680 		 * We may have split the extent tree while toggling the state.
1681 		 * Try to shrink the extent tree now.
1682 		 */
1683 		ext4_ext_replay_shrink_inode(inode, start + len);
1684 next:
1685 		cur += map.m_len;
1686 		remaining -= map.m_len;
1687 	}
1688 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1689 					sb->s_blocksize_bits);
1690 	iput(inode);
1691 	return 0;
1692 }
1693 
1694 /* Replay DEL_RANGE tag */
1695 static int
1696 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1697 {
1698 	struct inode *inode;
1699 	struct ext4_fc_del_range *lrange;
1700 	struct ext4_map_blocks map;
1701 	ext4_lblk_t cur, remaining;
1702 	int ret;
1703 
1704 	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1705 	cur = le32_to_cpu(lrange->fc_lblk);
1706 	remaining = le32_to_cpu(lrange->fc_len);
1707 
1708 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1709 		le32_to_cpu(lrange->fc_ino), cur, remaining);
1710 
1711 	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1712 	if (IS_ERR_OR_NULL(inode)) {
1713 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1714 		return 0;
1715 	}
1716 
1717 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1718 
1719 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1720 			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1721 			le32_to_cpu(lrange->fc_len));
1722 	while (remaining > 0) {
1723 		map.m_lblk = cur;
1724 		map.m_len = remaining;
1725 
1726 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1727 		if (ret < 0) {
1728 			iput(inode);
1729 			return 0;
1730 		}
1731 		if (ret > 0) {
1732 			remaining -= ret;
1733 			cur += ret;
1734 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1735 		} else {
1736 			remaining -= map.m_len;
1737 			cur += map.m_len;
1738 		}
1739 	}
1740 
1741 	ret = ext4_punch_hole(inode,
1742 		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1743 		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1744 	if (ret)
1745 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1746 	ext4_ext_replay_shrink_inode(inode,
1747 		i_size_read(inode) >> sb->s_blocksize_bits);
1748 	ext4_mark_inode_dirty(NULL, inode);
1749 	iput(inode);
1750 
1751 	return 0;
1752 }
1753 
1754 static inline const char *tag2str(u16 tag)
1755 {
1756 	switch (tag) {
1757 	case EXT4_FC_TAG_LINK:
1758 		return "TAG_ADD_ENTRY";
1759 	case EXT4_FC_TAG_UNLINK:
1760 		return "TAG_DEL_ENTRY";
1761 	case EXT4_FC_TAG_ADD_RANGE:
1762 		return "TAG_ADD_RANGE";
1763 	case EXT4_FC_TAG_CREAT:
1764 		return "TAG_CREAT_DENTRY";
1765 	case EXT4_FC_TAG_DEL_RANGE:
1766 		return "TAG_DEL_RANGE";
1767 	case EXT4_FC_TAG_INODE:
1768 		return "TAG_INODE";
1769 	case EXT4_FC_TAG_PAD:
1770 		return "TAG_PAD";
1771 	case EXT4_FC_TAG_TAIL:
1772 		return "TAG_TAIL";
1773 	case EXT4_FC_TAG_HEAD:
1774 		return "TAG_HEAD";
1775 	default:
1776 		return "TAG_ERROR";
1777 	}
1778 }
1779 
1780 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1781 {
1782 	struct ext4_fc_replay_state *state;
1783 	struct inode *inode;
1784 	struct ext4_ext_path *path = NULL;
1785 	struct ext4_map_blocks map;
1786 	int i, ret, j;
1787 	ext4_lblk_t cur, end;
1788 
1789 	state = &EXT4_SB(sb)->s_fc_replay_state;
1790 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1791 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1792 			EXT4_IGET_NORMAL);
1793 		if (IS_ERR_OR_NULL(inode)) {
1794 			jbd_debug(1, "Inode %d not found.",
1795 				state->fc_modified_inodes[i]);
1796 			continue;
1797 		}
1798 		cur = 0;
1799 		end = EXT_MAX_BLOCKS;
1800 		while (cur < end) {
1801 			map.m_lblk = cur;
1802 			map.m_len = end - cur;
1803 
1804 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1805 			if (ret < 0)
1806 				break;
1807 
1808 			if (ret > 0) {
1809 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1810 				if (!IS_ERR_OR_NULL(path)) {
1811 					for (j = 0; j < path->p_depth; j++)
1812 						ext4_mb_mark_bb(inode->i_sb,
1813 							path[j].p_block, 1, 1);
1814 					ext4_ext_drop_refs(path);
1815 					kfree(path);
1816 				}
1817 				cur += ret;
1818 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1819 							map.m_len, 1);
1820 			} else {
1821 				cur = cur + (map.m_len ? map.m_len : 1);
1822 			}
1823 		}
1824 		iput(inode);
1825 	}
1826 }
1827 
1828 /*
1829  * Check if block is in excluded regions for block allocation. The simple
1830  * allocator that runs during replay phase is calls this function to see
1831  * if it is okay to use a block.
1832  */
1833 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1834 {
1835 	int i;
1836 	struct ext4_fc_replay_state *state;
1837 
1838 	state = &EXT4_SB(sb)->s_fc_replay_state;
1839 	for (i = 0; i < state->fc_regions_valid; i++) {
1840 		if (state->fc_regions[i].ino == 0 ||
1841 			state->fc_regions[i].len == 0)
1842 			continue;
1843 		if (blk >= state->fc_regions[i].pblk &&
1844 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1845 			return true;
1846 	}
1847 	return false;
1848 }
1849 
1850 /* Cleanup function called after replay */
1851 void ext4_fc_replay_cleanup(struct super_block *sb)
1852 {
1853 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1854 
1855 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1856 	kfree(sbi->s_fc_replay_state.fc_regions);
1857 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1858 }
1859 
1860 /*
1861  * Recovery Scan phase handler
1862  *
1863  * This function is called during the scan phase and is responsible
1864  * for doing following things:
1865  * - Make sure the fast commit area has valid tags for replay
1866  * - Count number of tags that need to be replayed by the replay handler
1867  * - Verify CRC
1868  * - Create a list of excluded blocks for allocation during replay phase
1869  *
1870  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1871  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1872  * to indicate that scan has finished and JBD2 can now start replay phase.
1873  * It returns a negative error to indicate that there was an error. At the end
1874  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1875  * to indicate the number of tags that need to replayed during the replay phase.
1876  */
1877 static int ext4_fc_replay_scan(journal_t *journal,
1878 				struct buffer_head *bh, int off,
1879 				tid_t expected_tid)
1880 {
1881 	struct super_block *sb = journal->j_private;
1882 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1883 	struct ext4_fc_replay_state *state;
1884 	int ret = JBD2_FC_REPLAY_CONTINUE;
1885 	struct ext4_fc_add_range *ext;
1886 	struct ext4_fc_tl *tl;
1887 	struct ext4_fc_tail *tail;
1888 	__u8 *start, *end;
1889 	struct ext4_fc_head *head;
1890 	struct ext4_extent *ex;
1891 
1892 	state = &sbi->s_fc_replay_state;
1893 
1894 	start = (u8 *)bh->b_data;
1895 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1896 
1897 	if (state->fc_replay_expected_off == 0) {
1898 		state->fc_cur_tag = 0;
1899 		state->fc_replay_num_tags = 0;
1900 		state->fc_crc = 0;
1901 		state->fc_regions = NULL;
1902 		state->fc_regions_valid = state->fc_regions_used =
1903 			state->fc_regions_size = 0;
1904 		/* Check if we can stop early */
1905 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1906 			!= EXT4_FC_TAG_HEAD)
1907 			return 0;
1908 	}
1909 
1910 	if (off != state->fc_replay_expected_off) {
1911 		ret = -EFSCORRUPTED;
1912 		goto out_err;
1913 	}
1914 
1915 	state->fc_replay_expected_off++;
1916 	fc_for_each_tl(start, end, tl) {
1917 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1918 			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1919 		switch (le16_to_cpu(tl->fc_tag)) {
1920 		case EXT4_FC_TAG_ADD_RANGE:
1921 			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1922 			ex = (struct ext4_extent *)&ext->fc_ex;
1923 			ret = ext4_fc_record_regions(sb,
1924 				le32_to_cpu(ext->fc_ino),
1925 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1926 				ext4_ext_get_actual_len(ex));
1927 			if (ret < 0)
1928 				break;
1929 			ret = JBD2_FC_REPLAY_CONTINUE;
1930 			fallthrough;
1931 		case EXT4_FC_TAG_DEL_RANGE:
1932 		case EXT4_FC_TAG_LINK:
1933 		case EXT4_FC_TAG_UNLINK:
1934 		case EXT4_FC_TAG_CREAT:
1935 		case EXT4_FC_TAG_INODE:
1936 		case EXT4_FC_TAG_PAD:
1937 			state->fc_cur_tag++;
1938 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1939 					sizeof(*tl) + ext4_fc_tag_len(tl));
1940 			break;
1941 		case EXT4_FC_TAG_TAIL:
1942 			state->fc_cur_tag++;
1943 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1944 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1945 						sizeof(*tl) +
1946 						offsetof(struct ext4_fc_tail,
1947 						fc_crc));
1948 			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1949 				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1950 				state->fc_replay_num_tags = state->fc_cur_tag;
1951 				state->fc_regions_valid =
1952 					state->fc_regions_used;
1953 			} else {
1954 				ret = state->fc_replay_num_tags ?
1955 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1956 			}
1957 			state->fc_crc = 0;
1958 			break;
1959 		case EXT4_FC_TAG_HEAD:
1960 			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1961 			if (le32_to_cpu(head->fc_features) &
1962 				~EXT4_FC_SUPPORTED_FEATURES) {
1963 				ret = -EOPNOTSUPP;
1964 				break;
1965 			}
1966 			if (le32_to_cpu(head->fc_tid) != expected_tid) {
1967 				ret = JBD2_FC_REPLAY_STOP;
1968 				break;
1969 			}
1970 			state->fc_cur_tag++;
1971 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1972 					sizeof(*tl) + ext4_fc_tag_len(tl));
1973 			break;
1974 		default:
1975 			ret = state->fc_replay_num_tags ?
1976 				JBD2_FC_REPLAY_STOP : -ECANCELED;
1977 		}
1978 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1979 			break;
1980 	}
1981 
1982 out_err:
1983 	trace_ext4_fc_replay_scan(sb, ret, off);
1984 	return ret;
1985 }
1986 
1987 /*
1988  * Main recovery path entry point.
1989  * The meaning of return codes is similar as above.
1990  */
1991 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1992 				enum passtype pass, int off, tid_t expected_tid)
1993 {
1994 	struct super_block *sb = journal->j_private;
1995 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1996 	struct ext4_fc_tl *tl;
1997 	__u8 *start, *end;
1998 	int ret = JBD2_FC_REPLAY_CONTINUE;
1999 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2000 	struct ext4_fc_tail *tail;
2001 
2002 	if (pass == PASS_SCAN) {
2003 		state->fc_current_pass = PASS_SCAN;
2004 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2005 	}
2006 
2007 	if (state->fc_current_pass != pass) {
2008 		state->fc_current_pass = pass;
2009 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2010 	}
2011 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2012 		jbd_debug(1, "Replay stops\n");
2013 		ext4_fc_set_bitmaps_and_counters(sb);
2014 		return 0;
2015 	}
2016 
2017 #ifdef CONFIG_EXT4_DEBUG
2018 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2019 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2020 		return JBD2_FC_REPLAY_STOP;
2021 	}
2022 #endif
2023 
2024 	start = (u8 *)bh->b_data;
2025 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2026 
2027 	fc_for_each_tl(start, end, tl) {
2028 		if (state->fc_replay_num_tags == 0) {
2029 			ret = JBD2_FC_REPLAY_STOP;
2030 			ext4_fc_set_bitmaps_and_counters(sb);
2031 			break;
2032 		}
2033 		jbd_debug(3, "Replay phase, tag:%s\n",
2034 				tag2str(le16_to_cpu(tl->fc_tag)));
2035 		state->fc_replay_num_tags--;
2036 		switch (le16_to_cpu(tl->fc_tag)) {
2037 		case EXT4_FC_TAG_LINK:
2038 			ret = ext4_fc_replay_link(sb, tl);
2039 			break;
2040 		case EXT4_FC_TAG_UNLINK:
2041 			ret = ext4_fc_replay_unlink(sb, tl);
2042 			break;
2043 		case EXT4_FC_TAG_ADD_RANGE:
2044 			ret = ext4_fc_replay_add_range(sb, tl);
2045 			break;
2046 		case EXT4_FC_TAG_CREAT:
2047 			ret = ext4_fc_replay_create(sb, tl);
2048 			break;
2049 		case EXT4_FC_TAG_DEL_RANGE:
2050 			ret = ext4_fc_replay_del_range(sb, tl);
2051 			break;
2052 		case EXT4_FC_TAG_INODE:
2053 			ret = ext4_fc_replay_inode(sb, tl);
2054 			break;
2055 		case EXT4_FC_TAG_PAD:
2056 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2057 				ext4_fc_tag_len(tl), 0);
2058 			break;
2059 		case EXT4_FC_TAG_TAIL:
2060 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2061 				ext4_fc_tag_len(tl), 0);
2062 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2063 			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2064 			break;
2065 		case EXT4_FC_TAG_HEAD:
2066 			break;
2067 		default:
2068 			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2069 				ext4_fc_tag_len(tl), 0);
2070 			ret = -ECANCELED;
2071 			break;
2072 		}
2073 		if (ret < 0)
2074 			break;
2075 		ret = JBD2_FC_REPLAY_CONTINUE;
2076 	}
2077 	return ret;
2078 }
2079 
2080 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2081 {
2082 	int num_fc_blocks;
2083 
2084 	/*
2085 	 * We set replay callback even if fast commit disabled because we may
2086 	 * could still have fast commit blocks that need to be replayed even if
2087 	 * fast commit has now been turned off.
2088 	 */
2089 	journal->j_fc_replay_callback = ext4_fc_replay;
2090 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2091 		return;
2092 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2093 	if (!buffer_uptodate(journal->j_sb_buffer)
2094 		&& ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
2095 					true)) {
2096 		ext4_msg(sb, KERN_ERR, "I/O error on journal");
2097 		return;
2098 	}
2099 	num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
2100 	if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
2101 					EXT4_NUM_FC_BLKS)) {
2102 		pr_warn("Error while enabling fast commits, turning off.");
2103 		ext4_clear_feature_fast_commit(sb);
2104 	}
2105 }
2106 
2107 const char *fc_ineligible_reasons[] = {
2108 	"Extended attributes changed",
2109 	"Cross rename",
2110 	"Journal flag changed",
2111 	"Insufficient memory",
2112 	"Swap boot",
2113 	"Resize",
2114 	"Dir renamed",
2115 	"Falloc range op",
2116 	"FC Commit Failed"
2117 };
2118 
2119 int ext4_fc_info_show(struct seq_file *seq, void *v)
2120 {
2121 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2122 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2123 	int i;
2124 
2125 	if (v != SEQ_START_TOKEN)
2126 		return 0;
2127 
2128 	seq_printf(seq,
2129 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2130 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2131 		   stats->fc_numblks,
2132 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2133 	seq_puts(seq, "Ineligible reasons:\n");
2134 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2135 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2136 			stats->fc_ineligible_reason_count[i]);
2137 
2138 	return 0;
2139 }
2140 
2141 int __init ext4_fc_init_dentry_cache(void)
2142 {
2143 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2144 					   SLAB_RECLAIM_ACCOUNT);
2145 
2146 	if (ext4_fc_dentry_cachep == NULL)
2147 		return -ENOMEM;
2148 
2149 	return 0;
2150 }
2151