xref: /openbmc/linux/fs/ext4/fast_commit.c (revision f5ad1c74)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118 
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121 
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124 	BUFFER_TRACE(bh, "");
125 	if (uptodate) {
126 		ext4_debug("%s: Block %lld up-to-date",
127 			   __func__, bh->b_blocknr);
128 		set_buffer_uptodate(bh);
129 	} else {
130 		ext4_debug("%s: Block %lld not up-to-date",
131 			   __func__, bh->b_blocknr);
132 		clear_buffer_uptodate(bh);
133 	}
134 
135 	unlock_buffer(bh);
136 }
137 
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140 	struct ext4_inode_info *ei = EXT4_I(inode);
141 
142 	ei->i_fc_lblk_start = 0;
143 	ei->i_fc_lblk_len = 0;
144 }
145 
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148 	struct ext4_inode_info *ei = EXT4_I(inode);
149 
150 	ext4_fc_reset_inode(inode);
151 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 	INIT_LIST_HEAD(&ei->i_fc_list);
153 	init_waitqueue_head(&ei->i_fc_wait);
154 	atomic_set(&ei->i_fc_updates, 0);
155 }
156 
157 /* This function must be called with sbi->s_fc_lock held. */
158 static void ext4_fc_wait_committing_inode(struct inode *inode)
159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
160 {
161 	wait_queue_head_t *wq;
162 	struct ext4_inode_info *ei = EXT4_I(inode);
163 
164 #if (BITS_PER_LONG < 64)
165 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 			EXT4_STATE_FC_COMMITTING);
167 	wq = bit_waitqueue(&ei->i_state_flags,
168 				EXT4_STATE_FC_COMMITTING);
169 #else
170 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 			EXT4_STATE_FC_COMMITTING);
172 	wq = bit_waitqueue(&ei->i_flags,
173 				EXT4_STATE_FC_COMMITTING);
174 #endif
175 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 	schedule();
179 	finish_wait(wq, &wait.wq_entry);
180 }
181 
182 /*
183  * Inform Ext4's fast about start of an inode update
184  *
185  * This function is called by the high level call VFS callbacks before
186  * performing any inode update. This function blocks if there's an ongoing
187  * fast commit on the inode in question.
188  */
189 void ext4_fc_start_update(struct inode *inode)
190 {
191 	struct ext4_inode_info *ei = EXT4_I(inode);
192 
193 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195 		return;
196 
197 restart:
198 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 	if (list_empty(&ei->i_fc_list))
200 		goto out;
201 
202 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203 		ext4_fc_wait_committing_inode(inode);
204 		goto restart;
205 	}
206 out:
207 	atomic_inc(&ei->i_fc_updates);
208 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209 }
210 
211 /*
212  * Stop inode update and wake up waiting fast commits if any.
213  */
214 void ext4_fc_stop_update(struct inode *inode)
215 {
216 	struct ext4_inode_info *ei = EXT4_I(inode);
217 
218 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220 		return;
221 
222 	if (atomic_dec_and_test(&ei->i_fc_updates))
223 		wake_up_all(&ei->i_fc_wait);
224 }
225 
226 /*
227  * Remove inode from fast commit list. If the inode is being committed
228  * we wait until inode commit is done.
229  */
230 void ext4_fc_del(struct inode *inode)
231 {
232 	struct ext4_inode_info *ei = EXT4_I(inode);
233 
234 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236 		return;
237 
238 restart:
239 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 	if (list_empty(&ei->i_fc_list)) {
241 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 		return;
243 	}
244 
245 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246 		ext4_fc_wait_committing_inode(inode);
247 		goto restart;
248 	}
249 	list_del_init(&ei->i_fc_list);
250 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 }
252 
253 /*
254  * Mark file system as fast commit ineligible. This means that next commit
255  * operation would result in a full jbd2 commit.
256  */
257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258 {
259 	struct ext4_sb_info *sbi = EXT4_SB(sb);
260 
261 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 		return;
264 
265 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
266 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268 }
269 
270 /*
271  * Start a fast commit ineligible update. Any commits that happen while
272  * such an operation is in progress fall back to full commits.
273  */
274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275 {
276 	struct ext4_sb_info *sbi = EXT4_SB(sb);
277 
278 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 		return;
281 
282 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 	atomic_inc(&sbi->s_fc_ineligible_updates);
285 }
286 
287 /*
288  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289  * to ensure that after stopping the ineligible update, at least one full
290  * commit takes place.
291  */
292 void ext4_fc_stop_ineligible(struct super_block *sb)
293 {
294 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 		return;
297 
298 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
299 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300 }
301 
302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
303 {
304 	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
305 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
306 }
307 
308 /*
309  * Generic fast commit tracking function. If this is the first time this we are
310  * called after a full commit, we initialize fast commit fields and then call
311  * __fc_track_fn() with update = 0. If we have already been called after a full
312  * commit, we pass update = 1. Based on that, the track function can determine
313  * if it needs to track a field for the first time or if it needs to just
314  * update the previously tracked value.
315  *
316  * If enqueue is set, this function enqueues the inode in fast commit list.
317  */
318 static int ext4_fc_track_template(
319 	handle_t *handle, struct inode *inode,
320 	int (*__fc_track_fn)(struct inode *, void *, bool),
321 	void *args, int enqueue)
322 {
323 	bool update = false;
324 	struct ext4_inode_info *ei = EXT4_I(inode);
325 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326 	tid_t tid = 0;
327 	int ret;
328 
329 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
331 		return -EOPNOTSUPP;
332 
333 	if (ext4_fc_is_ineligible(inode->i_sb))
334 		return -EINVAL;
335 
336 	tid = handle->h_transaction->t_tid;
337 	mutex_lock(&ei->i_fc_lock);
338 	if (tid == ei->i_sync_tid) {
339 		update = true;
340 	} else {
341 		ext4_fc_reset_inode(inode);
342 		ei->i_sync_tid = tid;
343 	}
344 	ret = __fc_track_fn(inode, args, update);
345 	mutex_unlock(&ei->i_fc_lock);
346 
347 	if (!enqueue)
348 		return ret;
349 
350 	spin_lock(&sbi->s_fc_lock);
351 	if (list_empty(&EXT4_I(inode)->i_fc_list))
352 		list_add_tail(&EXT4_I(inode)->i_fc_list,
353 				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
354 				&sbi->s_fc_q[FC_Q_STAGING] :
355 				&sbi->s_fc_q[FC_Q_MAIN]);
356 	spin_unlock(&sbi->s_fc_lock);
357 
358 	return ret;
359 }
360 
361 struct __track_dentry_update_args {
362 	struct dentry *dentry;
363 	int op;
364 };
365 
366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368 {
369 	struct ext4_fc_dentry_update *node;
370 	struct ext4_inode_info *ei = EXT4_I(inode);
371 	struct __track_dentry_update_args *dentry_update =
372 		(struct __track_dentry_update_args *)arg;
373 	struct dentry *dentry = dentry_update->dentry;
374 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375 
376 	mutex_unlock(&ei->i_fc_lock);
377 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 	if (!node) {
379 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380 		mutex_lock(&ei->i_fc_lock);
381 		return -ENOMEM;
382 	}
383 
384 	node->fcd_op = dentry_update->op;
385 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 	node->fcd_ino = inode->i_ino;
387 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 		if (!node->fcd_name.name) {
390 			kmem_cache_free(ext4_fc_dentry_cachep, node);
391 			ext4_fc_mark_ineligible(inode->i_sb,
392 				EXT4_FC_REASON_NOMEM);
393 			mutex_lock(&ei->i_fc_lock);
394 			return -ENOMEM;
395 		}
396 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 			dentry->d_name.len);
398 	} else {
399 		memcpy(node->fcd_iname, dentry->d_name.name,
400 			dentry->d_name.len);
401 		node->fcd_name.name = node->fcd_iname;
402 	}
403 	node->fcd_name.len = dentry->d_name.len;
404 
405 	spin_lock(&sbi->s_fc_lock);
406 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
407 		list_add_tail(&node->fcd_list,
408 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 	else
410 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 	spin_unlock(&sbi->s_fc_lock);
412 	mutex_lock(&ei->i_fc_lock);
413 
414 	return 0;
415 }
416 
417 void __ext4_fc_track_unlink(handle_t *handle,
418 		struct inode *inode, struct dentry *dentry)
419 {
420 	struct __track_dentry_update_args args;
421 	int ret;
422 
423 	args.dentry = dentry;
424 	args.op = EXT4_FC_TAG_UNLINK;
425 
426 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427 					(void *)&args, 0);
428 	trace_ext4_fc_track_unlink(inode, dentry, ret);
429 }
430 
431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432 {
433 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434 }
435 
436 void __ext4_fc_track_link(handle_t *handle,
437 	struct inode *inode, struct dentry *dentry)
438 {
439 	struct __track_dentry_update_args args;
440 	int ret;
441 
442 	args.dentry = dentry;
443 	args.op = EXT4_FC_TAG_LINK;
444 
445 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446 					(void *)&args, 0);
447 	trace_ext4_fc_track_link(inode, dentry, ret);
448 }
449 
450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451 {
452 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
453 }
454 
455 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
456 {
457 	struct __track_dentry_update_args args;
458 	struct inode *inode = d_inode(dentry);
459 	int ret;
460 
461 	args.dentry = dentry;
462 	args.op = EXT4_FC_TAG_CREAT;
463 
464 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465 					(void *)&args, 0);
466 	trace_ext4_fc_track_create(inode, dentry, ret);
467 }
468 
469 /* __track_fn for inode tracking */
470 static int __track_inode(struct inode *inode, void *arg, bool update)
471 {
472 	if (update)
473 		return -EEXIST;
474 
475 	EXT4_I(inode)->i_fc_lblk_len = 0;
476 
477 	return 0;
478 }
479 
480 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
481 {
482 	int ret;
483 
484 	if (S_ISDIR(inode->i_mode))
485 		return;
486 
487 	if (ext4_should_journal_data(inode)) {
488 		ext4_fc_mark_ineligible(inode->i_sb,
489 					EXT4_FC_REASON_INODE_JOURNAL_DATA);
490 		return;
491 	}
492 
493 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
494 	trace_ext4_fc_track_inode(inode, ret);
495 }
496 
497 struct __track_range_args {
498 	ext4_lblk_t start, end;
499 };
500 
501 /* __track_fn for tracking data updates */
502 static int __track_range(struct inode *inode, void *arg, bool update)
503 {
504 	struct ext4_inode_info *ei = EXT4_I(inode);
505 	ext4_lblk_t oldstart;
506 	struct __track_range_args *__arg =
507 		(struct __track_range_args *)arg;
508 
509 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
510 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
511 		return -ECANCELED;
512 	}
513 
514 	oldstart = ei->i_fc_lblk_start;
515 
516 	if (update && ei->i_fc_lblk_len > 0) {
517 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
518 		ei->i_fc_lblk_len =
519 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
520 				ei->i_fc_lblk_start + 1;
521 	} else {
522 		ei->i_fc_lblk_start = __arg->start;
523 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
524 	}
525 
526 	return 0;
527 }
528 
529 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
530 			 ext4_lblk_t end)
531 {
532 	struct __track_range_args args;
533 	int ret;
534 
535 	if (S_ISDIR(inode->i_mode))
536 		return;
537 
538 	args.start = start;
539 	args.end = end;
540 
541 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
542 
543 	trace_ext4_fc_track_range(inode, start, end, ret);
544 }
545 
546 static void ext4_fc_submit_bh(struct super_block *sb)
547 {
548 	int write_flags = REQ_SYNC;
549 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
550 
551 	/* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
552 	if (test_opt(sb, BARRIER))
553 		write_flags |= REQ_FUA | REQ_PREFLUSH;
554 	lock_buffer(bh);
555 	set_buffer_dirty(bh);
556 	set_buffer_uptodate(bh);
557 	bh->b_end_io = ext4_end_buffer_io_sync;
558 	submit_bh(REQ_OP_WRITE, write_flags, bh);
559 	EXT4_SB(sb)->s_fc_bh = NULL;
560 }
561 
562 /* Ext4 commit path routines */
563 
564 /* memzero and update CRC */
565 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
566 				u32 *crc)
567 {
568 	void *ret;
569 
570 	ret = memset(dst, 0, len);
571 	if (crc)
572 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
573 	return ret;
574 }
575 
576 /*
577  * Allocate len bytes on a fast commit buffer.
578  *
579  * During the commit time this function is used to manage fast commit
580  * block space. We don't split a fast commit log onto different
581  * blocks. So this function makes sure that if there's not enough space
582  * on the current block, the remaining space in the current block is
583  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
584  * new block is from jbd2 and CRC is updated to reflect the padding
585  * we added.
586  */
587 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
588 {
589 	struct ext4_fc_tl *tl;
590 	struct ext4_sb_info *sbi = EXT4_SB(sb);
591 	struct buffer_head *bh;
592 	int bsize = sbi->s_journal->j_blocksize;
593 	int ret, off = sbi->s_fc_bytes % bsize;
594 	int pad_len;
595 
596 	/*
597 	 * After allocating len, we should have space at least for a 0 byte
598 	 * padding.
599 	 */
600 	if (len + sizeof(struct ext4_fc_tl) > bsize)
601 		return NULL;
602 
603 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
604 		/*
605 		 * Only allocate from current buffer if we have enough space for
606 		 * this request AND we have space to add a zero byte padding.
607 		 */
608 		if (!sbi->s_fc_bh) {
609 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
610 			if (ret)
611 				return NULL;
612 			sbi->s_fc_bh = bh;
613 		}
614 		sbi->s_fc_bytes += len;
615 		return sbi->s_fc_bh->b_data + off;
616 	}
617 	/* Need to add PAD tag */
618 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
619 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
620 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
621 	tl->fc_len = cpu_to_le16(pad_len);
622 	if (crc)
623 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
624 	if (pad_len > 0)
625 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
626 	ext4_fc_submit_bh(sb);
627 
628 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
629 	if (ret)
630 		return NULL;
631 	sbi->s_fc_bh = bh;
632 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
633 	return sbi->s_fc_bh->b_data;
634 }
635 
636 /* memcpy to fc reserved space and update CRC */
637 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
638 				int len, u32 *crc)
639 {
640 	if (crc)
641 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
642 	return memcpy(dst, src, len);
643 }
644 
645 /*
646  * Complete a fast commit by writing tail tag.
647  *
648  * Writing tail tag marks the end of a fast commit. In order to guarantee
649  * atomicity, after writing tail tag, even if there's space remaining
650  * in the block, next commit shouldn't use it. That's why tail tag
651  * has the length as that of the remaining space on the block.
652  */
653 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
654 {
655 	struct ext4_sb_info *sbi = EXT4_SB(sb);
656 	struct ext4_fc_tl tl;
657 	struct ext4_fc_tail tail;
658 	int off, bsize = sbi->s_journal->j_blocksize;
659 	u8 *dst;
660 
661 	/*
662 	 * ext4_fc_reserve_space takes care of allocating an extra block if
663 	 * there's no enough space on this block for accommodating this tail.
664 	 */
665 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
666 	if (!dst)
667 		return -ENOSPC;
668 
669 	off = sbi->s_fc_bytes % bsize;
670 
671 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
672 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
673 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
674 
675 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
676 	dst += sizeof(tl);
677 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
678 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
679 	dst += sizeof(tail.fc_tid);
680 	tail.fc_crc = cpu_to_le32(crc);
681 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
682 
683 	ext4_fc_submit_bh(sb);
684 
685 	return 0;
686 }
687 
688 /*
689  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
690  * Returns false if there's not enough space.
691  */
692 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
693 			   u32 *crc)
694 {
695 	struct ext4_fc_tl tl;
696 	u8 *dst;
697 
698 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
699 	if (!dst)
700 		return false;
701 
702 	tl.fc_tag = cpu_to_le16(tag);
703 	tl.fc_len = cpu_to_le16(len);
704 
705 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
706 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
707 
708 	return true;
709 }
710 
711 /* Same as above, but adds dentry tlv. */
712 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
713 					int parent_ino, int ino, int dlen,
714 					const unsigned char *dname,
715 					u32 *crc)
716 {
717 	struct ext4_fc_dentry_info fcd;
718 	struct ext4_fc_tl tl;
719 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
720 					crc);
721 
722 	if (!dst)
723 		return false;
724 
725 	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
726 	fcd.fc_ino = cpu_to_le32(ino);
727 	tl.fc_tag = cpu_to_le16(tag);
728 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
729 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
730 	dst += sizeof(tl);
731 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
732 	dst += sizeof(fcd);
733 	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
734 	dst += dlen;
735 
736 	return true;
737 }
738 
739 /*
740  * Writes inode in the fast commit space under TLV with tag @tag.
741  * Returns 0 on success, error on failure.
742  */
743 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
744 {
745 	struct ext4_inode_info *ei = EXT4_I(inode);
746 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
747 	int ret;
748 	struct ext4_iloc iloc;
749 	struct ext4_fc_inode fc_inode;
750 	struct ext4_fc_tl tl;
751 	u8 *dst;
752 
753 	ret = ext4_get_inode_loc(inode, &iloc);
754 	if (ret)
755 		return ret;
756 
757 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
758 		inode_len += ei->i_extra_isize;
759 
760 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
761 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
762 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
763 
764 	dst = ext4_fc_reserve_space(inode->i_sb,
765 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
766 	if (!dst)
767 		return -ECANCELED;
768 
769 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
770 		return -ECANCELED;
771 	dst += sizeof(tl);
772 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
773 		return -ECANCELED;
774 	dst += sizeof(fc_inode);
775 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
776 					inode_len, crc))
777 		return -ECANCELED;
778 
779 	return 0;
780 }
781 
782 /*
783  * Writes updated data ranges for the inode in question. Updates CRC.
784  * Returns 0 on success, error otherwise.
785  */
786 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
787 {
788 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
789 	struct ext4_inode_info *ei = EXT4_I(inode);
790 	struct ext4_map_blocks map;
791 	struct ext4_fc_add_range fc_ext;
792 	struct ext4_fc_del_range lrange;
793 	struct ext4_extent *ex;
794 	int ret;
795 
796 	mutex_lock(&ei->i_fc_lock);
797 	if (ei->i_fc_lblk_len == 0) {
798 		mutex_unlock(&ei->i_fc_lock);
799 		return 0;
800 	}
801 	old_blk_size = ei->i_fc_lblk_start;
802 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
803 	ei->i_fc_lblk_len = 0;
804 	mutex_unlock(&ei->i_fc_lock);
805 
806 	cur_lblk_off = old_blk_size;
807 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
808 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
809 
810 	while (cur_lblk_off <= new_blk_size) {
811 		map.m_lblk = cur_lblk_off;
812 		map.m_len = new_blk_size - cur_lblk_off + 1;
813 		ret = ext4_map_blocks(NULL, inode, &map, 0);
814 		if (ret < 0)
815 			return -ECANCELED;
816 
817 		if (map.m_len == 0) {
818 			cur_lblk_off++;
819 			continue;
820 		}
821 
822 		if (ret == 0) {
823 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
824 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
825 			lrange.fc_len = cpu_to_le32(map.m_len);
826 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
827 					    sizeof(lrange), (u8 *)&lrange, crc))
828 				return -ENOSPC;
829 		} else {
830 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
831 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
832 			ex->ee_block = cpu_to_le32(map.m_lblk);
833 			ex->ee_len = cpu_to_le16(map.m_len);
834 			ext4_ext_store_pblock(ex, map.m_pblk);
835 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
836 				ext4_ext_mark_unwritten(ex);
837 			else
838 				ext4_ext_mark_initialized(ex);
839 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
840 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
841 				return -ENOSPC;
842 		}
843 
844 		cur_lblk_off += map.m_len;
845 	}
846 
847 	return 0;
848 }
849 
850 
851 /* Submit data for all the fast commit inodes */
852 static int ext4_fc_submit_inode_data_all(journal_t *journal)
853 {
854 	struct super_block *sb = (struct super_block *)(journal->j_private);
855 	struct ext4_sb_info *sbi = EXT4_SB(sb);
856 	struct ext4_inode_info *ei;
857 	struct list_head *pos;
858 	int ret = 0;
859 
860 	spin_lock(&sbi->s_fc_lock);
861 	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
862 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
863 		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
864 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
865 		while (atomic_read(&ei->i_fc_updates)) {
866 			DEFINE_WAIT(wait);
867 
868 			prepare_to_wait(&ei->i_fc_wait, &wait,
869 						TASK_UNINTERRUPTIBLE);
870 			if (atomic_read(&ei->i_fc_updates)) {
871 				spin_unlock(&sbi->s_fc_lock);
872 				schedule();
873 				spin_lock(&sbi->s_fc_lock);
874 			}
875 			finish_wait(&ei->i_fc_wait, &wait);
876 		}
877 		spin_unlock(&sbi->s_fc_lock);
878 		ret = jbd2_submit_inode_data(ei->jinode);
879 		if (ret)
880 			return ret;
881 		spin_lock(&sbi->s_fc_lock);
882 	}
883 	spin_unlock(&sbi->s_fc_lock);
884 
885 	return ret;
886 }
887 
888 /* Wait for completion of data for all the fast commit inodes */
889 static int ext4_fc_wait_inode_data_all(journal_t *journal)
890 {
891 	struct super_block *sb = (struct super_block *)(journal->j_private);
892 	struct ext4_sb_info *sbi = EXT4_SB(sb);
893 	struct ext4_inode_info *pos, *n;
894 	int ret = 0;
895 
896 	spin_lock(&sbi->s_fc_lock);
897 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
898 		if (!ext4_test_inode_state(&pos->vfs_inode,
899 					   EXT4_STATE_FC_COMMITTING))
900 			continue;
901 		spin_unlock(&sbi->s_fc_lock);
902 
903 		ret = jbd2_wait_inode_data(journal, pos->jinode);
904 		if (ret)
905 			return ret;
906 		spin_lock(&sbi->s_fc_lock);
907 	}
908 	spin_unlock(&sbi->s_fc_lock);
909 
910 	return 0;
911 }
912 
913 /* Commit all the directory entry updates */
914 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
915 __acquires(&sbi->s_fc_lock)
916 __releases(&sbi->s_fc_lock)
917 {
918 	struct super_block *sb = (struct super_block *)(journal->j_private);
919 	struct ext4_sb_info *sbi = EXT4_SB(sb);
920 	struct ext4_fc_dentry_update *fc_dentry;
921 	struct inode *inode;
922 	struct list_head *pos, *n, *fcd_pos, *fcd_n;
923 	struct ext4_inode_info *ei;
924 	int ret;
925 
926 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
927 		return 0;
928 	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
929 		fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
930 					fcd_list);
931 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
932 			spin_unlock(&sbi->s_fc_lock);
933 			if (!ext4_fc_add_dentry_tlv(
934 				sb, fc_dentry->fcd_op,
935 				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
936 				fc_dentry->fcd_name.len,
937 				fc_dentry->fcd_name.name, crc)) {
938 				ret = -ENOSPC;
939 				goto lock_and_exit;
940 			}
941 			spin_lock(&sbi->s_fc_lock);
942 			continue;
943 		}
944 
945 		inode = NULL;
946 		list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
947 			ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
948 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
949 				inode = &ei->vfs_inode;
950 				break;
951 			}
952 		}
953 		/*
954 		 * If we don't find inode in our list, then it was deleted,
955 		 * in which case, we don't need to record it's create tag.
956 		 */
957 		if (!inode)
958 			continue;
959 		spin_unlock(&sbi->s_fc_lock);
960 
961 		/*
962 		 * We first write the inode and then the create dirent. This
963 		 * allows the recovery code to create an unnamed inode first
964 		 * and then link it to a directory entry. This allows us
965 		 * to use namei.c routines almost as is and simplifies
966 		 * the recovery code.
967 		 */
968 		ret = ext4_fc_write_inode(inode, crc);
969 		if (ret)
970 			goto lock_and_exit;
971 
972 		ret = ext4_fc_write_inode_data(inode, crc);
973 		if (ret)
974 			goto lock_and_exit;
975 
976 		if (!ext4_fc_add_dentry_tlv(
977 			sb, fc_dentry->fcd_op,
978 			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
979 			fc_dentry->fcd_name.len,
980 			fc_dentry->fcd_name.name, crc)) {
981 			ret = -ENOSPC;
982 			goto lock_and_exit;
983 		}
984 
985 		spin_lock(&sbi->s_fc_lock);
986 	}
987 	return 0;
988 lock_and_exit:
989 	spin_lock(&sbi->s_fc_lock);
990 	return ret;
991 }
992 
993 static int ext4_fc_perform_commit(journal_t *journal)
994 {
995 	struct super_block *sb = (struct super_block *)(journal->j_private);
996 	struct ext4_sb_info *sbi = EXT4_SB(sb);
997 	struct ext4_inode_info *iter;
998 	struct ext4_fc_head head;
999 	struct list_head *pos;
1000 	struct inode *inode;
1001 	struct blk_plug plug;
1002 	int ret = 0;
1003 	u32 crc = 0;
1004 
1005 	ret = ext4_fc_submit_inode_data_all(journal);
1006 	if (ret)
1007 		return ret;
1008 
1009 	ret = ext4_fc_wait_inode_data_all(journal);
1010 	if (ret)
1011 		return ret;
1012 
1013 	/*
1014 	 * If file system device is different from journal device, issue a cache
1015 	 * flush before we start writing fast commit blocks.
1016 	 */
1017 	if (journal->j_fs_dev != journal->j_dev)
1018 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1019 
1020 	blk_start_plug(&plug);
1021 	if (sbi->s_fc_bytes == 0) {
1022 		/*
1023 		 * Add a head tag only if this is the first fast commit
1024 		 * in this TID.
1025 		 */
1026 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1027 		head.fc_tid = cpu_to_le32(
1028 			sbi->s_journal->j_running_transaction->t_tid);
1029 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1030 			(u8 *)&head, &crc))
1031 			goto out;
1032 	}
1033 
1034 	spin_lock(&sbi->s_fc_lock);
1035 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1036 	if (ret) {
1037 		spin_unlock(&sbi->s_fc_lock);
1038 		goto out;
1039 	}
1040 
1041 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1042 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1043 		inode = &iter->vfs_inode;
1044 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1045 			continue;
1046 
1047 		spin_unlock(&sbi->s_fc_lock);
1048 		ret = ext4_fc_write_inode_data(inode, &crc);
1049 		if (ret)
1050 			goto out;
1051 		ret = ext4_fc_write_inode(inode, &crc);
1052 		if (ret)
1053 			goto out;
1054 		spin_lock(&sbi->s_fc_lock);
1055 	}
1056 	spin_unlock(&sbi->s_fc_lock);
1057 
1058 	ret = ext4_fc_write_tail(sb, crc);
1059 
1060 out:
1061 	blk_finish_plug(&plug);
1062 	return ret;
1063 }
1064 
1065 /*
1066  * The main commit entry point. Performs a fast commit for transaction
1067  * commit_tid if needed. If it's not possible to perform a fast commit
1068  * due to various reasons, we fall back to full commit. Returns 0
1069  * on success, error otherwise.
1070  */
1071 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1072 {
1073 	struct super_block *sb = (struct super_block *)(journal->j_private);
1074 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1075 	int nblks = 0, ret, bsize = journal->j_blocksize;
1076 	int subtid = atomic_read(&sbi->s_fc_subtid);
1077 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1078 	ktime_t start_time, commit_time;
1079 
1080 	trace_ext4_fc_commit_start(sb);
1081 
1082 	start_time = ktime_get();
1083 
1084 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1085 		(ext4_fc_is_ineligible(sb))) {
1086 		reason = EXT4_FC_REASON_INELIGIBLE;
1087 		goto out;
1088 	}
1089 
1090 restart_fc:
1091 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1092 	if (ret == -EALREADY) {
1093 		/* There was an ongoing commit, check if we need to restart */
1094 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1095 			commit_tid > journal->j_commit_sequence)
1096 			goto restart_fc;
1097 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1098 		goto out;
1099 	} else if (ret) {
1100 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1101 		reason = EXT4_FC_REASON_FC_START_FAILED;
1102 		goto out;
1103 	}
1104 
1105 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1106 	ret = ext4_fc_perform_commit(journal);
1107 	if (ret < 0) {
1108 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109 		reason = EXT4_FC_REASON_FC_FAILED;
1110 		goto out;
1111 	}
1112 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1113 	ret = jbd2_fc_wait_bufs(journal, nblks);
1114 	if (ret < 0) {
1115 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1116 		reason = EXT4_FC_REASON_FC_FAILED;
1117 		goto out;
1118 	}
1119 	atomic_inc(&sbi->s_fc_subtid);
1120 	jbd2_fc_end_commit(journal);
1121 out:
1122 	/* Has any ineligible update happened since we started? */
1123 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1124 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1125 		reason = EXT4_FC_REASON_INELIGIBLE;
1126 	}
1127 
1128 	spin_lock(&sbi->s_fc_lock);
1129 	if (reason != EXT4_FC_REASON_OK &&
1130 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1131 		sbi->s_fc_stats.fc_ineligible_commits++;
1132 	} else {
1133 		sbi->s_fc_stats.fc_num_commits++;
1134 		sbi->s_fc_stats.fc_numblks += nblks;
1135 	}
1136 	spin_unlock(&sbi->s_fc_lock);
1137 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1138 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1139 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1140 	/*
1141 	 * weight the commit time higher than the average time so we don't
1142 	 * react too strongly to vast changes in the commit time
1143 	 */
1144 	if (likely(sbi->s_fc_avg_commit_time))
1145 		sbi->s_fc_avg_commit_time = (commit_time +
1146 				sbi->s_fc_avg_commit_time * 3) / 4;
1147 	else
1148 		sbi->s_fc_avg_commit_time = commit_time;
1149 	jbd_debug(1,
1150 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1151 		nblks, reason, subtid);
1152 	if (reason == EXT4_FC_REASON_FC_FAILED)
1153 		return jbd2_fc_end_commit_fallback(journal);
1154 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1155 		reason == EXT4_FC_REASON_INELIGIBLE)
1156 		return jbd2_complete_transaction(journal, commit_tid);
1157 	return 0;
1158 }
1159 
1160 /*
1161  * Fast commit cleanup routine. This is called after every fast commit and
1162  * full commit. full is true if we are called after a full commit.
1163  */
1164 static void ext4_fc_cleanup(journal_t *journal, int full)
1165 {
1166 	struct super_block *sb = journal->j_private;
1167 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1168 	struct ext4_inode_info *iter;
1169 	struct ext4_fc_dentry_update *fc_dentry;
1170 	struct list_head *pos, *n;
1171 
1172 	if (full && sbi->s_fc_bh)
1173 		sbi->s_fc_bh = NULL;
1174 
1175 	jbd2_fc_release_bufs(journal);
1176 
1177 	spin_lock(&sbi->s_fc_lock);
1178 	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1179 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1180 		list_del_init(&iter->i_fc_list);
1181 		ext4_clear_inode_state(&iter->vfs_inode,
1182 				       EXT4_STATE_FC_COMMITTING);
1183 		ext4_fc_reset_inode(&iter->vfs_inode);
1184 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1185 		smp_mb();
1186 #if (BITS_PER_LONG < 64)
1187 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1188 #else
1189 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1190 #endif
1191 	}
1192 
1193 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1194 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1195 					     struct ext4_fc_dentry_update,
1196 					     fcd_list);
1197 		list_del_init(&fc_dentry->fcd_list);
1198 		spin_unlock(&sbi->s_fc_lock);
1199 
1200 		if (fc_dentry->fcd_name.name &&
1201 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1202 			kfree(fc_dentry->fcd_name.name);
1203 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1204 		spin_lock(&sbi->s_fc_lock);
1205 	}
1206 
1207 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1208 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1209 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1210 				&sbi->s_fc_q[FC_Q_STAGING]);
1211 
1212 	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1213 	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1214 
1215 	if (full)
1216 		sbi->s_fc_bytes = 0;
1217 	spin_unlock(&sbi->s_fc_lock);
1218 	trace_ext4_fc_stats(sb);
1219 }
1220 
1221 /* Ext4 Replay Path Routines */
1222 
1223 /* Get length of a particular tlv */
1224 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1225 {
1226 	return le16_to_cpu(tl->fc_len);
1227 }
1228 
1229 /* Get a pointer to "value" of a tlv */
1230 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1231 {
1232 	return (u8 *)tl + sizeof(*tl);
1233 }
1234 
1235 /* Helper struct for dentry replay routines */
1236 struct dentry_info_args {
1237 	int parent_ino, dname_len, ino, inode_len;
1238 	char *dname;
1239 };
1240 
1241 static inline void tl_to_darg(struct dentry_info_args *darg,
1242 				struct  ext4_fc_tl *tl)
1243 {
1244 	struct ext4_fc_dentry_info *fcd;
1245 
1246 	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1247 
1248 	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1249 	darg->ino = le32_to_cpu(fcd->fc_ino);
1250 	darg->dname = fcd->fc_dname;
1251 	darg->dname_len = ext4_fc_tag_len(tl) -
1252 			sizeof(struct ext4_fc_dentry_info);
1253 }
1254 
1255 /* Unlink replay function */
1256 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1257 {
1258 	struct inode *inode, *old_parent;
1259 	struct qstr entry;
1260 	struct dentry_info_args darg;
1261 	int ret = 0;
1262 
1263 	tl_to_darg(&darg, tl);
1264 
1265 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1266 			darg.parent_ino, darg.dname_len);
1267 
1268 	entry.name = darg.dname;
1269 	entry.len = darg.dname_len;
1270 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1271 
1272 	if (IS_ERR_OR_NULL(inode)) {
1273 		jbd_debug(1, "Inode %d not found", darg.ino);
1274 		return 0;
1275 	}
1276 
1277 	old_parent = ext4_iget(sb, darg.parent_ino,
1278 				EXT4_IGET_NORMAL);
1279 	if (IS_ERR_OR_NULL(old_parent)) {
1280 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1281 		iput(inode);
1282 		return 0;
1283 	}
1284 
1285 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1286 	/* -ENOENT ok coz it might not exist anymore. */
1287 	if (ret == -ENOENT)
1288 		ret = 0;
1289 	iput(old_parent);
1290 	iput(inode);
1291 	return ret;
1292 }
1293 
1294 static int ext4_fc_replay_link_internal(struct super_block *sb,
1295 				struct dentry_info_args *darg,
1296 				struct inode *inode)
1297 {
1298 	struct inode *dir = NULL;
1299 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1300 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1301 	int ret = 0;
1302 
1303 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1304 	if (IS_ERR(dir)) {
1305 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1306 		dir = NULL;
1307 		goto out;
1308 	}
1309 
1310 	dentry_dir = d_obtain_alias(dir);
1311 	if (IS_ERR(dentry_dir)) {
1312 		jbd_debug(1, "Failed to obtain dentry");
1313 		dentry_dir = NULL;
1314 		goto out;
1315 	}
1316 
1317 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1318 	if (!dentry_inode) {
1319 		jbd_debug(1, "Inode dentry not created.");
1320 		ret = -ENOMEM;
1321 		goto out;
1322 	}
1323 
1324 	ret = __ext4_link(dir, inode, dentry_inode);
1325 	/*
1326 	 * It's possible that link already existed since data blocks
1327 	 * for the dir in question got persisted before we crashed OR
1328 	 * we replayed this tag and crashed before the entire replay
1329 	 * could complete.
1330 	 */
1331 	if (ret && ret != -EEXIST) {
1332 		jbd_debug(1, "Failed to link\n");
1333 		goto out;
1334 	}
1335 
1336 	ret = 0;
1337 out:
1338 	if (dentry_dir) {
1339 		d_drop(dentry_dir);
1340 		dput(dentry_dir);
1341 	} else if (dir) {
1342 		iput(dir);
1343 	}
1344 	if (dentry_inode) {
1345 		d_drop(dentry_inode);
1346 		dput(dentry_inode);
1347 	}
1348 
1349 	return ret;
1350 }
1351 
1352 /* Link replay function */
1353 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1354 {
1355 	struct inode *inode;
1356 	struct dentry_info_args darg;
1357 	int ret = 0;
1358 
1359 	tl_to_darg(&darg, tl);
1360 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1361 			darg.parent_ino, darg.dname_len);
1362 
1363 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1364 	if (IS_ERR_OR_NULL(inode)) {
1365 		jbd_debug(1, "Inode not found.");
1366 		return 0;
1367 	}
1368 
1369 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1370 	iput(inode);
1371 	return ret;
1372 }
1373 
1374 /*
1375  * Record all the modified inodes during replay. We use this later to setup
1376  * block bitmaps correctly.
1377  */
1378 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1379 {
1380 	struct ext4_fc_replay_state *state;
1381 	int i;
1382 
1383 	state = &EXT4_SB(sb)->s_fc_replay_state;
1384 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1385 		if (state->fc_modified_inodes[i] == ino)
1386 			return 0;
1387 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1388 		state->fc_modified_inodes_size +=
1389 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1390 		state->fc_modified_inodes = krealloc(
1391 					state->fc_modified_inodes, sizeof(int) *
1392 					state->fc_modified_inodes_size,
1393 					GFP_KERNEL);
1394 		if (!state->fc_modified_inodes)
1395 			return -ENOMEM;
1396 	}
1397 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1398 	return 0;
1399 }
1400 
1401 /*
1402  * Inode replay function
1403  */
1404 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1405 {
1406 	struct ext4_fc_inode *fc_inode;
1407 	struct ext4_inode *raw_inode;
1408 	struct ext4_inode *raw_fc_inode;
1409 	struct inode *inode = NULL;
1410 	struct ext4_iloc iloc;
1411 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1412 	struct ext4_extent_header *eh;
1413 
1414 	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1415 
1416 	ino = le32_to_cpu(fc_inode->fc_ino);
1417 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1418 
1419 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1420 	if (!IS_ERR_OR_NULL(inode)) {
1421 		ext4_ext_clear_bb(inode);
1422 		iput(inode);
1423 	}
1424 
1425 	ext4_fc_record_modified_inode(sb, ino);
1426 
1427 	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1428 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1429 	if (ret)
1430 		goto out;
1431 
1432 	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1433 	raw_inode = ext4_raw_inode(&iloc);
1434 
1435 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1436 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1437 		inode_len - offsetof(struct ext4_inode, i_generation));
1438 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1439 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1440 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1441 			memset(eh, 0, sizeof(*eh));
1442 			eh->eh_magic = EXT4_EXT_MAGIC;
1443 			eh->eh_max = cpu_to_le16(
1444 				(sizeof(raw_inode->i_block) -
1445 				 sizeof(struct ext4_extent_header))
1446 				 / sizeof(struct ext4_extent));
1447 		}
1448 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1449 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1450 			sizeof(raw_inode->i_block));
1451 	}
1452 
1453 	/* Immediately update the inode on disk. */
1454 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1455 	if (ret)
1456 		goto out;
1457 	ret = sync_dirty_buffer(iloc.bh);
1458 	if (ret)
1459 		goto out;
1460 	ret = ext4_mark_inode_used(sb, ino);
1461 	if (ret)
1462 		goto out;
1463 
1464 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1465 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1466 	if (IS_ERR_OR_NULL(inode)) {
1467 		jbd_debug(1, "Inode not found.");
1468 		return -EFSCORRUPTED;
1469 	}
1470 
1471 	/*
1472 	 * Our allocator could have made different decisions than before
1473 	 * crashing. This should be fixed but until then, we calculate
1474 	 * the number of blocks the inode.
1475 	 */
1476 	ext4_ext_replay_set_iblocks(inode);
1477 
1478 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1479 	ext4_reset_inode_seed(inode);
1480 
1481 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1482 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1483 	sync_dirty_buffer(iloc.bh);
1484 	brelse(iloc.bh);
1485 out:
1486 	iput(inode);
1487 	if (!ret)
1488 		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1489 
1490 	return 0;
1491 }
1492 
1493 /*
1494  * Dentry create replay function.
1495  *
1496  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1497  * inode for which we are trying to create a dentry here, should already have
1498  * been replayed before we start here.
1499  */
1500 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1501 {
1502 	int ret = 0;
1503 	struct inode *inode = NULL;
1504 	struct inode *dir = NULL;
1505 	struct dentry_info_args darg;
1506 
1507 	tl_to_darg(&darg, tl);
1508 
1509 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1510 			darg.parent_ino, darg.dname_len);
1511 
1512 	/* This takes care of update group descriptor and other metadata */
1513 	ret = ext4_mark_inode_used(sb, darg.ino);
1514 	if (ret)
1515 		goto out;
1516 
1517 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1518 	if (IS_ERR_OR_NULL(inode)) {
1519 		jbd_debug(1, "inode %d not found.", darg.ino);
1520 		inode = NULL;
1521 		ret = -EINVAL;
1522 		goto out;
1523 	}
1524 
1525 	if (S_ISDIR(inode->i_mode)) {
1526 		/*
1527 		 * If we are creating a directory, we need to make sure that the
1528 		 * dot and dot dot dirents are setup properly.
1529 		 */
1530 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1531 		if (IS_ERR_OR_NULL(dir)) {
1532 			jbd_debug(1, "Dir %d not found.", darg.ino);
1533 			goto out;
1534 		}
1535 		ret = ext4_init_new_dir(NULL, dir, inode);
1536 		iput(dir);
1537 		if (ret) {
1538 			ret = 0;
1539 			goto out;
1540 		}
1541 	}
1542 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1543 	if (ret)
1544 		goto out;
1545 	set_nlink(inode, 1);
1546 	ext4_mark_inode_dirty(NULL, inode);
1547 out:
1548 	if (inode)
1549 		iput(inode);
1550 	return ret;
1551 }
1552 
1553 /*
1554  * Record physical disk regions which are in use as per fast commit area. Our
1555  * simple replay phase allocator excludes these regions from allocation.
1556  */
1557 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1558 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1559 {
1560 	struct ext4_fc_replay_state *state;
1561 	struct ext4_fc_alloc_region *region;
1562 
1563 	state = &EXT4_SB(sb)->s_fc_replay_state;
1564 	if (state->fc_regions_used == state->fc_regions_size) {
1565 		state->fc_regions_size +=
1566 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1567 		state->fc_regions = krealloc(
1568 					state->fc_regions,
1569 					state->fc_regions_size *
1570 					sizeof(struct ext4_fc_alloc_region),
1571 					GFP_KERNEL);
1572 		if (!state->fc_regions)
1573 			return -ENOMEM;
1574 	}
1575 	region = &state->fc_regions[state->fc_regions_used++];
1576 	region->ino = ino;
1577 	region->lblk = lblk;
1578 	region->pblk = pblk;
1579 	region->len = len;
1580 
1581 	return 0;
1582 }
1583 
1584 /* Replay add range tag */
1585 static int ext4_fc_replay_add_range(struct super_block *sb,
1586 				struct ext4_fc_tl *tl)
1587 {
1588 	struct ext4_fc_add_range *fc_add_ex;
1589 	struct ext4_extent newex, *ex;
1590 	struct inode *inode;
1591 	ext4_lblk_t start, cur;
1592 	int remaining, len;
1593 	ext4_fsblk_t start_pblk;
1594 	struct ext4_map_blocks map;
1595 	struct ext4_ext_path *path = NULL;
1596 	int ret;
1597 
1598 	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1599 	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1600 
1601 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1602 		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1603 		ext4_ext_get_actual_len(ex));
1604 
1605 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1606 				EXT4_IGET_NORMAL);
1607 	if (IS_ERR_OR_NULL(inode)) {
1608 		jbd_debug(1, "Inode not found.");
1609 		return 0;
1610 	}
1611 
1612 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1613 
1614 	start = le32_to_cpu(ex->ee_block);
1615 	start_pblk = ext4_ext_pblock(ex);
1616 	len = ext4_ext_get_actual_len(ex);
1617 
1618 	cur = start;
1619 	remaining = len;
1620 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1621 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1622 		  inode->i_ino);
1623 
1624 	while (remaining > 0) {
1625 		map.m_lblk = cur;
1626 		map.m_len = remaining;
1627 		map.m_pblk = 0;
1628 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1629 
1630 		if (ret < 0) {
1631 			iput(inode);
1632 			return 0;
1633 		}
1634 
1635 		if (ret == 0) {
1636 			/* Range is not mapped */
1637 			path = ext4_find_extent(inode, cur, NULL, 0);
1638 			if (IS_ERR(path)) {
1639 				iput(inode);
1640 				return 0;
1641 			}
1642 			memset(&newex, 0, sizeof(newex));
1643 			newex.ee_block = cpu_to_le32(cur);
1644 			ext4_ext_store_pblock(
1645 				&newex, start_pblk + cur - start);
1646 			newex.ee_len = cpu_to_le16(map.m_len);
1647 			if (ext4_ext_is_unwritten(ex))
1648 				ext4_ext_mark_unwritten(&newex);
1649 			down_write(&EXT4_I(inode)->i_data_sem);
1650 			ret = ext4_ext_insert_extent(
1651 				NULL, inode, &path, &newex, 0);
1652 			up_write((&EXT4_I(inode)->i_data_sem));
1653 			ext4_ext_drop_refs(path);
1654 			kfree(path);
1655 			if (ret) {
1656 				iput(inode);
1657 				return 0;
1658 			}
1659 			goto next;
1660 		}
1661 
1662 		if (start_pblk + cur - start != map.m_pblk) {
1663 			/*
1664 			 * Logical to physical mapping changed. This can happen
1665 			 * if this range was removed and then reallocated to
1666 			 * map to new physical blocks during a fast commit.
1667 			 */
1668 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1669 					ext4_ext_is_unwritten(ex),
1670 					start_pblk + cur - start);
1671 			if (ret) {
1672 				iput(inode);
1673 				return 0;
1674 			}
1675 			/*
1676 			 * Mark the old blocks as free since they aren't used
1677 			 * anymore. We maintain an array of all the modified
1678 			 * inodes. In case these blocks are still used at either
1679 			 * a different logical range in the same inode or in
1680 			 * some different inode, we will mark them as allocated
1681 			 * at the end of the FC replay using our array of
1682 			 * modified inodes.
1683 			 */
1684 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1685 			goto next;
1686 		}
1687 
1688 		/* Range is mapped and needs a state change */
1689 		jbd_debug(1, "Converting from %d to %d %lld",
1690 				map.m_flags & EXT4_MAP_UNWRITTEN,
1691 			ext4_ext_is_unwritten(ex), map.m_pblk);
1692 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1693 					ext4_ext_is_unwritten(ex), map.m_pblk);
1694 		if (ret) {
1695 			iput(inode);
1696 			return 0;
1697 		}
1698 		/*
1699 		 * We may have split the extent tree while toggling the state.
1700 		 * Try to shrink the extent tree now.
1701 		 */
1702 		ext4_ext_replay_shrink_inode(inode, start + len);
1703 next:
1704 		cur += map.m_len;
1705 		remaining -= map.m_len;
1706 	}
1707 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1708 					sb->s_blocksize_bits);
1709 	iput(inode);
1710 	return 0;
1711 }
1712 
1713 /* Replay DEL_RANGE tag */
1714 static int
1715 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1716 {
1717 	struct inode *inode;
1718 	struct ext4_fc_del_range *lrange;
1719 	struct ext4_map_blocks map;
1720 	ext4_lblk_t cur, remaining;
1721 	int ret;
1722 
1723 	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1724 	cur = le32_to_cpu(lrange->fc_lblk);
1725 	remaining = le32_to_cpu(lrange->fc_len);
1726 
1727 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1728 		le32_to_cpu(lrange->fc_ino), cur, remaining);
1729 
1730 	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1731 	if (IS_ERR_OR_NULL(inode)) {
1732 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1733 		return 0;
1734 	}
1735 
1736 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1737 
1738 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1739 			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1740 			le32_to_cpu(lrange->fc_len));
1741 	while (remaining > 0) {
1742 		map.m_lblk = cur;
1743 		map.m_len = remaining;
1744 
1745 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1746 		if (ret < 0) {
1747 			iput(inode);
1748 			return 0;
1749 		}
1750 		if (ret > 0) {
1751 			remaining -= ret;
1752 			cur += ret;
1753 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1754 		} else {
1755 			remaining -= map.m_len;
1756 			cur += map.m_len;
1757 		}
1758 	}
1759 
1760 	ret = ext4_punch_hole(inode,
1761 		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1762 		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1763 	if (ret)
1764 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1765 	ext4_ext_replay_shrink_inode(inode,
1766 		i_size_read(inode) >> sb->s_blocksize_bits);
1767 	ext4_mark_inode_dirty(NULL, inode);
1768 	iput(inode);
1769 
1770 	return 0;
1771 }
1772 
1773 static inline const char *tag2str(u16 tag)
1774 {
1775 	switch (tag) {
1776 	case EXT4_FC_TAG_LINK:
1777 		return "TAG_ADD_ENTRY";
1778 	case EXT4_FC_TAG_UNLINK:
1779 		return "TAG_DEL_ENTRY";
1780 	case EXT4_FC_TAG_ADD_RANGE:
1781 		return "TAG_ADD_RANGE";
1782 	case EXT4_FC_TAG_CREAT:
1783 		return "TAG_CREAT_DENTRY";
1784 	case EXT4_FC_TAG_DEL_RANGE:
1785 		return "TAG_DEL_RANGE";
1786 	case EXT4_FC_TAG_INODE:
1787 		return "TAG_INODE";
1788 	case EXT4_FC_TAG_PAD:
1789 		return "TAG_PAD";
1790 	case EXT4_FC_TAG_TAIL:
1791 		return "TAG_TAIL";
1792 	case EXT4_FC_TAG_HEAD:
1793 		return "TAG_HEAD";
1794 	default:
1795 		return "TAG_ERROR";
1796 	}
1797 }
1798 
1799 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1800 {
1801 	struct ext4_fc_replay_state *state;
1802 	struct inode *inode;
1803 	struct ext4_ext_path *path = NULL;
1804 	struct ext4_map_blocks map;
1805 	int i, ret, j;
1806 	ext4_lblk_t cur, end;
1807 
1808 	state = &EXT4_SB(sb)->s_fc_replay_state;
1809 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1810 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1811 			EXT4_IGET_NORMAL);
1812 		if (IS_ERR_OR_NULL(inode)) {
1813 			jbd_debug(1, "Inode %d not found.",
1814 				state->fc_modified_inodes[i]);
1815 			continue;
1816 		}
1817 		cur = 0;
1818 		end = EXT_MAX_BLOCKS;
1819 		while (cur < end) {
1820 			map.m_lblk = cur;
1821 			map.m_len = end - cur;
1822 
1823 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1824 			if (ret < 0)
1825 				break;
1826 
1827 			if (ret > 0) {
1828 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1829 				if (!IS_ERR_OR_NULL(path)) {
1830 					for (j = 0; j < path->p_depth; j++)
1831 						ext4_mb_mark_bb(inode->i_sb,
1832 							path[j].p_block, 1, 1);
1833 					ext4_ext_drop_refs(path);
1834 					kfree(path);
1835 				}
1836 				cur += ret;
1837 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1838 							map.m_len, 1);
1839 			} else {
1840 				cur = cur + (map.m_len ? map.m_len : 1);
1841 			}
1842 		}
1843 		iput(inode);
1844 	}
1845 }
1846 
1847 /*
1848  * Check if block is in excluded regions for block allocation. The simple
1849  * allocator that runs during replay phase is calls this function to see
1850  * if it is okay to use a block.
1851  */
1852 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1853 {
1854 	int i;
1855 	struct ext4_fc_replay_state *state;
1856 
1857 	state = &EXT4_SB(sb)->s_fc_replay_state;
1858 	for (i = 0; i < state->fc_regions_valid; i++) {
1859 		if (state->fc_regions[i].ino == 0 ||
1860 			state->fc_regions[i].len == 0)
1861 			continue;
1862 		if (blk >= state->fc_regions[i].pblk &&
1863 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1864 			return true;
1865 	}
1866 	return false;
1867 }
1868 
1869 /* Cleanup function called after replay */
1870 void ext4_fc_replay_cleanup(struct super_block *sb)
1871 {
1872 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1873 
1874 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1875 	kfree(sbi->s_fc_replay_state.fc_regions);
1876 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1877 }
1878 
1879 /*
1880  * Recovery Scan phase handler
1881  *
1882  * This function is called during the scan phase and is responsible
1883  * for doing following things:
1884  * - Make sure the fast commit area has valid tags for replay
1885  * - Count number of tags that need to be replayed by the replay handler
1886  * - Verify CRC
1887  * - Create a list of excluded blocks for allocation during replay phase
1888  *
1889  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1890  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1891  * to indicate that scan has finished and JBD2 can now start replay phase.
1892  * It returns a negative error to indicate that there was an error. At the end
1893  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1894  * to indicate the number of tags that need to replayed during the replay phase.
1895  */
1896 static int ext4_fc_replay_scan(journal_t *journal,
1897 				struct buffer_head *bh, int off,
1898 				tid_t expected_tid)
1899 {
1900 	struct super_block *sb = journal->j_private;
1901 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1902 	struct ext4_fc_replay_state *state;
1903 	int ret = JBD2_FC_REPLAY_CONTINUE;
1904 	struct ext4_fc_add_range *ext;
1905 	struct ext4_fc_tl *tl;
1906 	struct ext4_fc_tail *tail;
1907 	__u8 *start, *end;
1908 	struct ext4_fc_head *head;
1909 	struct ext4_extent *ex;
1910 
1911 	state = &sbi->s_fc_replay_state;
1912 
1913 	start = (u8 *)bh->b_data;
1914 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1915 
1916 	if (state->fc_replay_expected_off == 0) {
1917 		state->fc_cur_tag = 0;
1918 		state->fc_replay_num_tags = 0;
1919 		state->fc_crc = 0;
1920 		state->fc_regions = NULL;
1921 		state->fc_regions_valid = state->fc_regions_used =
1922 			state->fc_regions_size = 0;
1923 		/* Check if we can stop early */
1924 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1925 			!= EXT4_FC_TAG_HEAD)
1926 			return 0;
1927 	}
1928 
1929 	if (off != state->fc_replay_expected_off) {
1930 		ret = -EFSCORRUPTED;
1931 		goto out_err;
1932 	}
1933 
1934 	state->fc_replay_expected_off++;
1935 	fc_for_each_tl(start, end, tl) {
1936 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1937 			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1938 		switch (le16_to_cpu(tl->fc_tag)) {
1939 		case EXT4_FC_TAG_ADD_RANGE:
1940 			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1941 			ex = (struct ext4_extent *)&ext->fc_ex;
1942 			ret = ext4_fc_record_regions(sb,
1943 				le32_to_cpu(ext->fc_ino),
1944 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1945 				ext4_ext_get_actual_len(ex));
1946 			if (ret < 0)
1947 				break;
1948 			ret = JBD2_FC_REPLAY_CONTINUE;
1949 			fallthrough;
1950 		case EXT4_FC_TAG_DEL_RANGE:
1951 		case EXT4_FC_TAG_LINK:
1952 		case EXT4_FC_TAG_UNLINK:
1953 		case EXT4_FC_TAG_CREAT:
1954 		case EXT4_FC_TAG_INODE:
1955 		case EXT4_FC_TAG_PAD:
1956 			state->fc_cur_tag++;
1957 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1958 					sizeof(*tl) + ext4_fc_tag_len(tl));
1959 			break;
1960 		case EXT4_FC_TAG_TAIL:
1961 			state->fc_cur_tag++;
1962 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1963 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1964 						sizeof(*tl) +
1965 						offsetof(struct ext4_fc_tail,
1966 						fc_crc));
1967 			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1968 				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1969 				state->fc_replay_num_tags = state->fc_cur_tag;
1970 				state->fc_regions_valid =
1971 					state->fc_regions_used;
1972 			} else {
1973 				ret = state->fc_replay_num_tags ?
1974 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1975 			}
1976 			state->fc_crc = 0;
1977 			break;
1978 		case EXT4_FC_TAG_HEAD:
1979 			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1980 			if (le32_to_cpu(head->fc_features) &
1981 				~EXT4_FC_SUPPORTED_FEATURES) {
1982 				ret = -EOPNOTSUPP;
1983 				break;
1984 			}
1985 			if (le32_to_cpu(head->fc_tid) != expected_tid) {
1986 				ret = JBD2_FC_REPLAY_STOP;
1987 				break;
1988 			}
1989 			state->fc_cur_tag++;
1990 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1991 					sizeof(*tl) + ext4_fc_tag_len(tl));
1992 			break;
1993 		default:
1994 			ret = state->fc_replay_num_tags ?
1995 				JBD2_FC_REPLAY_STOP : -ECANCELED;
1996 		}
1997 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1998 			break;
1999 	}
2000 
2001 out_err:
2002 	trace_ext4_fc_replay_scan(sb, ret, off);
2003 	return ret;
2004 }
2005 
2006 /*
2007  * Main recovery path entry point.
2008  * The meaning of return codes is similar as above.
2009  */
2010 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2011 				enum passtype pass, int off, tid_t expected_tid)
2012 {
2013 	struct super_block *sb = journal->j_private;
2014 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2015 	struct ext4_fc_tl *tl;
2016 	__u8 *start, *end;
2017 	int ret = JBD2_FC_REPLAY_CONTINUE;
2018 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2019 	struct ext4_fc_tail *tail;
2020 
2021 	if (pass == PASS_SCAN) {
2022 		state->fc_current_pass = PASS_SCAN;
2023 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2024 	}
2025 
2026 	if (state->fc_current_pass != pass) {
2027 		state->fc_current_pass = pass;
2028 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2029 	}
2030 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2031 		jbd_debug(1, "Replay stops\n");
2032 		ext4_fc_set_bitmaps_and_counters(sb);
2033 		return 0;
2034 	}
2035 
2036 #ifdef CONFIG_EXT4_DEBUG
2037 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2038 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2039 		return JBD2_FC_REPLAY_STOP;
2040 	}
2041 #endif
2042 
2043 	start = (u8 *)bh->b_data;
2044 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2045 
2046 	fc_for_each_tl(start, end, tl) {
2047 		if (state->fc_replay_num_tags == 0) {
2048 			ret = JBD2_FC_REPLAY_STOP;
2049 			ext4_fc_set_bitmaps_and_counters(sb);
2050 			break;
2051 		}
2052 		jbd_debug(3, "Replay phase, tag:%s\n",
2053 				tag2str(le16_to_cpu(tl->fc_tag)));
2054 		state->fc_replay_num_tags--;
2055 		switch (le16_to_cpu(tl->fc_tag)) {
2056 		case EXT4_FC_TAG_LINK:
2057 			ret = ext4_fc_replay_link(sb, tl);
2058 			break;
2059 		case EXT4_FC_TAG_UNLINK:
2060 			ret = ext4_fc_replay_unlink(sb, tl);
2061 			break;
2062 		case EXT4_FC_TAG_ADD_RANGE:
2063 			ret = ext4_fc_replay_add_range(sb, tl);
2064 			break;
2065 		case EXT4_FC_TAG_CREAT:
2066 			ret = ext4_fc_replay_create(sb, tl);
2067 			break;
2068 		case EXT4_FC_TAG_DEL_RANGE:
2069 			ret = ext4_fc_replay_del_range(sb, tl);
2070 			break;
2071 		case EXT4_FC_TAG_INODE:
2072 			ret = ext4_fc_replay_inode(sb, tl);
2073 			break;
2074 		case EXT4_FC_TAG_PAD:
2075 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2076 				ext4_fc_tag_len(tl), 0);
2077 			break;
2078 		case EXT4_FC_TAG_TAIL:
2079 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2080 				ext4_fc_tag_len(tl), 0);
2081 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2082 			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2083 			break;
2084 		case EXT4_FC_TAG_HEAD:
2085 			break;
2086 		default:
2087 			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2088 				ext4_fc_tag_len(tl), 0);
2089 			ret = -ECANCELED;
2090 			break;
2091 		}
2092 		if (ret < 0)
2093 			break;
2094 		ret = JBD2_FC_REPLAY_CONTINUE;
2095 	}
2096 	return ret;
2097 }
2098 
2099 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2100 {
2101 	/*
2102 	 * We set replay callback even if fast commit disabled because we may
2103 	 * could still have fast commit blocks that need to be replayed even if
2104 	 * fast commit has now been turned off.
2105 	 */
2106 	journal->j_fc_replay_callback = ext4_fc_replay;
2107 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2108 		return;
2109 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2110 }
2111 
2112 static const char *fc_ineligible_reasons[] = {
2113 	"Extended attributes changed",
2114 	"Cross rename",
2115 	"Journal flag changed",
2116 	"Insufficient memory",
2117 	"Swap boot",
2118 	"Resize",
2119 	"Dir renamed",
2120 	"Falloc range op",
2121 	"Data journalling",
2122 	"FC Commit Failed"
2123 };
2124 
2125 int ext4_fc_info_show(struct seq_file *seq, void *v)
2126 {
2127 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2128 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2129 	int i;
2130 
2131 	if (v != SEQ_START_TOKEN)
2132 		return 0;
2133 
2134 	seq_printf(seq,
2135 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2136 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2137 		   stats->fc_numblks,
2138 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2139 	seq_puts(seq, "Ineligible reasons:\n");
2140 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2141 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2142 			stats->fc_ineligible_reason_count[i]);
2143 
2144 	return 0;
2145 }
2146 
2147 int __init ext4_fc_init_dentry_cache(void)
2148 {
2149 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2150 					   SLAB_RECLAIM_ACCOUNT);
2151 
2152 	if (ext4_fc_dentry_cachep == NULL)
2153 		return -ENOMEM;
2154 
2155 	return 0;
2156 }
2157