xref: /openbmc/linux/fs/ext4/fast_commit.c (revision aa74c44b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	init_waitqueue_head(&ei->i_fc_wait);
203 	atomic_set(&ei->i_fc_updates, 0);
204 }
205 
206 /* This function must be called with sbi->s_fc_lock held. */
207 static void ext4_fc_wait_committing_inode(struct inode *inode)
208 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
209 {
210 	wait_queue_head_t *wq;
211 	struct ext4_inode_info *ei = EXT4_I(inode);
212 
213 #if (BITS_PER_LONG < 64)
214 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
215 			EXT4_STATE_FC_COMMITTING);
216 	wq = bit_waitqueue(&ei->i_state_flags,
217 				EXT4_STATE_FC_COMMITTING);
218 #else
219 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
220 			EXT4_STATE_FC_COMMITTING);
221 	wq = bit_waitqueue(&ei->i_flags,
222 				EXT4_STATE_FC_COMMITTING);
223 #endif
224 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
225 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
226 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
227 	schedule();
228 	finish_wait(wq, &wait.wq_entry);
229 }
230 
231 /*
232  * Inform Ext4's fast about start of an inode update
233  *
234  * This function is called by the high level call VFS callbacks before
235  * performing any inode update. This function blocks if there's an ongoing
236  * fast commit on the inode in question.
237  */
238 void ext4_fc_start_update(struct inode *inode)
239 {
240 	struct ext4_inode_info *ei = EXT4_I(inode);
241 
242 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
243 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
244 		return;
245 
246 restart:
247 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
248 	if (list_empty(&ei->i_fc_list))
249 		goto out;
250 
251 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
252 		ext4_fc_wait_committing_inode(inode);
253 		goto restart;
254 	}
255 out:
256 	atomic_inc(&ei->i_fc_updates);
257 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259 
260 /*
261  * Stop inode update and wake up waiting fast commits if any.
262  */
263 void ext4_fc_stop_update(struct inode *inode)
264 {
265 	struct ext4_inode_info *ei = EXT4_I(inode);
266 
267 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
268 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
269 		return;
270 
271 	if (atomic_dec_and_test(&ei->i_fc_updates))
272 		wake_up_all(&ei->i_fc_wait);
273 }
274 
275 /*
276  * Remove inode from fast commit list. If the inode is being committed
277  * we wait until inode commit is done.
278  */
279 void ext4_fc_del(struct inode *inode)
280 {
281 	struct ext4_inode_info *ei = EXT4_I(inode);
282 
283 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
284 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
285 		return;
286 
287 restart:
288 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
289 	if (list_empty(&ei->i_fc_list)) {
290 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
291 		return;
292 	}
293 
294 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
295 		ext4_fc_wait_committing_inode(inode);
296 		goto restart;
297 	}
298 	list_del_init(&ei->i_fc_list);
299 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
300 }
301 
302 /*
303  * Mark file system as fast commit ineligible. This means that next commit
304  * operation would result in a full jbd2 commit.
305  */
306 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
307 {
308 	struct ext4_sb_info *sbi = EXT4_SB(sb);
309 
310 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
311 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
312 		return;
313 
314 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
315 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
316 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
317 }
318 
319 /*
320  * Generic fast commit tracking function. If this is the first time this we are
321  * called after a full commit, we initialize fast commit fields and then call
322  * __fc_track_fn() with update = 0. If we have already been called after a full
323  * commit, we pass update = 1. Based on that, the track function can determine
324  * if it needs to track a field for the first time or if it needs to just
325  * update the previously tracked value.
326  *
327  * If enqueue is set, this function enqueues the inode in fast commit list.
328  */
329 static int ext4_fc_track_template(
330 	handle_t *handle, struct inode *inode,
331 	int (*__fc_track_fn)(struct inode *, void *, bool),
332 	void *args, int enqueue)
333 {
334 	bool update = false;
335 	struct ext4_inode_info *ei = EXT4_I(inode);
336 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
337 	tid_t tid = 0;
338 	int ret;
339 
340 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
341 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
342 		return -EOPNOTSUPP;
343 
344 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
345 		return -EINVAL;
346 
347 	tid = handle->h_transaction->t_tid;
348 	mutex_lock(&ei->i_fc_lock);
349 	if (tid == ei->i_sync_tid) {
350 		update = true;
351 	} else {
352 		ext4_fc_reset_inode(inode);
353 		ei->i_sync_tid = tid;
354 	}
355 	ret = __fc_track_fn(inode, args, update);
356 	mutex_unlock(&ei->i_fc_lock);
357 
358 	if (!enqueue)
359 		return ret;
360 
361 	spin_lock(&sbi->s_fc_lock);
362 	if (list_empty(&EXT4_I(inode)->i_fc_list))
363 		list_add_tail(&EXT4_I(inode)->i_fc_list,
364 				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
365 				&sbi->s_fc_q[FC_Q_STAGING] :
366 				&sbi->s_fc_q[FC_Q_MAIN]);
367 	spin_unlock(&sbi->s_fc_lock);
368 
369 	return ret;
370 }
371 
372 struct __track_dentry_update_args {
373 	struct dentry *dentry;
374 	int op;
375 };
376 
377 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
378 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
379 {
380 	struct ext4_fc_dentry_update *node;
381 	struct ext4_inode_info *ei = EXT4_I(inode);
382 	struct __track_dentry_update_args *dentry_update =
383 		(struct __track_dentry_update_args *)arg;
384 	struct dentry *dentry = dentry_update->dentry;
385 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
386 
387 	mutex_unlock(&ei->i_fc_lock);
388 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
389 	if (!node) {
390 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
391 		mutex_lock(&ei->i_fc_lock);
392 		return -ENOMEM;
393 	}
394 
395 	node->fcd_op = dentry_update->op;
396 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
397 	node->fcd_ino = inode->i_ino;
398 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
399 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
400 		if (!node->fcd_name.name) {
401 			kmem_cache_free(ext4_fc_dentry_cachep, node);
402 			ext4_fc_mark_ineligible(inode->i_sb,
403 				EXT4_FC_REASON_NOMEM);
404 			mutex_lock(&ei->i_fc_lock);
405 			return -ENOMEM;
406 		}
407 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
408 			dentry->d_name.len);
409 	} else {
410 		memcpy(node->fcd_iname, dentry->d_name.name,
411 			dentry->d_name.len);
412 		node->fcd_name.name = node->fcd_iname;
413 	}
414 	node->fcd_name.len = dentry->d_name.len;
415 
416 	spin_lock(&sbi->s_fc_lock);
417 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
418 		list_add_tail(&node->fcd_list,
419 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
420 	else
421 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
422 	spin_unlock(&sbi->s_fc_lock);
423 	mutex_lock(&ei->i_fc_lock);
424 
425 	return 0;
426 }
427 
428 void __ext4_fc_track_unlink(handle_t *handle,
429 		struct inode *inode, struct dentry *dentry)
430 {
431 	struct __track_dentry_update_args args;
432 	int ret;
433 
434 	args.dentry = dentry;
435 	args.op = EXT4_FC_TAG_UNLINK;
436 
437 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
438 					(void *)&args, 0);
439 	trace_ext4_fc_track_unlink(inode, dentry, ret);
440 }
441 
442 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
443 {
444 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
445 }
446 
447 void __ext4_fc_track_link(handle_t *handle,
448 	struct inode *inode, struct dentry *dentry)
449 {
450 	struct __track_dentry_update_args args;
451 	int ret;
452 
453 	args.dentry = dentry;
454 	args.op = EXT4_FC_TAG_LINK;
455 
456 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
457 					(void *)&args, 0);
458 	trace_ext4_fc_track_link(inode, dentry, ret);
459 }
460 
461 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
462 {
463 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
464 }
465 
466 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
467 			  struct dentry *dentry)
468 {
469 	struct __track_dentry_update_args args;
470 	int ret;
471 
472 	args.dentry = dentry;
473 	args.op = EXT4_FC_TAG_CREAT;
474 
475 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
476 					(void *)&args, 0);
477 	trace_ext4_fc_track_create(inode, dentry, ret);
478 }
479 
480 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
481 {
482 	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
483 }
484 
485 /* __track_fn for inode tracking */
486 static int __track_inode(struct inode *inode, void *arg, bool update)
487 {
488 	if (update)
489 		return -EEXIST;
490 
491 	EXT4_I(inode)->i_fc_lblk_len = 0;
492 
493 	return 0;
494 }
495 
496 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
497 {
498 	int ret;
499 
500 	if (S_ISDIR(inode->i_mode))
501 		return;
502 
503 	if (ext4_should_journal_data(inode)) {
504 		ext4_fc_mark_ineligible(inode->i_sb,
505 					EXT4_FC_REASON_INODE_JOURNAL_DATA);
506 		return;
507 	}
508 
509 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
510 	trace_ext4_fc_track_inode(inode, ret);
511 }
512 
513 struct __track_range_args {
514 	ext4_lblk_t start, end;
515 };
516 
517 /* __track_fn for tracking data updates */
518 static int __track_range(struct inode *inode, void *arg, bool update)
519 {
520 	struct ext4_inode_info *ei = EXT4_I(inode);
521 	ext4_lblk_t oldstart;
522 	struct __track_range_args *__arg =
523 		(struct __track_range_args *)arg;
524 
525 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
526 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
527 		return -ECANCELED;
528 	}
529 
530 	oldstart = ei->i_fc_lblk_start;
531 
532 	if (update && ei->i_fc_lblk_len > 0) {
533 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
534 		ei->i_fc_lblk_len =
535 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
536 				ei->i_fc_lblk_start + 1;
537 	} else {
538 		ei->i_fc_lblk_start = __arg->start;
539 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
540 	}
541 
542 	return 0;
543 }
544 
545 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
546 			 ext4_lblk_t end)
547 {
548 	struct __track_range_args args;
549 	int ret;
550 
551 	if (S_ISDIR(inode->i_mode))
552 		return;
553 
554 	args.start = start;
555 	args.end = end;
556 
557 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
558 
559 	trace_ext4_fc_track_range(inode, start, end, ret);
560 }
561 
562 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
563 {
564 	int write_flags = REQ_SYNC;
565 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
566 
567 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
568 	if (test_opt(sb, BARRIER) && is_tail)
569 		write_flags |= REQ_FUA | REQ_PREFLUSH;
570 	lock_buffer(bh);
571 	set_buffer_dirty(bh);
572 	set_buffer_uptodate(bh);
573 	bh->b_end_io = ext4_end_buffer_io_sync;
574 	submit_bh(REQ_OP_WRITE, write_flags, bh);
575 	EXT4_SB(sb)->s_fc_bh = NULL;
576 }
577 
578 /* Ext4 commit path routines */
579 
580 /* memzero and update CRC */
581 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
582 				u32 *crc)
583 {
584 	void *ret;
585 
586 	ret = memset(dst, 0, len);
587 	if (crc)
588 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
589 	return ret;
590 }
591 
592 /*
593  * Allocate len bytes on a fast commit buffer.
594  *
595  * During the commit time this function is used to manage fast commit
596  * block space. We don't split a fast commit log onto different
597  * blocks. So this function makes sure that if there's not enough space
598  * on the current block, the remaining space in the current block is
599  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
600  * new block is from jbd2 and CRC is updated to reflect the padding
601  * we added.
602  */
603 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
604 {
605 	struct ext4_fc_tl *tl;
606 	struct ext4_sb_info *sbi = EXT4_SB(sb);
607 	struct buffer_head *bh;
608 	int bsize = sbi->s_journal->j_blocksize;
609 	int ret, off = sbi->s_fc_bytes % bsize;
610 	int pad_len;
611 
612 	/*
613 	 * After allocating len, we should have space at least for a 0 byte
614 	 * padding.
615 	 */
616 	if (len + sizeof(struct ext4_fc_tl) > bsize)
617 		return NULL;
618 
619 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
620 		/*
621 		 * Only allocate from current buffer if we have enough space for
622 		 * this request AND we have space to add a zero byte padding.
623 		 */
624 		if (!sbi->s_fc_bh) {
625 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
626 			if (ret)
627 				return NULL;
628 			sbi->s_fc_bh = bh;
629 		}
630 		sbi->s_fc_bytes += len;
631 		return sbi->s_fc_bh->b_data + off;
632 	}
633 	/* Need to add PAD tag */
634 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
635 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
636 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
637 	tl->fc_len = cpu_to_le16(pad_len);
638 	if (crc)
639 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
640 	if (pad_len > 0)
641 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
642 	ext4_fc_submit_bh(sb, false);
643 
644 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
645 	if (ret)
646 		return NULL;
647 	sbi->s_fc_bh = bh;
648 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
649 	return sbi->s_fc_bh->b_data;
650 }
651 
652 /* memcpy to fc reserved space and update CRC */
653 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
654 				int len, u32 *crc)
655 {
656 	if (crc)
657 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
658 	return memcpy(dst, src, len);
659 }
660 
661 /*
662  * Complete a fast commit by writing tail tag.
663  *
664  * Writing tail tag marks the end of a fast commit. In order to guarantee
665  * atomicity, after writing tail tag, even if there's space remaining
666  * in the block, next commit shouldn't use it. That's why tail tag
667  * has the length as that of the remaining space on the block.
668  */
669 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
670 {
671 	struct ext4_sb_info *sbi = EXT4_SB(sb);
672 	struct ext4_fc_tl tl;
673 	struct ext4_fc_tail tail;
674 	int off, bsize = sbi->s_journal->j_blocksize;
675 	u8 *dst;
676 
677 	/*
678 	 * ext4_fc_reserve_space takes care of allocating an extra block if
679 	 * there's no enough space on this block for accommodating this tail.
680 	 */
681 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
682 	if (!dst)
683 		return -ENOSPC;
684 
685 	off = sbi->s_fc_bytes % bsize;
686 
687 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
688 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
689 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
690 
691 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
692 	dst += sizeof(tl);
693 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
694 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
695 	dst += sizeof(tail.fc_tid);
696 	tail.fc_crc = cpu_to_le32(crc);
697 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
698 
699 	ext4_fc_submit_bh(sb, true);
700 
701 	return 0;
702 }
703 
704 /*
705  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
706  * Returns false if there's not enough space.
707  */
708 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
709 			   u32 *crc)
710 {
711 	struct ext4_fc_tl tl;
712 	u8 *dst;
713 
714 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
715 	if (!dst)
716 		return false;
717 
718 	tl.fc_tag = cpu_to_le16(tag);
719 	tl.fc_len = cpu_to_le16(len);
720 
721 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
722 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
723 
724 	return true;
725 }
726 
727 /* Same as above, but adds dentry tlv. */
728 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
729 				   struct ext4_fc_dentry_update *fc_dentry)
730 {
731 	struct ext4_fc_dentry_info fcd;
732 	struct ext4_fc_tl tl;
733 	int dlen = fc_dentry->fcd_name.len;
734 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
735 					crc);
736 
737 	if (!dst)
738 		return false;
739 
740 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
741 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
742 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
743 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
744 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
745 	dst += sizeof(tl);
746 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
747 	dst += sizeof(fcd);
748 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
749 
750 	return true;
751 }
752 
753 /*
754  * Writes inode in the fast commit space under TLV with tag @tag.
755  * Returns 0 on success, error on failure.
756  */
757 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
758 {
759 	struct ext4_inode_info *ei = EXT4_I(inode);
760 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
761 	int ret;
762 	struct ext4_iloc iloc;
763 	struct ext4_fc_inode fc_inode;
764 	struct ext4_fc_tl tl;
765 	u8 *dst;
766 
767 	ret = ext4_get_inode_loc(inode, &iloc);
768 	if (ret)
769 		return ret;
770 
771 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
772 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
773 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
774 		inode_len += ei->i_extra_isize;
775 
776 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
777 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
778 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
779 
780 	dst = ext4_fc_reserve_space(inode->i_sb,
781 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
782 	if (!dst)
783 		return -ECANCELED;
784 
785 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
786 		return -ECANCELED;
787 	dst += sizeof(tl);
788 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
789 		return -ECANCELED;
790 	dst += sizeof(fc_inode);
791 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
792 					inode_len, crc))
793 		return -ECANCELED;
794 
795 	return 0;
796 }
797 
798 /*
799  * Writes updated data ranges for the inode in question. Updates CRC.
800  * Returns 0 on success, error otherwise.
801  */
802 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
803 {
804 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
805 	struct ext4_inode_info *ei = EXT4_I(inode);
806 	struct ext4_map_blocks map;
807 	struct ext4_fc_add_range fc_ext;
808 	struct ext4_fc_del_range lrange;
809 	struct ext4_extent *ex;
810 	int ret;
811 
812 	mutex_lock(&ei->i_fc_lock);
813 	if (ei->i_fc_lblk_len == 0) {
814 		mutex_unlock(&ei->i_fc_lock);
815 		return 0;
816 	}
817 	old_blk_size = ei->i_fc_lblk_start;
818 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
819 	ei->i_fc_lblk_len = 0;
820 	mutex_unlock(&ei->i_fc_lock);
821 
822 	cur_lblk_off = old_blk_size;
823 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
824 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
825 
826 	while (cur_lblk_off <= new_blk_size) {
827 		map.m_lblk = cur_lblk_off;
828 		map.m_len = new_blk_size - cur_lblk_off + 1;
829 		ret = ext4_map_blocks(NULL, inode, &map, 0);
830 		if (ret < 0)
831 			return -ECANCELED;
832 
833 		if (map.m_len == 0) {
834 			cur_lblk_off++;
835 			continue;
836 		}
837 
838 		if (ret == 0) {
839 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
840 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
841 			lrange.fc_len = cpu_to_le32(map.m_len);
842 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
843 					    sizeof(lrange), (u8 *)&lrange, crc))
844 				return -ENOSPC;
845 		} else {
846 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
847 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
848 
849 			/* Limit the number of blocks in one extent */
850 			map.m_len = min(max, map.m_len);
851 
852 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
853 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
854 			ex->ee_block = cpu_to_le32(map.m_lblk);
855 			ex->ee_len = cpu_to_le16(map.m_len);
856 			ext4_ext_store_pblock(ex, map.m_pblk);
857 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
858 				ext4_ext_mark_unwritten(ex);
859 			else
860 				ext4_ext_mark_initialized(ex);
861 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
862 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
863 				return -ENOSPC;
864 		}
865 
866 		cur_lblk_off += map.m_len;
867 	}
868 
869 	return 0;
870 }
871 
872 
873 /* Submit data for all the fast commit inodes */
874 static int ext4_fc_submit_inode_data_all(journal_t *journal)
875 {
876 	struct super_block *sb = (struct super_block *)(journal->j_private);
877 	struct ext4_sb_info *sbi = EXT4_SB(sb);
878 	struct ext4_inode_info *ei;
879 	int ret = 0;
880 
881 	spin_lock(&sbi->s_fc_lock);
882 	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
883 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
884 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
885 		while (atomic_read(&ei->i_fc_updates)) {
886 			DEFINE_WAIT(wait);
887 
888 			prepare_to_wait(&ei->i_fc_wait, &wait,
889 						TASK_UNINTERRUPTIBLE);
890 			if (atomic_read(&ei->i_fc_updates)) {
891 				spin_unlock(&sbi->s_fc_lock);
892 				schedule();
893 				spin_lock(&sbi->s_fc_lock);
894 			}
895 			finish_wait(&ei->i_fc_wait, &wait);
896 		}
897 		spin_unlock(&sbi->s_fc_lock);
898 		ret = jbd2_submit_inode_data(ei->jinode);
899 		if (ret)
900 			return ret;
901 		spin_lock(&sbi->s_fc_lock);
902 	}
903 	spin_unlock(&sbi->s_fc_lock);
904 
905 	return ret;
906 }
907 
908 /* Wait for completion of data for all the fast commit inodes */
909 static int ext4_fc_wait_inode_data_all(journal_t *journal)
910 {
911 	struct super_block *sb = (struct super_block *)(journal->j_private);
912 	struct ext4_sb_info *sbi = EXT4_SB(sb);
913 	struct ext4_inode_info *pos, *n;
914 	int ret = 0;
915 
916 	spin_lock(&sbi->s_fc_lock);
917 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
918 		if (!ext4_test_inode_state(&pos->vfs_inode,
919 					   EXT4_STATE_FC_COMMITTING))
920 			continue;
921 		spin_unlock(&sbi->s_fc_lock);
922 
923 		ret = jbd2_wait_inode_data(journal, pos->jinode);
924 		if (ret)
925 			return ret;
926 		spin_lock(&sbi->s_fc_lock);
927 	}
928 	spin_unlock(&sbi->s_fc_lock);
929 
930 	return 0;
931 }
932 
933 /* Commit all the directory entry updates */
934 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
935 __acquires(&sbi->s_fc_lock)
936 __releases(&sbi->s_fc_lock)
937 {
938 	struct super_block *sb = (struct super_block *)(journal->j_private);
939 	struct ext4_sb_info *sbi = EXT4_SB(sb);
940 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
941 	struct inode *inode;
942 	struct ext4_inode_info *ei, *ei_n;
943 	int ret;
944 
945 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
946 		return 0;
947 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
948 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
949 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
950 			spin_unlock(&sbi->s_fc_lock);
951 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
952 				ret = -ENOSPC;
953 				goto lock_and_exit;
954 			}
955 			spin_lock(&sbi->s_fc_lock);
956 			continue;
957 		}
958 
959 		inode = NULL;
960 		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
961 					 i_fc_list) {
962 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
963 				inode = &ei->vfs_inode;
964 				break;
965 			}
966 		}
967 		/*
968 		 * If we don't find inode in our list, then it was deleted,
969 		 * in which case, we don't need to record it's create tag.
970 		 */
971 		if (!inode)
972 			continue;
973 		spin_unlock(&sbi->s_fc_lock);
974 
975 		/*
976 		 * We first write the inode and then the create dirent. This
977 		 * allows the recovery code to create an unnamed inode first
978 		 * and then link it to a directory entry. This allows us
979 		 * to use namei.c routines almost as is and simplifies
980 		 * the recovery code.
981 		 */
982 		ret = ext4_fc_write_inode(inode, crc);
983 		if (ret)
984 			goto lock_and_exit;
985 
986 		ret = ext4_fc_write_inode_data(inode, crc);
987 		if (ret)
988 			goto lock_and_exit;
989 
990 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
991 			ret = -ENOSPC;
992 			goto lock_and_exit;
993 		}
994 
995 		spin_lock(&sbi->s_fc_lock);
996 	}
997 	return 0;
998 lock_and_exit:
999 	spin_lock(&sbi->s_fc_lock);
1000 	return ret;
1001 }
1002 
1003 static int ext4_fc_perform_commit(journal_t *journal)
1004 {
1005 	struct super_block *sb = (struct super_block *)(journal->j_private);
1006 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1007 	struct ext4_inode_info *iter;
1008 	struct ext4_fc_head head;
1009 	struct inode *inode;
1010 	struct blk_plug plug;
1011 	int ret = 0;
1012 	u32 crc = 0;
1013 
1014 	ret = ext4_fc_submit_inode_data_all(journal);
1015 	if (ret)
1016 		return ret;
1017 
1018 	ret = ext4_fc_wait_inode_data_all(journal);
1019 	if (ret)
1020 		return ret;
1021 
1022 	/*
1023 	 * If file system device is different from journal device, issue a cache
1024 	 * flush before we start writing fast commit blocks.
1025 	 */
1026 	if (journal->j_fs_dev != journal->j_dev)
1027 		blkdev_issue_flush(journal->j_fs_dev);
1028 
1029 	blk_start_plug(&plug);
1030 	if (sbi->s_fc_bytes == 0) {
1031 		/*
1032 		 * Add a head tag only if this is the first fast commit
1033 		 * in this TID.
1034 		 */
1035 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1036 		head.fc_tid = cpu_to_le32(
1037 			sbi->s_journal->j_running_transaction->t_tid);
1038 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1039 			(u8 *)&head, &crc)) {
1040 			ret = -ENOSPC;
1041 			goto out;
1042 		}
1043 	}
1044 
1045 	spin_lock(&sbi->s_fc_lock);
1046 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1047 	if (ret) {
1048 		spin_unlock(&sbi->s_fc_lock);
1049 		goto out;
1050 	}
1051 
1052 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1053 		inode = &iter->vfs_inode;
1054 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1055 			continue;
1056 
1057 		spin_unlock(&sbi->s_fc_lock);
1058 		ret = ext4_fc_write_inode_data(inode, &crc);
1059 		if (ret)
1060 			goto out;
1061 		ret = ext4_fc_write_inode(inode, &crc);
1062 		if (ret)
1063 			goto out;
1064 		spin_lock(&sbi->s_fc_lock);
1065 	}
1066 	spin_unlock(&sbi->s_fc_lock);
1067 
1068 	ret = ext4_fc_write_tail(sb, crc);
1069 
1070 out:
1071 	blk_finish_plug(&plug);
1072 	return ret;
1073 }
1074 
1075 static void ext4_fc_update_stats(struct super_block *sb, int status,
1076 				 u64 commit_time, int nblks)
1077 {
1078 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1079 
1080 	jbd_debug(1, "Fast commit ended with status = %d", status);
1081 	if (status == EXT4_FC_STATUS_OK) {
1082 		stats->fc_num_commits++;
1083 		stats->fc_numblks += nblks;
1084 		if (likely(stats->s_fc_avg_commit_time))
1085 			stats->s_fc_avg_commit_time =
1086 				(commit_time +
1087 				 stats->s_fc_avg_commit_time * 3) / 4;
1088 		else
1089 			stats->s_fc_avg_commit_time = commit_time;
1090 	} else if (status == EXT4_FC_STATUS_FAILED ||
1091 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1092 		if (status == EXT4_FC_STATUS_FAILED)
1093 			stats->fc_failed_commits++;
1094 		stats->fc_ineligible_commits++;
1095 	} else {
1096 		stats->fc_skipped_commits++;
1097 	}
1098 	trace_ext4_fc_commit_stop(sb, nblks, status);
1099 }
1100 
1101 /*
1102  * The main commit entry point. Performs a fast commit for transaction
1103  * commit_tid if needed. If it's not possible to perform a fast commit
1104  * due to various reasons, we fall back to full commit. Returns 0
1105  * on success, error otherwise.
1106  */
1107 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1108 {
1109 	struct super_block *sb = (struct super_block *)(journal->j_private);
1110 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1111 	int nblks = 0, ret, bsize = journal->j_blocksize;
1112 	int subtid = atomic_read(&sbi->s_fc_subtid);
1113 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1114 	ktime_t start_time, commit_time;
1115 
1116 	trace_ext4_fc_commit_start(sb);
1117 
1118 	start_time = ktime_get();
1119 
1120 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1121 		return jbd2_complete_transaction(journal, commit_tid);
1122 
1123 restart_fc:
1124 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1125 	if (ret == -EALREADY) {
1126 		/* There was an ongoing commit, check if we need to restart */
1127 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1128 			commit_tid > journal->j_commit_sequence)
1129 			goto restart_fc;
1130 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
1131 		return 0;
1132 	} else if (ret) {
1133 		/*
1134 		 * Commit couldn't start. Just update stats and perform a
1135 		 * full commit.
1136 		 */
1137 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
1138 		return jbd2_complete_transaction(journal, commit_tid);
1139 	}
1140 
1141 	/*
1142 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1143 	 * if we are fast commit ineligible.
1144 	 */
1145 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1146 		status = EXT4_FC_STATUS_INELIGIBLE;
1147 		goto fallback;
1148 	}
1149 
1150 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1151 	ret = ext4_fc_perform_commit(journal);
1152 	if (ret < 0) {
1153 		status = EXT4_FC_STATUS_FAILED;
1154 		goto fallback;
1155 	}
1156 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1157 	ret = jbd2_fc_wait_bufs(journal, nblks);
1158 	if (ret < 0) {
1159 		status = EXT4_FC_STATUS_FAILED;
1160 		goto fallback;
1161 	}
1162 	atomic_inc(&sbi->s_fc_subtid);
1163 	ret = jbd2_fc_end_commit(journal);
1164 	/*
1165 	 * weight the commit time higher than the average time so we
1166 	 * don't react too strongly to vast changes in the commit time
1167 	 */
1168 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1169 	ext4_fc_update_stats(sb, status, commit_time, nblks);
1170 	return ret;
1171 
1172 fallback:
1173 	ret = jbd2_fc_end_commit_fallback(journal);
1174 	ext4_fc_update_stats(sb, status, 0, 0);
1175 	return ret;
1176 }
1177 
1178 /*
1179  * Fast commit cleanup routine. This is called after every fast commit and
1180  * full commit. full is true if we are called after a full commit.
1181  */
1182 static void ext4_fc_cleanup(journal_t *journal, int full)
1183 {
1184 	struct super_block *sb = journal->j_private;
1185 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1186 	struct ext4_inode_info *iter, *iter_n;
1187 	struct ext4_fc_dentry_update *fc_dentry;
1188 
1189 	if (full && sbi->s_fc_bh)
1190 		sbi->s_fc_bh = NULL;
1191 
1192 	jbd2_fc_release_bufs(journal);
1193 
1194 	spin_lock(&sbi->s_fc_lock);
1195 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1196 				 i_fc_list) {
1197 		list_del_init(&iter->i_fc_list);
1198 		ext4_clear_inode_state(&iter->vfs_inode,
1199 				       EXT4_STATE_FC_COMMITTING);
1200 		ext4_fc_reset_inode(&iter->vfs_inode);
1201 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1202 		smp_mb();
1203 #if (BITS_PER_LONG < 64)
1204 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1205 #else
1206 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1207 #endif
1208 	}
1209 
1210 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1211 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1212 					     struct ext4_fc_dentry_update,
1213 					     fcd_list);
1214 		list_del_init(&fc_dentry->fcd_list);
1215 		spin_unlock(&sbi->s_fc_lock);
1216 
1217 		if (fc_dentry->fcd_name.name &&
1218 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1219 			kfree(fc_dentry->fcd_name.name);
1220 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1221 		spin_lock(&sbi->s_fc_lock);
1222 	}
1223 
1224 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1225 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1226 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1227 				&sbi->s_fc_q[FC_Q_MAIN]);
1228 
1229 	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1230 	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1231 
1232 	if (full)
1233 		sbi->s_fc_bytes = 0;
1234 	spin_unlock(&sbi->s_fc_lock);
1235 	trace_ext4_fc_stats(sb);
1236 }
1237 
1238 /* Ext4 Replay Path Routines */
1239 
1240 /* Helper struct for dentry replay routines */
1241 struct dentry_info_args {
1242 	int parent_ino, dname_len, ino, inode_len;
1243 	char *dname;
1244 };
1245 
1246 static inline void tl_to_darg(struct dentry_info_args *darg,
1247 			      struct  ext4_fc_tl *tl, u8 *val)
1248 {
1249 	struct ext4_fc_dentry_info fcd;
1250 
1251 	memcpy(&fcd, val, sizeof(fcd));
1252 
1253 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1254 	darg->ino = le32_to_cpu(fcd.fc_ino);
1255 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1256 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1257 		sizeof(struct ext4_fc_dentry_info);
1258 }
1259 
1260 /* Unlink replay function */
1261 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1262 				 u8 *val)
1263 {
1264 	struct inode *inode, *old_parent;
1265 	struct qstr entry;
1266 	struct dentry_info_args darg;
1267 	int ret = 0;
1268 
1269 	tl_to_darg(&darg, tl, val);
1270 
1271 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1272 			darg.parent_ino, darg.dname_len);
1273 
1274 	entry.name = darg.dname;
1275 	entry.len = darg.dname_len;
1276 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1277 
1278 	if (IS_ERR(inode)) {
1279 		jbd_debug(1, "Inode %d not found", darg.ino);
1280 		return 0;
1281 	}
1282 
1283 	old_parent = ext4_iget(sb, darg.parent_ino,
1284 				EXT4_IGET_NORMAL);
1285 	if (IS_ERR(old_parent)) {
1286 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1287 		iput(inode);
1288 		return 0;
1289 	}
1290 
1291 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1292 	/* -ENOENT ok coz it might not exist anymore. */
1293 	if (ret == -ENOENT)
1294 		ret = 0;
1295 	iput(old_parent);
1296 	iput(inode);
1297 	return ret;
1298 }
1299 
1300 static int ext4_fc_replay_link_internal(struct super_block *sb,
1301 				struct dentry_info_args *darg,
1302 				struct inode *inode)
1303 {
1304 	struct inode *dir = NULL;
1305 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1306 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1307 	int ret = 0;
1308 
1309 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1310 	if (IS_ERR(dir)) {
1311 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1312 		dir = NULL;
1313 		goto out;
1314 	}
1315 
1316 	dentry_dir = d_obtain_alias(dir);
1317 	if (IS_ERR(dentry_dir)) {
1318 		jbd_debug(1, "Failed to obtain dentry");
1319 		dentry_dir = NULL;
1320 		goto out;
1321 	}
1322 
1323 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1324 	if (!dentry_inode) {
1325 		jbd_debug(1, "Inode dentry not created.");
1326 		ret = -ENOMEM;
1327 		goto out;
1328 	}
1329 
1330 	ret = __ext4_link(dir, inode, dentry_inode);
1331 	/*
1332 	 * It's possible that link already existed since data blocks
1333 	 * for the dir in question got persisted before we crashed OR
1334 	 * we replayed this tag and crashed before the entire replay
1335 	 * could complete.
1336 	 */
1337 	if (ret && ret != -EEXIST) {
1338 		jbd_debug(1, "Failed to link\n");
1339 		goto out;
1340 	}
1341 
1342 	ret = 0;
1343 out:
1344 	if (dentry_dir) {
1345 		d_drop(dentry_dir);
1346 		dput(dentry_dir);
1347 	} else if (dir) {
1348 		iput(dir);
1349 	}
1350 	if (dentry_inode) {
1351 		d_drop(dentry_inode);
1352 		dput(dentry_inode);
1353 	}
1354 
1355 	return ret;
1356 }
1357 
1358 /* Link replay function */
1359 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1360 			       u8 *val)
1361 {
1362 	struct inode *inode;
1363 	struct dentry_info_args darg;
1364 	int ret = 0;
1365 
1366 	tl_to_darg(&darg, tl, val);
1367 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1368 			darg.parent_ino, darg.dname_len);
1369 
1370 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1371 	if (IS_ERR(inode)) {
1372 		jbd_debug(1, "Inode not found.");
1373 		return 0;
1374 	}
1375 
1376 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1377 	iput(inode);
1378 	return ret;
1379 }
1380 
1381 /*
1382  * Record all the modified inodes during replay. We use this later to setup
1383  * block bitmaps correctly.
1384  */
1385 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1386 {
1387 	struct ext4_fc_replay_state *state;
1388 	int i;
1389 
1390 	state = &EXT4_SB(sb)->s_fc_replay_state;
1391 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1392 		if (state->fc_modified_inodes[i] == ino)
1393 			return 0;
1394 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1395 		state->fc_modified_inodes_size +=
1396 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1397 		state->fc_modified_inodes = krealloc(
1398 					state->fc_modified_inodes, sizeof(int) *
1399 					state->fc_modified_inodes_size,
1400 					GFP_KERNEL);
1401 		if (!state->fc_modified_inodes)
1402 			return -ENOMEM;
1403 	}
1404 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1405 	return 0;
1406 }
1407 
1408 /*
1409  * Inode replay function
1410  */
1411 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1412 				u8 *val)
1413 {
1414 	struct ext4_fc_inode fc_inode;
1415 	struct ext4_inode *raw_inode;
1416 	struct ext4_inode *raw_fc_inode;
1417 	struct inode *inode = NULL;
1418 	struct ext4_iloc iloc;
1419 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1420 	struct ext4_extent_header *eh;
1421 
1422 	memcpy(&fc_inode, val, sizeof(fc_inode));
1423 
1424 	ino = le32_to_cpu(fc_inode.fc_ino);
1425 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1426 
1427 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1428 	if (!IS_ERR(inode)) {
1429 		ext4_ext_clear_bb(inode);
1430 		iput(inode);
1431 	}
1432 	inode = NULL;
1433 
1434 	ext4_fc_record_modified_inode(sb, ino);
1435 
1436 	raw_fc_inode = (struct ext4_inode *)
1437 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1438 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1439 	if (ret)
1440 		goto out;
1441 
1442 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1443 	raw_inode = ext4_raw_inode(&iloc);
1444 
1445 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1446 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1447 		inode_len - offsetof(struct ext4_inode, i_generation));
1448 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1449 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1450 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1451 			memset(eh, 0, sizeof(*eh));
1452 			eh->eh_magic = EXT4_EXT_MAGIC;
1453 			eh->eh_max = cpu_to_le16(
1454 				(sizeof(raw_inode->i_block) -
1455 				 sizeof(struct ext4_extent_header))
1456 				 / sizeof(struct ext4_extent));
1457 		}
1458 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1459 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1460 			sizeof(raw_inode->i_block));
1461 	}
1462 
1463 	/* Immediately update the inode on disk. */
1464 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1465 	if (ret)
1466 		goto out;
1467 	ret = sync_dirty_buffer(iloc.bh);
1468 	if (ret)
1469 		goto out;
1470 	ret = ext4_mark_inode_used(sb, ino);
1471 	if (ret)
1472 		goto out;
1473 
1474 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1475 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1476 	if (IS_ERR(inode)) {
1477 		jbd_debug(1, "Inode not found.");
1478 		return -EFSCORRUPTED;
1479 	}
1480 
1481 	/*
1482 	 * Our allocator could have made different decisions than before
1483 	 * crashing. This should be fixed but until then, we calculate
1484 	 * the number of blocks the inode.
1485 	 */
1486 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1487 		ext4_ext_replay_set_iblocks(inode);
1488 
1489 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1490 	ext4_reset_inode_seed(inode);
1491 
1492 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1493 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1494 	sync_dirty_buffer(iloc.bh);
1495 	brelse(iloc.bh);
1496 out:
1497 	iput(inode);
1498 	if (!ret)
1499 		blkdev_issue_flush(sb->s_bdev);
1500 
1501 	return 0;
1502 }
1503 
1504 /*
1505  * Dentry create replay function.
1506  *
1507  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1508  * inode for which we are trying to create a dentry here, should already have
1509  * been replayed before we start here.
1510  */
1511 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1512 				 u8 *val)
1513 {
1514 	int ret = 0;
1515 	struct inode *inode = NULL;
1516 	struct inode *dir = NULL;
1517 	struct dentry_info_args darg;
1518 
1519 	tl_to_darg(&darg, tl, val);
1520 
1521 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1522 			darg.parent_ino, darg.dname_len);
1523 
1524 	/* This takes care of update group descriptor and other metadata */
1525 	ret = ext4_mark_inode_used(sb, darg.ino);
1526 	if (ret)
1527 		goto out;
1528 
1529 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1530 	if (IS_ERR(inode)) {
1531 		jbd_debug(1, "inode %d not found.", darg.ino);
1532 		inode = NULL;
1533 		ret = -EINVAL;
1534 		goto out;
1535 	}
1536 
1537 	if (S_ISDIR(inode->i_mode)) {
1538 		/*
1539 		 * If we are creating a directory, we need to make sure that the
1540 		 * dot and dot dot dirents are setup properly.
1541 		 */
1542 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1543 		if (IS_ERR(dir)) {
1544 			jbd_debug(1, "Dir %d not found.", darg.ino);
1545 			goto out;
1546 		}
1547 		ret = ext4_init_new_dir(NULL, dir, inode);
1548 		iput(dir);
1549 		if (ret) {
1550 			ret = 0;
1551 			goto out;
1552 		}
1553 	}
1554 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1555 	if (ret)
1556 		goto out;
1557 	set_nlink(inode, 1);
1558 	ext4_mark_inode_dirty(NULL, inode);
1559 out:
1560 	if (inode)
1561 		iput(inode);
1562 	return ret;
1563 }
1564 
1565 /*
1566  * Record physical disk regions which are in use as per fast commit area. Our
1567  * simple replay phase allocator excludes these regions from allocation.
1568  */
1569 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1570 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1571 {
1572 	struct ext4_fc_replay_state *state;
1573 	struct ext4_fc_alloc_region *region;
1574 
1575 	state = &EXT4_SB(sb)->s_fc_replay_state;
1576 	if (state->fc_regions_used == state->fc_regions_size) {
1577 		state->fc_regions_size +=
1578 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1579 		state->fc_regions = krealloc(
1580 					state->fc_regions,
1581 					state->fc_regions_size *
1582 					sizeof(struct ext4_fc_alloc_region),
1583 					GFP_KERNEL);
1584 		if (!state->fc_regions)
1585 			return -ENOMEM;
1586 	}
1587 	region = &state->fc_regions[state->fc_regions_used++];
1588 	region->ino = ino;
1589 	region->lblk = lblk;
1590 	region->pblk = pblk;
1591 	region->len = len;
1592 
1593 	return 0;
1594 }
1595 
1596 /* Replay add range tag */
1597 static int ext4_fc_replay_add_range(struct super_block *sb,
1598 				    struct ext4_fc_tl *tl, u8 *val)
1599 {
1600 	struct ext4_fc_add_range fc_add_ex;
1601 	struct ext4_extent newex, *ex;
1602 	struct inode *inode;
1603 	ext4_lblk_t start, cur;
1604 	int remaining, len;
1605 	ext4_fsblk_t start_pblk;
1606 	struct ext4_map_blocks map;
1607 	struct ext4_ext_path *path = NULL;
1608 	int ret;
1609 
1610 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1611 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1612 
1613 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1614 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1615 		ext4_ext_get_actual_len(ex));
1616 
1617 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1618 	if (IS_ERR(inode)) {
1619 		jbd_debug(1, "Inode not found.");
1620 		return 0;
1621 	}
1622 
1623 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1624 
1625 	start = le32_to_cpu(ex->ee_block);
1626 	start_pblk = ext4_ext_pblock(ex);
1627 	len = ext4_ext_get_actual_len(ex);
1628 
1629 	cur = start;
1630 	remaining = len;
1631 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1632 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1633 		  inode->i_ino);
1634 
1635 	while (remaining > 0) {
1636 		map.m_lblk = cur;
1637 		map.m_len = remaining;
1638 		map.m_pblk = 0;
1639 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1640 
1641 		if (ret < 0) {
1642 			iput(inode);
1643 			return 0;
1644 		}
1645 
1646 		if (ret == 0) {
1647 			/* Range is not mapped */
1648 			path = ext4_find_extent(inode, cur, NULL, 0);
1649 			if (IS_ERR(path)) {
1650 				iput(inode);
1651 				return 0;
1652 			}
1653 			memset(&newex, 0, sizeof(newex));
1654 			newex.ee_block = cpu_to_le32(cur);
1655 			ext4_ext_store_pblock(
1656 				&newex, start_pblk + cur - start);
1657 			newex.ee_len = cpu_to_le16(map.m_len);
1658 			if (ext4_ext_is_unwritten(ex))
1659 				ext4_ext_mark_unwritten(&newex);
1660 			down_write(&EXT4_I(inode)->i_data_sem);
1661 			ret = ext4_ext_insert_extent(
1662 				NULL, inode, &path, &newex, 0);
1663 			up_write((&EXT4_I(inode)->i_data_sem));
1664 			ext4_ext_drop_refs(path);
1665 			kfree(path);
1666 			if (ret) {
1667 				iput(inode);
1668 				return 0;
1669 			}
1670 			goto next;
1671 		}
1672 
1673 		if (start_pblk + cur - start != map.m_pblk) {
1674 			/*
1675 			 * Logical to physical mapping changed. This can happen
1676 			 * if this range was removed and then reallocated to
1677 			 * map to new physical blocks during a fast commit.
1678 			 */
1679 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1680 					ext4_ext_is_unwritten(ex),
1681 					start_pblk + cur - start);
1682 			if (ret) {
1683 				iput(inode);
1684 				return 0;
1685 			}
1686 			/*
1687 			 * Mark the old blocks as free since they aren't used
1688 			 * anymore. We maintain an array of all the modified
1689 			 * inodes. In case these blocks are still used at either
1690 			 * a different logical range in the same inode or in
1691 			 * some different inode, we will mark them as allocated
1692 			 * at the end of the FC replay using our array of
1693 			 * modified inodes.
1694 			 */
1695 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1696 			goto next;
1697 		}
1698 
1699 		/* Range is mapped and needs a state change */
1700 		jbd_debug(1, "Converting from %ld to %d %lld",
1701 				map.m_flags & EXT4_MAP_UNWRITTEN,
1702 			ext4_ext_is_unwritten(ex), map.m_pblk);
1703 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1704 					ext4_ext_is_unwritten(ex), map.m_pblk);
1705 		if (ret) {
1706 			iput(inode);
1707 			return 0;
1708 		}
1709 		/*
1710 		 * We may have split the extent tree while toggling the state.
1711 		 * Try to shrink the extent tree now.
1712 		 */
1713 		ext4_ext_replay_shrink_inode(inode, start + len);
1714 next:
1715 		cur += map.m_len;
1716 		remaining -= map.m_len;
1717 	}
1718 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1719 					sb->s_blocksize_bits);
1720 	iput(inode);
1721 	return 0;
1722 }
1723 
1724 /* Replay DEL_RANGE tag */
1725 static int
1726 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1727 			 u8 *val)
1728 {
1729 	struct inode *inode;
1730 	struct ext4_fc_del_range lrange;
1731 	struct ext4_map_blocks map;
1732 	ext4_lblk_t cur, remaining;
1733 	int ret;
1734 
1735 	memcpy(&lrange, val, sizeof(lrange));
1736 	cur = le32_to_cpu(lrange.fc_lblk);
1737 	remaining = le32_to_cpu(lrange.fc_len);
1738 
1739 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1740 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1741 
1742 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1743 	if (IS_ERR(inode)) {
1744 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1745 		return 0;
1746 	}
1747 
1748 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1749 
1750 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1751 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1752 			le32_to_cpu(lrange.fc_len));
1753 	while (remaining > 0) {
1754 		map.m_lblk = cur;
1755 		map.m_len = remaining;
1756 
1757 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1758 		if (ret < 0) {
1759 			iput(inode);
1760 			return 0;
1761 		}
1762 		if (ret > 0) {
1763 			remaining -= ret;
1764 			cur += ret;
1765 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1766 		} else {
1767 			remaining -= map.m_len;
1768 			cur += map.m_len;
1769 		}
1770 	}
1771 
1772 	down_write(&EXT4_I(inode)->i_data_sem);
1773 	ret = ext4_ext_remove_space(inode, lrange.fc_lblk,
1774 				lrange.fc_lblk + lrange.fc_len - 1);
1775 	up_write(&EXT4_I(inode)->i_data_sem);
1776 	if (ret) {
1777 		iput(inode);
1778 		return 0;
1779 	}
1780 	ext4_ext_replay_shrink_inode(inode,
1781 		i_size_read(inode) >> sb->s_blocksize_bits);
1782 	ext4_mark_inode_dirty(NULL, inode);
1783 	iput(inode);
1784 
1785 	return 0;
1786 }
1787 
1788 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1789 {
1790 	struct ext4_fc_replay_state *state;
1791 	struct inode *inode;
1792 	struct ext4_ext_path *path = NULL;
1793 	struct ext4_map_blocks map;
1794 	int i, ret, j;
1795 	ext4_lblk_t cur, end;
1796 
1797 	state = &EXT4_SB(sb)->s_fc_replay_state;
1798 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1799 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1800 			EXT4_IGET_NORMAL);
1801 		if (IS_ERR(inode)) {
1802 			jbd_debug(1, "Inode %d not found.",
1803 				state->fc_modified_inodes[i]);
1804 			continue;
1805 		}
1806 		cur = 0;
1807 		end = EXT_MAX_BLOCKS;
1808 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1809 			iput(inode);
1810 			continue;
1811 		}
1812 		while (cur < end) {
1813 			map.m_lblk = cur;
1814 			map.m_len = end - cur;
1815 
1816 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1817 			if (ret < 0)
1818 				break;
1819 
1820 			if (ret > 0) {
1821 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1822 				if (!IS_ERR(path)) {
1823 					for (j = 0; j < path->p_depth; j++)
1824 						ext4_mb_mark_bb(inode->i_sb,
1825 							path[j].p_block, 1, 1);
1826 					ext4_ext_drop_refs(path);
1827 					kfree(path);
1828 				}
1829 				cur += ret;
1830 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1831 							map.m_len, 1);
1832 			} else {
1833 				cur = cur + (map.m_len ? map.m_len : 1);
1834 			}
1835 		}
1836 		iput(inode);
1837 	}
1838 }
1839 
1840 /*
1841  * Check if block is in excluded regions for block allocation. The simple
1842  * allocator that runs during replay phase is calls this function to see
1843  * if it is okay to use a block.
1844  */
1845 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1846 {
1847 	int i;
1848 	struct ext4_fc_replay_state *state;
1849 
1850 	state = &EXT4_SB(sb)->s_fc_replay_state;
1851 	for (i = 0; i < state->fc_regions_valid; i++) {
1852 		if (state->fc_regions[i].ino == 0 ||
1853 			state->fc_regions[i].len == 0)
1854 			continue;
1855 		if (blk >= state->fc_regions[i].pblk &&
1856 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1857 			return true;
1858 	}
1859 	return false;
1860 }
1861 
1862 /* Cleanup function called after replay */
1863 void ext4_fc_replay_cleanup(struct super_block *sb)
1864 {
1865 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1866 
1867 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1868 	kfree(sbi->s_fc_replay_state.fc_regions);
1869 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1870 }
1871 
1872 /*
1873  * Recovery Scan phase handler
1874  *
1875  * This function is called during the scan phase and is responsible
1876  * for doing following things:
1877  * - Make sure the fast commit area has valid tags for replay
1878  * - Count number of tags that need to be replayed by the replay handler
1879  * - Verify CRC
1880  * - Create a list of excluded blocks for allocation during replay phase
1881  *
1882  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1883  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1884  * to indicate that scan has finished and JBD2 can now start replay phase.
1885  * It returns a negative error to indicate that there was an error. At the end
1886  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1887  * to indicate the number of tags that need to replayed during the replay phase.
1888  */
1889 static int ext4_fc_replay_scan(journal_t *journal,
1890 				struct buffer_head *bh, int off,
1891 				tid_t expected_tid)
1892 {
1893 	struct super_block *sb = journal->j_private;
1894 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1895 	struct ext4_fc_replay_state *state;
1896 	int ret = JBD2_FC_REPLAY_CONTINUE;
1897 	struct ext4_fc_add_range ext;
1898 	struct ext4_fc_tl tl;
1899 	struct ext4_fc_tail tail;
1900 	__u8 *start, *end, *cur, *val;
1901 	struct ext4_fc_head head;
1902 	struct ext4_extent *ex;
1903 
1904 	state = &sbi->s_fc_replay_state;
1905 
1906 	start = (u8 *)bh->b_data;
1907 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1908 
1909 	if (state->fc_replay_expected_off == 0) {
1910 		state->fc_cur_tag = 0;
1911 		state->fc_replay_num_tags = 0;
1912 		state->fc_crc = 0;
1913 		state->fc_regions = NULL;
1914 		state->fc_regions_valid = state->fc_regions_used =
1915 			state->fc_regions_size = 0;
1916 		/* Check if we can stop early */
1917 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1918 			!= EXT4_FC_TAG_HEAD)
1919 			return 0;
1920 	}
1921 
1922 	if (off != state->fc_replay_expected_off) {
1923 		ret = -EFSCORRUPTED;
1924 		goto out_err;
1925 	}
1926 
1927 	state->fc_replay_expected_off++;
1928 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1929 		memcpy(&tl, cur, sizeof(tl));
1930 		val = cur + sizeof(tl);
1931 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1932 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1933 		switch (le16_to_cpu(tl.fc_tag)) {
1934 		case EXT4_FC_TAG_ADD_RANGE:
1935 			memcpy(&ext, val, sizeof(ext));
1936 			ex = (struct ext4_extent *)&ext.fc_ex;
1937 			ret = ext4_fc_record_regions(sb,
1938 				le32_to_cpu(ext.fc_ino),
1939 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1940 				ext4_ext_get_actual_len(ex));
1941 			if (ret < 0)
1942 				break;
1943 			ret = JBD2_FC_REPLAY_CONTINUE;
1944 			fallthrough;
1945 		case EXT4_FC_TAG_DEL_RANGE:
1946 		case EXT4_FC_TAG_LINK:
1947 		case EXT4_FC_TAG_UNLINK:
1948 		case EXT4_FC_TAG_CREAT:
1949 		case EXT4_FC_TAG_INODE:
1950 		case EXT4_FC_TAG_PAD:
1951 			state->fc_cur_tag++;
1952 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1953 					sizeof(tl) + le16_to_cpu(tl.fc_len));
1954 			break;
1955 		case EXT4_FC_TAG_TAIL:
1956 			state->fc_cur_tag++;
1957 			memcpy(&tail, val, sizeof(tail));
1958 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1959 						sizeof(tl) +
1960 						offsetof(struct ext4_fc_tail,
1961 						fc_crc));
1962 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1963 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1964 				state->fc_replay_num_tags = state->fc_cur_tag;
1965 				state->fc_regions_valid =
1966 					state->fc_regions_used;
1967 			} else {
1968 				ret = state->fc_replay_num_tags ?
1969 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1970 			}
1971 			state->fc_crc = 0;
1972 			break;
1973 		case EXT4_FC_TAG_HEAD:
1974 			memcpy(&head, val, sizeof(head));
1975 			if (le32_to_cpu(head.fc_features) &
1976 				~EXT4_FC_SUPPORTED_FEATURES) {
1977 				ret = -EOPNOTSUPP;
1978 				break;
1979 			}
1980 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
1981 				ret = JBD2_FC_REPLAY_STOP;
1982 				break;
1983 			}
1984 			state->fc_cur_tag++;
1985 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1986 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
1987 			break;
1988 		default:
1989 			ret = state->fc_replay_num_tags ?
1990 				JBD2_FC_REPLAY_STOP : -ECANCELED;
1991 		}
1992 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1993 			break;
1994 	}
1995 
1996 out_err:
1997 	trace_ext4_fc_replay_scan(sb, ret, off);
1998 	return ret;
1999 }
2000 
2001 /*
2002  * Main recovery path entry point.
2003  * The meaning of return codes is similar as above.
2004  */
2005 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2006 				enum passtype pass, int off, tid_t expected_tid)
2007 {
2008 	struct super_block *sb = journal->j_private;
2009 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2010 	struct ext4_fc_tl tl;
2011 	__u8 *start, *end, *cur, *val;
2012 	int ret = JBD2_FC_REPLAY_CONTINUE;
2013 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2014 	struct ext4_fc_tail tail;
2015 
2016 	if (pass == PASS_SCAN) {
2017 		state->fc_current_pass = PASS_SCAN;
2018 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2019 	}
2020 
2021 	if (state->fc_current_pass != pass) {
2022 		state->fc_current_pass = pass;
2023 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2024 	}
2025 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2026 		jbd_debug(1, "Replay stops\n");
2027 		ext4_fc_set_bitmaps_and_counters(sb);
2028 		return 0;
2029 	}
2030 
2031 #ifdef CONFIG_EXT4_DEBUG
2032 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2033 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2034 		return JBD2_FC_REPLAY_STOP;
2035 	}
2036 #endif
2037 
2038 	start = (u8 *)bh->b_data;
2039 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2040 
2041 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2042 		memcpy(&tl, cur, sizeof(tl));
2043 		val = cur + sizeof(tl);
2044 
2045 		if (state->fc_replay_num_tags == 0) {
2046 			ret = JBD2_FC_REPLAY_STOP;
2047 			ext4_fc_set_bitmaps_and_counters(sb);
2048 			break;
2049 		}
2050 		jbd_debug(3, "Replay phase, tag:%s\n",
2051 				tag2str(le16_to_cpu(tl.fc_tag)));
2052 		state->fc_replay_num_tags--;
2053 		switch (le16_to_cpu(tl.fc_tag)) {
2054 		case EXT4_FC_TAG_LINK:
2055 			ret = ext4_fc_replay_link(sb, &tl, val);
2056 			break;
2057 		case EXT4_FC_TAG_UNLINK:
2058 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2059 			break;
2060 		case EXT4_FC_TAG_ADD_RANGE:
2061 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2062 			break;
2063 		case EXT4_FC_TAG_CREAT:
2064 			ret = ext4_fc_replay_create(sb, &tl, val);
2065 			break;
2066 		case EXT4_FC_TAG_DEL_RANGE:
2067 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2068 			break;
2069 		case EXT4_FC_TAG_INODE:
2070 			ret = ext4_fc_replay_inode(sb, &tl, val);
2071 			break;
2072 		case EXT4_FC_TAG_PAD:
2073 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2074 					     le16_to_cpu(tl.fc_len), 0);
2075 			break;
2076 		case EXT4_FC_TAG_TAIL:
2077 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2078 					     le16_to_cpu(tl.fc_len), 0);
2079 			memcpy(&tail, val, sizeof(tail));
2080 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2081 			break;
2082 		case EXT4_FC_TAG_HEAD:
2083 			break;
2084 		default:
2085 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2086 					     le16_to_cpu(tl.fc_len), 0);
2087 			ret = -ECANCELED;
2088 			break;
2089 		}
2090 		if (ret < 0)
2091 			break;
2092 		ret = JBD2_FC_REPLAY_CONTINUE;
2093 	}
2094 	return ret;
2095 }
2096 
2097 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2098 {
2099 	/*
2100 	 * We set replay callback even if fast commit disabled because we may
2101 	 * could still have fast commit blocks that need to be replayed even if
2102 	 * fast commit has now been turned off.
2103 	 */
2104 	journal->j_fc_replay_callback = ext4_fc_replay;
2105 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2106 		return;
2107 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2108 }
2109 
2110 static const char *fc_ineligible_reasons[] = {
2111 	"Extended attributes changed",
2112 	"Cross rename",
2113 	"Journal flag changed",
2114 	"Insufficient memory",
2115 	"Swap boot",
2116 	"Resize",
2117 	"Dir renamed",
2118 	"Falloc range op",
2119 	"Data journalling",
2120 	"FC Commit Failed"
2121 };
2122 
2123 int ext4_fc_info_show(struct seq_file *seq, void *v)
2124 {
2125 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2126 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2127 	int i;
2128 
2129 	if (v != SEQ_START_TOKEN)
2130 		return 0;
2131 
2132 	seq_printf(seq,
2133 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2134 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2135 		   stats->fc_numblks,
2136 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2137 	seq_puts(seq, "Ineligible reasons:\n");
2138 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2139 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2140 			stats->fc_ineligible_reason_count[i]);
2141 
2142 	return 0;
2143 }
2144 
2145 int __init ext4_fc_init_dentry_cache(void)
2146 {
2147 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2148 					   SLAB_RECLAIM_ACCOUNT);
2149 
2150 	if (ext4_fc_dentry_cachep == NULL)
2151 		return -ENOMEM;
2152 
2153 	return 0;
2154 }
2155 
2156 void ext4_fc_destroy_dentry_cache(void)
2157 {
2158 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2159 }
2160