1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 INIT_LIST_HEAD(&ei->i_fc_dilist); 203 init_waitqueue_head(&ei->i_fc_wait); 204 atomic_set(&ei->i_fc_updates, 0); 205 } 206 207 /* This function must be called with sbi->s_fc_lock held. */ 208 static void ext4_fc_wait_committing_inode(struct inode *inode) 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 210 { 211 wait_queue_head_t *wq; 212 struct ext4_inode_info *ei = EXT4_I(inode); 213 214 #if (BITS_PER_LONG < 64) 215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 216 EXT4_STATE_FC_COMMITTING); 217 wq = bit_waitqueue(&ei->i_state_flags, 218 EXT4_STATE_FC_COMMITTING); 219 #else 220 DEFINE_WAIT_BIT(wait, &ei->i_flags, 221 EXT4_STATE_FC_COMMITTING); 222 wq = bit_waitqueue(&ei->i_flags, 223 EXT4_STATE_FC_COMMITTING); 224 #endif 225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 228 schedule(); 229 finish_wait(wq, &wait.wq_entry); 230 } 231 232 /* 233 * Inform Ext4's fast about start of an inode update 234 * 235 * This function is called by the high level call VFS callbacks before 236 * performing any inode update. This function blocks if there's an ongoing 237 * fast commit on the inode in question. 238 */ 239 void ext4_fc_start_update(struct inode *inode) 240 { 241 struct ext4_inode_info *ei = EXT4_I(inode); 242 243 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 244 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 245 return; 246 247 restart: 248 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 249 if (list_empty(&ei->i_fc_list)) 250 goto out; 251 252 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 253 ext4_fc_wait_committing_inode(inode); 254 goto restart; 255 } 256 out: 257 atomic_inc(&ei->i_fc_updates); 258 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 259 } 260 261 /* 262 * Stop inode update and wake up waiting fast commits if any. 263 */ 264 void ext4_fc_stop_update(struct inode *inode) 265 { 266 struct ext4_inode_info *ei = EXT4_I(inode); 267 268 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 269 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 270 return; 271 272 if (atomic_dec_and_test(&ei->i_fc_updates)) 273 wake_up_all(&ei->i_fc_wait); 274 } 275 276 /* 277 * Remove inode from fast commit list. If the inode is being committed 278 * we wait until inode commit is done. 279 */ 280 void ext4_fc_del(struct inode *inode) 281 { 282 struct ext4_inode_info *ei = EXT4_I(inode); 283 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 284 struct ext4_fc_dentry_update *fc_dentry; 285 286 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 287 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 288 return; 289 290 restart: 291 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 292 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 293 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 294 return; 295 } 296 297 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 298 ext4_fc_wait_committing_inode(inode); 299 goto restart; 300 } 301 302 if (!list_empty(&ei->i_fc_list)) 303 list_del_init(&ei->i_fc_list); 304 305 /* 306 * Since this inode is getting removed, let's also remove all FC 307 * dentry create references, since it is not needed to log it anyways. 308 */ 309 if (list_empty(&ei->i_fc_dilist)) { 310 spin_unlock(&sbi->s_fc_lock); 311 return; 312 } 313 314 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 315 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 316 list_del_init(&fc_dentry->fcd_list); 317 list_del_init(&fc_dentry->fcd_dilist); 318 319 WARN_ON(!list_empty(&ei->i_fc_dilist)); 320 spin_unlock(&sbi->s_fc_lock); 321 322 if (fc_dentry->fcd_name.name && 323 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 324 kfree(fc_dentry->fcd_name.name); 325 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 326 327 return; 328 } 329 330 /* 331 * Mark file system as fast commit ineligible, and record latest 332 * ineligible transaction tid. This means until the recorded 333 * transaction, commit operation would result in a full jbd2 commit. 334 */ 335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 336 { 337 struct ext4_sb_info *sbi = EXT4_SB(sb); 338 tid_t tid; 339 340 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 341 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 342 return; 343 344 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 345 if (handle && !IS_ERR(handle)) 346 tid = handle->h_transaction->t_tid; 347 else { 348 read_lock(&sbi->s_journal->j_state_lock); 349 tid = sbi->s_journal->j_running_transaction ? 350 sbi->s_journal->j_running_transaction->t_tid : 0; 351 read_unlock(&sbi->s_journal->j_state_lock); 352 } 353 spin_lock(&sbi->s_fc_lock); 354 if (sbi->s_fc_ineligible_tid < tid) 355 sbi->s_fc_ineligible_tid = tid; 356 spin_unlock(&sbi->s_fc_lock); 357 WARN_ON(reason >= EXT4_FC_REASON_MAX); 358 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 359 } 360 361 /* 362 * Generic fast commit tracking function. If this is the first time this we are 363 * called after a full commit, we initialize fast commit fields and then call 364 * __fc_track_fn() with update = 0. If we have already been called after a full 365 * commit, we pass update = 1. Based on that, the track function can determine 366 * if it needs to track a field for the first time or if it needs to just 367 * update the previously tracked value. 368 * 369 * If enqueue is set, this function enqueues the inode in fast commit list. 370 */ 371 static int ext4_fc_track_template( 372 handle_t *handle, struct inode *inode, 373 int (*__fc_track_fn)(struct inode *, void *, bool), 374 void *args, int enqueue) 375 { 376 bool update = false; 377 struct ext4_inode_info *ei = EXT4_I(inode); 378 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 379 tid_t tid = 0; 380 int ret; 381 382 tid = handle->h_transaction->t_tid; 383 mutex_lock(&ei->i_fc_lock); 384 if (tid == ei->i_sync_tid) { 385 update = true; 386 } else { 387 ext4_fc_reset_inode(inode); 388 ei->i_sync_tid = tid; 389 } 390 ret = __fc_track_fn(inode, args, update); 391 mutex_unlock(&ei->i_fc_lock); 392 393 if (!enqueue) 394 return ret; 395 396 spin_lock(&sbi->s_fc_lock); 397 if (list_empty(&EXT4_I(inode)->i_fc_list)) 398 list_add_tail(&EXT4_I(inode)->i_fc_list, 399 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 400 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 401 &sbi->s_fc_q[FC_Q_STAGING] : 402 &sbi->s_fc_q[FC_Q_MAIN]); 403 spin_unlock(&sbi->s_fc_lock); 404 405 return ret; 406 } 407 408 struct __track_dentry_update_args { 409 struct dentry *dentry; 410 int op; 411 }; 412 413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 414 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 415 { 416 struct ext4_fc_dentry_update *node; 417 struct ext4_inode_info *ei = EXT4_I(inode); 418 struct __track_dentry_update_args *dentry_update = 419 (struct __track_dentry_update_args *)arg; 420 struct dentry *dentry = dentry_update->dentry; 421 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 422 423 mutex_unlock(&ei->i_fc_lock); 424 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 425 if (!node) { 426 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 427 mutex_lock(&ei->i_fc_lock); 428 return -ENOMEM; 429 } 430 431 node->fcd_op = dentry_update->op; 432 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 433 node->fcd_ino = inode->i_ino; 434 if (dentry->d_name.len > DNAME_INLINE_LEN) { 435 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 436 if (!node->fcd_name.name) { 437 kmem_cache_free(ext4_fc_dentry_cachep, node); 438 ext4_fc_mark_ineligible(inode->i_sb, 439 EXT4_FC_REASON_NOMEM, NULL); 440 mutex_lock(&ei->i_fc_lock); 441 return -ENOMEM; 442 } 443 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 444 dentry->d_name.len); 445 } else { 446 memcpy(node->fcd_iname, dentry->d_name.name, 447 dentry->d_name.len); 448 node->fcd_name.name = node->fcd_iname; 449 } 450 node->fcd_name.len = dentry->d_name.len; 451 INIT_LIST_HEAD(&node->fcd_dilist); 452 spin_lock(&sbi->s_fc_lock); 453 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 454 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 455 list_add_tail(&node->fcd_list, 456 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 457 else 458 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 459 460 /* 461 * This helps us keep a track of all fc_dentry updates which is part of 462 * this ext4 inode. So in case the inode is getting unlinked, before 463 * even we get a chance to fsync, we could remove all fc_dentry 464 * references while evicting the inode in ext4_fc_del(). 465 * Also with this, we don't need to loop over all the inodes in 466 * sbi->s_fc_q to get the corresponding inode in 467 * ext4_fc_commit_dentry_updates(). 468 */ 469 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 470 WARN_ON(!list_empty(&ei->i_fc_dilist)); 471 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 472 } 473 spin_unlock(&sbi->s_fc_lock); 474 mutex_lock(&ei->i_fc_lock); 475 476 return 0; 477 } 478 479 void __ext4_fc_track_unlink(handle_t *handle, 480 struct inode *inode, struct dentry *dentry) 481 { 482 struct __track_dentry_update_args args; 483 int ret; 484 485 args.dentry = dentry; 486 args.op = EXT4_FC_TAG_UNLINK; 487 488 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 489 (void *)&args, 0); 490 trace_ext4_fc_track_unlink(inode, dentry, ret); 491 } 492 493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 494 { 495 struct inode *inode = d_inode(dentry); 496 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 497 498 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 499 (sbi->s_mount_state & EXT4_FC_REPLAY)) 500 return; 501 502 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 503 return; 504 505 __ext4_fc_track_unlink(handle, inode, dentry); 506 } 507 508 void __ext4_fc_track_link(handle_t *handle, 509 struct inode *inode, struct dentry *dentry) 510 { 511 struct __track_dentry_update_args args; 512 int ret; 513 514 args.dentry = dentry; 515 args.op = EXT4_FC_TAG_LINK; 516 517 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 518 (void *)&args, 0); 519 trace_ext4_fc_track_link(inode, dentry, ret); 520 } 521 522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 523 { 524 struct inode *inode = d_inode(dentry); 525 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 526 527 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 528 (sbi->s_mount_state & EXT4_FC_REPLAY)) 529 return; 530 531 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 532 return; 533 534 __ext4_fc_track_link(handle, inode, dentry); 535 } 536 537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 538 struct dentry *dentry) 539 { 540 struct __track_dentry_update_args args; 541 int ret; 542 543 args.dentry = dentry; 544 args.op = EXT4_FC_TAG_CREAT; 545 546 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 547 (void *)&args, 0); 548 trace_ext4_fc_track_create(inode, dentry, ret); 549 } 550 551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 552 { 553 struct inode *inode = d_inode(dentry); 554 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 555 556 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 557 (sbi->s_mount_state & EXT4_FC_REPLAY)) 558 return; 559 560 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 561 return; 562 563 __ext4_fc_track_create(handle, inode, dentry); 564 } 565 566 /* __track_fn for inode tracking */ 567 static int __track_inode(struct inode *inode, void *arg, bool update) 568 { 569 if (update) 570 return -EEXIST; 571 572 EXT4_I(inode)->i_fc_lblk_len = 0; 573 574 return 0; 575 } 576 577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 578 { 579 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 580 int ret; 581 582 if (S_ISDIR(inode->i_mode)) 583 return; 584 585 if (ext4_should_journal_data(inode)) { 586 ext4_fc_mark_ineligible(inode->i_sb, 587 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 588 return; 589 } 590 591 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 592 (sbi->s_mount_state & EXT4_FC_REPLAY)) 593 return; 594 595 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 596 return; 597 598 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 599 trace_ext4_fc_track_inode(inode, ret); 600 } 601 602 struct __track_range_args { 603 ext4_lblk_t start, end; 604 }; 605 606 /* __track_fn for tracking data updates */ 607 static int __track_range(struct inode *inode, void *arg, bool update) 608 { 609 struct ext4_inode_info *ei = EXT4_I(inode); 610 ext4_lblk_t oldstart; 611 struct __track_range_args *__arg = 612 (struct __track_range_args *)arg; 613 614 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 615 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 616 return -ECANCELED; 617 } 618 619 oldstart = ei->i_fc_lblk_start; 620 621 if (update && ei->i_fc_lblk_len > 0) { 622 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 623 ei->i_fc_lblk_len = 624 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 625 ei->i_fc_lblk_start + 1; 626 } else { 627 ei->i_fc_lblk_start = __arg->start; 628 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 629 } 630 631 return 0; 632 } 633 634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 635 ext4_lblk_t end) 636 { 637 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 638 struct __track_range_args args; 639 int ret; 640 641 if (S_ISDIR(inode->i_mode)) 642 return; 643 644 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 645 (sbi->s_mount_state & EXT4_FC_REPLAY)) 646 return; 647 648 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 649 return; 650 651 args.start = start; 652 args.end = end; 653 654 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 655 656 trace_ext4_fc_track_range(inode, start, end, ret); 657 } 658 659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 660 { 661 int write_flags = REQ_SYNC; 662 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 663 664 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 665 if (test_opt(sb, BARRIER) && is_tail) 666 write_flags |= REQ_FUA | REQ_PREFLUSH; 667 lock_buffer(bh); 668 set_buffer_dirty(bh); 669 set_buffer_uptodate(bh); 670 bh->b_end_io = ext4_end_buffer_io_sync; 671 submit_bh(REQ_OP_WRITE, write_flags, bh); 672 EXT4_SB(sb)->s_fc_bh = NULL; 673 } 674 675 /* Ext4 commit path routines */ 676 677 /* memzero and update CRC */ 678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 679 u32 *crc) 680 { 681 void *ret; 682 683 ret = memset(dst, 0, len); 684 if (crc) 685 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 686 return ret; 687 } 688 689 /* 690 * Allocate len bytes on a fast commit buffer. 691 * 692 * During the commit time this function is used to manage fast commit 693 * block space. We don't split a fast commit log onto different 694 * blocks. So this function makes sure that if there's not enough space 695 * on the current block, the remaining space in the current block is 696 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 697 * new block is from jbd2 and CRC is updated to reflect the padding 698 * we added. 699 */ 700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 701 { 702 struct ext4_fc_tl *tl; 703 struct ext4_sb_info *sbi = EXT4_SB(sb); 704 struct buffer_head *bh; 705 int bsize = sbi->s_journal->j_blocksize; 706 int ret, off = sbi->s_fc_bytes % bsize; 707 int pad_len; 708 709 /* 710 * After allocating len, we should have space at least for a 0 byte 711 * padding. 712 */ 713 if (len + sizeof(struct ext4_fc_tl) > bsize) 714 return NULL; 715 716 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 717 /* 718 * Only allocate from current buffer if we have enough space for 719 * this request AND we have space to add a zero byte padding. 720 */ 721 if (!sbi->s_fc_bh) { 722 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 723 if (ret) 724 return NULL; 725 sbi->s_fc_bh = bh; 726 } 727 sbi->s_fc_bytes += len; 728 return sbi->s_fc_bh->b_data + off; 729 } 730 /* Need to add PAD tag */ 731 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 732 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 733 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 734 tl->fc_len = cpu_to_le16(pad_len); 735 if (crc) 736 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 737 if (pad_len > 0) 738 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 739 ext4_fc_submit_bh(sb, false); 740 741 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 742 if (ret) 743 return NULL; 744 sbi->s_fc_bh = bh; 745 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 746 return sbi->s_fc_bh->b_data; 747 } 748 749 /* memcpy to fc reserved space and update CRC */ 750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 751 int len, u32 *crc) 752 { 753 if (crc) 754 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 755 return memcpy(dst, src, len); 756 } 757 758 /* 759 * Complete a fast commit by writing tail tag. 760 * 761 * Writing tail tag marks the end of a fast commit. In order to guarantee 762 * atomicity, after writing tail tag, even if there's space remaining 763 * in the block, next commit shouldn't use it. That's why tail tag 764 * has the length as that of the remaining space on the block. 765 */ 766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 767 { 768 struct ext4_sb_info *sbi = EXT4_SB(sb); 769 struct ext4_fc_tl tl; 770 struct ext4_fc_tail tail; 771 int off, bsize = sbi->s_journal->j_blocksize; 772 u8 *dst; 773 774 /* 775 * ext4_fc_reserve_space takes care of allocating an extra block if 776 * there's no enough space on this block for accommodating this tail. 777 */ 778 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 779 if (!dst) 780 return -ENOSPC; 781 782 off = sbi->s_fc_bytes % bsize; 783 784 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 785 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 786 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 787 788 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 789 dst += sizeof(tl); 790 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 791 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 792 dst += sizeof(tail.fc_tid); 793 tail.fc_crc = cpu_to_le32(crc); 794 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 795 796 ext4_fc_submit_bh(sb, true); 797 798 return 0; 799 } 800 801 /* 802 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 803 * Returns false if there's not enough space. 804 */ 805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 806 u32 *crc) 807 { 808 struct ext4_fc_tl tl; 809 u8 *dst; 810 811 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 812 if (!dst) 813 return false; 814 815 tl.fc_tag = cpu_to_le16(tag); 816 tl.fc_len = cpu_to_le16(len); 817 818 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 819 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 820 821 return true; 822 } 823 824 /* Same as above, but adds dentry tlv. */ 825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 826 struct ext4_fc_dentry_update *fc_dentry) 827 { 828 struct ext4_fc_dentry_info fcd; 829 struct ext4_fc_tl tl; 830 int dlen = fc_dentry->fcd_name.len; 831 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 832 crc); 833 834 if (!dst) 835 return false; 836 837 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 838 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 839 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 840 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 841 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 842 dst += sizeof(tl); 843 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 844 dst += sizeof(fcd); 845 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); 846 847 return true; 848 } 849 850 /* 851 * Writes inode in the fast commit space under TLV with tag @tag. 852 * Returns 0 on success, error on failure. 853 */ 854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 855 { 856 struct ext4_inode_info *ei = EXT4_I(inode); 857 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 858 int ret; 859 struct ext4_iloc iloc; 860 struct ext4_fc_inode fc_inode; 861 struct ext4_fc_tl tl; 862 u8 *dst; 863 864 ret = ext4_get_inode_loc(inode, &iloc); 865 if (ret) 866 return ret; 867 868 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 869 inode_len = EXT4_INODE_SIZE(inode->i_sb); 870 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 871 inode_len += ei->i_extra_isize; 872 873 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 874 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 875 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 876 877 dst = ext4_fc_reserve_space(inode->i_sb, 878 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 879 if (!dst) 880 return -ECANCELED; 881 882 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 883 return -ECANCELED; 884 dst += sizeof(tl); 885 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 886 return -ECANCELED; 887 dst += sizeof(fc_inode); 888 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 889 inode_len, crc)) 890 return -ECANCELED; 891 892 return 0; 893 } 894 895 /* 896 * Writes updated data ranges for the inode in question. Updates CRC. 897 * Returns 0 on success, error otherwise. 898 */ 899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 900 { 901 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 902 struct ext4_inode_info *ei = EXT4_I(inode); 903 struct ext4_map_blocks map; 904 struct ext4_fc_add_range fc_ext; 905 struct ext4_fc_del_range lrange; 906 struct ext4_extent *ex; 907 int ret; 908 909 mutex_lock(&ei->i_fc_lock); 910 if (ei->i_fc_lblk_len == 0) { 911 mutex_unlock(&ei->i_fc_lock); 912 return 0; 913 } 914 old_blk_size = ei->i_fc_lblk_start; 915 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 916 ei->i_fc_lblk_len = 0; 917 mutex_unlock(&ei->i_fc_lock); 918 919 cur_lblk_off = old_blk_size; 920 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 921 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 922 923 while (cur_lblk_off <= new_blk_size) { 924 map.m_lblk = cur_lblk_off; 925 map.m_len = new_blk_size - cur_lblk_off + 1; 926 ret = ext4_map_blocks(NULL, inode, &map, 0); 927 if (ret < 0) 928 return -ECANCELED; 929 930 if (map.m_len == 0) { 931 cur_lblk_off++; 932 continue; 933 } 934 935 if (ret == 0) { 936 lrange.fc_ino = cpu_to_le32(inode->i_ino); 937 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 938 lrange.fc_len = cpu_to_le32(map.m_len); 939 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 940 sizeof(lrange), (u8 *)&lrange, crc)) 941 return -ENOSPC; 942 } else { 943 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 944 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 945 946 /* Limit the number of blocks in one extent */ 947 map.m_len = min(max, map.m_len); 948 949 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 950 ex = (struct ext4_extent *)&fc_ext.fc_ex; 951 ex->ee_block = cpu_to_le32(map.m_lblk); 952 ex->ee_len = cpu_to_le16(map.m_len); 953 ext4_ext_store_pblock(ex, map.m_pblk); 954 if (map.m_flags & EXT4_MAP_UNWRITTEN) 955 ext4_ext_mark_unwritten(ex); 956 else 957 ext4_ext_mark_initialized(ex); 958 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 959 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 960 return -ENOSPC; 961 } 962 963 cur_lblk_off += map.m_len; 964 } 965 966 return 0; 967 } 968 969 970 /* Submit data for all the fast commit inodes */ 971 static int ext4_fc_submit_inode_data_all(journal_t *journal) 972 { 973 struct super_block *sb = (struct super_block *)(journal->j_private); 974 struct ext4_sb_info *sbi = EXT4_SB(sb); 975 struct ext4_inode_info *ei; 976 int ret = 0; 977 978 spin_lock(&sbi->s_fc_lock); 979 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 980 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 981 while (atomic_read(&ei->i_fc_updates)) { 982 DEFINE_WAIT(wait); 983 984 prepare_to_wait(&ei->i_fc_wait, &wait, 985 TASK_UNINTERRUPTIBLE); 986 if (atomic_read(&ei->i_fc_updates)) { 987 spin_unlock(&sbi->s_fc_lock); 988 schedule(); 989 spin_lock(&sbi->s_fc_lock); 990 } 991 finish_wait(&ei->i_fc_wait, &wait); 992 } 993 spin_unlock(&sbi->s_fc_lock); 994 ret = jbd2_submit_inode_data(ei->jinode); 995 if (ret) 996 return ret; 997 spin_lock(&sbi->s_fc_lock); 998 } 999 spin_unlock(&sbi->s_fc_lock); 1000 1001 return ret; 1002 } 1003 1004 /* Wait for completion of data for all the fast commit inodes */ 1005 static int ext4_fc_wait_inode_data_all(journal_t *journal) 1006 { 1007 struct super_block *sb = (struct super_block *)(journal->j_private); 1008 struct ext4_sb_info *sbi = EXT4_SB(sb); 1009 struct ext4_inode_info *pos, *n; 1010 int ret = 0; 1011 1012 spin_lock(&sbi->s_fc_lock); 1013 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1014 if (!ext4_test_inode_state(&pos->vfs_inode, 1015 EXT4_STATE_FC_COMMITTING)) 1016 continue; 1017 spin_unlock(&sbi->s_fc_lock); 1018 1019 ret = jbd2_wait_inode_data(journal, pos->jinode); 1020 if (ret) 1021 return ret; 1022 spin_lock(&sbi->s_fc_lock); 1023 } 1024 spin_unlock(&sbi->s_fc_lock); 1025 1026 return 0; 1027 } 1028 1029 /* Commit all the directory entry updates */ 1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1031 __acquires(&sbi->s_fc_lock) 1032 __releases(&sbi->s_fc_lock) 1033 { 1034 struct super_block *sb = (struct super_block *)(journal->j_private); 1035 struct ext4_sb_info *sbi = EXT4_SB(sb); 1036 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1037 struct inode *inode; 1038 struct ext4_inode_info *ei; 1039 int ret; 1040 1041 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1042 return 0; 1043 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1044 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1045 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1046 spin_unlock(&sbi->s_fc_lock); 1047 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1048 ret = -ENOSPC; 1049 goto lock_and_exit; 1050 } 1051 spin_lock(&sbi->s_fc_lock); 1052 continue; 1053 } 1054 /* 1055 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1056 * corresponding inode pointer 1057 */ 1058 WARN_ON(list_empty(&fc_dentry->fcd_dilist)); 1059 ei = list_first_entry(&fc_dentry->fcd_dilist, 1060 struct ext4_inode_info, i_fc_dilist); 1061 inode = &ei->vfs_inode; 1062 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1063 1064 spin_unlock(&sbi->s_fc_lock); 1065 1066 /* 1067 * We first write the inode and then the create dirent. This 1068 * allows the recovery code to create an unnamed inode first 1069 * and then link it to a directory entry. This allows us 1070 * to use namei.c routines almost as is and simplifies 1071 * the recovery code. 1072 */ 1073 ret = ext4_fc_write_inode(inode, crc); 1074 if (ret) 1075 goto lock_and_exit; 1076 1077 ret = ext4_fc_write_inode_data(inode, crc); 1078 if (ret) 1079 goto lock_and_exit; 1080 1081 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1082 ret = -ENOSPC; 1083 goto lock_and_exit; 1084 } 1085 1086 spin_lock(&sbi->s_fc_lock); 1087 } 1088 return 0; 1089 lock_and_exit: 1090 spin_lock(&sbi->s_fc_lock); 1091 return ret; 1092 } 1093 1094 static int ext4_fc_perform_commit(journal_t *journal) 1095 { 1096 struct super_block *sb = (struct super_block *)(journal->j_private); 1097 struct ext4_sb_info *sbi = EXT4_SB(sb); 1098 struct ext4_inode_info *iter; 1099 struct ext4_fc_head head; 1100 struct inode *inode; 1101 struct blk_plug plug; 1102 int ret = 0; 1103 u32 crc = 0; 1104 1105 ret = ext4_fc_submit_inode_data_all(journal); 1106 if (ret) 1107 return ret; 1108 1109 ret = ext4_fc_wait_inode_data_all(journal); 1110 if (ret) 1111 return ret; 1112 1113 /* 1114 * If file system device is different from journal device, issue a cache 1115 * flush before we start writing fast commit blocks. 1116 */ 1117 if (journal->j_fs_dev != journal->j_dev) 1118 blkdev_issue_flush(journal->j_fs_dev); 1119 1120 blk_start_plug(&plug); 1121 if (sbi->s_fc_bytes == 0) { 1122 /* 1123 * Add a head tag only if this is the first fast commit 1124 * in this TID. 1125 */ 1126 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1127 head.fc_tid = cpu_to_le32( 1128 sbi->s_journal->j_running_transaction->t_tid); 1129 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1130 (u8 *)&head, &crc)) { 1131 ret = -ENOSPC; 1132 goto out; 1133 } 1134 } 1135 1136 spin_lock(&sbi->s_fc_lock); 1137 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1138 if (ret) { 1139 spin_unlock(&sbi->s_fc_lock); 1140 goto out; 1141 } 1142 1143 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1144 inode = &iter->vfs_inode; 1145 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1146 continue; 1147 1148 spin_unlock(&sbi->s_fc_lock); 1149 ret = ext4_fc_write_inode_data(inode, &crc); 1150 if (ret) 1151 goto out; 1152 ret = ext4_fc_write_inode(inode, &crc); 1153 if (ret) 1154 goto out; 1155 spin_lock(&sbi->s_fc_lock); 1156 } 1157 spin_unlock(&sbi->s_fc_lock); 1158 1159 ret = ext4_fc_write_tail(sb, crc); 1160 1161 out: 1162 blk_finish_plug(&plug); 1163 return ret; 1164 } 1165 1166 static void ext4_fc_update_stats(struct super_block *sb, int status, 1167 u64 commit_time, int nblks) 1168 { 1169 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1170 1171 jbd_debug(1, "Fast commit ended with status = %d", status); 1172 if (status == EXT4_FC_STATUS_OK) { 1173 stats->fc_num_commits++; 1174 stats->fc_numblks += nblks; 1175 if (likely(stats->s_fc_avg_commit_time)) 1176 stats->s_fc_avg_commit_time = 1177 (commit_time + 1178 stats->s_fc_avg_commit_time * 3) / 4; 1179 else 1180 stats->s_fc_avg_commit_time = commit_time; 1181 } else if (status == EXT4_FC_STATUS_FAILED || 1182 status == EXT4_FC_STATUS_INELIGIBLE) { 1183 if (status == EXT4_FC_STATUS_FAILED) 1184 stats->fc_failed_commits++; 1185 stats->fc_ineligible_commits++; 1186 } else { 1187 stats->fc_skipped_commits++; 1188 } 1189 trace_ext4_fc_commit_stop(sb, nblks, status); 1190 } 1191 1192 /* 1193 * The main commit entry point. Performs a fast commit for transaction 1194 * commit_tid if needed. If it's not possible to perform a fast commit 1195 * due to various reasons, we fall back to full commit. Returns 0 1196 * on success, error otherwise. 1197 */ 1198 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1199 { 1200 struct super_block *sb = (struct super_block *)(journal->j_private); 1201 struct ext4_sb_info *sbi = EXT4_SB(sb); 1202 int nblks = 0, ret, bsize = journal->j_blocksize; 1203 int subtid = atomic_read(&sbi->s_fc_subtid); 1204 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1205 ktime_t start_time, commit_time; 1206 1207 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1208 return jbd2_complete_transaction(journal, commit_tid); 1209 1210 trace_ext4_fc_commit_start(sb); 1211 1212 start_time = ktime_get(); 1213 1214 restart_fc: 1215 ret = jbd2_fc_begin_commit(journal, commit_tid); 1216 if (ret == -EALREADY) { 1217 /* There was an ongoing commit, check if we need to restart */ 1218 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1219 commit_tid > journal->j_commit_sequence) 1220 goto restart_fc; 1221 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); 1222 return 0; 1223 } else if (ret) { 1224 /* 1225 * Commit couldn't start. Just update stats and perform a 1226 * full commit. 1227 */ 1228 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); 1229 return jbd2_complete_transaction(journal, commit_tid); 1230 } 1231 1232 /* 1233 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1234 * if we are fast commit ineligible. 1235 */ 1236 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1237 status = EXT4_FC_STATUS_INELIGIBLE; 1238 goto fallback; 1239 } 1240 1241 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1242 ret = ext4_fc_perform_commit(journal); 1243 if (ret < 0) { 1244 status = EXT4_FC_STATUS_FAILED; 1245 goto fallback; 1246 } 1247 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1248 ret = jbd2_fc_wait_bufs(journal, nblks); 1249 if (ret < 0) { 1250 status = EXT4_FC_STATUS_FAILED; 1251 goto fallback; 1252 } 1253 atomic_inc(&sbi->s_fc_subtid); 1254 ret = jbd2_fc_end_commit(journal); 1255 /* 1256 * weight the commit time higher than the average time so we 1257 * don't react too strongly to vast changes in the commit time 1258 */ 1259 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1260 ext4_fc_update_stats(sb, status, commit_time, nblks); 1261 return ret; 1262 1263 fallback: 1264 ret = jbd2_fc_end_commit_fallback(journal); 1265 ext4_fc_update_stats(sb, status, 0, 0); 1266 return ret; 1267 } 1268 1269 /* 1270 * Fast commit cleanup routine. This is called after every fast commit and 1271 * full commit. full is true if we are called after a full commit. 1272 */ 1273 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1274 { 1275 struct super_block *sb = journal->j_private; 1276 struct ext4_sb_info *sbi = EXT4_SB(sb); 1277 struct ext4_inode_info *iter, *iter_n; 1278 struct ext4_fc_dentry_update *fc_dentry; 1279 1280 if (full && sbi->s_fc_bh) 1281 sbi->s_fc_bh = NULL; 1282 1283 trace_ext4_fc_cleanup(journal, full, tid); 1284 jbd2_fc_release_bufs(journal); 1285 1286 spin_lock(&sbi->s_fc_lock); 1287 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1288 i_fc_list) { 1289 list_del_init(&iter->i_fc_list); 1290 ext4_clear_inode_state(&iter->vfs_inode, 1291 EXT4_STATE_FC_COMMITTING); 1292 if (iter->i_sync_tid <= tid) 1293 ext4_fc_reset_inode(&iter->vfs_inode); 1294 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1295 smp_mb(); 1296 #if (BITS_PER_LONG < 64) 1297 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1298 #else 1299 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1300 #endif 1301 } 1302 1303 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1304 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1305 struct ext4_fc_dentry_update, 1306 fcd_list); 1307 list_del_init(&fc_dentry->fcd_list); 1308 list_del_init(&fc_dentry->fcd_dilist); 1309 spin_unlock(&sbi->s_fc_lock); 1310 1311 if (fc_dentry->fcd_name.name && 1312 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1313 kfree(fc_dentry->fcd_name.name); 1314 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1315 spin_lock(&sbi->s_fc_lock); 1316 } 1317 1318 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1319 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1320 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1321 &sbi->s_fc_q[FC_Q_MAIN]); 1322 1323 if (tid >= sbi->s_fc_ineligible_tid) { 1324 sbi->s_fc_ineligible_tid = 0; 1325 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1326 } 1327 1328 if (full) 1329 sbi->s_fc_bytes = 0; 1330 spin_unlock(&sbi->s_fc_lock); 1331 trace_ext4_fc_stats(sb); 1332 } 1333 1334 /* Ext4 Replay Path Routines */ 1335 1336 /* Helper struct for dentry replay routines */ 1337 struct dentry_info_args { 1338 int parent_ino, dname_len, ino, inode_len; 1339 char *dname; 1340 }; 1341 1342 static inline void tl_to_darg(struct dentry_info_args *darg, 1343 struct ext4_fc_tl *tl, u8 *val) 1344 { 1345 struct ext4_fc_dentry_info fcd; 1346 1347 memcpy(&fcd, val, sizeof(fcd)); 1348 1349 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1350 darg->ino = le32_to_cpu(fcd.fc_ino); 1351 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1352 darg->dname_len = le16_to_cpu(tl->fc_len) - 1353 sizeof(struct ext4_fc_dentry_info); 1354 } 1355 1356 /* Unlink replay function */ 1357 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1358 u8 *val) 1359 { 1360 struct inode *inode, *old_parent; 1361 struct qstr entry; 1362 struct dentry_info_args darg; 1363 int ret = 0; 1364 1365 tl_to_darg(&darg, tl, val); 1366 1367 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1368 darg.parent_ino, darg.dname_len); 1369 1370 entry.name = darg.dname; 1371 entry.len = darg.dname_len; 1372 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1373 1374 if (IS_ERR(inode)) { 1375 jbd_debug(1, "Inode %d not found", darg.ino); 1376 return 0; 1377 } 1378 1379 old_parent = ext4_iget(sb, darg.parent_ino, 1380 EXT4_IGET_NORMAL); 1381 if (IS_ERR(old_parent)) { 1382 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1383 iput(inode); 1384 return 0; 1385 } 1386 1387 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1388 /* -ENOENT ok coz it might not exist anymore. */ 1389 if (ret == -ENOENT) 1390 ret = 0; 1391 iput(old_parent); 1392 iput(inode); 1393 return ret; 1394 } 1395 1396 static int ext4_fc_replay_link_internal(struct super_block *sb, 1397 struct dentry_info_args *darg, 1398 struct inode *inode) 1399 { 1400 struct inode *dir = NULL; 1401 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1402 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1403 int ret = 0; 1404 1405 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1406 if (IS_ERR(dir)) { 1407 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1408 dir = NULL; 1409 goto out; 1410 } 1411 1412 dentry_dir = d_obtain_alias(dir); 1413 if (IS_ERR(dentry_dir)) { 1414 jbd_debug(1, "Failed to obtain dentry"); 1415 dentry_dir = NULL; 1416 goto out; 1417 } 1418 1419 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1420 if (!dentry_inode) { 1421 jbd_debug(1, "Inode dentry not created."); 1422 ret = -ENOMEM; 1423 goto out; 1424 } 1425 1426 ret = __ext4_link(dir, inode, dentry_inode); 1427 /* 1428 * It's possible that link already existed since data blocks 1429 * for the dir in question got persisted before we crashed OR 1430 * we replayed this tag and crashed before the entire replay 1431 * could complete. 1432 */ 1433 if (ret && ret != -EEXIST) { 1434 jbd_debug(1, "Failed to link\n"); 1435 goto out; 1436 } 1437 1438 ret = 0; 1439 out: 1440 if (dentry_dir) { 1441 d_drop(dentry_dir); 1442 dput(dentry_dir); 1443 } else if (dir) { 1444 iput(dir); 1445 } 1446 if (dentry_inode) { 1447 d_drop(dentry_inode); 1448 dput(dentry_inode); 1449 } 1450 1451 return ret; 1452 } 1453 1454 /* Link replay function */ 1455 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1456 u8 *val) 1457 { 1458 struct inode *inode; 1459 struct dentry_info_args darg; 1460 int ret = 0; 1461 1462 tl_to_darg(&darg, tl, val); 1463 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1464 darg.parent_ino, darg.dname_len); 1465 1466 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1467 if (IS_ERR(inode)) { 1468 jbd_debug(1, "Inode not found."); 1469 return 0; 1470 } 1471 1472 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1473 iput(inode); 1474 return ret; 1475 } 1476 1477 /* 1478 * Record all the modified inodes during replay. We use this later to setup 1479 * block bitmaps correctly. 1480 */ 1481 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1482 { 1483 struct ext4_fc_replay_state *state; 1484 int i; 1485 1486 state = &EXT4_SB(sb)->s_fc_replay_state; 1487 for (i = 0; i < state->fc_modified_inodes_used; i++) 1488 if (state->fc_modified_inodes[i] == ino) 1489 return 0; 1490 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1491 state->fc_modified_inodes = krealloc( 1492 state->fc_modified_inodes, 1493 sizeof(int) * (state->fc_modified_inodes_size + 1494 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1495 GFP_KERNEL); 1496 if (!state->fc_modified_inodes) 1497 return -ENOMEM; 1498 state->fc_modified_inodes_size += 1499 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1500 } 1501 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1502 return 0; 1503 } 1504 1505 /* 1506 * Inode replay function 1507 */ 1508 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1509 u8 *val) 1510 { 1511 struct ext4_fc_inode fc_inode; 1512 struct ext4_inode *raw_inode; 1513 struct ext4_inode *raw_fc_inode; 1514 struct inode *inode = NULL; 1515 struct ext4_iloc iloc; 1516 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1517 struct ext4_extent_header *eh; 1518 1519 memcpy(&fc_inode, val, sizeof(fc_inode)); 1520 1521 ino = le32_to_cpu(fc_inode.fc_ino); 1522 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1523 1524 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1525 if (!IS_ERR(inode)) { 1526 ext4_ext_clear_bb(inode); 1527 iput(inode); 1528 } 1529 inode = NULL; 1530 1531 ret = ext4_fc_record_modified_inode(sb, ino); 1532 if (ret) 1533 goto out; 1534 1535 raw_fc_inode = (struct ext4_inode *) 1536 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1537 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1538 if (ret) 1539 goto out; 1540 1541 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); 1542 raw_inode = ext4_raw_inode(&iloc); 1543 1544 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1545 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1546 inode_len - offsetof(struct ext4_inode, i_generation)); 1547 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1548 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1549 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1550 memset(eh, 0, sizeof(*eh)); 1551 eh->eh_magic = EXT4_EXT_MAGIC; 1552 eh->eh_max = cpu_to_le16( 1553 (sizeof(raw_inode->i_block) - 1554 sizeof(struct ext4_extent_header)) 1555 / sizeof(struct ext4_extent)); 1556 } 1557 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1558 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1559 sizeof(raw_inode->i_block)); 1560 } 1561 1562 /* Immediately update the inode on disk. */ 1563 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1564 if (ret) 1565 goto out; 1566 ret = sync_dirty_buffer(iloc.bh); 1567 if (ret) 1568 goto out; 1569 ret = ext4_mark_inode_used(sb, ino); 1570 if (ret) 1571 goto out; 1572 1573 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1574 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1575 if (IS_ERR(inode)) { 1576 jbd_debug(1, "Inode not found."); 1577 return -EFSCORRUPTED; 1578 } 1579 1580 /* 1581 * Our allocator could have made different decisions than before 1582 * crashing. This should be fixed but until then, we calculate 1583 * the number of blocks the inode. 1584 */ 1585 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1586 ext4_ext_replay_set_iblocks(inode); 1587 1588 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1589 ext4_reset_inode_seed(inode); 1590 1591 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1592 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1593 sync_dirty_buffer(iloc.bh); 1594 brelse(iloc.bh); 1595 out: 1596 iput(inode); 1597 if (!ret) 1598 blkdev_issue_flush(sb->s_bdev); 1599 1600 return 0; 1601 } 1602 1603 /* 1604 * Dentry create replay function. 1605 * 1606 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1607 * inode for which we are trying to create a dentry here, should already have 1608 * been replayed before we start here. 1609 */ 1610 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1611 u8 *val) 1612 { 1613 int ret = 0; 1614 struct inode *inode = NULL; 1615 struct inode *dir = NULL; 1616 struct dentry_info_args darg; 1617 1618 tl_to_darg(&darg, tl, val); 1619 1620 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1621 darg.parent_ino, darg.dname_len); 1622 1623 /* This takes care of update group descriptor and other metadata */ 1624 ret = ext4_mark_inode_used(sb, darg.ino); 1625 if (ret) 1626 goto out; 1627 1628 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1629 if (IS_ERR(inode)) { 1630 jbd_debug(1, "inode %d not found.", darg.ino); 1631 inode = NULL; 1632 ret = -EINVAL; 1633 goto out; 1634 } 1635 1636 if (S_ISDIR(inode->i_mode)) { 1637 /* 1638 * If we are creating a directory, we need to make sure that the 1639 * dot and dot dot dirents are setup properly. 1640 */ 1641 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1642 if (IS_ERR(dir)) { 1643 jbd_debug(1, "Dir %d not found.", darg.ino); 1644 goto out; 1645 } 1646 ret = ext4_init_new_dir(NULL, dir, inode); 1647 iput(dir); 1648 if (ret) { 1649 ret = 0; 1650 goto out; 1651 } 1652 } 1653 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1654 if (ret) 1655 goto out; 1656 set_nlink(inode, 1); 1657 ext4_mark_inode_dirty(NULL, inode); 1658 out: 1659 if (inode) 1660 iput(inode); 1661 return ret; 1662 } 1663 1664 /* 1665 * Record physical disk regions which are in use as per fast commit area, 1666 * and used by inodes during replay phase. Our simple replay phase 1667 * allocator excludes these regions from allocation. 1668 */ 1669 int ext4_fc_record_regions(struct super_block *sb, int ino, 1670 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1671 { 1672 struct ext4_fc_replay_state *state; 1673 struct ext4_fc_alloc_region *region; 1674 1675 state = &EXT4_SB(sb)->s_fc_replay_state; 1676 /* 1677 * during replay phase, the fc_regions_valid may not same as 1678 * fc_regions_used, update it when do new additions. 1679 */ 1680 if (replay && state->fc_regions_used != state->fc_regions_valid) 1681 state->fc_regions_used = state->fc_regions_valid; 1682 if (state->fc_regions_used == state->fc_regions_size) { 1683 state->fc_regions_size += 1684 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1685 state->fc_regions = krealloc( 1686 state->fc_regions, 1687 state->fc_regions_size * 1688 sizeof(struct ext4_fc_alloc_region), 1689 GFP_KERNEL); 1690 if (!state->fc_regions) 1691 return -ENOMEM; 1692 } 1693 region = &state->fc_regions[state->fc_regions_used++]; 1694 region->ino = ino; 1695 region->lblk = lblk; 1696 region->pblk = pblk; 1697 region->len = len; 1698 1699 if (replay) 1700 state->fc_regions_valid++; 1701 1702 return 0; 1703 } 1704 1705 /* Replay add range tag */ 1706 static int ext4_fc_replay_add_range(struct super_block *sb, 1707 struct ext4_fc_tl *tl, u8 *val) 1708 { 1709 struct ext4_fc_add_range fc_add_ex; 1710 struct ext4_extent newex, *ex; 1711 struct inode *inode; 1712 ext4_lblk_t start, cur; 1713 int remaining, len; 1714 ext4_fsblk_t start_pblk; 1715 struct ext4_map_blocks map; 1716 struct ext4_ext_path *path = NULL; 1717 int ret; 1718 1719 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1720 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1721 1722 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1723 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1724 ext4_ext_get_actual_len(ex)); 1725 1726 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1727 if (IS_ERR(inode)) { 1728 jbd_debug(1, "Inode not found."); 1729 return 0; 1730 } 1731 1732 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1733 if (ret) 1734 goto out; 1735 1736 start = le32_to_cpu(ex->ee_block); 1737 start_pblk = ext4_ext_pblock(ex); 1738 len = ext4_ext_get_actual_len(ex); 1739 1740 cur = start; 1741 remaining = len; 1742 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1743 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1744 inode->i_ino); 1745 1746 while (remaining > 0) { 1747 map.m_lblk = cur; 1748 map.m_len = remaining; 1749 map.m_pblk = 0; 1750 ret = ext4_map_blocks(NULL, inode, &map, 0); 1751 1752 if (ret < 0) 1753 goto out; 1754 1755 if (ret == 0) { 1756 /* Range is not mapped */ 1757 path = ext4_find_extent(inode, cur, NULL, 0); 1758 if (IS_ERR(path)) 1759 goto out; 1760 memset(&newex, 0, sizeof(newex)); 1761 newex.ee_block = cpu_to_le32(cur); 1762 ext4_ext_store_pblock( 1763 &newex, start_pblk + cur - start); 1764 newex.ee_len = cpu_to_le16(map.m_len); 1765 if (ext4_ext_is_unwritten(ex)) 1766 ext4_ext_mark_unwritten(&newex); 1767 down_write(&EXT4_I(inode)->i_data_sem); 1768 ret = ext4_ext_insert_extent( 1769 NULL, inode, &path, &newex, 0); 1770 up_write((&EXT4_I(inode)->i_data_sem)); 1771 ext4_ext_drop_refs(path); 1772 kfree(path); 1773 if (ret) 1774 goto out; 1775 goto next; 1776 } 1777 1778 if (start_pblk + cur - start != map.m_pblk) { 1779 /* 1780 * Logical to physical mapping changed. This can happen 1781 * if this range was removed and then reallocated to 1782 * map to new physical blocks during a fast commit. 1783 */ 1784 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1785 ext4_ext_is_unwritten(ex), 1786 start_pblk + cur - start); 1787 if (ret) 1788 goto out; 1789 /* 1790 * Mark the old blocks as free since they aren't used 1791 * anymore. We maintain an array of all the modified 1792 * inodes. In case these blocks are still used at either 1793 * a different logical range in the same inode or in 1794 * some different inode, we will mark them as allocated 1795 * at the end of the FC replay using our array of 1796 * modified inodes. 1797 */ 1798 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1799 goto next; 1800 } 1801 1802 /* Range is mapped and needs a state change */ 1803 jbd_debug(1, "Converting from %ld to %d %lld", 1804 map.m_flags & EXT4_MAP_UNWRITTEN, 1805 ext4_ext_is_unwritten(ex), map.m_pblk); 1806 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1807 ext4_ext_is_unwritten(ex), map.m_pblk); 1808 if (ret) 1809 goto out; 1810 /* 1811 * We may have split the extent tree while toggling the state. 1812 * Try to shrink the extent tree now. 1813 */ 1814 ext4_ext_replay_shrink_inode(inode, start + len); 1815 next: 1816 cur += map.m_len; 1817 remaining -= map.m_len; 1818 } 1819 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1820 sb->s_blocksize_bits); 1821 out: 1822 iput(inode); 1823 return 0; 1824 } 1825 1826 /* Replay DEL_RANGE tag */ 1827 static int 1828 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1829 u8 *val) 1830 { 1831 struct inode *inode; 1832 struct ext4_fc_del_range lrange; 1833 struct ext4_map_blocks map; 1834 ext4_lblk_t cur, remaining; 1835 int ret; 1836 1837 memcpy(&lrange, val, sizeof(lrange)); 1838 cur = le32_to_cpu(lrange.fc_lblk); 1839 remaining = le32_to_cpu(lrange.fc_len); 1840 1841 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1842 le32_to_cpu(lrange.fc_ino), cur, remaining); 1843 1844 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1845 if (IS_ERR(inode)) { 1846 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1847 return 0; 1848 } 1849 1850 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1851 if (ret) 1852 goto out; 1853 1854 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1855 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1856 le32_to_cpu(lrange.fc_len)); 1857 while (remaining > 0) { 1858 map.m_lblk = cur; 1859 map.m_len = remaining; 1860 1861 ret = ext4_map_blocks(NULL, inode, &map, 0); 1862 if (ret < 0) 1863 goto out; 1864 if (ret > 0) { 1865 remaining -= ret; 1866 cur += ret; 1867 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1868 } else { 1869 remaining -= map.m_len; 1870 cur += map.m_len; 1871 } 1872 } 1873 1874 down_write(&EXT4_I(inode)->i_data_sem); 1875 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1876 le32_to_cpu(lrange.fc_lblk) + 1877 le32_to_cpu(lrange.fc_len) - 1); 1878 up_write(&EXT4_I(inode)->i_data_sem); 1879 if (ret) 1880 goto out; 1881 ext4_ext_replay_shrink_inode(inode, 1882 i_size_read(inode) >> sb->s_blocksize_bits); 1883 ext4_mark_inode_dirty(NULL, inode); 1884 out: 1885 iput(inode); 1886 return 0; 1887 } 1888 1889 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1890 { 1891 struct ext4_fc_replay_state *state; 1892 struct inode *inode; 1893 struct ext4_ext_path *path = NULL; 1894 struct ext4_map_blocks map; 1895 int i, ret, j; 1896 ext4_lblk_t cur, end; 1897 1898 state = &EXT4_SB(sb)->s_fc_replay_state; 1899 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1900 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1901 EXT4_IGET_NORMAL); 1902 if (IS_ERR(inode)) { 1903 jbd_debug(1, "Inode %d not found.", 1904 state->fc_modified_inodes[i]); 1905 continue; 1906 } 1907 cur = 0; 1908 end = EXT_MAX_BLOCKS; 1909 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1910 iput(inode); 1911 continue; 1912 } 1913 while (cur < end) { 1914 map.m_lblk = cur; 1915 map.m_len = end - cur; 1916 1917 ret = ext4_map_blocks(NULL, inode, &map, 0); 1918 if (ret < 0) 1919 break; 1920 1921 if (ret > 0) { 1922 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1923 if (!IS_ERR(path)) { 1924 for (j = 0; j < path->p_depth; j++) 1925 ext4_mb_mark_bb(inode->i_sb, 1926 path[j].p_block, 1, 1); 1927 ext4_ext_drop_refs(path); 1928 kfree(path); 1929 } 1930 cur += ret; 1931 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1932 map.m_len, 1); 1933 } else { 1934 cur = cur + (map.m_len ? map.m_len : 1); 1935 } 1936 } 1937 iput(inode); 1938 } 1939 } 1940 1941 /* 1942 * Check if block is in excluded regions for block allocation. The simple 1943 * allocator that runs during replay phase is calls this function to see 1944 * if it is okay to use a block. 1945 */ 1946 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1947 { 1948 int i; 1949 struct ext4_fc_replay_state *state; 1950 1951 state = &EXT4_SB(sb)->s_fc_replay_state; 1952 for (i = 0; i < state->fc_regions_valid; i++) { 1953 if (state->fc_regions[i].ino == 0 || 1954 state->fc_regions[i].len == 0) 1955 continue; 1956 if (in_range(blk, state->fc_regions[i].pblk, 1957 state->fc_regions[i].len)) 1958 return true; 1959 } 1960 return false; 1961 } 1962 1963 /* Cleanup function called after replay */ 1964 void ext4_fc_replay_cleanup(struct super_block *sb) 1965 { 1966 struct ext4_sb_info *sbi = EXT4_SB(sb); 1967 1968 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1969 kfree(sbi->s_fc_replay_state.fc_regions); 1970 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1971 } 1972 1973 /* 1974 * Recovery Scan phase handler 1975 * 1976 * This function is called during the scan phase and is responsible 1977 * for doing following things: 1978 * - Make sure the fast commit area has valid tags for replay 1979 * - Count number of tags that need to be replayed by the replay handler 1980 * - Verify CRC 1981 * - Create a list of excluded blocks for allocation during replay phase 1982 * 1983 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1984 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1985 * to indicate that scan has finished and JBD2 can now start replay phase. 1986 * It returns a negative error to indicate that there was an error. At the end 1987 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1988 * to indicate the number of tags that need to replayed during the replay phase. 1989 */ 1990 static int ext4_fc_replay_scan(journal_t *journal, 1991 struct buffer_head *bh, int off, 1992 tid_t expected_tid) 1993 { 1994 struct super_block *sb = journal->j_private; 1995 struct ext4_sb_info *sbi = EXT4_SB(sb); 1996 struct ext4_fc_replay_state *state; 1997 int ret = JBD2_FC_REPLAY_CONTINUE; 1998 struct ext4_fc_add_range ext; 1999 struct ext4_fc_tl tl; 2000 struct ext4_fc_tail tail; 2001 __u8 *start, *end, *cur, *val; 2002 struct ext4_fc_head head; 2003 struct ext4_extent *ex; 2004 2005 state = &sbi->s_fc_replay_state; 2006 2007 start = (u8 *)bh->b_data; 2008 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2009 2010 if (state->fc_replay_expected_off == 0) { 2011 state->fc_cur_tag = 0; 2012 state->fc_replay_num_tags = 0; 2013 state->fc_crc = 0; 2014 state->fc_regions = NULL; 2015 state->fc_regions_valid = state->fc_regions_used = 2016 state->fc_regions_size = 0; 2017 /* Check if we can stop early */ 2018 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2019 != EXT4_FC_TAG_HEAD) 2020 return 0; 2021 } 2022 2023 if (off != state->fc_replay_expected_off) { 2024 ret = -EFSCORRUPTED; 2025 goto out_err; 2026 } 2027 2028 state->fc_replay_expected_off++; 2029 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2030 memcpy(&tl, cur, sizeof(tl)); 2031 val = cur + sizeof(tl); 2032 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 2033 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); 2034 switch (le16_to_cpu(tl.fc_tag)) { 2035 case EXT4_FC_TAG_ADD_RANGE: 2036 memcpy(&ext, val, sizeof(ext)); 2037 ex = (struct ext4_extent *)&ext.fc_ex; 2038 ret = ext4_fc_record_regions(sb, 2039 le32_to_cpu(ext.fc_ino), 2040 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2041 ext4_ext_get_actual_len(ex), 0); 2042 if (ret < 0) 2043 break; 2044 ret = JBD2_FC_REPLAY_CONTINUE; 2045 fallthrough; 2046 case EXT4_FC_TAG_DEL_RANGE: 2047 case EXT4_FC_TAG_LINK: 2048 case EXT4_FC_TAG_UNLINK: 2049 case EXT4_FC_TAG_CREAT: 2050 case EXT4_FC_TAG_INODE: 2051 case EXT4_FC_TAG_PAD: 2052 state->fc_cur_tag++; 2053 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2054 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2055 break; 2056 case EXT4_FC_TAG_TAIL: 2057 state->fc_cur_tag++; 2058 memcpy(&tail, val, sizeof(tail)); 2059 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2060 sizeof(tl) + 2061 offsetof(struct ext4_fc_tail, 2062 fc_crc)); 2063 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2064 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2065 state->fc_replay_num_tags = state->fc_cur_tag; 2066 state->fc_regions_valid = 2067 state->fc_regions_used; 2068 } else { 2069 ret = state->fc_replay_num_tags ? 2070 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2071 } 2072 state->fc_crc = 0; 2073 break; 2074 case EXT4_FC_TAG_HEAD: 2075 memcpy(&head, val, sizeof(head)); 2076 if (le32_to_cpu(head.fc_features) & 2077 ~EXT4_FC_SUPPORTED_FEATURES) { 2078 ret = -EOPNOTSUPP; 2079 break; 2080 } 2081 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2082 ret = JBD2_FC_REPLAY_STOP; 2083 break; 2084 } 2085 state->fc_cur_tag++; 2086 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2087 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2088 break; 2089 default: 2090 ret = state->fc_replay_num_tags ? 2091 JBD2_FC_REPLAY_STOP : -ECANCELED; 2092 } 2093 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2094 break; 2095 } 2096 2097 out_err: 2098 trace_ext4_fc_replay_scan(sb, ret, off); 2099 return ret; 2100 } 2101 2102 /* 2103 * Main recovery path entry point. 2104 * The meaning of return codes is similar as above. 2105 */ 2106 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2107 enum passtype pass, int off, tid_t expected_tid) 2108 { 2109 struct super_block *sb = journal->j_private; 2110 struct ext4_sb_info *sbi = EXT4_SB(sb); 2111 struct ext4_fc_tl tl; 2112 __u8 *start, *end, *cur, *val; 2113 int ret = JBD2_FC_REPLAY_CONTINUE; 2114 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2115 struct ext4_fc_tail tail; 2116 2117 if (pass == PASS_SCAN) { 2118 state->fc_current_pass = PASS_SCAN; 2119 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2120 } 2121 2122 if (state->fc_current_pass != pass) { 2123 state->fc_current_pass = pass; 2124 sbi->s_mount_state |= EXT4_FC_REPLAY; 2125 } 2126 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2127 jbd_debug(1, "Replay stops\n"); 2128 ext4_fc_set_bitmaps_and_counters(sb); 2129 return 0; 2130 } 2131 2132 #ifdef CONFIG_EXT4_DEBUG 2133 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2134 pr_warn("Dropping fc block %d because max_replay set\n", off); 2135 return JBD2_FC_REPLAY_STOP; 2136 } 2137 #endif 2138 2139 start = (u8 *)bh->b_data; 2140 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2141 2142 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2143 memcpy(&tl, cur, sizeof(tl)); 2144 val = cur + sizeof(tl); 2145 2146 if (state->fc_replay_num_tags == 0) { 2147 ret = JBD2_FC_REPLAY_STOP; 2148 ext4_fc_set_bitmaps_and_counters(sb); 2149 break; 2150 } 2151 jbd_debug(3, "Replay phase, tag:%s\n", 2152 tag2str(le16_to_cpu(tl.fc_tag))); 2153 state->fc_replay_num_tags--; 2154 switch (le16_to_cpu(tl.fc_tag)) { 2155 case EXT4_FC_TAG_LINK: 2156 ret = ext4_fc_replay_link(sb, &tl, val); 2157 break; 2158 case EXT4_FC_TAG_UNLINK: 2159 ret = ext4_fc_replay_unlink(sb, &tl, val); 2160 break; 2161 case EXT4_FC_TAG_ADD_RANGE: 2162 ret = ext4_fc_replay_add_range(sb, &tl, val); 2163 break; 2164 case EXT4_FC_TAG_CREAT: 2165 ret = ext4_fc_replay_create(sb, &tl, val); 2166 break; 2167 case EXT4_FC_TAG_DEL_RANGE: 2168 ret = ext4_fc_replay_del_range(sb, &tl, val); 2169 break; 2170 case EXT4_FC_TAG_INODE: 2171 ret = ext4_fc_replay_inode(sb, &tl, val); 2172 break; 2173 case EXT4_FC_TAG_PAD: 2174 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2175 le16_to_cpu(tl.fc_len), 0); 2176 break; 2177 case EXT4_FC_TAG_TAIL: 2178 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2179 le16_to_cpu(tl.fc_len), 0); 2180 memcpy(&tail, val, sizeof(tail)); 2181 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2182 break; 2183 case EXT4_FC_TAG_HEAD: 2184 break; 2185 default: 2186 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, 2187 le16_to_cpu(tl.fc_len), 0); 2188 ret = -ECANCELED; 2189 break; 2190 } 2191 if (ret < 0) 2192 break; 2193 ret = JBD2_FC_REPLAY_CONTINUE; 2194 } 2195 return ret; 2196 } 2197 2198 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2199 { 2200 /* 2201 * We set replay callback even if fast commit disabled because we may 2202 * could still have fast commit blocks that need to be replayed even if 2203 * fast commit has now been turned off. 2204 */ 2205 journal->j_fc_replay_callback = ext4_fc_replay; 2206 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2207 return; 2208 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2209 } 2210 2211 static const char *fc_ineligible_reasons[] = { 2212 "Extended attributes changed", 2213 "Cross rename", 2214 "Journal flag changed", 2215 "Insufficient memory", 2216 "Swap boot", 2217 "Resize", 2218 "Dir renamed", 2219 "Falloc range op", 2220 "Data journalling", 2221 "FC Commit Failed" 2222 }; 2223 2224 int ext4_fc_info_show(struct seq_file *seq, void *v) 2225 { 2226 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2227 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2228 int i; 2229 2230 if (v != SEQ_START_TOKEN) 2231 return 0; 2232 2233 seq_printf(seq, 2234 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2235 stats->fc_num_commits, stats->fc_ineligible_commits, 2236 stats->fc_numblks, 2237 div_u64(stats->s_fc_avg_commit_time, 1000)); 2238 seq_puts(seq, "Ineligible reasons:\n"); 2239 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2240 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2241 stats->fc_ineligible_reason_count[i]); 2242 2243 return 0; 2244 } 2245 2246 int __init ext4_fc_init_dentry_cache(void) 2247 { 2248 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2249 SLAB_RECLAIM_ACCOUNT); 2250 2251 if (ext4_fc_dentry_cachep == NULL) 2252 return -ENOMEM; 2253 2254 return 0; 2255 } 2256 2257 void ext4_fc_destroy_dentry_cache(void) 2258 { 2259 kmem_cache_destroy(ext4_fc_dentry_cachep); 2260 } 2261