1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 INIT_LIST_HEAD(&ei->i_fc_dilist); 203 init_waitqueue_head(&ei->i_fc_wait); 204 atomic_set(&ei->i_fc_updates, 0); 205 } 206 207 /* This function must be called with sbi->s_fc_lock held. */ 208 static void ext4_fc_wait_committing_inode(struct inode *inode) 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 210 { 211 wait_queue_head_t *wq; 212 struct ext4_inode_info *ei = EXT4_I(inode); 213 214 #if (BITS_PER_LONG < 64) 215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 216 EXT4_STATE_FC_COMMITTING); 217 wq = bit_waitqueue(&ei->i_state_flags, 218 EXT4_STATE_FC_COMMITTING); 219 #else 220 DEFINE_WAIT_BIT(wait, &ei->i_flags, 221 EXT4_STATE_FC_COMMITTING); 222 wq = bit_waitqueue(&ei->i_flags, 223 EXT4_STATE_FC_COMMITTING); 224 #endif 225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 228 schedule(); 229 finish_wait(wq, &wait.wq_entry); 230 } 231 232 static bool ext4_fc_disabled(struct super_block *sb) 233 { 234 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 235 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 236 } 237 238 /* 239 * Inform Ext4's fast about start of an inode update 240 * 241 * This function is called by the high level call VFS callbacks before 242 * performing any inode update. This function blocks if there's an ongoing 243 * fast commit on the inode in question. 244 */ 245 void ext4_fc_start_update(struct inode *inode) 246 { 247 struct ext4_inode_info *ei = EXT4_I(inode); 248 249 if (ext4_fc_disabled(inode->i_sb)) 250 return; 251 252 restart: 253 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 254 if (list_empty(&ei->i_fc_list)) 255 goto out; 256 257 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 258 ext4_fc_wait_committing_inode(inode); 259 goto restart; 260 } 261 out: 262 atomic_inc(&ei->i_fc_updates); 263 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 264 } 265 266 /* 267 * Stop inode update and wake up waiting fast commits if any. 268 */ 269 void ext4_fc_stop_update(struct inode *inode) 270 { 271 struct ext4_inode_info *ei = EXT4_I(inode); 272 273 if (ext4_fc_disabled(inode->i_sb)) 274 return; 275 276 if (atomic_dec_and_test(&ei->i_fc_updates)) 277 wake_up_all(&ei->i_fc_wait); 278 } 279 280 /* 281 * Remove inode from fast commit list. If the inode is being committed 282 * we wait until inode commit is done. 283 */ 284 void ext4_fc_del(struct inode *inode) 285 { 286 struct ext4_inode_info *ei = EXT4_I(inode); 287 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 288 struct ext4_fc_dentry_update *fc_dentry; 289 290 if (ext4_fc_disabled(inode->i_sb)) 291 return; 292 293 restart: 294 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 295 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 296 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 297 return; 298 } 299 300 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 301 ext4_fc_wait_committing_inode(inode); 302 goto restart; 303 } 304 305 if (!list_empty(&ei->i_fc_list)) 306 list_del_init(&ei->i_fc_list); 307 308 /* 309 * Since this inode is getting removed, let's also remove all FC 310 * dentry create references, since it is not needed to log it anyways. 311 */ 312 if (list_empty(&ei->i_fc_dilist)) { 313 spin_unlock(&sbi->s_fc_lock); 314 return; 315 } 316 317 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 318 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 319 list_del_init(&fc_dentry->fcd_list); 320 list_del_init(&fc_dentry->fcd_dilist); 321 322 WARN_ON(!list_empty(&ei->i_fc_dilist)); 323 spin_unlock(&sbi->s_fc_lock); 324 325 if (fc_dentry->fcd_name.name && 326 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 327 kfree(fc_dentry->fcd_name.name); 328 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 329 330 return; 331 } 332 333 /* 334 * Mark file system as fast commit ineligible, and record latest 335 * ineligible transaction tid. This means until the recorded 336 * transaction, commit operation would result in a full jbd2 commit. 337 */ 338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 339 { 340 struct ext4_sb_info *sbi = EXT4_SB(sb); 341 tid_t tid; 342 343 if (ext4_fc_disabled(sb)) 344 return; 345 346 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 347 if (handle && !IS_ERR(handle)) 348 tid = handle->h_transaction->t_tid; 349 else { 350 read_lock(&sbi->s_journal->j_state_lock); 351 tid = sbi->s_journal->j_running_transaction ? 352 sbi->s_journal->j_running_transaction->t_tid : 0; 353 read_unlock(&sbi->s_journal->j_state_lock); 354 } 355 spin_lock(&sbi->s_fc_lock); 356 if (sbi->s_fc_ineligible_tid < tid) 357 sbi->s_fc_ineligible_tid = tid; 358 spin_unlock(&sbi->s_fc_lock); 359 WARN_ON(reason >= EXT4_FC_REASON_MAX); 360 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 361 } 362 363 /* 364 * Generic fast commit tracking function. If this is the first time this we are 365 * called after a full commit, we initialize fast commit fields and then call 366 * __fc_track_fn() with update = 0. If we have already been called after a full 367 * commit, we pass update = 1. Based on that, the track function can determine 368 * if it needs to track a field for the first time or if it needs to just 369 * update the previously tracked value. 370 * 371 * If enqueue is set, this function enqueues the inode in fast commit list. 372 */ 373 static int ext4_fc_track_template( 374 handle_t *handle, struct inode *inode, 375 int (*__fc_track_fn)(struct inode *, void *, bool), 376 void *args, int enqueue) 377 { 378 bool update = false; 379 struct ext4_inode_info *ei = EXT4_I(inode); 380 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 381 tid_t tid = 0; 382 int ret; 383 384 tid = handle->h_transaction->t_tid; 385 mutex_lock(&ei->i_fc_lock); 386 if (tid == ei->i_sync_tid) { 387 update = true; 388 } else { 389 ext4_fc_reset_inode(inode); 390 ei->i_sync_tid = tid; 391 } 392 ret = __fc_track_fn(inode, args, update); 393 mutex_unlock(&ei->i_fc_lock); 394 395 if (!enqueue) 396 return ret; 397 398 spin_lock(&sbi->s_fc_lock); 399 if (list_empty(&EXT4_I(inode)->i_fc_list)) 400 list_add_tail(&EXT4_I(inode)->i_fc_list, 401 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 402 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 403 &sbi->s_fc_q[FC_Q_STAGING] : 404 &sbi->s_fc_q[FC_Q_MAIN]); 405 spin_unlock(&sbi->s_fc_lock); 406 407 return ret; 408 } 409 410 struct __track_dentry_update_args { 411 struct dentry *dentry; 412 int op; 413 }; 414 415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 416 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 417 { 418 struct ext4_fc_dentry_update *node; 419 struct ext4_inode_info *ei = EXT4_I(inode); 420 struct __track_dentry_update_args *dentry_update = 421 (struct __track_dentry_update_args *)arg; 422 struct dentry *dentry = dentry_update->dentry; 423 struct inode *dir = dentry->d_parent->d_inode; 424 struct super_block *sb = inode->i_sb; 425 struct ext4_sb_info *sbi = EXT4_SB(sb); 426 427 mutex_unlock(&ei->i_fc_lock); 428 429 if (IS_ENCRYPTED(dir)) { 430 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, 431 NULL); 432 mutex_lock(&ei->i_fc_lock); 433 return -EOPNOTSUPP; 434 } 435 436 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 437 if (!node) { 438 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); 439 mutex_lock(&ei->i_fc_lock); 440 return -ENOMEM; 441 } 442 443 node->fcd_op = dentry_update->op; 444 node->fcd_parent = dir->i_ino; 445 node->fcd_ino = inode->i_ino; 446 if (dentry->d_name.len > DNAME_INLINE_LEN) { 447 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 448 if (!node->fcd_name.name) { 449 kmem_cache_free(ext4_fc_dentry_cachep, node); 450 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); 451 mutex_lock(&ei->i_fc_lock); 452 return -ENOMEM; 453 } 454 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 455 dentry->d_name.len); 456 } else { 457 memcpy(node->fcd_iname, dentry->d_name.name, 458 dentry->d_name.len); 459 node->fcd_name.name = node->fcd_iname; 460 } 461 node->fcd_name.len = dentry->d_name.len; 462 INIT_LIST_HEAD(&node->fcd_dilist); 463 spin_lock(&sbi->s_fc_lock); 464 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 465 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 466 list_add_tail(&node->fcd_list, 467 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 468 else 469 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 470 471 /* 472 * This helps us keep a track of all fc_dentry updates which is part of 473 * this ext4 inode. So in case the inode is getting unlinked, before 474 * even we get a chance to fsync, we could remove all fc_dentry 475 * references while evicting the inode in ext4_fc_del(). 476 * Also with this, we don't need to loop over all the inodes in 477 * sbi->s_fc_q to get the corresponding inode in 478 * ext4_fc_commit_dentry_updates(). 479 */ 480 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 481 WARN_ON(!list_empty(&ei->i_fc_dilist)); 482 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 483 } 484 spin_unlock(&sbi->s_fc_lock); 485 mutex_lock(&ei->i_fc_lock); 486 487 return 0; 488 } 489 490 void __ext4_fc_track_unlink(handle_t *handle, 491 struct inode *inode, struct dentry *dentry) 492 { 493 struct __track_dentry_update_args args; 494 int ret; 495 496 args.dentry = dentry; 497 args.op = EXT4_FC_TAG_UNLINK; 498 499 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 500 (void *)&args, 0); 501 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 502 } 503 504 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 505 { 506 struct inode *inode = d_inode(dentry); 507 508 if (ext4_fc_disabled(inode->i_sb)) 509 return; 510 511 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 512 return; 513 514 __ext4_fc_track_unlink(handle, inode, dentry); 515 } 516 517 void __ext4_fc_track_link(handle_t *handle, 518 struct inode *inode, struct dentry *dentry) 519 { 520 struct __track_dentry_update_args args; 521 int ret; 522 523 args.dentry = dentry; 524 args.op = EXT4_FC_TAG_LINK; 525 526 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 527 (void *)&args, 0); 528 trace_ext4_fc_track_link(handle, inode, dentry, ret); 529 } 530 531 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 532 { 533 struct inode *inode = d_inode(dentry); 534 535 if (ext4_fc_disabled(inode->i_sb)) 536 return; 537 538 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 539 return; 540 541 __ext4_fc_track_link(handle, inode, dentry); 542 } 543 544 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 545 struct dentry *dentry) 546 { 547 struct __track_dentry_update_args args; 548 int ret; 549 550 args.dentry = dentry; 551 args.op = EXT4_FC_TAG_CREAT; 552 553 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 554 (void *)&args, 0); 555 trace_ext4_fc_track_create(handle, inode, dentry, ret); 556 } 557 558 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 559 { 560 struct inode *inode = d_inode(dentry); 561 562 if (ext4_fc_disabled(inode->i_sb)) 563 return; 564 565 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 566 return; 567 568 __ext4_fc_track_create(handle, inode, dentry); 569 } 570 571 /* __track_fn for inode tracking */ 572 static int __track_inode(struct inode *inode, void *arg, bool update) 573 { 574 if (update) 575 return -EEXIST; 576 577 EXT4_I(inode)->i_fc_lblk_len = 0; 578 579 return 0; 580 } 581 582 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 583 { 584 int ret; 585 586 if (S_ISDIR(inode->i_mode)) 587 return; 588 589 if (ext4_fc_disabled(inode->i_sb)) 590 return; 591 592 if (ext4_should_journal_data(inode)) { 593 ext4_fc_mark_ineligible(inode->i_sb, 594 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 595 return; 596 } 597 598 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 599 return; 600 601 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 602 trace_ext4_fc_track_inode(handle, inode, ret); 603 } 604 605 struct __track_range_args { 606 ext4_lblk_t start, end; 607 }; 608 609 /* __track_fn for tracking data updates */ 610 static int __track_range(struct inode *inode, void *arg, bool update) 611 { 612 struct ext4_inode_info *ei = EXT4_I(inode); 613 ext4_lblk_t oldstart; 614 struct __track_range_args *__arg = 615 (struct __track_range_args *)arg; 616 617 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 618 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 619 return -ECANCELED; 620 } 621 622 oldstart = ei->i_fc_lblk_start; 623 624 if (update && ei->i_fc_lblk_len > 0) { 625 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 626 ei->i_fc_lblk_len = 627 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 628 ei->i_fc_lblk_start + 1; 629 } else { 630 ei->i_fc_lblk_start = __arg->start; 631 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 632 } 633 634 return 0; 635 } 636 637 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 638 ext4_lblk_t end) 639 { 640 struct __track_range_args args; 641 int ret; 642 643 if (S_ISDIR(inode->i_mode)) 644 return; 645 646 if (ext4_fc_disabled(inode->i_sb)) 647 return; 648 649 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 650 return; 651 652 if (ext4_has_inline_data(inode)) { 653 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, 654 handle); 655 return; 656 } 657 658 args.start = start; 659 args.end = end; 660 661 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 662 663 trace_ext4_fc_track_range(handle, inode, start, end, ret); 664 } 665 666 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 667 { 668 blk_opf_t write_flags = REQ_SYNC; 669 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 670 671 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 672 if (test_opt(sb, BARRIER) && is_tail) 673 write_flags |= REQ_FUA | REQ_PREFLUSH; 674 lock_buffer(bh); 675 set_buffer_dirty(bh); 676 set_buffer_uptodate(bh); 677 bh->b_end_io = ext4_end_buffer_io_sync; 678 submit_bh(REQ_OP_WRITE | write_flags, bh); 679 EXT4_SB(sb)->s_fc_bh = NULL; 680 } 681 682 /* Ext4 commit path routines */ 683 684 /* 685 * Allocate len bytes on a fast commit buffer. 686 * 687 * During the commit time this function is used to manage fast commit 688 * block space. We don't split a fast commit log onto different 689 * blocks. So this function makes sure that if there's not enough space 690 * on the current block, the remaining space in the current block is 691 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 692 * new block is from jbd2 and CRC is updated to reflect the padding 693 * we added. 694 */ 695 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 696 { 697 struct ext4_fc_tl tl; 698 struct ext4_sb_info *sbi = EXT4_SB(sb); 699 struct buffer_head *bh; 700 int bsize = sbi->s_journal->j_blocksize; 701 int ret, off = sbi->s_fc_bytes % bsize; 702 int remaining; 703 u8 *dst; 704 705 /* 706 * If 'len' is too long to fit in any block alongside a PAD tlv, then we 707 * cannot fulfill the request. 708 */ 709 if (len > bsize - EXT4_FC_TAG_BASE_LEN) 710 return NULL; 711 712 if (!sbi->s_fc_bh) { 713 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 714 if (ret) 715 return NULL; 716 sbi->s_fc_bh = bh; 717 } 718 dst = sbi->s_fc_bh->b_data + off; 719 720 /* 721 * Allocate the bytes in the current block if we can do so while still 722 * leaving enough space for a PAD tlv. 723 */ 724 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; 725 if (len <= remaining) { 726 sbi->s_fc_bytes += len; 727 return dst; 728 } 729 730 /* 731 * Else, terminate the current block with a PAD tlv, then allocate a new 732 * block and allocate the bytes at the start of that new block. 733 */ 734 735 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 736 tl.fc_len = cpu_to_le16(remaining); 737 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 738 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); 739 *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); 740 741 ext4_fc_submit_bh(sb, false); 742 743 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 744 if (ret) 745 return NULL; 746 sbi->s_fc_bh = bh; 747 sbi->s_fc_bytes += bsize - off + len; 748 return sbi->s_fc_bh->b_data; 749 } 750 751 /* 752 * Complete a fast commit by writing tail tag. 753 * 754 * Writing tail tag marks the end of a fast commit. In order to guarantee 755 * atomicity, after writing tail tag, even if there's space remaining 756 * in the block, next commit shouldn't use it. That's why tail tag 757 * has the length as that of the remaining space on the block. 758 */ 759 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 760 { 761 struct ext4_sb_info *sbi = EXT4_SB(sb); 762 struct ext4_fc_tl tl; 763 struct ext4_fc_tail tail; 764 int off, bsize = sbi->s_journal->j_blocksize; 765 u8 *dst; 766 767 /* 768 * ext4_fc_reserve_space takes care of allocating an extra block if 769 * there's no enough space on this block for accommodating this tail. 770 */ 771 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); 772 if (!dst) 773 return -ENOSPC; 774 775 off = sbi->s_fc_bytes % bsize; 776 777 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 778 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); 779 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 780 781 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 782 dst += EXT4_FC_TAG_BASE_LEN; 783 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 784 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); 785 dst += sizeof(tail.fc_tid); 786 crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, 787 dst - (u8 *)sbi->s_fc_bh->b_data); 788 tail.fc_crc = cpu_to_le32(crc); 789 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); 790 dst += sizeof(tail.fc_crc); 791 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ 792 793 ext4_fc_submit_bh(sb, true); 794 795 return 0; 796 } 797 798 /* 799 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 800 * Returns false if there's not enough space. 801 */ 802 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 803 u32 *crc) 804 { 805 struct ext4_fc_tl tl; 806 u8 *dst; 807 808 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); 809 if (!dst) 810 return false; 811 812 tl.fc_tag = cpu_to_le16(tag); 813 tl.fc_len = cpu_to_le16(len); 814 815 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 816 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); 817 818 return true; 819 } 820 821 /* Same as above, but adds dentry tlv. */ 822 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 823 struct ext4_fc_dentry_update *fc_dentry) 824 { 825 struct ext4_fc_dentry_info fcd; 826 struct ext4_fc_tl tl; 827 int dlen = fc_dentry->fcd_name.len; 828 u8 *dst = ext4_fc_reserve_space(sb, 829 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); 830 831 if (!dst) 832 return false; 833 834 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 835 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 836 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 837 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 838 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 839 dst += EXT4_FC_TAG_BASE_LEN; 840 memcpy(dst, &fcd, sizeof(fcd)); 841 dst += sizeof(fcd); 842 memcpy(dst, fc_dentry->fcd_name.name, dlen); 843 844 return true; 845 } 846 847 /* 848 * Writes inode in the fast commit space under TLV with tag @tag. 849 * Returns 0 on success, error on failure. 850 */ 851 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 852 { 853 struct ext4_inode_info *ei = EXT4_I(inode); 854 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 855 int ret; 856 struct ext4_iloc iloc; 857 struct ext4_fc_inode fc_inode; 858 struct ext4_fc_tl tl; 859 u8 *dst; 860 861 ret = ext4_get_inode_loc(inode, &iloc); 862 if (ret) 863 return ret; 864 865 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 866 inode_len = EXT4_INODE_SIZE(inode->i_sb); 867 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 868 inode_len += ei->i_extra_isize; 869 870 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 871 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 872 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 873 874 ret = -ECANCELED; 875 dst = ext4_fc_reserve_space(inode->i_sb, 876 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); 877 if (!dst) 878 goto err; 879 880 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 881 dst += EXT4_FC_TAG_BASE_LEN; 882 memcpy(dst, &fc_inode, sizeof(fc_inode)); 883 dst += sizeof(fc_inode); 884 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); 885 ret = 0; 886 err: 887 brelse(iloc.bh); 888 return ret; 889 } 890 891 /* 892 * Writes updated data ranges for the inode in question. Updates CRC. 893 * Returns 0 on success, error otherwise. 894 */ 895 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 896 { 897 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 898 struct ext4_inode_info *ei = EXT4_I(inode); 899 struct ext4_map_blocks map; 900 struct ext4_fc_add_range fc_ext; 901 struct ext4_fc_del_range lrange; 902 struct ext4_extent *ex; 903 int ret; 904 905 mutex_lock(&ei->i_fc_lock); 906 if (ei->i_fc_lblk_len == 0) { 907 mutex_unlock(&ei->i_fc_lock); 908 return 0; 909 } 910 old_blk_size = ei->i_fc_lblk_start; 911 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 912 ei->i_fc_lblk_len = 0; 913 mutex_unlock(&ei->i_fc_lock); 914 915 cur_lblk_off = old_blk_size; 916 ext4_debug("will try writing %d to %d for inode %ld\n", 917 cur_lblk_off, new_blk_size, inode->i_ino); 918 919 while (cur_lblk_off <= new_blk_size) { 920 map.m_lblk = cur_lblk_off; 921 map.m_len = new_blk_size - cur_lblk_off + 1; 922 ret = ext4_map_blocks(NULL, inode, &map, 0); 923 if (ret < 0) 924 return -ECANCELED; 925 926 if (map.m_len == 0) { 927 cur_lblk_off++; 928 continue; 929 } 930 931 if (ret == 0) { 932 lrange.fc_ino = cpu_to_le32(inode->i_ino); 933 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 934 lrange.fc_len = cpu_to_le32(map.m_len); 935 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 936 sizeof(lrange), (u8 *)&lrange, crc)) 937 return -ENOSPC; 938 } else { 939 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 940 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 941 942 /* Limit the number of blocks in one extent */ 943 map.m_len = min(max, map.m_len); 944 945 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 946 ex = (struct ext4_extent *)&fc_ext.fc_ex; 947 ex->ee_block = cpu_to_le32(map.m_lblk); 948 ex->ee_len = cpu_to_le16(map.m_len); 949 ext4_ext_store_pblock(ex, map.m_pblk); 950 if (map.m_flags & EXT4_MAP_UNWRITTEN) 951 ext4_ext_mark_unwritten(ex); 952 else 953 ext4_ext_mark_initialized(ex); 954 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 955 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 956 return -ENOSPC; 957 } 958 959 cur_lblk_off += map.m_len; 960 } 961 962 return 0; 963 } 964 965 966 /* Submit data for all the fast commit inodes */ 967 static int ext4_fc_submit_inode_data_all(journal_t *journal) 968 { 969 struct super_block *sb = journal->j_private; 970 struct ext4_sb_info *sbi = EXT4_SB(sb); 971 struct ext4_inode_info *ei; 972 int ret = 0; 973 974 spin_lock(&sbi->s_fc_lock); 975 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 976 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 977 while (atomic_read(&ei->i_fc_updates)) { 978 DEFINE_WAIT(wait); 979 980 prepare_to_wait(&ei->i_fc_wait, &wait, 981 TASK_UNINTERRUPTIBLE); 982 if (atomic_read(&ei->i_fc_updates)) { 983 spin_unlock(&sbi->s_fc_lock); 984 schedule(); 985 spin_lock(&sbi->s_fc_lock); 986 } 987 finish_wait(&ei->i_fc_wait, &wait); 988 } 989 spin_unlock(&sbi->s_fc_lock); 990 ret = jbd2_submit_inode_data(journal, ei->jinode); 991 if (ret) 992 return ret; 993 spin_lock(&sbi->s_fc_lock); 994 } 995 spin_unlock(&sbi->s_fc_lock); 996 997 return ret; 998 } 999 1000 /* Wait for completion of data for all the fast commit inodes */ 1001 static int ext4_fc_wait_inode_data_all(journal_t *journal) 1002 { 1003 struct super_block *sb = journal->j_private; 1004 struct ext4_sb_info *sbi = EXT4_SB(sb); 1005 struct ext4_inode_info *pos, *n; 1006 int ret = 0; 1007 1008 spin_lock(&sbi->s_fc_lock); 1009 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1010 if (!ext4_test_inode_state(&pos->vfs_inode, 1011 EXT4_STATE_FC_COMMITTING)) 1012 continue; 1013 spin_unlock(&sbi->s_fc_lock); 1014 1015 ret = jbd2_wait_inode_data(journal, pos->jinode); 1016 if (ret) 1017 return ret; 1018 spin_lock(&sbi->s_fc_lock); 1019 } 1020 spin_unlock(&sbi->s_fc_lock); 1021 1022 return 0; 1023 } 1024 1025 /* Commit all the directory entry updates */ 1026 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1027 __acquires(&sbi->s_fc_lock) 1028 __releases(&sbi->s_fc_lock) 1029 { 1030 struct super_block *sb = journal->j_private; 1031 struct ext4_sb_info *sbi = EXT4_SB(sb); 1032 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1033 struct inode *inode; 1034 struct ext4_inode_info *ei; 1035 int ret; 1036 1037 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1038 return 0; 1039 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1040 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1041 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1042 spin_unlock(&sbi->s_fc_lock); 1043 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1044 ret = -ENOSPC; 1045 goto lock_and_exit; 1046 } 1047 spin_lock(&sbi->s_fc_lock); 1048 continue; 1049 } 1050 /* 1051 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1052 * corresponding inode pointer 1053 */ 1054 WARN_ON(list_empty(&fc_dentry->fcd_dilist)); 1055 ei = list_first_entry(&fc_dentry->fcd_dilist, 1056 struct ext4_inode_info, i_fc_dilist); 1057 inode = &ei->vfs_inode; 1058 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1059 1060 spin_unlock(&sbi->s_fc_lock); 1061 1062 /* 1063 * We first write the inode and then the create dirent. This 1064 * allows the recovery code to create an unnamed inode first 1065 * and then link it to a directory entry. This allows us 1066 * to use namei.c routines almost as is and simplifies 1067 * the recovery code. 1068 */ 1069 ret = ext4_fc_write_inode(inode, crc); 1070 if (ret) 1071 goto lock_and_exit; 1072 1073 ret = ext4_fc_write_inode_data(inode, crc); 1074 if (ret) 1075 goto lock_and_exit; 1076 1077 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1078 ret = -ENOSPC; 1079 goto lock_and_exit; 1080 } 1081 1082 spin_lock(&sbi->s_fc_lock); 1083 } 1084 return 0; 1085 lock_and_exit: 1086 spin_lock(&sbi->s_fc_lock); 1087 return ret; 1088 } 1089 1090 static int ext4_fc_perform_commit(journal_t *journal) 1091 { 1092 struct super_block *sb = journal->j_private; 1093 struct ext4_sb_info *sbi = EXT4_SB(sb); 1094 struct ext4_inode_info *iter; 1095 struct ext4_fc_head head; 1096 struct inode *inode; 1097 struct blk_plug plug; 1098 int ret = 0; 1099 u32 crc = 0; 1100 1101 ret = ext4_fc_submit_inode_data_all(journal); 1102 if (ret) 1103 return ret; 1104 1105 ret = ext4_fc_wait_inode_data_all(journal); 1106 if (ret) 1107 return ret; 1108 1109 /* 1110 * If file system device is different from journal device, issue a cache 1111 * flush before we start writing fast commit blocks. 1112 */ 1113 if (journal->j_fs_dev != journal->j_dev) 1114 blkdev_issue_flush(journal->j_fs_dev); 1115 1116 blk_start_plug(&plug); 1117 if (sbi->s_fc_bytes == 0) { 1118 /* 1119 * Add a head tag only if this is the first fast commit 1120 * in this TID. 1121 */ 1122 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1123 head.fc_tid = cpu_to_le32( 1124 sbi->s_journal->j_running_transaction->t_tid); 1125 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1126 (u8 *)&head, &crc)) { 1127 ret = -ENOSPC; 1128 goto out; 1129 } 1130 } 1131 1132 spin_lock(&sbi->s_fc_lock); 1133 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1134 if (ret) { 1135 spin_unlock(&sbi->s_fc_lock); 1136 goto out; 1137 } 1138 1139 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1140 inode = &iter->vfs_inode; 1141 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1142 continue; 1143 1144 spin_unlock(&sbi->s_fc_lock); 1145 ret = ext4_fc_write_inode_data(inode, &crc); 1146 if (ret) 1147 goto out; 1148 ret = ext4_fc_write_inode(inode, &crc); 1149 if (ret) 1150 goto out; 1151 spin_lock(&sbi->s_fc_lock); 1152 } 1153 spin_unlock(&sbi->s_fc_lock); 1154 1155 ret = ext4_fc_write_tail(sb, crc); 1156 1157 out: 1158 blk_finish_plug(&plug); 1159 return ret; 1160 } 1161 1162 static void ext4_fc_update_stats(struct super_block *sb, int status, 1163 u64 commit_time, int nblks, tid_t commit_tid) 1164 { 1165 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1166 1167 ext4_debug("Fast commit ended with status = %d for tid %u", 1168 status, commit_tid); 1169 if (status == EXT4_FC_STATUS_OK) { 1170 stats->fc_num_commits++; 1171 stats->fc_numblks += nblks; 1172 if (likely(stats->s_fc_avg_commit_time)) 1173 stats->s_fc_avg_commit_time = 1174 (commit_time + 1175 stats->s_fc_avg_commit_time * 3) / 4; 1176 else 1177 stats->s_fc_avg_commit_time = commit_time; 1178 } else if (status == EXT4_FC_STATUS_FAILED || 1179 status == EXT4_FC_STATUS_INELIGIBLE) { 1180 if (status == EXT4_FC_STATUS_FAILED) 1181 stats->fc_failed_commits++; 1182 stats->fc_ineligible_commits++; 1183 } else { 1184 stats->fc_skipped_commits++; 1185 } 1186 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1187 } 1188 1189 /* 1190 * The main commit entry point. Performs a fast commit for transaction 1191 * commit_tid if needed. If it's not possible to perform a fast commit 1192 * due to various reasons, we fall back to full commit. Returns 0 1193 * on success, error otherwise. 1194 */ 1195 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1196 { 1197 struct super_block *sb = journal->j_private; 1198 struct ext4_sb_info *sbi = EXT4_SB(sb); 1199 int nblks = 0, ret, bsize = journal->j_blocksize; 1200 int subtid = atomic_read(&sbi->s_fc_subtid); 1201 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1202 ktime_t start_time, commit_time; 1203 1204 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1205 return jbd2_complete_transaction(journal, commit_tid); 1206 1207 trace_ext4_fc_commit_start(sb, commit_tid); 1208 1209 start_time = ktime_get(); 1210 1211 restart_fc: 1212 ret = jbd2_fc_begin_commit(journal, commit_tid); 1213 if (ret == -EALREADY) { 1214 /* There was an ongoing commit, check if we need to restart */ 1215 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1216 commit_tid > journal->j_commit_sequence) 1217 goto restart_fc; 1218 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1219 commit_tid); 1220 return 0; 1221 } else if (ret) { 1222 /* 1223 * Commit couldn't start. Just update stats and perform a 1224 * full commit. 1225 */ 1226 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1227 commit_tid); 1228 return jbd2_complete_transaction(journal, commit_tid); 1229 } 1230 1231 /* 1232 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1233 * if we are fast commit ineligible. 1234 */ 1235 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1236 status = EXT4_FC_STATUS_INELIGIBLE; 1237 goto fallback; 1238 } 1239 1240 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1241 ret = ext4_fc_perform_commit(journal); 1242 if (ret < 0) { 1243 status = EXT4_FC_STATUS_FAILED; 1244 goto fallback; 1245 } 1246 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1247 ret = jbd2_fc_wait_bufs(journal, nblks); 1248 if (ret < 0) { 1249 status = EXT4_FC_STATUS_FAILED; 1250 goto fallback; 1251 } 1252 atomic_inc(&sbi->s_fc_subtid); 1253 ret = jbd2_fc_end_commit(journal); 1254 /* 1255 * weight the commit time higher than the average time so we 1256 * don't react too strongly to vast changes in the commit time 1257 */ 1258 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1259 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1260 return ret; 1261 1262 fallback: 1263 ret = jbd2_fc_end_commit_fallback(journal); 1264 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1265 return ret; 1266 } 1267 1268 /* 1269 * Fast commit cleanup routine. This is called after every fast commit and 1270 * full commit. full is true if we are called after a full commit. 1271 */ 1272 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1273 { 1274 struct super_block *sb = journal->j_private; 1275 struct ext4_sb_info *sbi = EXT4_SB(sb); 1276 struct ext4_inode_info *iter, *iter_n; 1277 struct ext4_fc_dentry_update *fc_dentry; 1278 1279 if (full && sbi->s_fc_bh) 1280 sbi->s_fc_bh = NULL; 1281 1282 trace_ext4_fc_cleanup(journal, full, tid); 1283 jbd2_fc_release_bufs(journal); 1284 1285 spin_lock(&sbi->s_fc_lock); 1286 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1287 i_fc_list) { 1288 list_del_init(&iter->i_fc_list); 1289 ext4_clear_inode_state(&iter->vfs_inode, 1290 EXT4_STATE_FC_COMMITTING); 1291 if (iter->i_sync_tid <= tid) 1292 ext4_fc_reset_inode(&iter->vfs_inode); 1293 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1294 smp_mb(); 1295 #if (BITS_PER_LONG < 64) 1296 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1297 #else 1298 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1299 #endif 1300 } 1301 1302 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1303 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1304 struct ext4_fc_dentry_update, 1305 fcd_list); 1306 list_del_init(&fc_dentry->fcd_list); 1307 list_del_init(&fc_dentry->fcd_dilist); 1308 spin_unlock(&sbi->s_fc_lock); 1309 1310 if (fc_dentry->fcd_name.name && 1311 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1312 kfree(fc_dentry->fcd_name.name); 1313 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1314 spin_lock(&sbi->s_fc_lock); 1315 } 1316 1317 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1318 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1319 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1320 &sbi->s_fc_q[FC_Q_MAIN]); 1321 1322 if (tid >= sbi->s_fc_ineligible_tid) { 1323 sbi->s_fc_ineligible_tid = 0; 1324 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1325 } 1326 1327 if (full) 1328 sbi->s_fc_bytes = 0; 1329 spin_unlock(&sbi->s_fc_lock); 1330 trace_ext4_fc_stats(sb); 1331 } 1332 1333 /* Ext4 Replay Path Routines */ 1334 1335 /* Helper struct for dentry replay routines */ 1336 struct dentry_info_args { 1337 int parent_ino, dname_len, ino, inode_len; 1338 char *dname; 1339 }; 1340 1341 /* Same as struct ext4_fc_tl, but uses native endianness fields */ 1342 struct ext4_fc_tl_mem { 1343 u16 fc_tag; 1344 u16 fc_len; 1345 }; 1346 1347 static inline void tl_to_darg(struct dentry_info_args *darg, 1348 struct ext4_fc_tl_mem *tl, u8 *val) 1349 { 1350 struct ext4_fc_dentry_info fcd; 1351 1352 memcpy(&fcd, val, sizeof(fcd)); 1353 1354 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1355 darg->ino = le32_to_cpu(fcd.fc_ino); 1356 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1357 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1358 } 1359 1360 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) 1361 { 1362 struct ext4_fc_tl tl_disk; 1363 1364 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); 1365 tl->fc_len = le16_to_cpu(tl_disk.fc_len); 1366 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); 1367 } 1368 1369 /* Unlink replay function */ 1370 static int ext4_fc_replay_unlink(struct super_block *sb, 1371 struct ext4_fc_tl_mem *tl, u8 *val) 1372 { 1373 struct inode *inode, *old_parent; 1374 struct qstr entry; 1375 struct dentry_info_args darg; 1376 int ret = 0; 1377 1378 tl_to_darg(&darg, tl, val); 1379 1380 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1381 darg.parent_ino, darg.dname_len); 1382 1383 entry.name = darg.dname; 1384 entry.len = darg.dname_len; 1385 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1386 1387 if (IS_ERR(inode)) { 1388 ext4_debug("Inode %d not found", darg.ino); 1389 return 0; 1390 } 1391 1392 old_parent = ext4_iget(sb, darg.parent_ino, 1393 EXT4_IGET_NORMAL); 1394 if (IS_ERR(old_parent)) { 1395 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1396 iput(inode); 1397 return 0; 1398 } 1399 1400 ret = __ext4_unlink(old_parent, &entry, inode, NULL); 1401 /* -ENOENT ok coz it might not exist anymore. */ 1402 if (ret == -ENOENT) 1403 ret = 0; 1404 iput(old_parent); 1405 iput(inode); 1406 return ret; 1407 } 1408 1409 static int ext4_fc_replay_link_internal(struct super_block *sb, 1410 struct dentry_info_args *darg, 1411 struct inode *inode) 1412 { 1413 struct inode *dir = NULL; 1414 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1415 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1416 int ret = 0; 1417 1418 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1419 if (IS_ERR(dir)) { 1420 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1421 dir = NULL; 1422 goto out; 1423 } 1424 1425 dentry_dir = d_obtain_alias(dir); 1426 if (IS_ERR(dentry_dir)) { 1427 ext4_debug("Failed to obtain dentry"); 1428 dentry_dir = NULL; 1429 goto out; 1430 } 1431 1432 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1433 if (!dentry_inode) { 1434 ext4_debug("Inode dentry not created."); 1435 ret = -ENOMEM; 1436 goto out; 1437 } 1438 1439 ret = __ext4_link(dir, inode, dentry_inode); 1440 /* 1441 * It's possible that link already existed since data blocks 1442 * for the dir in question got persisted before we crashed OR 1443 * we replayed this tag and crashed before the entire replay 1444 * could complete. 1445 */ 1446 if (ret && ret != -EEXIST) { 1447 ext4_debug("Failed to link\n"); 1448 goto out; 1449 } 1450 1451 ret = 0; 1452 out: 1453 if (dentry_dir) { 1454 d_drop(dentry_dir); 1455 dput(dentry_dir); 1456 } else if (dir) { 1457 iput(dir); 1458 } 1459 if (dentry_inode) { 1460 d_drop(dentry_inode); 1461 dput(dentry_inode); 1462 } 1463 1464 return ret; 1465 } 1466 1467 /* Link replay function */ 1468 static int ext4_fc_replay_link(struct super_block *sb, 1469 struct ext4_fc_tl_mem *tl, u8 *val) 1470 { 1471 struct inode *inode; 1472 struct dentry_info_args darg; 1473 int ret = 0; 1474 1475 tl_to_darg(&darg, tl, val); 1476 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1477 darg.parent_ino, darg.dname_len); 1478 1479 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1480 if (IS_ERR(inode)) { 1481 ext4_debug("Inode not found."); 1482 return 0; 1483 } 1484 1485 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1486 iput(inode); 1487 return ret; 1488 } 1489 1490 /* 1491 * Record all the modified inodes during replay. We use this later to setup 1492 * block bitmaps correctly. 1493 */ 1494 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1495 { 1496 struct ext4_fc_replay_state *state; 1497 int i; 1498 1499 state = &EXT4_SB(sb)->s_fc_replay_state; 1500 for (i = 0; i < state->fc_modified_inodes_used; i++) 1501 if (state->fc_modified_inodes[i] == ino) 1502 return 0; 1503 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1504 int *fc_modified_inodes; 1505 1506 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1507 sizeof(int) * (state->fc_modified_inodes_size + 1508 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1509 GFP_KERNEL); 1510 if (!fc_modified_inodes) 1511 return -ENOMEM; 1512 state->fc_modified_inodes = fc_modified_inodes; 1513 state->fc_modified_inodes_size += 1514 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1515 } 1516 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1517 return 0; 1518 } 1519 1520 /* 1521 * Inode replay function 1522 */ 1523 static int ext4_fc_replay_inode(struct super_block *sb, 1524 struct ext4_fc_tl_mem *tl, u8 *val) 1525 { 1526 struct ext4_fc_inode fc_inode; 1527 struct ext4_inode *raw_inode; 1528 struct ext4_inode *raw_fc_inode; 1529 struct inode *inode = NULL; 1530 struct ext4_iloc iloc; 1531 int inode_len, ino, ret, tag = tl->fc_tag; 1532 struct ext4_extent_header *eh; 1533 size_t off_gen = offsetof(struct ext4_inode, i_generation); 1534 1535 memcpy(&fc_inode, val, sizeof(fc_inode)); 1536 1537 ino = le32_to_cpu(fc_inode.fc_ino); 1538 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1539 1540 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1541 if (!IS_ERR(inode)) { 1542 ext4_ext_clear_bb(inode); 1543 iput(inode); 1544 } 1545 inode = NULL; 1546 1547 ret = ext4_fc_record_modified_inode(sb, ino); 1548 if (ret) 1549 goto out; 1550 1551 raw_fc_inode = (struct ext4_inode *) 1552 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1553 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1554 if (ret) 1555 goto out; 1556 1557 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); 1558 raw_inode = ext4_raw_inode(&iloc); 1559 1560 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1561 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, 1562 inode_len - off_gen); 1563 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1564 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1565 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1566 memset(eh, 0, sizeof(*eh)); 1567 eh->eh_magic = EXT4_EXT_MAGIC; 1568 eh->eh_max = cpu_to_le16( 1569 (sizeof(raw_inode->i_block) - 1570 sizeof(struct ext4_extent_header)) 1571 / sizeof(struct ext4_extent)); 1572 } 1573 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1574 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1575 sizeof(raw_inode->i_block)); 1576 } 1577 1578 /* Immediately update the inode on disk. */ 1579 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1580 if (ret) 1581 goto out; 1582 ret = sync_dirty_buffer(iloc.bh); 1583 if (ret) 1584 goto out; 1585 ret = ext4_mark_inode_used(sb, ino); 1586 if (ret) 1587 goto out; 1588 1589 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1590 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1591 if (IS_ERR(inode)) { 1592 ext4_debug("Inode not found."); 1593 return -EFSCORRUPTED; 1594 } 1595 1596 /* 1597 * Our allocator could have made different decisions than before 1598 * crashing. This should be fixed but until then, we calculate 1599 * the number of blocks the inode. 1600 */ 1601 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1602 ext4_ext_replay_set_iblocks(inode); 1603 1604 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1605 ext4_reset_inode_seed(inode); 1606 1607 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1608 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1609 sync_dirty_buffer(iloc.bh); 1610 brelse(iloc.bh); 1611 out: 1612 iput(inode); 1613 if (!ret) 1614 blkdev_issue_flush(sb->s_bdev); 1615 1616 return 0; 1617 } 1618 1619 /* 1620 * Dentry create replay function. 1621 * 1622 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1623 * inode for which we are trying to create a dentry here, should already have 1624 * been replayed before we start here. 1625 */ 1626 static int ext4_fc_replay_create(struct super_block *sb, 1627 struct ext4_fc_tl_mem *tl, u8 *val) 1628 { 1629 int ret = 0; 1630 struct inode *inode = NULL; 1631 struct inode *dir = NULL; 1632 struct dentry_info_args darg; 1633 1634 tl_to_darg(&darg, tl, val); 1635 1636 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1637 darg.parent_ino, darg.dname_len); 1638 1639 /* This takes care of update group descriptor and other metadata */ 1640 ret = ext4_mark_inode_used(sb, darg.ino); 1641 if (ret) 1642 goto out; 1643 1644 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1645 if (IS_ERR(inode)) { 1646 ext4_debug("inode %d not found.", darg.ino); 1647 inode = NULL; 1648 ret = -EINVAL; 1649 goto out; 1650 } 1651 1652 if (S_ISDIR(inode->i_mode)) { 1653 /* 1654 * If we are creating a directory, we need to make sure that the 1655 * dot and dot dot dirents are setup properly. 1656 */ 1657 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1658 if (IS_ERR(dir)) { 1659 ext4_debug("Dir %d not found.", darg.ino); 1660 goto out; 1661 } 1662 ret = ext4_init_new_dir(NULL, dir, inode); 1663 iput(dir); 1664 if (ret) { 1665 ret = 0; 1666 goto out; 1667 } 1668 } 1669 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1670 if (ret) 1671 goto out; 1672 set_nlink(inode, 1); 1673 ext4_mark_inode_dirty(NULL, inode); 1674 out: 1675 iput(inode); 1676 return ret; 1677 } 1678 1679 /* 1680 * Record physical disk regions which are in use as per fast commit area, 1681 * and used by inodes during replay phase. Our simple replay phase 1682 * allocator excludes these regions from allocation. 1683 */ 1684 int ext4_fc_record_regions(struct super_block *sb, int ino, 1685 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1686 { 1687 struct ext4_fc_replay_state *state; 1688 struct ext4_fc_alloc_region *region; 1689 1690 state = &EXT4_SB(sb)->s_fc_replay_state; 1691 /* 1692 * during replay phase, the fc_regions_valid may not same as 1693 * fc_regions_used, update it when do new additions. 1694 */ 1695 if (replay && state->fc_regions_used != state->fc_regions_valid) 1696 state->fc_regions_used = state->fc_regions_valid; 1697 if (state->fc_regions_used == state->fc_regions_size) { 1698 struct ext4_fc_alloc_region *fc_regions; 1699 1700 fc_regions = krealloc(state->fc_regions, 1701 sizeof(struct ext4_fc_alloc_region) * 1702 (state->fc_regions_size + 1703 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1704 GFP_KERNEL); 1705 if (!fc_regions) 1706 return -ENOMEM; 1707 state->fc_regions_size += 1708 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1709 state->fc_regions = fc_regions; 1710 } 1711 region = &state->fc_regions[state->fc_regions_used++]; 1712 region->ino = ino; 1713 region->lblk = lblk; 1714 region->pblk = pblk; 1715 region->len = len; 1716 1717 if (replay) 1718 state->fc_regions_valid++; 1719 1720 return 0; 1721 } 1722 1723 /* Replay add range tag */ 1724 static int ext4_fc_replay_add_range(struct super_block *sb, 1725 struct ext4_fc_tl_mem *tl, u8 *val) 1726 { 1727 struct ext4_fc_add_range fc_add_ex; 1728 struct ext4_extent newex, *ex; 1729 struct inode *inode; 1730 ext4_lblk_t start, cur; 1731 int remaining, len; 1732 ext4_fsblk_t start_pblk; 1733 struct ext4_map_blocks map; 1734 struct ext4_ext_path *path = NULL; 1735 int ret; 1736 1737 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1738 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1739 1740 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1741 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1742 ext4_ext_get_actual_len(ex)); 1743 1744 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1745 if (IS_ERR(inode)) { 1746 ext4_debug("Inode not found."); 1747 return 0; 1748 } 1749 1750 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1751 if (ret) 1752 goto out; 1753 1754 start = le32_to_cpu(ex->ee_block); 1755 start_pblk = ext4_ext_pblock(ex); 1756 len = ext4_ext_get_actual_len(ex); 1757 1758 cur = start; 1759 remaining = len; 1760 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1761 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1762 inode->i_ino); 1763 1764 while (remaining > 0) { 1765 map.m_lblk = cur; 1766 map.m_len = remaining; 1767 map.m_pblk = 0; 1768 ret = ext4_map_blocks(NULL, inode, &map, 0); 1769 1770 if (ret < 0) 1771 goto out; 1772 1773 if (ret == 0) { 1774 /* Range is not mapped */ 1775 path = ext4_find_extent(inode, cur, NULL, 0); 1776 if (IS_ERR(path)) 1777 goto out; 1778 memset(&newex, 0, sizeof(newex)); 1779 newex.ee_block = cpu_to_le32(cur); 1780 ext4_ext_store_pblock( 1781 &newex, start_pblk + cur - start); 1782 newex.ee_len = cpu_to_le16(map.m_len); 1783 if (ext4_ext_is_unwritten(ex)) 1784 ext4_ext_mark_unwritten(&newex); 1785 down_write(&EXT4_I(inode)->i_data_sem); 1786 ret = ext4_ext_insert_extent( 1787 NULL, inode, &path, &newex, 0); 1788 up_write((&EXT4_I(inode)->i_data_sem)); 1789 ext4_free_ext_path(path); 1790 if (ret) 1791 goto out; 1792 goto next; 1793 } 1794 1795 if (start_pblk + cur - start != map.m_pblk) { 1796 /* 1797 * Logical to physical mapping changed. This can happen 1798 * if this range was removed and then reallocated to 1799 * map to new physical blocks during a fast commit. 1800 */ 1801 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1802 ext4_ext_is_unwritten(ex), 1803 start_pblk + cur - start); 1804 if (ret) 1805 goto out; 1806 /* 1807 * Mark the old blocks as free since they aren't used 1808 * anymore. We maintain an array of all the modified 1809 * inodes. In case these blocks are still used at either 1810 * a different logical range in the same inode or in 1811 * some different inode, we will mark them as allocated 1812 * at the end of the FC replay using our array of 1813 * modified inodes. 1814 */ 1815 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1816 goto next; 1817 } 1818 1819 /* Range is mapped and needs a state change */ 1820 ext4_debug("Converting from %ld to %d %lld", 1821 map.m_flags & EXT4_MAP_UNWRITTEN, 1822 ext4_ext_is_unwritten(ex), map.m_pblk); 1823 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1824 ext4_ext_is_unwritten(ex), map.m_pblk); 1825 if (ret) 1826 goto out; 1827 /* 1828 * We may have split the extent tree while toggling the state. 1829 * Try to shrink the extent tree now. 1830 */ 1831 ext4_ext_replay_shrink_inode(inode, start + len); 1832 next: 1833 cur += map.m_len; 1834 remaining -= map.m_len; 1835 } 1836 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1837 sb->s_blocksize_bits); 1838 out: 1839 iput(inode); 1840 return 0; 1841 } 1842 1843 /* Replay DEL_RANGE tag */ 1844 static int 1845 ext4_fc_replay_del_range(struct super_block *sb, 1846 struct ext4_fc_tl_mem *tl, u8 *val) 1847 { 1848 struct inode *inode; 1849 struct ext4_fc_del_range lrange; 1850 struct ext4_map_blocks map; 1851 ext4_lblk_t cur, remaining; 1852 int ret; 1853 1854 memcpy(&lrange, val, sizeof(lrange)); 1855 cur = le32_to_cpu(lrange.fc_lblk); 1856 remaining = le32_to_cpu(lrange.fc_len); 1857 1858 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1859 le32_to_cpu(lrange.fc_ino), cur, remaining); 1860 1861 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1862 if (IS_ERR(inode)) { 1863 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1864 return 0; 1865 } 1866 1867 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1868 if (ret) 1869 goto out; 1870 1871 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", 1872 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1873 le32_to_cpu(lrange.fc_len)); 1874 while (remaining > 0) { 1875 map.m_lblk = cur; 1876 map.m_len = remaining; 1877 1878 ret = ext4_map_blocks(NULL, inode, &map, 0); 1879 if (ret < 0) 1880 goto out; 1881 if (ret > 0) { 1882 remaining -= ret; 1883 cur += ret; 1884 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1885 } else { 1886 remaining -= map.m_len; 1887 cur += map.m_len; 1888 } 1889 } 1890 1891 down_write(&EXT4_I(inode)->i_data_sem); 1892 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1893 le32_to_cpu(lrange.fc_lblk) + 1894 le32_to_cpu(lrange.fc_len) - 1); 1895 up_write(&EXT4_I(inode)->i_data_sem); 1896 if (ret) 1897 goto out; 1898 ext4_ext_replay_shrink_inode(inode, 1899 i_size_read(inode) >> sb->s_blocksize_bits); 1900 ext4_mark_inode_dirty(NULL, inode); 1901 out: 1902 iput(inode); 1903 return 0; 1904 } 1905 1906 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1907 { 1908 struct ext4_fc_replay_state *state; 1909 struct inode *inode; 1910 struct ext4_ext_path *path = NULL; 1911 struct ext4_map_blocks map; 1912 int i, ret, j; 1913 ext4_lblk_t cur, end; 1914 1915 state = &EXT4_SB(sb)->s_fc_replay_state; 1916 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1917 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1918 EXT4_IGET_NORMAL); 1919 if (IS_ERR(inode)) { 1920 ext4_debug("Inode %d not found.", 1921 state->fc_modified_inodes[i]); 1922 continue; 1923 } 1924 cur = 0; 1925 end = EXT_MAX_BLOCKS; 1926 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1927 iput(inode); 1928 continue; 1929 } 1930 while (cur < end) { 1931 map.m_lblk = cur; 1932 map.m_len = end - cur; 1933 1934 ret = ext4_map_blocks(NULL, inode, &map, 0); 1935 if (ret < 0) 1936 break; 1937 1938 if (ret > 0) { 1939 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1940 if (!IS_ERR(path)) { 1941 for (j = 0; j < path->p_depth; j++) 1942 ext4_mb_mark_bb(inode->i_sb, 1943 path[j].p_block, 1, 1); 1944 ext4_free_ext_path(path); 1945 } 1946 cur += ret; 1947 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1948 map.m_len, 1); 1949 } else { 1950 cur = cur + (map.m_len ? map.m_len : 1); 1951 } 1952 } 1953 iput(inode); 1954 } 1955 } 1956 1957 /* 1958 * Check if block is in excluded regions for block allocation. The simple 1959 * allocator that runs during replay phase is calls this function to see 1960 * if it is okay to use a block. 1961 */ 1962 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1963 { 1964 int i; 1965 struct ext4_fc_replay_state *state; 1966 1967 state = &EXT4_SB(sb)->s_fc_replay_state; 1968 for (i = 0; i < state->fc_regions_valid; i++) { 1969 if (state->fc_regions[i].ino == 0 || 1970 state->fc_regions[i].len == 0) 1971 continue; 1972 if (in_range(blk, state->fc_regions[i].pblk, 1973 state->fc_regions[i].len)) 1974 return true; 1975 } 1976 return false; 1977 } 1978 1979 /* Cleanup function called after replay */ 1980 void ext4_fc_replay_cleanup(struct super_block *sb) 1981 { 1982 struct ext4_sb_info *sbi = EXT4_SB(sb); 1983 1984 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1985 kfree(sbi->s_fc_replay_state.fc_regions); 1986 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1987 } 1988 1989 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, 1990 int tag, int len) 1991 { 1992 switch (tag) { 1993 case EXT4_FC_TAG_ADD_RANGE: 1994 return len == sizeof(struct ext4_fc_add_range); 1995 case EXT4_FC_TAG_DEL_RANGE: 1996 return len == sizeof(struct ext4_fc_del_range); 1997 case EXT4_FC_TAG_CREAT: 1998 case EXT4_FC_TAG_LINK: 1999 case EXT4_FC_TAG_UNLINK: 2000 len -= sizeof(struct ext4_fc_dentry_info); 2001 return len >= 1 && len <= EXT4_NAME_LEN; 2002 case EXT4_FC_TAG_INODE: 2003 len -= sizeof(struct ext4_fc_inode); 2004 return len >= EXT4_GOOD_OLD_INODE_SIZE && 2005 len <= sbi->s_inode_size; 2006 case EXT4_FC_TAG_PAD: 2007 return true; /* padding can have any length */ 2008 case EXT4_FC_TAG_TAIL: 2009 return len >= sizeof(struct ext4_fc_tail); 2010 case EXT4_FC_TAG_HEAD: 2011 return len == sizeof(struct ext4_fc_head); 2012 } 2013 return false; 2014 } 2015 2016 /* 2017 * Recovery Scan phase handler 2018 * 2019 * This function is called during the scan phase and is responsible 2020 * for doing following things: 2021 * - Make sure the fast commit area has valid tags for replay 2022 * - Count number of tags that need to be replayed by the replay handler 2023 * - Verify CRC 2024 * - Create a list of excluded blocks for allocation during replay phase 2025 * 2026 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 2027 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 2028 * to indicate that scan has finished and JBD2 can now start replay phase. 2029 * It returns a negative error to indicate that there was an error. At the end 2030 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 2031 * to indicate the number of tags that need to replayed during the replay phase. 2032 */ 2033 static int ext4_fc_replay_scan(journal_t *journal, 2034 struct buffer_head *bh, int off, 2035 tid_t expected_tid) 2036 { 2037 struct super_block *sb = journal->j_private; 2038 struct ext4_sb_info *sbi = EXT4_SB(sb); 2039 struct ext4_fc_replay_state *state; 2040 int ret = JBD2_FC_REPLAY_CONTINUE; 2041 struct ext4_fc_add_range ext; 2042 struct ext4_fc_tl_mem tl; 2043 struct ext4_fc_tail tail; 2044 __u8 *start, *end, *cur, *val; 2045 struct ext4_fc_head head; 2046 struct ext4_extent *ex; 2047 2048 state = &sbi->s_fc_replay_state; 2049 2050 start = (u8 *)bh->b_data; 2051 end = start + journal->j_blocksize; 2052 2053 if (state->fc_replay_expected_off == 0) { 2054 state->fc_cur_tag = 0; 2055 state->fc_replay_num_tags = 0; 2056 state->fc_crc = 0; 2057 state->fc_regions = NULL; 2058 state->fc_regions_valid = state->fc_regions_used = 2059 state->fc_regions_size = 0; 2060 /* Check if we can stop early */ 2061 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2062 != EXT4_FC_TAG_HEAD) 2063 return 0; 2064 } 2065 2066 if (off != state->fc_replay_expected_off) { 2067 ret = -EFSCORRUPTED; 2068 goto out_err; 2069 } 2070 2071 state->fc_replay_expected_off++; 2072 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2073 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2074 ext4_fc_get_tl(&tl, cur); 2075 val = cur + EXT4_FC_TAG_BASE_LEN; 2076 if (tl.fc_len > end - val || 2077 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { 2078 ret = state->fc_replay_num_tags ? 2079 JBD2_FC_REPLAY_STOP : -ECANCELED; 2080 goto out_err; 2081 } 2082 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2083 tag2str(tl.fc_tag), bh->b_blocknr); 2084 switch (tl.fc_tag) { 2085 case EXT4_FC_TAG_ADD_RANGE: 2086 memcpy(&ext, val, sizeof(ext)); 2087 ex = (struct ext4_extent *)&ext.fc_ex; 2088 ret = ext4_fc_record_regions(sb, 2089 le32_to_cpu(ext.fc_ino), 2090 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2091 ext4_ext_get_actual_len(ex), 0); 2092 if (ret < 0) 2093 break; 2094 ret = JBD2_FC_REPLAY_CONTINUE; 2095 fallthrough; 2096 case EXT4_FC_TAG_DEL_RANGE: 2097 case EXT4_FC_TAG_LINK: 2098 case EXT4_FC_TAG_UNLINK: 2099 case EXT4_FC_TAG_CREAT: 2100 case EXT4_FC_TAG_INODE: 2101 case EXT4_FC_TAG_PAD: 2102 state->fc_cur_tag++; 2103 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2104 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2105 break; 2106 case EXT4_FC_TAG_TAIL: 2107 state->fc_cur_tag++; 2108 memcpy(&tail, val, sizeof(tail)); 2109 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2110 EXT4_FC_TAG_BASE_LEN + 2111 offsetof(struct ext4_fc_tail, 2112 fc_crc)); 2113 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2114 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2115 state->fc_replay_num_tags = state->fc_cur_tag; 2116 state->fc_regions_valid = 2117 state->fc_regions_used; 2118 } else { 2119 ret = state->fc_replay_num_tags ? 2120 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2121 } 2122 state->fc_crc = 0; 2123 break; 2124 case EXT4_FC_TAG_HEAD: 2125 memcpy(&head, val, sizeof(head)); 2126 if (le32_to_cpu(head.fc_features) & 2127 ~EXT4_FC_SUPPORTED_FEATURES) { 2128 ret = -EOPNOTSUPP; 2129 break; 2130 } 2131 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2132 ret = JBD2_FC_REPLAY_STOP; 2133 break; 2134 } 2135 state->fc_cur_tag++; 2136 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2137 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2138 break; 2139 default: 2140 ret = state->fc_replay_num_tags ? 2141 JBD2_FC_REPLAY_STOP : -ECANCELED; 2142 } 2143 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2144 break; 2145 } 2146 2147 out_err: 2148 trace_ext4_fc_replay_scan(sb, ret, off); 2149 return ret; 2150 } 2151 2152 /* 2153 * Main recovery path entry point. 2154 * The meaning of return codes is similar as above. 2155 */ 2156 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2157 enum passtype pass, int off, tid_t expected_tid) 2158 { 2159 struct super_block *sb = journal->j_private; 2160 struct ext4_sb_info *sbi = EXT4_SB(sb); 2161 struct ext4_fc_tl_mem tl; 2162 __u8 *start, *end, *cur, *val; 2163 int ret = JBD2_FC_REPLAY_CONTINUE; 2164 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2165 struct ext4_fc_tail tail; 2166 2167 if (pass == PASS_SCAN) { 2168 state->fc_current_pass = PASS_SCAN; 2169 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2170 } 2171 2172 if (state->fc_current_pass != pass) { 2173 state->fc_current_pass = pass; 2174 sbi->s_mount_state |= EXT4_FC_REPLAY; 2175 } 2176 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2177 ext4_debug("Replay stops\n"); 2178 ext4_fc_set_bitmaps_and_counters(sb); 2179 return 0; 2180 } 2181 2182 #ifdef CONFIG_EXT4_DEBUG 2183 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2184 pr_warn("Dropping fc block %d because max_replay set\n", off); 2185 return JBD2_FC_REPLAY_STOP; 2186 } 2187 #endif 2188 2189 start = (u8 *)bh->b_data; 2190 end = start + journal->j_blocksize; 2191 2192 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2193 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2194 ext4_fc_get_tl(&tl, cur); 2195 val = cur + EXT4_FC_TAG_BASE_LEN; 2196 2197 if (state->fc_replay_num_tags == 0) { 2198 ret = JBD2_FC_REPLAY_STOP; 2199 ext4_fc_set_bitmaps_and_counters(sb); 2200 break; 2201 } 2202 2203 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); 2204 state->fc_replay_num_tags--; 2205 switch (tl.fc_tag) { 2206 case EXT4_FC_TAG_LINK: 2207 ret = ext4_fc_replay_link(sb, &tl, val); 2208 break; 2209 case EXT4_FC_TAG_UNLINK: 2210 ret = ext4_fc_replay_unlink(sb, &tl, val); 2211 break; 2212 case EXT4_FC_TAG_ADD_RANGE: 2213 ret = ext4_fc_replay_add_range(sb, &tl, val); 2214 break; 2215 case EXT4_FC_TAG_CREAT: 2216 ret = ext4_fc_replay_create(sb, &tl, val); 2217 break; 2218 case EXT4_FC_TAG_DEL_RANGE: 2219 ret = ext4_fc_replay_del_range(sb, &tl, val); 2220 break; 2221 case EXT4_FC_TAG_INODE: 2222 ret = ext4_fc_replay_inode(sb, &tl, val); 2223 break; 2224 case EXT4_FC_TAG_PAD: 2225 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2226 tl.fc_len, 0); 2227 break; 2228 case EXT4_FC_TAG_TAIL: 2229 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 2230 0, tl.fc_len, 0); 2231 memcpy(&tail, val, sizeof(tail)); 2232 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2233 break; 2234 case EXT4_FC_TAG_HEAD: 2235 break; 2236 default: 2237 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); 2238 ret = -ECANCELED; 2239 break; 2240 } 2241 if (ret < 0) 2242 break; 2243 ret = JBD2_FC_REPLAY_CONTINUE; 2244 } 2245 return ret; 2246 } 2247 2248 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2249 { 2250 /* 2251 * We set replay callback even if fast commit disabled because we may 2252 * could still have fast commit blocks that need to be replayed even if 2253 * fast commit has now been turned off. 2254 */ 2255 journal->j_fc_replay_callback = ext4_fc_replay; 2256 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2257 return; 2258 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2259 } 2260 2261 static const char * const fc_ineligible_reasons[] = { 2262 [EXT4_FC_REASON_XATTR] = "Extended attributes changed", 2263 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", 2264 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", 2265 [EXT4_FC_REASON_NOMEM] = "Insufficient memory", 2266 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", 2267 [EXT4_FC_REASON_RESIZE] = "Resize", 2268 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", 2269 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", 2270 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", 2271 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", 2272 }; 2273 2274 int ext4_fc_info_show(struct seq_file *seq, void *v) 2275 { 2276 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2277 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2278 int i; 2279 2280 if (v != SEQ_START_TOKEN) 2281 return 0; 2282 2283 seq_printf(seq, 2284 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2285 stats->fc_num_commits, stats->fc_ineligible_commits, 2286 stats->fc_numblks, 2287 div_u64(stats->s_fc_avg_commit_time, 1000)); 2288 seq_puts(seq, "Ineligible reasons:\n"); 2289 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2290 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2291 stats->fc_ineligible_reason_count[i]); 2292 2293 return 0; 2294 } 2295 2296 int __init ext4_fc_init_dentry_cache(void) 2297 { 2298 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2299 SLAB_RECLAIM_ACCOUNT); 2300 2301 if (ext4_fc_dentry_cachep == NULL) 2302 return -ENOMEM; 2303 2304 return 0; 2305 } 2306 2307 void ext4_fc_destroy_dentry_cache(void) 2308 { 2309 kmem_cache_destroy(ext4_fc_dentry_cachep); 2310 } 2311