1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 INIT_LIST_HEAD(&ei->i_fc_dilist); 203 init_waitqueue_head(&ei->i_fc_wait); 204 atomic_set(&ei->i_fc_updates, 0); 205 } 206 207 /* This function must be called with sbi->s_fc_lock held. */ 208 static void ext4_fc_wait_committing_inode(struct inode *inode) 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 210 { 211 wait_queue_head_t *wq; 212 struct ext4_inode_info *ei = EXT4_I(inode); 213 214 #if (BITS_PER_LONG < 64) 215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 216 EXT4_STATE_FC_COMMITTING); 217 wq = bit_waitqueue(&ei->i_state_flags, 218 EXT4_STATE_FC_COMMITTING); 219 #else 220 DEFINE_WAIT_BIT(wait, &ei->i_flags, 221 EXT4_STATE_FC_COMMITTING); 222 wq = bit_waitqueue(&ei->i_flags, 223 EXT4_STATE_FC_COMMITTING); 224 #endif 225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 228 schedule(); 229 finish_wait(wq, &wait.wq_entry); 230 } 231 232 static bool ext4_fc_disabled(struct super_block *sb) 233 { 234 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 235 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 236 } 237 238 /* 239 * Inform Ext4's fast about start of an inode update 240 * 241 * This function is called by the high level call VFS callbacks before 242 * performing any inode update. This function blocks if there's an ongoing 243 * fast commit on the inode in question. 244 */ 245 void ext4_fc_start_update(struct inode *inode) 246 { 247 struct ext4_inode_info *ei = EXT4_I(inode); 248 249 if (ext4_fc_disabled(inode->i_sb)) 250 return; 251 252 restart: 253 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 254 if (list_empty(&ei->i_fc_list)) 255 goto out; 256 257 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 258 ext4_fc_wait_committing_inode(inode); 259 goto restart; 260 } 261 out: 262 atomic_inc(&ei->i_fc_updates); 263 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 264 } 265 266 /* 267 * Stop inode update and wake up waiting fast commits if any. 268 */ 269 void ext4_fc_stop_update(struct inode *inode) 270 { 271 struct ext4_inode_info *ei = EXT4_I(inode); 272 273 if (ext4_fc_disabled(inode->i_sb)) 274 return; 275 276 if (atomic_dec_and_test(&ei->i_fc_updates)) 277 wake_up_all(&ei->i_fc_wait); 278 } 279 280 /* 281 * Remove inode from fast commit list. If the inode is being committed 282 * we wait until inode commit is done. 283 */ 284 void ext4_fc_del(struct inode *inode) 285 { 286 struct ext4_inode_info *ei = EXT4_I(inode); 287 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 288 struct ext4_fc_dentry_update *fc_dentry; 289 290 if (ext4_fc_disabled(inode->i_sb)) 291 return; 292 293 restart: 294 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 295 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 296 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 297 return; 298 } 299 300 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 301 ext4_fc_wait_committing_inode(inode); 302 goto restart; 303 } 304 305 if (!list_empty(&ei->i_fc_list)) 306 list_del_init(&ei->i_fc_list); 307 308 /* 309 * Since this inode is getting removed, let's also remove all FC 310 * dentry create references, since it is not needed to log it anyways. 311 */ 312 if (list_empty(&ei->i_fc_dilist)) { 313 spin_unlock(&sbi->s_fc_lock); 314 return; 315 } 316 317 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 318 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 319 list_del_init(&fc_dentry->fcd_list); 320 list_del_init(&fc_dentry->fcd_dilist); 321 322 WARN_ON(!list_empty(&ei->i_fc_dilist)); 323 spin_unlock(&sbi->s_fc_lock); 324 325 if (fc_dentry->fcd_name.name && 326 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 327 kfree(fc_dentry->fcd_name.name); 328 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 329 330 return; 331 } 332 333 /* 334 * Mark file system as fast commit ineligible, and record latest 335 * ineligible transaction tid. This means until the recorded 336 * transaction, commit operation would result in a full jbd2 commit. 337 */ 338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 339 { 340 struct ext4_sb_info *sbi = EXT4_SB(sb); 341 tid_t tid; 342 343 if (ext4_fc_disabled(sb)) 344 return; 345 346 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 347 if (handle && !IS_ERR(handle)) 348 tid = handle->h_transaction->t_tid; 349 else { 350 read_lock(&sbi->s_journal->j_state_lock); 351 tid = sbi->s_journal->j_running_transaction ? 352 sbi->s_journal->j_running_transaction->t_tid : 0; 353 read_unlock(&sbi->s_journal->j_state_lock); 354 } 355 spin_lock(&sbi->s_fc_lock); 356 if (sbi->s_fc_ineligible_tid < tid) 357 sbi->s_fc_ineligible_tid = tid; 358 spin_unlock(&sbi->s_fc_lock); 359 WARN_ON(reason >= EXT4_FC_REASON_MAX); 360 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 361 } 362 363 /* 364 * Generic fast commit tracking function. If this is the first time this we are 365 * called after a full commit, we initialize fast commit fields and then call 366 * __fc_track_fn() with update = 0. If we have already been called after a full 367 * commit, we pass update = 1. Based on that, the track function can determine 368 * if it needs to track a field for the first time or if it needs to just 369 * update the previously tracked value. 370 * 371 * If enqueue is set, this function enqueues the inode in fast commit list. 372 */ 373 static int ext4_fc_track_template( 374 handle_t *handle, struct inode *inode, 375 int (*__fc_track_fn)(struct inode *, void *, bool), 376 void *args, int enqueue) 377 { 378 bool update = false; 379 struct ext4_inode_info *ei = EXT4_I(inode); 380 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 381 tid_t tid = 0; 382 int ret; 383 384 tid = handle->h_transaction->t_tid; 385 mutex_lock(&ei->i_fc_lock); 386 if (tid == ei->i_sync_tid) { 387 update = true; 388 } else { 389 ext4_fc_reset_inode(inode); 390 ei->i_sync_tid = tid; 391 } 392 ret = __fc_track_fn(inode, args, update); 393 mutex_unlock(&ei->i_fc_lock); 394 395 if (!enqueue) 396 return ret; 397 398 spin_lock(&sbi->s_fc_lock); 399 if (list_empty(&EXT4_I(inode)->i_fc_list)) 400 list_add_tail(&EXT4_I(inode)->i_fc_list, 401 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 402 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 403 &sbi->s_fc_q[FC_Q_STAGING] : 404 &sbi->s_fc_q[FC_Q_MAIN]); 405 spin_unlock(&sbi->s_fc_lock); 406 407 return ret; 408 } 409 410 struct __track_dentry_update_args { 411 struct dentry *dentry; 412 int op; 413 }; 414 415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 416 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 417 { 418 struct ext4_fc_dentry_update *node; 419 struct ext4_inode_info *ei = EXT4_I(inode); 420 struct __track_dentry_update_args *dentry_update = 421 (struct __track_dentry_update_args *)arg; 422 struct dentry *dentry = dentry_update->dentry; 423 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 424 425 mutex_unlock(&ei->i_fc_lock); 426 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 427 if (!node) { 428 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 429 mutex_lock(&ei->i_fc_lock); 430 return -ENOMEM; 431 } 432 433 node->fcd_op = dentry_update->op; 434 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 435 node->fcd_ino = inode->i_ino; 436 if (dentry->d_name.len > DNAME_INLINE_LEN) { 437 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 438 if (!node->fcd_name.name) { 439 kmem_cache_free(ext4_fc_dentry_cachep, node); 440 ext4_fc_mark_ineligible(inode->i_sb, 441 EXT4_FC_REASON_NOMEM, NULL); 442 mutex_lock(&ei->i_fc_lock); 443 return -ENOMEM; 444 } 445 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 446 dentry->d_name.len); 447 } else { 448 memcpy(node->fcd_iname, dentry->d_name.name, 449 dentry->d_name.len); 450 node->fcd_name.name = node->fcd_iname; 451 } 452 node->fcd_name.len = dentry->d_name.len; 453 INIT_LIST_HEAD(&node->fcd_dilist); 454 spin_lock(&sbi->s_fc_lock); 455 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 456 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 457 list_add_tail(&node->fcd_list, 458 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 459 else 460 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 461 462 /* 463 * This helps us keep a track of all fc_dentry updates which is part of 464 * this ext4 inode. So in case the inode is getting unlinked, before 465 * even we get a chance to fsync, we could remove all fc_dentry 466 * references while evicting the inode in ext4_fc_del(). 467 * Also with this, we don't need to loop over all the inodes in 468 * sbi->s_fc_q to get the corresponding inode in 469 * ext4_fc_commit_dentry_updates(). 470 */ 471 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 472 WARN_ON(!list_empty(&ei->i_fc_dilist)); 473 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 474 } 475 spin_unlock(&sbi->s_fc_lock); 476 mutex_lock(&ei->i_fc_lock); 477 478 return 0; 479 } 480 481 void __ext4_fc_track_unlink(handle_t *handle, 482 struct inode *inode, struct dentry *dentry) 483 { 484 struct __track_dentry_update_args args; 485 int ret; 486 487 args.dentry = dentry; 488 args.op = EXT4_FC_TAG_UNLINK; 489 490 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 491 (void *)&args, 0); 492 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 493 } 494 495 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 496 { 497 struct inode *inode = d_inode(dentry); 498 499 if (ext4_fc_disabled(inode->i_sb)) 500 return; 501 502 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 503 return; 504 505 __ext4_fc_track_unlink(handle, inode, dentry); 506 } 507 508 void __ext4_fc_track_link(handle_t *handle, 509 struct inode *inode, struct dentry *dentry) 510 { 511 struct __track_dentry_update_args args; 512 int ret; 513 514 args.dentry = dentry; 515 args.op = EXT4_FC_TAG_LINK; 516 517 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 518 (void *)&args, 0); 519 trace_ext4_fc_track_link(handle, inode, dentry, ret); 520 } 521 522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 523 { 524 struct inode *inode = d_inode(dentry); 525 526 if (ext4_fc_disabled(inode->i_sb)) 527 return; 528 529 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 530 return; 531 532 __ext4_fc_track_link(handle, inode, dentry); 533 } 534 535 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 536 struct dentry *dentry) 537 { 538 struct __track_dentry_update_args args; 539 int ret; 540 541 args.dentry = dentry; 542 args.op = EXT4_FC_TAG_CREAT; 543 544 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 545 (void *)&args, 0); 546 trace_ext4_fc_track_create(handle, inode, dentry, ret); 547 } 548 549 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 550 { 551 struct inode *inode = d_inode(dentry); 552 553 if (ext4_fc_disabled(inode->i_sb)) 554 return; 555 556 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 557 return; 558 559 __ext4_fc_track_create(handle, inode, dentry); 560 } 561 562 /* __track_fn for inode tracking */ 563 static int __track_inode(struct inode *inode, void *arg, bool update) 564 { 565 if (update) 566 return -EEXIST; 567 568 EXT4_I(inode)->i_fc_lblk_len = 0; 569 570 return 0; 571 } 572 573 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 574 { 575 int ret; 576 577 if (S_ISDIR(inode->i_mode)) 578 return; 579 580 if (ext4_fc_disabled(inode->i_sb)) 581 return; 582 583 if (ext4_should_journal_data(inode)) { 584 ext4_fc_mark_ineligible(inode->i_sb, 585 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 586 return; 587 } 588 589 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 590 return; 591 592 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 593 trace_ext4_fc_track_inode(handle, inode, ret); 594 } 595 596 struct __track_range_args { 597 ext4_lblk_t start, end; 598 }; 599 600 /* __track_fn for tracking data updates */ 601 static int __track_range(struct inode *inode, void *arg, bool update) 602 { 603 struct ext4_inode_info *ei = EXT4_I(inode); 604 ext4_lblk_t oldstart; 605 struct __track_range_args *__arg = 606 (struct __track_range_args *)arg; 607 608 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 609 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 610 return -ECANCELED; 611 } 612 613 oldstart = ei->i_fc_lblk_start; 614 615 if (update && ei->i_fc_lblk_len > 0) { 616 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 617 ei->i_fc_lblk_len = 618 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 619 ei->i_fc_lblk_start + 1; 620 } else { 621 ei->i_fc_lblk_start = __arg->start; 622 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 623 } 624 625 return 0; 626 } 627 628 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 629 ext4_lblk_t end) 630 { 631 struct __track_range_args args; 632 int ret; 633 634 if (S_ISDIR(inode->i_mode)) 635 return; 636 637 if (ext4_fc_disabled(inode->i_sb)) 638 return; 639 640 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 641 return; 642 643 args.start = start; 644 args.end = end; 645 646 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 647 648 trace_ext4_fc_track_range(handle, inode, start, end, ret); 649 } 650 651 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 652 { 653 blk_opf_t write_flags = REQ_SYNC; 654 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 655 656 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 657 if (test_opt(sb, BARRIER) && is_tail) 658 write_flags |= REQ_FUA | REQ_PREFLUSH; 659 lock_buffer(bh); 660 set_buffer_dirty(bh); 661 set_buffer_uptodate(bh); 662 bh->b_end_io = ext4_end_buffer_io_sync; 663 submit_bh(REQ_OP_WRITE | write_flags, bh); 664 EXT4_SB(sb)->s_fc_bh = NULL; 665 } 666 667 /* Ext4 commit path routines */ 668 669 /* memzero and update CRC */ 670 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 671 u32 *crc) 672 { 673 void *ret; 674 675 ret = memset(dst, 0, len); 676 if (crc) 677 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 678 return ret; 679 } 680 681 /* 682 * Allocate len bytes on a fast commit buffer. 683 * 684 * During the commit time this function is used to manage fast commit 685 * block space. We don't split a fast commit log onto different 686 * blocks. So this function makes sure that if there's not enough space 687 * on the current block, the remaining space in the current block is 688 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 689 * new block is from jbd2 and CRC is updated to reflect the padding 690 * we added. 691 */ 692 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 693 { 694 struct ext4_fc_tl *tl; 695 struct ext4_sb_info *sbi = EXT4_SB(sb); 696 struct buffer_head *bh; 697 int bsize = sbi->s_journal->j_blocksize; 698 int ret, off = sbi->s_fc_bytes % bsize; 699 int pad_len; 700 701 /* 702 * After allocating len, we should have space at least for a 0 byte 703 * padding. 704 */ 705 if (len + sizeof(struct ext4_fc_tl) > bsize) 706 return NULL; 707 708 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 709 /* 710 * Only allocate from current buffer if we have enough space for 711 * this request AND we have space to add a zero byte padding. 712 */ 713 if (!sbi->s_fc_bh) { 714 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 715 if (ret) 716 return NULL; 717 sbi->s_fc_bh = bh; 718 } 719 sbi->s_fc_bytes += len; 720 return sbi->s_fc_bh->b_data + off; 721 } 722 /* Need to add PAD tag */ 723 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 724 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 725 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 726 tl->fc_len = cpu_to_le16(pad_len); 727 if (crc) 728 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 729 if (pad_len > 0) 730 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 731 ext4_fc_submit_bh(sb, false); 732 733 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 734 if (ret) 735 return NULL; 736 sbi->s_fc_bh = bh; 737 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 738 return sbi->s_fc_bh->b_data; 739 } 740 741 /* memcpy to fc reserved space and update CRC */ 742 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 743 int len, u32 *crc) 744 { 745 if (crc) 746 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 747 return memcpy(dst, src, len); 748 } 749 750 /* 751 * Complete a fast commit by writing tail tag. 752 * 753 * Writing tail tag marks the end of a fast commit. In order to guarantee 754 * atomicity, after writing tail tag, even if there's space remaining 755 * in the block, next commit shouldn't use it. That's why tail tag 756 * has the length as that of the remaining space on the block. 757 */ 758 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 759 { 760 struct ext4_sb_info *sbi = EXT4_SB(sb); 761 struct ext4_fc_tl tl; 762 struct ext4_fc_tail tail; 763 int off, bsize = sbi->s_journal->j_blocksize; 764 u8 *dst; 765 766 /* 767 * ext4_fc_reserve_space takes care of allocating an extra block if 768 * there's no enough space on this block for accommodating this tail. 769 */ 770 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 771 if (!dst) 772 return -ENOSPC; 773 774 off = sbi->s_fc_bytes % bsize; 775 776 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 777 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 778 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 779 780 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 781 dst += sizeof(tl); 782 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 783 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 784 dst += sizeof(tail.fc_tid); 785 tail.fc_crc = cpu_to_le32(crc); 786 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 787 788 ext4_fc_submit_bh(sb, true); 789 790 return 0; 791 } 792 793 /* 794 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 795 * Returns false if there's not enough space. 796 */ 797 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 798 u32 *crc) 799 { 800 struct ext4_fc_tl tl; 801 u8 *dst; 802 803 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 804 if (!dst) 805 return false; 806 807 tl.fc_tag = cpu_to_le16(tag); 808 tl.fc_len = cpu_to_le16(len); 809 810 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 811 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 812 813 return true; 814 } 815 816 /* Same as above, but adds dentry tlv. */ 817 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 818 struct ext4_fc_dentry_update *fc_dentry) 819 { 820 struct ext4_fc_dentry_info fcd; 821 struct ext4_fc_tl tl; 822 int dlen = fc_dentry->fcd_name.len; 823 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 824 crc); 825 826 if (!dst) 827 return false; 828 829 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 830 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 831 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 832 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 833 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 834 dst += sizeof(tl); 835 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 836 dst += sizeof(fcd); 837 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); 838 839 return true; 840 } 841 842 /* 843 * Writes inode in the fast commit space under TLV with tag @tag. 844 * Returns 0 on success, error on failure. 845 */ 846 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 847 { 848 struct ext4_inode_info *ei = EXT4_I(inode); 849 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 850 int ret; 851 struct ext4_iloc iloc; 852 struct ext4_fc_inode fc_inode; 853 struct ext4_fc_tl tl; 854 u8 *dst; 855 856 ret = ext4_get_inode_loc(inode, &iloc); 857 if (ret) 858 return ret; 859 860 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 861 inode_len = EXT4_INODE_SIZE(inode->i_sb); 862 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 863 inode_len += ei->i_extra_isize; 864 865 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 866 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 867 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 868 869 ret = -ECANCELED; 870 dst = ext4_fc_reserve_space(inode->i_sb, 871 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 872 if (!dst) 873 goto err; 874 875 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 876 goto err; 877 dst += sizeof(tl); 878 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 879 goto err; 880 dst += sizeof(fc_inode); 881 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 882 inode_len, crc)) 883 goto err; 884 ret = 0; 885 err: 886 brelse(iloc.bh); 887 return ret; 888 } 889 890 /* 891 * Writes updated data ranges for the inode in question. Updates CRC. 892 * Returns 0 on success, error otherwise. 893 */ 894 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 895 { 896 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 897 struct ext4_inode_info *ei = EXT4_I(inode); 898 struct ext4_map_blocks map; 899 struct ext4_fc_add_range fc_ext; 900 struct ext4_fc_del_range lrange; 901 struct ext4_extent *ex; 902 int ret; 903 904 mutex_lock(&ei->i_fc_lock); 905 if (ei->i_fc_lblk_len == 0) { 906 mutex_unlock(&ei->i_fc_lock); 907 return 0; 908 } 909 old_blk_size = ei->i_fc_lblk_start; 910 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 911 ei->i_fc_lblk_len = 0; 912 mutex_unlock(&ei->i_fc_lock); 913 914 cur_lblk_off = old_blk_size; 915 ext4_debug("will try writing %d to %d for inode %ld\n", 916 cur_lblk_off, new_blk_size, inode->i_ino); 917 918 while (cur_lblk_off <= new_blk_size) { 919 map.m_lblk = cur_lblk_off; 920 map.m_len = new_blk_size - cur_lblk_off + 1; 921 ret = ext4_map_blocks(NULL, inode, &map, 0); 922 if (ret < 0) 923 return -ECANCELED; 924 925 if (map.m_len == 0) { 926 cur_lblk_off++; 927 continue; 928 } 929 930 if (ret == 0) { 931 lrange.fc_ino = cpu_to_le32(inode->i_ino); 932 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 933 lrange.fc_len = cpu_to_le32(map.m_len); 934 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 935 sizeof(lrange), (u8 *)&lrange, crc)) 936 return -ENOSPC; 937 } else { 938 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 939 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 940 941 /* Limit the number of blocks in one extent */ 942 map.m_len = min(max, map.m_len); 943 944 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 945 ex = (struct ext4_extent *)&fc_ext.fc_ex; 946 ex->ee_block = cpu_to_le32(map.m_lblk); 947 ex->ee_len = cpu_to_le16(map.m_len); 948 ext4_ext_store_pblock(ex, map.m_pblk); 949 if (map.m_flags & EXT4_MAP_UNWRITTEN) 950 ext4_ext_mark_unwritten(ex); 951 else 952 ext4_ext_mark_initialized(ex); 953 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 954 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 955 return -ENOSPC; 956 } 957 958 cur_lblk_off += map.m_len; 959 } 960 961 return 0; 962 } 963 964 965 /* Submit data for all the fast commit inodes */ 966 static int ext4_fc_submit_inode_data_all(journal_t *journal) 967 { 968 struct super_block *sb = journal->j_private; 969 struct ext4_sb_info *sbi = EXT4_SB(sb); 970 struct ext4_inode_info *ei; 971 int ret = 0; 972 973 spin_lock(&sbi->s_fc_lock); 974 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 975 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 976 while (atomic_read(&ei->i_fc_updates)) { 977 DEFINE_WAIT(wait); 978 979 prepare_to_wait(&ei->i_fc_wait, &wait, 980 TASK_UNINTERRUPTIBLE); 981 if (atomic_read(&ei->i_fc_updates)) { 982 spin_unlock(&sbi->s_fc_lock); 983 schedule(); 984 spin_lock(&sbi->s_fc_lock); 985 } 986 finish_wait(&ei->i_fc_wait, &wait); 987 } 988 spin_unlock(&sbi->s_fc_lock); 989 ret = jbd2_submit_inode_data(ei->jinode); 990 if (ret) 991 return ret; 992 spin_lock(&sbi->s_fc_lock); 993 } 994 spin_unlock(&sbi->s_fc_lock); 995 996 return ret; 997 } 998 999 /* Wait for completion of data for all the fast commit inodes */ 1000 static int ext4_fc_wait_inode_data_all(journal_t *journal) 1001 { 1002 struct super_block *sb = journal->j_private; 1003 struct ext4_sb_info *sbi = EXT4_SB(sb); 1004 struct ext4_inode_info *pos, *n; 1005 int ret = 0; 1006 1007 spin_lock(&sbi->s_fc_lock); 1008 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1009 if (!ext4_test_inode_state(&pos->vfs_inode, 1010 EXT4_STATE_FC_COMMITTING)) 1011 continue; 1012 spin_unlock(&sbi->s_fc_lock); 1013 1014 ret = jbd2_wait_inode_data(journal, pos->jinode); 1015 if (ret) 1016 return ret; 1017 spin_lock(&sbi->s_fc_lock); 1018 } 1019 spin_unlock(&sbi->s_fc_lock); 1020 1021 return 0; 1022 } 1023 1024 /* Commit all the directory entry updates */ 1025 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1026 __acquires(&sbi->s_fc_lock) 1027 __releases(&sbi->s_fc_lock) 1028 { 1029 struct super_block *sb = journal->j_private; 1030 struct ext4_sb_info *sbi = EXT4_SB(sb); 1031 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1032 struct inode *inode; 1033 struct ext4_inode_info *ei; 1034 int ret; 1035 1036 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1037 return 0; 1038 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1039 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1040 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1041 spin_unlock(&sbi->s_fc_lock); 1042 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1043 ret = -ENOSPC; 1044 goto lock_and_exit; 1045 } 1046 spin_lock(&sbi->s_fc_lock); 1047 continue; 1048 } 1049 /* 1050 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1051 * corresponding inode pointer 1052 */ 1053 WARN_ON(list_empty(&fc_dentry->fcd_dilist)); 1054 ei = list_first_entry(&fc_dentry->fcd_dilist, 1055 struct ext4_inode_info, i_fc_dilist); 1056 inode = &ei->vfs_inode; 1057 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1058 1059 spin_unlock(&sbi->s_fc_lock); 1060 1061 /* 1062 * We first write the inode and then the create dirent. This 1063 * allows the recovery code to create an unnamed inode first 1064 * and then link it to a directory entry. This allows us 1065 * to use namei.c routines almost as is and simplifies 1066 * the recovery code. 1067 */ 1068 ret = ext4_fc_write_inode(inode, crc); 1069 if (ret) 1070 goto lock_and_exit; 1071 1072 ret = ext4_fc_write_inode_data(inode, crc); 1073 if (ret) 1074 goto lock_and_exit; 1075 1076 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1077 ret = -ENOSPC; 1078 goto lock_and_exit; 1079 } 1080 1081 spin_lock(&sbi->s_fc_lock); 1082 } 1083 return 0; 1084 lock_and_exit: 1085 spin_lock(&sbi->s_fc_lock); 1086 return ret; 1087 } 1088 1089 static int ext4_fc_perform_commit(journal_t *journal) 1090 { 1091 struct super_block *sb = journal->j_private; 1092 struct ext4_sb_info *sbi = EXT4_SB(sb); 1093 struct ext4_inode_info *iter; 1094 struct ext4_fc_head head; 1095 struct inode *inode; 1096 struct blk_plug plug; 1097 int ret = 0; 1098 u32 crc = 0; 1099 1100 ret = ext4_fc_submit_inode_data_all(journal); 1101 if (ret) 1102 return ret; 1103 1104 ret = ext4_fc_wait_inode_data_all(journal); 1105 if (ret) 1106 return ret; 1107 1108 /* 1109 * If file system device is different from journal device, issue a cache 1110 * flush before we start writing fast commit blocks. 1111 */ 1112 if (journal->j_fs_dev != journal->j_dev) 1113 blkdev_issue_flush(journal->j_fs_dev); 1114 1115 blk_start_plug(&plug); 1116 if (sbi->s_fc_bytes == 0) { 1117 /* 1118 * Add a head tag only if this is the first fast commit 1119 * in this TID. 1120 */ 1121 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1122 head.fc_tid = cpu_to_le32( 1123 sbi->s_journal->j_running_transaction->t_tid); 1124 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1125 (u8 *)&head, &crc)) { 1126 ret = -ENOSPC; 1127 goto out; 1128 } 1129 } 1130 1131 spin_lock(&sbi->s_fc_lock); 1132 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1133 if (ret) { 1134 spin_unlock(&sbi->s_fc_lock); 1135 goto out; 1136 } 1137 1138 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1139 inode = &iter->vfs_inode; 1140 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1141 continue; 1142 1143 spin_unlock(&sbi->s_fc_lock); 1144 ret = ext4_fc_write_inode_data(inode, &crc); 1145 if (ret) 1146 goto out; 1147 ret = ext4_fc_write_inode(inode, &crc); 1148 if (ret) 1149 goto out; 1150 spin_lock(&sbi->s_fc_lock); 1151 } 1152 spin_unlock(&sbi->s_fc_lock); 1153 1154 ret = ext4_fc_write_tail(sb, crc); 1155 1156 out: 1157 blk_finish_plug(&plug); 1158 return ret; 1159 } 1160 1161 static void ext4_fc_update_stats(struct super_block *sb, int status, 1162 u64 commit_time, int nblks, tid_t commit_tid) 1163 { 1164 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1165 1166 ext4_debug("Fast commit ended with status = %d for tid %u", 1167 status, commit_tid); 1168 if (status == EXT4_FC_STATUS_OK) { 1169 stats->fc_num_commits++; 1170 stats->fc_numblks += nblks; 1171 if (likely(stats->s_fc_avg_commit_time)) 1172 stats->s_fc_avg_commit_time = 1173 (commit_time + 1174 stats->s_fc_avg_commit_time * 3) / 4; 1175 else 1176 stats->s_fc_avg_commit_time = commit_time; 1177 } else if (status == EXT4_FC_STATUS_FAILED || 1178 status == EXT4_FC_STATUS_INELIGIBLE) { 1179 if (status == EXT4_FC_STATUS_FAILED) 1180 stats->fc_failed_commits++; 1181 stats->fc_ineligible_commits++; 1182 } else { 1183 stats->fc_skipped_commits++; 1184 } 1185 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1186 } 1187 1188 /* 1189 * The main commit entry point. Performs a fast commit for transaction 1190 * commit_tid if needed. If it's not possible to perform a fast commit 1191 * due to various reasons, we fall back to full commit. Returns 0 1192 * on success, error otherwise. 1193 */ 1194 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1195 { 1196 struct super_block *sb = journal->j_private; 1197 struct ext4_sb_info *sbi = EXT4_SB(sb); 1198 int nblks = 0, ret, bsize = journal->j_blocksize; 1199 int subtid = atomic_read(&sbi->s_fc_subtid); 1200 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1201 ktime_t start_time, commit_time; 1202 1203 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1204 return jbd2_complete_transaction(journal, commit_tid); 1205 1206 trace_ext4_fc_commit_start(sb, commit_tid); 1207 1208 start_time = ktime_get(); 1209 1210 restart_fc: 1211 ret = jbd2_fc_begin_commit(journal, commit_tid); 1212 if (ret == -EALREADY) { 1213 /* There was an ongoing commit, check if we need to restart */ 1214 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1215 commit_tid > journal->j_commit_sequence) 1216 goto restart_fc; 1217 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1218 commit_tid); 1219 return 0; 1220 } else if (ret) { 1221 /* 1222 * Commit couldn't start. Just update stats and perform a 1223 * full commit. 1224 */ 1225 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1226 commit_tid); 1227 return jbd2_complete_transaction(journal, commit_tid); 1228 } 1229 1230 /* 1231 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1232 * if we are fast commit ineligible. 1233 */ 1234 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1235 status = EXT4_FC_STATUS_INELIGIBLE; 1236 goto fallback; 1237 } 1238 1239 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1240 ret = ext4_fc_perform_commit(journal); 1241 if (ret < 0) { 1242 status = EXT4_FC_STATUS_FAILED; 1243 goto fallback; 1244 } 1245 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1246 ret = jbd2_fc_wait_bufs(journal, nblks); 1247 if (ret < 0) { 1248 status = EXT4_FC_STATUS_FAILED; 1249 goto fallback; 1250 } 1251 atomic_inc(&sbi->s_fc_subtid); 1252 ret = jbd2_fc_end_commit(journal); 1253 /* 1254 * weight the commit time higher than the average time so we 1255 * don't react too strongly to vast changes in the commit time 1256 */ 1257 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1258 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1259 return ret; 1260 1261 fallback: 1262 ret = jbd2_fc_end_commit_fallback(journal); 1263 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1264 return ret; 1265 } 1266 1267 /* 1268 * Fast commit cleanup routine. This is called after every fast commit and 1269 * full commit. full is true if we are called after a full commit. 1270 */ 1271 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1272 { 1273 struct super_block *sb = journal->j_private; 1274 struct ext4_sb_info *sbi = EXT4_SB(sb); 1275 struct ext4_inode_info *iter, *iter_n; 1276 struct ext4_fc_dentry_update *fc_dentry; 1277 1278 if (full && sbi->s_fc_bh) 1279 sbi->s_fc_bh = NULL; 1280 1281 trace_ext4_fc_cleanup(journal, full, tid); 1282 jbd2_fc_release_bufs(journal); 1283 1284 spin_lock(&sbi->s_fc_lock); 1285 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1286 i_fc_list) { 1287 list_del_init(&iter->i_fc_list); 1288 ext4_clear_inode_state(&iter->vfs_inode, 1289 EXT4_STATE_FC_COMMITTING); 1290 if (iter->i_sync_tid <= tid) 1291 ext4_fc_reset_inode(&iter->vfs_inode); 1292 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1293 smp_mb(); 1294 #if (BITS_PER_LONG < 64) 1295 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1296 #else 1297 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1298 #endif 1299 } 1300 1301 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1302 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1303 struct ext4_fc_dentry_update, 1304 fcd_list); 1305 list_del_init(&fc_dentry->fcd_list); 1306 list_del_init(&fc_dentry->fcd_dilist); 1307 spin_unlock(&sbi->s_fc_lock); 1308 1309 if (fc_dentry->fcd_name.name && 1310 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1311 kfree(fc_dentry->fcd_name.name); 1312 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1313 spin_lock(&sbi->s_fc_lock); 1314 } 1315 1316 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1317 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1318 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1319 &sbi->s_fc_q[FC_Q_MAIN]); 1320 1321 if (tid >= sbi->s_fc_ineligible_tid) { 1322 sbi->s_fc_ineligible_tid = 0; 1323 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1324 } 1325 1326 if (full) 1327 sbi->s_fc_bytes = 0; 1328 spin_unlock(&sbi->s_fc_lock); 1329 trace_ext4_fc_stats(sb); 1330 } 1331 1332 /* Ext4 Replay Path Routines */ 1333 1334 /* Helper struct for dentry replay routines */ 1335 struct dentry_info_args { 1336 int parent_ino, dname_len, ino, inode_len; 1337 char *dname; 1338 }; 1339 1340 static inline void tl_to_darg(struct dentry_info_args *darg, 1341 struct ext4_fc_tl *tl, u8 *val) 1342 { 1343 struct ext4_fc_dentry_info fcd; 1344 1345 memcpy(&fcd, val, sizeof(fcd)); 1346 1347 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1348 darg->ino = le32_to_cpu(fcd.fc_ino); 1349 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1350 darg->dname_len = le16_to_cpu(tl->fc_len) - 1351 sizeof(struct ext4_fc_dentry_info); 1352 } 1353 1354 /* Unlink replay function */ 1355 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1356 u8 *val) 1357 { 1358 struct inode *inode, *old_parent; 1359 struct qstr entry; 1360 struct dentry_info_args darg; 1361 int ret = 0; 1362 1363 tl_to_darg(&darg, tl, val); 1364 1365 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1366 darg.parent_ino, darg.dname_len); 1367 1368 entry.name = darg.dname; 1369 entry.len = darg.dname_len; 1370 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1371 1372 if (IS_ERR(inode)) { 1373 ext4_debug("Inode %d not found", darg.ino); 1374 return 0; 1375 } 1376 1377 old_parent = ext4_iget(sb, darg.parent_ino, 1378 EXT4_IGET_NORMAL); 1379 if (IS_ERR(old_parent)) { 1380 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1381 iput(inode); 1382 return 0; 1383 } 1384 1385 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1386 /* -ENOENT ok coz it might not exist anymore. */ 1387 if (ret == -ENOENT) 1388 ret = 0; 1389 iput(old_parent); 1390 iput(inode); 1391 return ret; 1392 } 1393 1394 static int ext4_fc_replay_link_internal(struct super_block *sb, 1395 struct dentry_info_args *darg, 1396 struct inode *inode) 1397 { 1398 struct inode *dir = NULL; 1399 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1400 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1401 int ret = 0; 1402 1403 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1404 if (IS_ERR(dir)) { 1405 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1406 dir = NULL; 1407 goto out; 1408 } 1409 1410 dentry_dir = d_obtain_alias(dir); 1411 if (IS_ERR(dentry_dir)) { 1412 ext4_debug("Failed to obtain dentry"); 1413 dentry_dir = NULL; 1414 goto out; 1415 } 1416 1417 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1418 if (!dentry_inode) { 1419 ext4_debug("Inode dentry not created."); 1420 ret = -ENOMEM; 1421 goto out; 1422 } 1423 1424 ret = __ext4_link(dir, inode, dentry_inode); 1425 /* 1426 * It's possible that link already existed since data blocks 1427 * for the dir in question got persisted before we crashed OR 1428 * we replayed this tag and crashed before the entire replay 1429 * could complete. 1430 */ 1431 if (ret && ret != -EEXIST) { 1432 ext4_debug("Failed to link\n"); 1433 goto out; 1434 } 1435 1436 ret = 0; 1437 out: 1438 if (dentry_dir) { 1439 d_drop(dentry_dir); 1440 dput(dentry_dir); 1441 } else if (dir) { 1442 iput(dir); 1443 } 1444 if (dentry_inode) { 1445 d_drop(dentry_inode); 1446 dput(dentry_inode); 1447 } 1448 1449 return ret; 1450 } 1451 1452 /* Link replay function */ 1453 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1454 u8 *val) 1455 { 1456 struct inode *inode; 1457 struct dentry_info_args darg; 1458 int ret = 0; 1459 1460 tl_to_darg(&darg, tl, val); 1461 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1462 darg.parent_ino, darg.dname_len); 1463 1464 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1465 if (IS_ERR(inode)) { 1466 ext4_debug("Inode not found."); 1467 return 0; 1468 } 1469 1470 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1471 iput(inode); 1472 return ret; 1473 } 1474 1475 /* 1476 * Record all the modified inodes during replay. We use this later to setup 1477 * block bitmaps correctly. 1478 */ 1479 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1480 { 1481 struct ext4_fc_replay_state *state; 1482 int i; 1483 1484 state = &EXT4_SB(sb)->s_fc_replay_state; 1485 for (i = 0; i < state->fc_modified_inodes_used; i++) 1486 if (state->fc_modified_inodes[i] == ino) 1487 return 0; 1488 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1489 int *fc_modified_inodes; 1490 1491 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1492 sizeof(int) * (state->fc_modified_inodes_size + 1493 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1494 GFP_KERNEL); 1495 if (!fc_modified_inodes) 1496 return -ENOMEM; 1497 state->fc_modified_inodes = fc_modified_inodes; 1498 state->fc_modified_inodes_size += 1499 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1500 } 1501 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1502 return 0; 1503 } 1504 1505 /* 1506 * Inode replay function 1507 */ 1508 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1509 u8 *val) 1510 { 1511 struct ext4_fc_inode fc_inode; 1512 struct ext4_inode *raw_inode; 1513 struct ext4_inode *raw_fc_inode; 1514 struct inode *inode = NULL; 1515 struct ext4_iloc iloc; 1516 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1517 struct ext4_extent_header *eh; 1518 1519 memcpy(&fc_inode, val, sizeof(fc_inode)); 1520 1521 ino = le32_to_cpu(fc_inode.fc_ino); 1522 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1523 1524 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1525 if (!IS_ERR(inode)) { 1526 ext4_ext_clear_bb(inode); 1527 iput(inode); 1528 } 1529 inode = NULL; 1530 1531 ret = ext4_fc_record_modified_inode(sb, ino); 1532 if (ret) 1533 goto out; 1534 1535 raw_fc_inode = (struct ext4_inode *) 1536 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1537 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1538 if (ret) 1539 goto out; 1540 1541 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); 1542 raw_inode = ext4_raw_inode(&iloc); 1543 1544 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1545 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1546 inode_len - offsetof(struct ext4_inode, i_generation)); 1547 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1548 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1549 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1550 memset(eh, 0, sizeof(*eh)); 1551 eh->eh_magic = EXT4_EXT_MAGIC; 1552 eh->eh_max = cpu_to_le16( 1553 (sizeof(raw_inode->i_block) - 1554 sizeof(struct ext4_extent_header)) 1555 / sizeof(struct ext4_extent)); 1556 } 1557 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1558 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1559 sizeof(raw_inode->i_block)); 1560 } 1561 1562 /* Immediately update the inode on disk. */ 1563 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1564 if (ret) 1565 goto out; 1566 ret = sync_dirty_buffer(iloc.bh); 1567 if (ret) 1568 goto out; 1569 ret = ext4_mark_inode_used(sb, ino); 1570 if (ret) 1571 goto out; 1572 1573 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1574 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1575 if (IS_ERR(inode)) { 1576 ext4_debug("Inode not found."); 1577 return -EFSCORRUPTED; 1578 } 1579 1580 /* 1581 * Our allocator could have made different decisions than before 1582 * crashing. This should be fixed but until then, we calculate 1583 * the number of blocks the inode. 1584 */ 1585 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1586 ext4_ext_replay_set_iblocks(inode); 1587 1588 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1589 ext4_reset_inode_seed(inode); 1590 1591 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1592 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1593 sync_dirty_buffer(iloc.bh); 1594 brelse(iloc.bh); 1595 out: 1596 iput(inode); 1597 if (!ret) 1598 blkdev_issue_flush(sb->s_bdev); 1599 1600 return 0; 1601 } 1602 1603 /* 1604 * Dentry create replay function. 1605 * 1606 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1607 * inode for which we are trying to create a dentry here, should already have 1608 * been replayed before we start here. 1609 */ 1610 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1611 u8 *val) 1612 { 1613 int ret = 0; 1614 struct inode *inode = NULL; 1615 struct inode *dir = NULL; 1616 struct dentry_info_args darg; 1617 1618 tl_to_darg(&darg, tl, val); 1619 1620 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1621 darg.parent_ino, darg.dname_len); 1622 1623 /* This takes care of update group descriptor and other metadata */ 1624 ret = ext4_mark_inode_used(sb, darg.ino); 1625 if (ret) 1626 goto out; 1627 1628 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1629 if (IS_ERR(inode)) { 1630 ext4_debug("inode %d not found.", darg.ino); 1631 inode = NULL; 1632 ret = -EINVAL; 1633 goto out; 1634 } 1635 1636 if (S_ISDIR(inode->i_mode)) { 1637 /* 1638 * If we are creating a directory, we need to make sure that the 1639 * dot and dot dot dirents are setup properly. 1640 */ 1641 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1642 if (IS_ERR(dir)) { 1643 ext4_debug("Dir %d not found.", darg.ino); 1644 goto out; 1645 } 1646 ret = ext4_init_new_dir(NULL, dir, inode); 1647 iput(dir); 1648 if (ret) { 1649 ret = 0; 1650 goto out; 1651 } 1652 } 1653 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1654 if (ret) 1655 goto out; 1656 set_nlink(inode, 1); 1657 ext4_mark_inode_dirty(NULL, inode); 1658 out: 1659 iput(inode); 1660 return ret; 1661 } 1662 1663 /* 1664 * Record physical disk regions which are in use as per fast commit area, 1665 * and used by inodes during replay phase. Our simple replay phase 1666 * allocator excludes these regions from allocation. 1667 */ 1668 int ext4_fc_record_regions(struct super_block *sb, int ino, 1669 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1670 { 1671 struct ext4_fc_replay_state *state; 1672 struct ext4_fc_alloc_region *region; 1673 1674 state = &EXT4_SB(sb)->s_fc_replay_state; 1675 /* 1676 * during replay phase, the fc_regions_valid may not same as 1677 * fc_regions_used, update it when do new additions. 1678 */ 1679 if (replay && state->fc_regions_used != state->fc_regions_valid) 1680 state->fc_regions_used = state->fc_regions_valid; 1681 if (state->fc_regions_used == state->fc_regions_size) { 1682 struct ext4_fc_alloc_region *fc_regions; 1683 1684 fc_regions = krealloc(state->fc_regions, 1685 sizeof(struct ext4_fc_alloc_region) * 1686 (state->fc_regions_size + 1687 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1688 GFP_KERNEL); 1689 if (!fc_regions) 1690 return -ENOMEM; 1691 state->fc_regions_size += 1692 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1693 state->fc_regions = fc_regions; 1694 } 1695 region = &state->fc_regions[state->fc_regions_used++]; 1696 region->ino = ino; 1697 region->lblk = lblk; 1698 region->pblk = pblk; 1699 region->len = len; 1700 1701 if (replay) 1702 state->fc_regions_valid++; 1703 1704 return 0; 1705 } 1706 1707 /* Replay add range tag */ 1708 static int ext4_fc_replay_add_range(struct super_block *sb, 1709 struct ext4_fc_tl *tl, u8 *val) 1710 { 1711 struct ext4_fc_add_range fc_add_ex; 1712 struct ext4_extent newex, *ex; 1713 struct inode *inode; 1714 ext4_lblk_t start, cur; 1715 int remaining, len; 1716 ext4_fsblk_t start_pblk; 1717 struct ext4_map_blocks map; 1718 struct ext4_ext_path *path = NULL; 1719 int ret; 1720 1721 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1722 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1723 1724 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1725 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1726 ext4_ext_get_actual_len(ex)); 1727 1728 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1729 if (IS_ERR(inode)) { 1730 ext4_debug("Inode not found."); 1731 return 0; 1732 } 1733 1734 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1735 if (ret) 1736 goto out; 1737 1738 start = le32_to_cpu(ex->ee_block); 1739 start_pblk = ext4_ext_pblock(ex); 1740 len = ext4_ext_get_actual_len(ex); 1741 1742 cur = start; 1743 remaining = len; 1744 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1745 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1746 inode->i_ino); 1747 1748 while (remaining > 0) { 1749 map.m_lblk = cur; 1750 map.m_len = remaining; 1751 map.m_pblk = 0; 1752 ret = ext4_map_blocks(NULL, inode, &map, 0); 1753 1754 if (ret < 0) 1755 goto out; 1756 1757 if (ret == 0) { 1758 /* Range is not mapped */ 1759 path = ext4_find_extent(inode, cur, NULL, 0); 1760 if (IS_ERR(path)) 1761 goto out; 1762 memset(&newex, 0, sizeof(newex)); 1763 newex.ee_block = cpu_to_le32(cur); 1764 ext4_ext_store_pblock( 1765 &newex, start_pblk + cur - start); 1766 newex.ee_len = cpu_to_le16(map.m_len); 1767 if (ext4_ext_is_unwritten(ex)) 1768 ext4_ext_mark_unwritten(&newex); 1769 down_write(&EXT4_I(inode)->i_data_sem); 1770 ret = ext4_ext_insert_extent( 1771 NULL, inode, &path, &newex, 0); 1772 up_write((&EXT4_I(inode)->i_data_sem)); 1773 ext4_ext_drop_refs(path); 1774 kfree(path); 1775 if (ret) 1776 goto out; 1777 goto next; 1778 } 1779 1780 if (start_pblk + cur - start != map.m_pblk) { 1781 /* 1782 * Logical to physical mapping changed. This can happen 1783 * if this range was removed and then reallocated to 1784 * map to new physical blocks during a fast commit. 1785 */ 1786 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1787 ext4_ext_is_unwritten(ex), 1788 start_pblk + cur - start); 1789 if (ret) 1790 goto out; 1791 /* 1792 * Mark the old blocks as free since they aren't used 1793 * anymore. We maintain an array of all the modified 1794 * inodes. In case these blocks are still used at either 1795 * a different logical range in the same inode or in 1796 * some different inode, we will mark them as allocated 1797 * at the end of the FC replay using our array of 1798 * modified inodes. 1799 */ 1800 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1801 goto next; 1802 } 1803 1804 /* Range is mapped and needs a state change */ 1805 ext4_debug("Converting from %ld to %d %lld", 1806 map.m_flags & EXT4_MAP_UNWRITTEN, 1807 ext4_ext_is_unwritten(ex), map.m_pblk); 1808 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1809 ext4_ext_is_unwritten(ex), map.m_pblk); 1810 if (ret) 1811 goto out; 1812 /* 1813 * We may have split the extent tree while toggling the state. 1814 * Try to shrink the extent tree now. 1815 */ 1816 ext4_ext_replay_shrink_inode(inode, start + len); 1817 next: 1818 cur += map.m_len; 1819 remaining -= map.m_len; 1820 } 1821 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1822 sb->s_blocksize_bits); 1823 out: 1824 iput(inode); 1825 return 0; 1826 } 1827 1828 /* Replay DEL_RANGE tag */ 1829 static int 1830 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1831 u8 *val) 1832 { 1833 struct inode *inode; 1834 struct ext4_fc_del_range lrange; 1835 struct ext4_map_blocks map; 1836 ext4_lblk_t cur, remaining; 1837 int ret; 1838 1839 memcpy(&lrange, val, sizeof(lrange)); 1840 cur = le32_to_cpu(lrange.fc_lblk); 1841 remaining = le32_to_cpu(lrange.fc_len); 1842 1843 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1844 le32_to_cpu(lrange.fc_ino), cur, remaining); 1845 1846 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1847 if (IS_ERR(inode)) { 1848 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1849 return 0; 1850 } 1851 1852 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1853 if (ret) 1854 goto out; 1855 1856 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", 1857 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1858 le32_to_cpu(lrange.fc_len)); 1859 while (remaining > 0) { 1860 map.m_lblk = cur; 1861 map.m_len = remaining; 1862 1863 ret = ext4_map_blocks(NULL, inode, &map, 0); 1864 if (ret < 0) 1865 goto out; 1866 if (ret > 0) { 1867 remaining -= ret; 1868 cur += ret; 1869 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1870 } else { 1871 remaining -= map.m_len; 1872 cur += map.m_len; 1873 } 1874 } 1875 1876 down_write(&EXT4_I(inode)->i_data_sem); 1877 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1878 le32_to_cpu(lrange.fc_lblk) + 1879 le32_to_cpu(lrange.fc_len) - 1); 1880 up_write(&EXT4_I(inode)->i_data_sem); 1881 if (ret) 1882 goto out; 1883 ext4_ext_replay_shrink_inode(inode, 1884 i_size_read(inode) >> sb->s_blocksize_bits); 1885 ext4_mark_inode_dirty(NULL, inode); 1886 out: 1887 iput(inode); 1888 return 0; 1889 } 1890 1891 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1892 { 1893 struct ext4_fc_replay_state *state; 1894 struct inode *inode; 1895 struct ext4_ext_path *path = NULL; 1896 struct ext4_map_blocks map; 1897 int i, ret, j; 1898 ext4_lblk_t cur, end; 1899 1900 state = &EXT4_SB(sb)->s_fc_replay_state; 1901 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1902 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1903 EXT4_IGET_NORMAL); 1904 if (IS_ERR(inode)) { 1905 ext4_debug("Inode %d not found.", 1906 state->fc_modified_inodes[i]); 1907 continue; 1908 } 1909 cur = 0; 1910 end = EXT_MAX_BLOCKS; 1911 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1912 iput(inode); 1913 continue; 1914 } 1915 while (cur < end) { 1916 map.m_lblk = cur; 1917 map.m_len = end - cur; 1918 1919 ret = ext4_map_blocks(NULL, inode, &map, 0); 1920 if (ret < 0) 1921 break; 1922 1923 if (ret > 0) { 1924 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1925 if (!IS_ERR(path)) { 1926 for (j = 0; j < path->p_depth; j++) 1927 ext4_mb_mark_bb(inode->i_sb, 1928 path[j].p_block, 1, 1); 1929 ext4_ext_drop_refs(path); 1930 kfree(path); 1931 } 1932 cur += ret; 1933 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1934 map.m_len, 1); 1935 } else { 1936 cur = cur + (map.m_len ? map.m_len : 1); 1937 } 1938 } 1939 iput(inode); 1940 } 1941 } 1942 1943 /* 1944 * Check if block is in excluded regions for block allocation. The simple 1945 * allocator that runs during replay phase is calls this function to see 1946 * if it is okay to use a block. 1947 */ 1948 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1949 { 1950 int i; 1951 struct ext4_fc_replay_state *state; 1952 1953 state = &EXT4_SB(sb)->s_fc_replay_state; 1954 for (i = 0; i < state->fc_regions_valid; i++) { 1955 if (state->fc_regions[i].ino == 0 || 1956 state->fc_regions[i].len == 0) 1957 continue; 1958 if (in_range(blk, state->fc_regions[i].pblk, 1959 state->fc_regions[i].len)) 1960 return true; 1961 } 1962 return false; 1963 } 1964 1965 /* Cleanup function called after replay */ 1966 void ext4_fc_replay_cleanup(struct super_block *sb) 1967 { 1968 struct ext4_sb_info *sbi = EXT4_SB(sb); 1969 1970 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1971 kfree(sbi->s_fc_replay_state.fc_regions); 1972 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1973 } 1974 1975 /* 1976 * Recovery Scan phase handler 1977 * 1978 * This function is called during the scan phase and is responsible 1979 * for doing following things: 1980 * - Make sure the fast commit area has valid tags for replay 1981 * - Count number of tags that need to be replayed by the replay handler 1982 * - Verify CRC 1983 * - Create a list of excluded blocks for allocation during replay phase 1984 * 1985 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1986 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1987 * to indicate that scan has finished and JBD2 can now start replay phase. 1988 * It returns a negative error to indicate that there was an error. At the end 1989 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1990 * to indicate the number of tags that need to replayed during the replay phase. 1991 */ 1992 static int ext4_fc_replay_scan(journal_t *journal, 1993 struct buffer_head *bh, int off, 1994 tid_t expected_tid) 1995 { 1996 struct super_block *sb = journal->j_private; 1997 struct ext4_sb_info *sbi = EXT4_SB(sb); 1998 struct ext4_fc_replay_state *state; 1999 int ret = JBD2_FC_REPLAY_CONTINUE; 2000 struct ext4_fc_add_range ext; 2001 struct ext4_fc_tl tl; 2002 struct ext4_fc_tail tail; 2003 __u8 *start, *end, *cur, *val; 2004 struct ext4_fc_head head; 2005 struct ext4_extent *ex; 2006 2007 state = &sbi->s_fc_replay_state; 2008 2009 start = (u8 *)bh->b_data; 2010 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2011 2012 if (state->fc_replay_expected_off == 0) { 2013 state->fc_cur_tag = 0; 2014 state->fc_replay_num_tags = 0; 2015 state->fc_crc = 0; 2016 state->fc_regions = NULL; 2017 state->fc_regions_valid = state->fc_regions_used = 2018 state->fc_regions_size = 0; 2019 /* Check if we can stop early */ 2020 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2021 != EXT4_FC_TAG_HEAD) 2022 return 0; 2023 } 2024 2025 if (off != state->fc_replay_expected_off) { 2026 ret = -EFSCORRUPTED; 2027 goto out_err; 2028 } 2029 2030 state->fc_replay_expected_off++; 2031 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2032 memcpy(&tl, cur, sizeof(tl)); 2033 val = cur + sizeof(tl); 2034 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2035 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); 2036 switch (le16_to_cpu(tl.fc_tag)) { 2037 case EXT4_FC_TAG_ADD_RANGE: 2038 memcpy(&ext, val, sizeof(ext)); 2039 ex = (struct ext4_extent *)&ext.fc_ex; 2040 ret = ext4_fc_record_regions(sb, 2041 le32_to_cpu(ext.fc_ino), 2042 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2043 ext4_ext_get_actual_len(ex), 0); 2044 if (ret < 0) 2045 break; 2046 ret = JBD2_FC_REPLAY_CONTINUE; 2047 fallthrough; 2048 case EXT4_FC_TAG_DEL_RANGE: 2049 case EXT4_FC_TAG_LINK: 2050 case EXT4_FC_TAG_UNLINK: 2051 case EXT4_FC_TAG_CREAT: 2052 case EXT4_FC_TAG_INODE: 2053 case EXT4_FC_TAG_PAD: 2054 state->fc_cur_tag++; 2055 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2056 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2057 break; 2058 case EXT4_FC_TAG_TAIL: 2059 state->fc_cur_tag++; 2060 memcpy(&tail, val, sizeof(tail)); 2061 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2062 sizeof(tl) + 2063 offsetof(struct ext4_fc_tail, 2064 fc_crc)); 2065 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2066 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2067 state->fc_replay_num_tags = state->fc_cur_tag; 2068 state->fc_regions_valid = 2069 state->fc_regions_used; 2070 } else { 2071 ret = state->fc_replay_num_tags ? 2072 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2073 } 2074 state->fc_crc = 0; 2075 break; 2076 case EXT4_FC_TAG_HEAD: 2077 memcpy(&head, val, sizeof(head)); 2078 if (le32_to_cpu(head.fc_features) & 2079 ~EXT4_FC_SUPPORTED_FEATURES) { 2080 ret = -EOPNOTSUPP; 2081 break; 2082 } 2083 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2084 ret = JBD2_FC_REPLAY_STOP; 2085 break; 2086 } 2087 state->fc_cur_tag++; 2088 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2089 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2090 break; 2091 default: 2092 ret = state->fc_replay_num_tags ? 2093 JBD2_FC_REPLAY_STOP : -ECANCELED; 2094 } 2095 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2096 break; 2097 } 2098 2099 out_err: 2100 trace_ext4_fc_replay_scan(sb, ret, off); 2101 return ret; 2102 } 2103 2104 /* 2105 * Main recovery path entry point. 2106 * The meaning of return codes is similar as above. 2107 */ 2108 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2109 enum passtype pass, int off, tid_t expected_tid) 2110 { 2111 struct super_block *sb = journal->j_private; 2112 struct ext4_sb_info *sbi = EXT4_SB(sb); 2113 struct ext4_fc_tl tl; 2114 __u8 *start, *end, *cur, *val; 2115 int ret = JBD2_FC_REPLAY_CONTINUE; 2116 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2117 struct ext4_fc_tail tail; 2118 2119 if (pass == PASS_SCAN) { 2120 state->fc_current_pass = PASS_SCAN; 2121 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2122 } 2123 2124 if (state->fc_current_pass != pass) { 2125 state->fc_current_pass = pass; 2126 sbi->s_mount_state |= EXT4_FC_REPLAY; 2127 } 2128 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2129 ext4_debug("Replay stops\n"); 2130 ext4_fc_set_bitmaps_and_counters(sb); 2131 return 0; 2132 } 2133 2134 #ifdef CONFIG_EXT4_DEBUG 2135 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2136 pr_warn("Dropping fc block %d because max_replay set\n", off); 2137 return JBD2_FC_REPLAY_STOP; 2138 } 2139 #endif 2140 2141 start = (u8 *)bh->b_data; 2142 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2143 2144 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2145 memcpy(&tl, cur, sizeof(tl)); 2146 val = cur + sizeof(tl); 2147 2148 if (state->fc_replay_num_tags == 0) { 2149 ret = JBD2_FC_REPLAY_STOP; 2150 ext4_fc_set_bitmaps_and_counters(sb); 2151 break; 2152 } 2153 ext4_debug("Replay phase, tag:%s\n", 2154 tag2str(le16_to_cpu(tl.fc_tag))); 2155 state->fc_replay_num_tags--; 2156 switch (le16_to_cpu(tl.fc_tag)) { 2157 case EXT4_FC_TAG_LINK: 2158 ret = ext4_fc_replay_link(sb, &tl, val); 2159 break; 2160 case EXT4_FC_TAG_UNLINK: 2161 ret = ext4_fc_replay_unlink(sb, &tl, val); 2162 break; 2163 case EXT4_FC_TAG_ADD_RANGE: 2164 ret = ext4_fc_replay_add_range(sb, &tl, val); 2165 break; 2166 case EXT4_FC_TAG_CREAT: 2167 ret = ext4_fc_replay_create(sb, &tl, val); 2168 break; 2169 case EXT4_FC_TAG_DEL_RANGE: 2170 ret = ext4_fc_replay_del_range(sb, &tl, val); 2171 break; 2172 case EXT4_FC_TAG_INODE: 2173 ret = ext4_fc_replay_inode(sb, &tl, val); 2174 break; 2175 case EXT4_FC_TAG_PAD: 2176 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2177 le16_to_cpu(tl.fc_len), 0); 2178 break; 2179 case EXT4_FC_TAG_TAIL: 2180 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2181 le16_to_cpu(tl.fc_len), 0); 2182 memcpy(&tail, val, sizeof(tail)); 2183 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2184 break; 2185 case EXT4_FC_TAG_HEAD: 2186 break; 2187 default: 2188 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, 2189 le16_to_cpu(tl.fc_len), 0); 2190 ret = -ECANCELED; 2191 break; 2192 } 2193 if (ret < 0) 2194 break; 2195 ret = JBD2_FC_REPLAY_CONTINUE; 2196 } 2197 return ret; 2198 } 2199 2200 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2201 { 2202 /* 2203 * We set replay callback even if fast commit disabled because we may 2204 * could still have fast commit blocks that need to be replayed even if 2205 * fast commit has now been turned off. 2206 */ 2207 journal->j_fc_replay_callback = ext4_fc_replay; 2208 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2209 return; 2210 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2211 } 2212 2213 static const char *fc_ineligible_reasons[] = { 2214 "Extended attributes changed", 2215 "Cross rename", 2216 "Journal flag changed", 2217 "Insufficient memory", 2218 "Swap boot", 2219 "Resize", 2220 "Dir renamed", 2221 "Falloc range op", 2222 "Data journalling", 2223 "FC Commit Failed" 2224 }; 2225 2226 int ext4_fc_info_show(struct seq_file *seq, void *v) 2227 { 2228 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2229 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2230 int i; 2231 2232 if (v != SEQ_START_TOKEN) 2233 return 0; 2234 2235 seq_printf(seq, 2236 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2237 stats->fc_num_commits, stats->fc_ineligible_commits, 2238 stats->fc_numblks, 2239 div_u64(stats->s_fc_avg_commit_time, 1000)); 2240 seq_puts(seq, "Ineligible reasons:\n"); 2241 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2242 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2243 stats->fc_ineligible_reason_count[i]); 2244 2245 return 0; 2246 } 2247 2248 int __init ext4_fc_init_dentry_cache(void) 2249 { 2250 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2251 SLAB_RECLAIM_ACCOUNT); 2252 2253 if (ext4_fc_dentry_cachep == NULL) 2254 return -ENOMEM; 2255 2256 return 0; 2257 } 2258 2259 void ext4_fc_destroy_dentry_cache(void) 2260 { 2261 kmem_cache_destroy(ext4_fc_dentry_cachep); 2262 } 2263