1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 init_waitqueue_head(&ei->i_fc_wait); 203 atomic_set(&ei->i_fc_updates, 0); 204 } 205 206 /* This function must be called with sbi->s_fc_lock held. */ 207 static void ext4_fc_wait_committing_inode(struct inode *inode) 208 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 209 { 210 wait_queue_head_t *wq; 211 struct ext4_inode_info *ei = EXT4_I(inode); 212 213 #if (BITS_PER_LONG < 64) 214 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 215 EXT4_STATE_FC_COMMITTING); 216 wq = bit_waitqueue(&ei->i_state_flags, 217 EXT4_STATE_FC_COMMITTING); 218 #else 219 DEFINE_WAIT_BIT(wait, &ei->i_flags, 220 EXT4_STATE_FC_COMMITTING); 221 wq = bit_waitqueue(&ei->i_flags, 222 EXT4_STATE_FC_COMMITTING); 223 #endif 224 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 225 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 226 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 227 schedule(); 228 finish_wait(wq, &wait.wq_entry); 229 } 230 231 /* 232 * Inform Ext4's fast about start of an inode update 233 * 234 * This function is called by the high level call VFS callbacks before 235 * performing any inode update. This function blocks if there's an ongoing 236 * fast commit on the inode in question. 237 */ 238 void ext4_fc_start_update(struct inode *inode) 239 { 240 struct ext4_inode_info *ei = EXT4_I(inode); 241 242 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 243 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 244 return; 245 246 restart: 247 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 248 if (list_empty(&ei->i_fc_list)) 249 goto out; 250 251 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 252 ext4_fc_wait_committing_inode(inode); 253 goto restart; 254 } 255 out: 256 atomic_inc(&ei->i_fc_updates); 257 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 258 } 259 260 /* 261 * Stop inode update and wake up waiting fast commits if any. 262 */ 263 void ext4_fc_stop_update(struct inode *inode) 264 { 265 struct ext4_inode_info *ei = EXT4_I(inode); 266 267 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 268 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 269 return; 270 271 if (atomic_dec_and_test(&ei->i_fc_updates)) 272 wake_up_all(&ei->i_fc_wait); 273 } 274 275 /* 276 * Remove inode from fast commit list. If the inode is being committed 277 * we wait until inode commit is done. 278 */ 279 void ext4_fc_del(struct inode *inode) 280 { 281 struct ext4_inode_info *ei = EXT4_I(inode); 282 283 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 284 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 285 return; 286 287 restart: 288 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 289 if (list_empty(&ei->i_fc_list)) { 290 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 291 return; 292 } 293 294 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 295 ext4_fc_wait_committing_inode(inode); 296 goto restart; 297 } 298 list_del_init(&ei->i_fc_list); 299 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 300 } 301 302 /* 303 * Mark file system as fast commit ineligible, and record latest 304 * ineligible transaction tid. This means until the recorded 305 * transaction, commit operation would result in a full jbd2 commit. 306 */ 307 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 308 { 309 struct ext4_sb_info *sbi = EXT4_SB(sb); 310 tid_t tid; 311 312 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 313 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 314 return; 315 316 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 317 if (handle && !IS_ERR(handle)) 318 tid = handle->h_transaction->t_tid; 319 else { 320 read_lock(&sbi->s_journal->j_state_lock); 321 tid = sbi->s_journal->j_running_transaction ? 322 sbi->s_journal->j_running_transaction->t_tid : 0; 323 read_unlock(&sbi->s_journal->j_state_lock); 324 } 325 spin_lock(&sbi->s_fc_lock); 326 if (sbi->s_fc_ineligible_tid < tid) 327 sbi->s_fc_ineligible_tid = tid; 328 spin_unlock(&sbi->s_fc_lock); 329 WARN_ON(reason >= EXT4_FC_REASON_MAX); 330 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 331 } 332 333 /* 334 * Generic fast commit tracking function. If this is the first time this we are 335 * called after a full commit, we initialize fast commit fields and then call 336 * __fc_track_fn() with update = 0. If we have already been called after a full 337 * commit, we pass update = 1. Based on that, the track function can determine 338 * if it needs to track a field for the first time or if it needs to just 339 * update the previously tracked value. 340 * 341 * If enqueue is set, this function enqueues the inode in fast commit list. 342 */ 343 static int ext4_fc_track_template( 344 handle_t *handle, struct inode *inode, 345 int (*__fc_track_fn)(struct inode *, void *, bool), 346 void *args, int enqueue) 347 { 348 bool update = false; 349 struct ext4_inode_info *ei = EXT4_I(inode); 350 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 351 tid_t tid = 0; 352 int ret; 353 354 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 355 (sbi->s_mount_state & EXT4_FC_REPLAY)) 356 return -EOPNOTSUPP; 357 358 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 359 return -EINVAL; 360 361 tid = handle->h_transaction->t_tid; 362 mutex_lock(&ei->i_fc_lock); 363 if (tid == ei->i_sync_tid) { 364 update = true; 365 } else { 366 ext4_fc_reset_inode(inode); 367 ei->i_sync_tid = tid; 368 } 369 ret = __fc_track_fn(inode, args, update); 370 mutex_unlock(&ei->i_fc_lock); 371 372 if (!enqueue) 373 return ret; 374 375 spin_lock(&sbi->s_fc_lock); 376 if (list_empty(&EXT4_I(inode)->i_fc_list)) 377 list_add_tail(&EXT4_I(inode)->i_fc_list, 378 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 379 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 380 &sbi->s_fc_q[FC_Q_STAGING] : 381 &sbi->s_fc_q[FC_Q_MAIN]); 382 spin_unlock(&sbi->s_fc_lock); 383 384 return ret; 385 } 386 387 struct __track_dentry_update_args { 388 struct dentry *dentry; 389 int op; 390 }; 391 392 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 393 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 394 { 395 struct ext4_fc_dentry_update *node; 396 struct ext4_inode_info *ei = EXT4_I(inode); 397 struct __track_dentry_update_args *dentry_update = 398 (struct __track_dentry_update_args *)arg; 399 struct dentry *dentry = dentry_update->dentry; 400 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 401 402 mutex_unlock(&ei->i_fc_lock); 403 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 404 if (!node) { 405 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 406 mutex_lock(&ei->i_fc_lock); 407 return -ENOMEM; 408 } 409 410 node->fcd_op = dentry_update->op; 411 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 412 node->fcd_ino = inode->i_ino; 413 if (dentry->d_name.len > DNAME_INLINE_LEN) { 414 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 415 if (!node->fcd_name.name) { 416 kmem_cache_free(ext4_fc_dentry_cachep, node); 417 ext4_fc_mark_ineligible(inode->i_sb, 418 EXT4_FC_REASON_NOMEM, NULL); 419 mutex_lock(&ei->i_fc_lock); 420 return -ENOMEM; 421 } 422 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 423 dentry->d_name.len); 424 } else { 425 memcpy(node->fcd_iname, dentry->d_name.name, 426 dentry->d_name.len); 427 node->fcd_name.name = node->fcd_iname; 428 } 429 node->fcd_name.len = dentry->d_name.len; 430 431 spin_lock(&sbi->s_fc_lock); 432 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 433 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 434 list_add_tail(&node->fcd_list, 435 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 436 else 437 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 438 spin_unlock(&sbi->s_fc_lock); 439 mutex_lock(&ei->i_fc_lock); 440 441 return 0; 442 } 443 444 void __ext4_fc_track_unlink(handle_t *handle, 445 struct inode *inode, struct dentry *dentry) 446 { 447 struct __track_dentry_update_args args; 448 int ret; 449 450 args.dentry = dentry; 451 args.op = EXT4_FC_TAG_UNLINK; 452 453 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 454 (void *)&args, 0); 455 trace_ext4_fc_track_unlink(inode, dentry, ret); 456 } 457 458 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 459 { 460 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 461 } 462 463 void __ext4_fc_track_link(handle_t *handle, 464 struct inode *inode, struct dentry *dentry) 465 { 466 struct __track_dentry_update_args args; 467 int ret; 468 469 args.dentry = dentry; 470 args.op = EXT4_FC_TAG_LINK; 471 472 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 473 (void *)&args, 0); 474 trace_ext4_fc_track_link(inode, dentry, ret); 475 } 476 477 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 478 { 479 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 480 } 481 482 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 483 struct dentry *dentry) 484 { 485 struct __track_dentry_update_args args; 486 int ret; 487 488 args.dentry = dentry; 489 args.op = EXT4_FC_TAG_CREAT; 490 491 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 492 (void *)&args, 0); 493 trace_ext4_fc_track_create(inode, dentry, ret); 494 } 495 496 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 497 { 498 __ext4_fc_track_create(handle, d_inode(dentry), dentry); 499 } 500 501 /* __track_fn for inode tracking */ 502 static int __track_inode(struct inode *inode, void *arg, bool update) 503 { 504 if (update) 505 return -EEXIST; 506 507 EXT4_I(inode)->i_fc_lblk_len = 0; 508 509 return 0; 510 } 511 512 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 513 { 514 int ret; 515 516 if (S_ISDIR(inode->i_mode)) 517 return; 518 519 if (ext4_should_journal_data(inode)) { 520 ext4_fc_mark_ineligible(inode->i_sb, 521 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 522 return; 523 } 524 525 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 526 trace_ext4_fc_track_inode(inode, ret); 527 } 528 529 struct __track_range_args { 530 ext4_lblk_t start, end; 531 }; 532 533 /* __track_fn for tracking data updates */ 534 static int __track_range(struct inode *inode, void *arg, bool update) 535 { 536 struct ext4_inode_info *ei = EXT4_I(inode); 537 ext4_lblk_t oldstart; 538 struct __track_range_args *__arg = 539 (struct __track_range_args *)arg; 540 541 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 542 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 543 return -ECANCELED; 544 } 545 546 oldstart = ei->i_fc_lblk_start; 547 548 if (update && ei->i_fc_lblk_len > 0) { 549 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 550 ei->i_fc_lblk_len = 551 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 552 ei->i_fc_lblk_start + 1; 553 } else { 554 ei->i_fc_lblk_start = __arg->start; 555 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 556 } 557 558 return 0; 559 } 560 561 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 562 ext4_lblk_t end) 563 { 564 struct __track_range_args args; 565 int ret; 566 567 if (S_ISDIR(inode->i_mode)) 568 return; 569 570 args.start = start; 571 args.end = end; 572 573 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 574 575 trace_ext4_fc_track_range(inode, start, end, ret); 576 } 577 578 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 579 { 580 int write_flags = REQ_SYNC; 581 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 582 583 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 584 if (test_opt(sb, BARRIER) && is_tail) 585 write_flags |= REQ_FUA | REQ_PREFLUSH; 586 lock_buffer(bh); 587 set_buffer_dirty(bh); 588 set_buffer_uptodate(bh); 589 bh->b_end_io = ext4_end_buffer_io_sync; 590 submit_bh(REQ_OP_WRITE, write_flags, bh); 591 EXT4_SB(sb)->s_fc_bh = NULL; 592 } 593 594 /* Ext4 commit path routines */ 595 596 /* memzero and update CRC */ 597 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 598 u32 *crc) 599 { 600 void *ret; 601 602 ret = memset(dst, 0, len); 603 if (crc) 604 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 605 return ret; 606 } 607 608 /* 609 * Allocate len bytes on a fast commit buffer. 610 * 611 * During the commit time this function is used to manage fast commit 612 * block space. We don't split a fast commit log onto different 613 * blocks. So this function makes sure that if there's not enough space 614 * on the current block, the remaining space in the current block is 615 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 616 * new block is from jbd2 and CRC is updated to reflect the padding 617 * we added. 618 */ 619 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 620 { 621 struct ext4_fc_tl *tl; 622 struct ext4_sb_info *sbi = EXT4_SB(sb); 623 struct buffer_head *bh; 624 int bsize = sbi->s_journal->j_blocksize; 625 int ret, off = sbi->s_fc_bytes % bsize; 626 int pad_len; 627 628 /* 629 * After allocating len, we should have space at least for a 0 byte 630 * padding. 631 */ 632 if (len + sizeof(struct ext4_fc_tl) > bsize) 633 return NULL; 634 635 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 636 /* 637 * Only allocate from current buffer if we have enough space for 638 * this request AND we have space to add a zero byte padding. 639 */ 640 if (!sbi->s_fc_bh) { 641 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 642 if (ret) 643 return NULL; 644 sbi->s_fc_bh = bh; 645 } 646 sbi->s_fc_bytes += len; 647 return sbi->s_fc_bh->b_data + off; 648 } 649 /* Need to add PAD tag */ 650 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 651 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 652 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 653 tl->fc_len = cpu_to_le16(pad_len); 654 if (crc) 655 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 656 if (pad_len > 0) 657 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 658 ext4_fc_submit_bh(sb, false); 659 660 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 661 if (ret) 662 return NULL; 663 sbi->s_fc_bh = bh; 664 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 665 return sbi->s_fc_bh->b_data; 666 } 667 668 /* memcpy to fc reserved space and update CRC */ 669 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 670 int len, u32 *crc) 671 { 672 if (crc) 673 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 674 return memcpy(dst, src, len); 675 } 676 677 /* 678 * Complete a fast commit by writing tail tag. 679 * 680 * Writing tail tag marks the end of a fast commit. In order to guarantee 681 * atomicity, after writing tail tag, even if there's space remaining 682 * in the block, next commit shouldn't use it. That's why tail tag 683 * has the length as that of the remaining space on the block. 684 */ 685 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 686 { 687 struct ext4_sb_info *sbi = EXT4_SB(sb); 688 struct ext4_fc_tl tl; 689 struct ext4_fc_tail tail; 690 int off, bsize = sbi->s_journal->j_blocksize; 691 u8 *dst; 692 693 /* 694 * ext4_fc_reserve_space takes care of allocating an extra block if 695 * there's no enough space on this block for accommodating this tail. 696 */ 697 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 698 if (!dst) 699 return -ENOSPC; 700 701 off = sbi->s_fc_bytes % bsize; 702 703 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 704 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 705 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 706 707 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 708 dst += sizeof(tl); 709 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 710 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 711 dst += sizeof(tail.fc_tid); 712 tail.fc_crc = cpu_to_le32(crc); 713 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 714 715 ext4_fc_submit_bh(sb, true); 716 717 return 0; 718 } 719 720 /* 721 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 722 * Returns false if there's not enough space. 723 */ 724 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 725 u32 *crc) 726 { 727 struct ext4_fc_tl tl; 728 u8 *dst; 729 730 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 731 if (!dst) 732 return false; 733 734 tl.fc_tag = cpu_to_le16(tag); 735 tl.fc_len = cpu_to_le16(len); 736 737 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 738 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 739 740 return true; 741 } 742 743 /* Same as above, but adds dentry tlv. */ 744 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 745 struct ext4_fc_dentry_update *fc_dentry) 746 { 747 struct ext4_fc_dentry_info fcd; 748 struct ext4_fc_tl tl; 749 int dlen = fc_dentry->fcd_name.len; 750 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 751 crc); 752 753 if (!dst) 754 return false; 755 756 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 757 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 758 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 759 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 760 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 761 dst += sizeof(tl); 762 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 763 dst += sizeof(fcd); 764 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); 765 766 return true; 767 } 768 769 /* 770 * Writes inode in the fast commit space under TLV with tag @tag. 771 * Returns 0 on success, error on failure. 772 */ 773 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 774 { 775 struct ext4_inode_info *ei = EXT4_I(inode); 776 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 777 int ret; 778 struct ext4_iloc iloc; 779 struct ext4_fc_inode fc_inode; 780 struct ext4_fc_tl tl; 781 u8 *dst; 782 783 ret = ext4_get_inode_loc(inode, &iloc); 784 if (ret) 785 return ret; 786 787 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 788 inode_len = EXT4_INODE_SIZE(inode->i_sb); 789 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 790 inode_len += ei->i_extra_isize; 791 792 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 793 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 794 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 795 796 dst = ext4_fc_reserve_space(inode->i_sb, 797 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 798 if (!dst) 799 return -ECANCELED; 800 801 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 802 return -ECANCELED; 803 dst += sizeof(tl); 804 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 805 return -ECANCELED; 806 dst += sizeof(fc_inode); 807 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 808 inode_len, crc)) 809 return -ECANCELED; 810 811 return 0; 812 } 813 814 /* 815 * Writes updated data ranges for the inode in question. Updates CRC. 816 * Returns 0 on success, error otherwise. 817 */ 818 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 819 { 820 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 821 struct ext4_inode_info *ei = EXT4_I(inode); 822 struct ext4_map_blocks map; 823 struct ext4_fc_add_range fc_ext; 824 struct ext4_fc_del_range lrange; 825 struct ext4_extent *ex; 826 int ret; 827 828 mutex_lock(&ei->i_fc_lock); 829 if (ei->i_fc_lblk_len == 0) { 830 mutex_unlock(&ei->i_fc_lock); 831 return 0; 832 } 833 old_blk_size = ei->i_fc_lblk_start; 834 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 835 ei->i_fc_lblk_len = 0; 836 mutex_unlock(&ei->i_fc_lock); 837 838 cur_lblk_off = old_blk_size; 839 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 840 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 841 842 while (cur_lblk_off <= new_blk_size) { 843 map.m_lblk = cur_lblk_off; 844 map.m_len = new_blk_size - cur_lblk_off + 1; 845 ret = ext4_map_blocks(NULL, inode, &map, 0); 846 if (ret < 0) 847 return -ECANCELED; 848 849 if (map.m_len == 0) { 850 cur_lblk_off++; 851 continue; 852 } 853 854 if (ret == 0) { 855 lrange.fc_ino = cpu_to_le32(inode->i_ino); 856 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 857 lrange.fc_len = cpu_to_le32(map.m_len); 858 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 859 sizeof(lrange), (u8 *)&lrange, crc)) 860 return -ENOSPC; 861 } else { 862 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 863 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 864 865 /* Limit the number of blocks in one extent */ 866 map.m_len = min(max, map.m_len); 867 868 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 869 ex = (struct ext4_extent *)&fc_ext.fc_ex; 870 ex->ee_block = cpu_to_le32(map.m_lblk); 871 ex->ee_len = cpu_to_le16(map.m_len); 872 ext4_ext_store_pblock(ex, map.m_pblk); 873 if (map.m_flags & EXT4_MAP_UNWRITTEN) 874 ext4_ext_mark_unwritten(ex); 875 else 876 ext4_ext_mark_initialized(ex); 877 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 878 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 879 return -ENOSPC; 880 } 881 882 cur_lblk_off += map.m_len; 883 } 884 885 return 0; 886 } 887 888 889 /* Submit data for all the fast commit inodes */ 890 static int ext4_fc_submit_inode_data_all(journal_t *journal) 891 { 892 struct super_block *sb = (struct super_block *)(journal->j_private); 893 struct ext4_sb_info *sbi = EXT4_SB(sb); 894 struct ext4_inode_info *ei; 895 int ret = 0; 896 897 spin_lock(&sbi->s_fc_lock); 898 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 899 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 900 while (atomic_read(&ei->i_fc_updates)) { 901 DEFINE_WAIT(wait); 902 903 prepare_to_wait(&ei->i_fc_wait, &wait, 904 TASK_UNINTERRUPTIBLE); 905 if (atomic_read(&ei->i_fc_updates)) { 906 spin_unlock(&sbi->s_fc_lock); 907 schedule(); 908 spin_lock(&sbi->s_fc_lock); 909 } 910 finish_wait(&ei->i_fc_wait, &wait); 911 } 912 spin_unlock(&sbi->s_fc_lock); 913 ret = jbd2_submit_inode_data(ei->jinode); 914 if (ret) 915 return ret; 916 spin_lock(&sbi->s_fc_lock); 917 } 918 spin_unlock(&sbi->s_fc_lock); 919 920 return ret; 921 } 922 923 /* Wait for completion of data for all the fast commit inodes */ 924 static int ext4_fc_wait_inode_data_all(journal_t *journal) 925 { 926 struct super_block *sb = (struct super_block *)(journal->j_private); 927 struct ext4_sb_info *sbi = EXT4_SB(sb); 928 struct ext4_inode_info *pos, *n; 929 int ret = 0; 930 931 spin_lock(&sbi->s_fc_lock); 932 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 933 if (!ext4_test_inode_state(&pos->vfs_inode, 934 EXT4_STATE_FC_COMMITTING)) 935 continue; 936 spin_unlock(&sbi->s_fc_lock); 937 938 ret = jbd2_wait_inode_data(journal, pos->jinode); 939 if (ret) 940 return ret; 941 spin_lock(&sbi->s_fc_lock); 942 } 943 spin_unlock(&sbi->s_fc_lock); 944 945 return 0; 946 } 947 948 /* Commit all the directory entry updates */ 949 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 950 __acquires(&sbi->s_fc_lock) 951 __releases(&sbi->s_fc_lock) 952 { 953 struct super_block *sb = (struct super_block *)(journal->j_private); 954 struct ext4_sb_info *sbi = EXT4_SB(sb); 955 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 956 struct inode *inode; 957 struct ext4_inode_info *ei, *ei_n; 958 int ret; 959 960 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 961 return 0; 962 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 963 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 964 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 965 spin_unlock(&sbi->s_fc_lock); 966 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 967 ret = -ENOSPC; 968 goto lock_and_exit; 969 } 970 spin_lock(&sbi->s_fc_lock); 971 continue; 972 } 973 974 inode = NULL; 975 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], 976 i_fc_list) { 977 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 978 inode = &ei->vfs_inode; 979 break; 980 } 981 } 982 /* 983 * If we don't find inode in our list, then it was deleted, 984 * in which case, we don't need to record it's create tag. 985 */ 986 if (!inode) 987 continue; 988 spin_unlock(&sbi->s_fc_lock); 989 990 /* 991 * We first write the inode and then the create dirent. This 992 * allows the recovery code to create an unnamed inode first 993 * and then link it to a directory entry. This allows us 994 * to use namei.c routines almost as is and simplifies 995 * the recovery code. 996 */ 997 ret = ext4_fc_write_inode(inode, crc); 998 if (ret) 999 goto lock_and_exit; 1000 1001 ret = ext4_fc_write_inode_data(inode, crc); 1002 if (ret) 1003 goto lock_and_exit; 1004 1005 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1006 ret = -ENOSPC; 1007 goto lock_and_exit; 1008 } 1009 1010 spin_lock(&sbi->s_fc_lock); 1011 } 1012 return 0; 1013 lock_and_exit: 1014 spin_lock(&sbi->s_fc_lock); 1015 return ret; 1016 } 1017 1018 static int ext4_fc_perform_commit(journal_t *journal) 1019 { 1020 struct super_block *sb = (struct super_block *)(journal->j_private); 1021 struct ext4_sb_info *sbi = EXT4_SB(sb); 1022 struct ext4_inode_info *iter; 1023 struct ext4_fc_head head; 1024 struct inode *inode; 1025 struct blk_plug plug; 1026 int ret = 0; 1027 u32 crc = 0; 1028 1029 ret = ext4_fc_submit_inode_data_all(journal); 1030 if (ret) 1031 return ret; 1032 1033 ret = ext4_fc_wait_inode_data_all(journal); 1034 if (ret) 1035 return ret; 1036 1037 /* 1038 * If file system device is different from journal device, issue a cache 1039 * flush before we start writing fast commit blocks. 1040 */ 1041 if (journal->j_fs_dev != journal->j_dev) 1042 blkdev_issue_flush(journal->j_fs_dev); 1043 1044 blk_start_plug(&plug); 1045 if (sbi->s_fc_bytes == 0) { 1046 /* 1047 * Add a head tag only if this is the first fast commit 1048 * in this TID. 1049 */ 1050 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1051 head.fc_tid = cpu_to_le32( 1052 sbi->s_journal->j_running_transaction->t_tid); 1053 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1054 (u8 *)&head, &crc)) { 1055 ret = -ENOSPC; 1056 goto out; 1057 } 1058 } 1059 1060 spin_lock(&sbi->s_fc_lock); 1061 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1062 if (ret) { 1063 spin_unlock(&sbi->s_fc_lock); 1064 goto out; 1065 } 1066 1067 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1068 inode = &iter->vfs_inode; 1069 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1070 continue; 1071 1072 spin_unlock(&sbi->s_fc_lock); 1073 ret = ext4_fc_write_inode_data(inode, &crc); 1074 if (ret) 1075 goto out; 1076 ret = ext4_fc_write_inode(inode, &crc); 1077 if (ret) 1078 goto out; 1079 spin_lock(&sbi->s_fc_lock); 1080 } 1081 spin_unlock(&sbi->s_fc_lock); 1082 1083 ret = ext4_fc_write_tail(sb, crc); 1084 1085 out: 1086 blk_finish_plug(&plug); 1087 return ret; 1088 } 1089 1090 static void ext4_fc_update_stats(struct super_block *sb, int status, 1091 u64 commit_time, int nblks) 1092 { 1093 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1094 1095 jbd_debug(1, "Fast commit ended with status = %d", status); 1096 if (status == EXT4_FC_STATUS_OK) { 1097 stats->fc_num_commits++; 1098 stats->fc_numblks += nblks; 1099 if (likely(stats->s_fc_avg_commit_time)) 1100 stats->s_fc_avg_commit_time = 1101 (commit_time + 1102 stats->s_fc_avg_commit_time * 3) / 4; 1103 else 1104 stats->s_fc_avg_commit_time = commit_time; 1105 } else if (status == EXT4_FC_STATUS_FAILED || 1106 status == EXT4_FC_STATUS_INELIGIBLE) { 1107 if (status == EXT4_FC_STATUS_FAILED) 1108 stats->fc_failed_commits++; 1109 stats->fc_ineligible_commits++; 1110 } else { 1111 stats->fc_skipped_commits++; 1112 } 1113 trace_ext4_fc_commit_stop(sb, nblks, status); 1114 } 1115 1116 /* 1117 * The main commit entry point. Performs a fast commit for transaction 1118 * commit_tid if needed. If it's not possible to perform a fast commit 1119 * due to various reasons, we fall back to full commit. Returns 0 1120 * on success, error otherwise. 1121 */ 1122 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1123 { 1124 struct super_block *sb = (struct super_block *)(journal->j_private); 1125 struct ext4_sb_info *sbi = EXT4_SB(sb); 1126 int nblks = 0, ret, bsize = journal->j_blocksize; 1127 int subtid = atomic_read(&sbi->s_fc_subtid); 1128 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1129 ktime_t start_time, commit_time; 1130 1131 trace_ext4_fc_commit_start(sb); 1132 1133 start_time = ktime_get(); 1134 1135 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1136 return jbd2_complete_transaction(journal, commit_tid); 1137 1138 restart_fc: 1139 ret = jbd2_fc_begin_commit(journal, commit_tid); 1140 if (ret == -EALREADY) { 1141 /* There was an ongoing commit, check if we need to restart */ 1142 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1143 commit_tid > journal->j_commit_sequence) 1144 goto restart_fc; 1145 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); 1146 return 0; 1147 } else if (ret) { 1148 /* 1149 * Commit couldn't start. Just update stats and perform a 1150 * full commit. 1151 */ 1152 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); 1153 return jbd2_complete_transaction(journal, commit_tid); 1154 } 1155 1156 /* 1157 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1158 * if we are fast commit ineligible. 1159 */ 1160 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1161 status = EXT4_FC_STATUS_INELIGIBLE; 1162 goto fallback; 1163 } 1164 1165 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1166 ret = ext4_fc_perform_commit(journal); 1167 if (ret < 0) { 1168 status = EXT4_FC_STATUS_FAILED; 1169 goto fallback; 1170 } 1171 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1172 ret = jbd2_fc_wait_bufs(journal, nblks); 1173 if (ret < 0) { 1174 status = EXT4_FC_STATUS_FAILED; 1175 goto fallback; 1176 } 1177 atomic_inc(&sbi->s_fc_subtid); 1178 ret = jbd2_fc_end_commit(journal); 1179 /* 1180 * weight the commit time higher than the average time so we 1181 * don't react too strongly to vast changes in the commit time 1182 */ 1183 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1184 ext4_fc_update_stats(sb, status, commit_time, nblks); 1185 return ret; 1186 1187 fallback: 1188 ret = jbd2_fc_end_commit_fallback(journal); 1189 ext4_fc_update_stats(sb, status, 0, 0); 1190 return ret; 1191 } 1192 1193 /* 1194 * Fast commit cleanup routine. This is called after every fast commit and 1195 * full commit. full is true if we are called after a full commit. 1196 */ 1197 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1198 { 1199 struct super_block *sb = journal->j_private; 1200 struct ext4_sb_info *sbi = EXT4_SB(sb); 1201 struct ext4_inode_info *iter, *iter_n; 1202 struct ext4_fc_dentry_update *fc_dentry; 1203 1204 if (full && sbi->s_fc_bh) 1205 sbi->s_fc_bh = NULL; 1206 1207 jbd2_fc_release_bufs(journal); 1208 1209 spin_lock(&sbi->s_fc_lock); 1210 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1211 i_fc_list) { 1212 list_del_init(&iter->i_fc_list); 1213 ext4_clear_inode_state(&iter->vfs_inode, 1214 EXT4_STATE_FC_COMMITTING); 1215 if (iter->i_sync_tid <= tid) 1216 ext4_fc_reset_inode(&iter->vfs_inode); 1217 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1218 smp_mb(); 1219 #if (BITS_PER_LONG < 64) 1220 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1221 #else 1222 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1223 #endif 1224 } 1225 1226 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1227 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1228 struct ext4_fc_dentry_update, 1229 fcd_list); 1230 list_del_init(&fc_dentry->fcd_list); 1231 spin_unlock(&sbi->s_fc_lock); 1232 1233 if (fc_dentry->fcd_name.name && 1234 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1235 kfree(fc_dentry->fcd_name.name); 1236 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1237 spin_lock(&sbi->s_fc_lock); 1238 } 1239 1240 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1241 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1242 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1243 &sbi->s_fc_q[FC_Q_MAIN]); 1244 1245 if (tid >= sbi->s_fc_ineligible_tid) { 1246 sbi->s_fc_ineligible_tid = 0; 1247 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1248 } 1249 1250 if (full) 1251 sbi->s_fc_bytes = 0; 1252 spin_unlock(&sbi->s_fc_lock); 1253 trace_ext4_fc_stats(sb); 1254 } 1255 1256 /* Ext4 Replay Path Routines */ 1257 1258 /* Helper struct for dentry replay routines */ 1259 struct dentry_info_args { 1260 int parent_ino, dname_len, ino, inode_len; 1261 char *dname; 1262 }; 1263 1264 static inline void tl_to_darg(struct dentry_info_args *darg, 1265 struct ext4_fc_tl *tl, u8 *val) 1266 { 1267 struct ext4_fc_dentry_info fcd; 1268 1269 memcpy(&fcd, val, sizeof(fcd)); 1270 1271 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1272 darg->ino = le32_to_cpu(fcd.fc_ino); 1273 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1274 darg->dname_len = le16_to_cpu(tl->fc_len) - 1275 sizeof(struct ext4_fc_dentry_info); 1276 } 1277 1278 /* Unlink replay function */ 1279 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1280 u8 *val) 1281 { 1282 struct inode *inode, *old_parent; 1283 struct qstr entry; 1284 struct dentry_info_args darg; 1285 int ret = 0; 1286 1287 tl_to_darg(&darg, tl, val); 1288 1289 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1290 darg.parent_ino, darg.dname_len); 1291 1292 entry.name = darg.dname; 1293 entry.len = darg.dname_len; 1294 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1295 1296 if (IS_ERR(inode)) { 1297 jbd_debug(1, "Inode %d not found", darg.ino); 1298 return 0; 1299 } 1300 1301 old_parent = ext4_iget(sb, darg.parent_ino, 1302 EXT4_IGET_NORMAL); 1303 if (IS_ERR(old_parent)) { 1304 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1305 iput(inode); 1306 return 0; 1307 } 1308 1309 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1310 /* -ENOENT ok coz it might not exist anymore. */ 1311 if (ret == -ENOENT) 1312 ret = 0; 1313 iput(old_parent); 1314 iput(inode); 1315 return ret; 1316 } 1317 1318 static int ext4_fc_replay_link_internal(struct super_block *sb, 1319 struct dentry_info_args *darg, 1320 struct inode *inode) 1321 { 1322 struct inode *dir = NULL; 1323 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1324 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1325 int ret = 0; 1326 1327 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1328 if (IS_ERR(dir)) { 1329 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1330 dir = NULL; 1331 goto out; 1332 } 1333 1334 dentry_dir = d_obtain_alias(dir); 1335 if (IS_ERR(dentry_dir)) { 1336 jbd_debug(1, "Failed to obtain dentry"); 1337 dentry_dir = NULL; 1338 goto out; 1339 } 1340 1341 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1342 if (!dentry_inode) { 1343 jbd_debug(1, "Inode dentry not created."); 1344 ret = -ENOMEM; 1345 goto out; 1346 } 1347 1348 ret = __ext4_link(dir, inode, dentry_inode); 1349 /* 1350 * It's possible that link already existed since data blocks 1351 * for the dir in question got persisted before we crashed OR 1352 * we replayed this tag and crashed before the entire replay 1353 * could complete. 1354 */ 1355 if (ret && ret != -EEXIST) { 1356 jbd_debug(1, "Failed to link\n"); 1357 goto out; 1358 } 1359 1360 ret = 0; 1361 out: 1362 if (dentry_dir) { 1363 d_drop(dentry_dir); 1364 dput(dentry_dir); 1365 } else if (dir) { 1366 iput(dir); 1367 } 1368 if (dentry_inode) { 1369 d_drop(dentry_inode); 1370 dput(dentry_inode); 1371 } 1372 1373 return ret; 1374 } 1375 1376 /* Link replay function */ 1377 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1378 u8 *val) 1379 { 1380 struct inode *inode; 1381 struct dentry_info_args darg; 1382 int ret = 0; 1383 1384 tl_to_darg(&darg, tl, val); 1385 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1386 darg.parent_ino, darg.dname_len); 1387 1388 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1389 if (IS_ERR(inode)) { 1390 jbd_debug(1, "Inode not found."); 1391 return 0; 1392 } 1393 1394 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1395 iput(inode); 1396 return ret; 1397 } 1398 1399 /* 1400 * Record all the modified inodes during replay. We use this later to setup 1401 * block bitmaps correctly. 1402 */ 1403 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1404 { 1405 struct ext4_fc_replay_state *state; 1406 int i; 1407 1408 state = &EXT4_SB(sb)->s_fc_replay_state; 1409 for (i = 0; i < state->fc_modified_inodes_used; i++) 1410 if (state->fc_modified_inodes[i] == ino) 1411 return 0; 1412 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1413 state->fc_modified_inodes = krealloc( 1414 state->fc_modified_inodes, 1415 sizeof(int) * (state->fc_modified_inodes_size + 1416 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1417 GFP_KERNEL); 1418 if (!state->fc_modified_inodes) 1419 return -ENOMEM; 1420 state->fc_modified_inodes_size += 1421 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1422 } 1423 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1424 return 0; 1425 } 1426 1427 /* 1428 * Inode replay function 1429 */ 1430 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1431 u8 *val) 1432 { 1433 struct ext4_fc_inode fc_inode; 1434 struct ext4_inode *raw_inode; 1435 struct ext4_inode *raw_fc_inode; 1436 struct inode *inode = NULL; 1437 struct ext4_iloc iloc; 1438 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1439 struct ext4_extent_header *eh; 1440 1441 memcpy(&fc_inode, val, sizeof(fc_inode)); 1442 1443 ino = le32_to_cpu(fc_inode.fc_ino); 1444 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1445 1446 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1447 if (!IS_ERR(inode)) { 1448 ext4_ext_clear_bb(inode); 1449 iput(inode); 1450 } 1451 inode = NULL; 1452 1453 ret = ext4_fc_record_modified_inode(sb, ino); 1454 if (ret) 1455 goto out; 1456 1457 raw_fc_inode = (struct ext4_inode *) 1458 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1459 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1460 if (ret) 1461 goto out; 1462 1463 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); 1464 raw_inode = ext4_raw_inode(&iloc); 1465 1466 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1467 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1468 inode_len - offsetof(struct ext4_inode, i_generation)); 1469 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1470 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1471 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1472 memset(eh, 0, sizeof(*eh)); 1473 eh->eh_magic = EXT4_EXT_MAGIC; 1474 eh->eh_max = cpu_to_le16( 1475 (sizeof(raw_inode->i_block) - 1476 sizeof(struct ext4_extent_header)) 1477 / sizeof(struct ext4_extent)); 1478 } 1479 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1480 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1481 sizeof(raw_inode->i_block)); 1482 } 1483 1484 /* Immediately update the inode on disk. */ 1485 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1486 if (ret) 1487 goto out; 1488 ret = sync_dirty_buffer(iloc.bh); 1489 if (ret) 1490 goto out; 1491 ret = ext4_mark_inode_used(sb, ino); 1492 if (ret) 1493 goto out; 1494 1495 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1496 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1497 if (IS_ERR(inode)) { 1498 jbd_debug(1, "Inode not found."); 1499 return -EFSCORRUPTED; 1500 } 1501 1502 /* 1503 * Our allocator could have made different decisions than before 1504 * crashing. This should be fixed but until then, we calculate 1505 * the number of blocks the inode. 1506 */ 1507 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1508 ext4_ext_replay_set_iblocks(inode); 1509 1510 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1511 ext4_reset_inode_seed(inode); 1512 1513 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1514 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1515 sync_dirty_buffer(iloc.bh); 1516 brelse(iloc.bh); 1517 out: 1518 iput(inode); 1519 if (!ret) 1520 blkdev_issue_flush(sb->s_bdev); 1521 1522 return 0; 1523 } 1524 1525 /* 1526 * Dentry create replay function. 1527 * 1528 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1529 * inode for which we are trying to create a dentry here, should already have 1530 * been replayed before we start here. 1531 */ 1532 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1533 u8 *val) 1534 { 1535 int ret = 0; 1536 struct inode *inode = NULL; 1537 struct inode *dir = NULL; 1538 struct dentry_info_args darg; 1539 1540 tl_to_darg(&darg, tl, val); 1541 1542 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1543 darg.parent_ino, darg.dname_len); 1544 1545 /* This takes care of update group descriptor and other metadata */ 1546 ret = ext4_mark_inode_used(sb, darg.ino); 1547 if (ret) 1548 goto out; 1549 1550 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1551 if (IS_ERR(inode)) { 1552 jbd_debug(1, "inode %d not found.", darg.ino); 1553 inode = NULL; 1554 ret = -EINVAL; 1555 goto out; 1556 } 1557 1558 if (S_ISDIR(inode->i_mode)) { 1559 /* 1560 * If we are creating a directory, we need to make sure that the 1561 * dot and dot dot dirents are setup properly. 1562 */ 1563 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1564 if (IS_ERR(dir)) { 1565 jbd_debug(1, "Dir %d not found.", darg.ino); 1566 goto out; 1567 } 1568 ret = ext4_init_new_dir(NULL, dir, inode); 1569 iput(dir); 1570 if (ret) { 1571 ret = 0; 1572 goto out; 1573 } 1574 } 1575 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1576 if (ret) 1577 goto out; 1578 set_nlink(inode, 1); 1579 ext4_mark_inode_dirty(NULL, inode); 1580 out: 1581 if (inode) 1582 iput(inode); 1583 return ret; 1584 } 1585 1586 /* 1587 * Record physical disk regions which are in use as per fast commit area, 1588 * and used by inodes during replay phase. Our simple replay phase 1589 * allocator excludes these regions from allocation. 1590 */ 1591 int ext4_fc_record_regions(struct super_block *sb, int ino, 1592 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1593 { 1594 struct ext4_fc_replay_state *state; 1595 struct ext4_fc_alloc_region *region; 1596 1597 state = &EXT4_SB(sb)->s_fc_replay_state; 1598 /* 1599 * during replay phase, the fc_regions_valid may not same as 1600 * fc_regions_used, update it when do new additions. 1601 */ 1602 if (replay && state->fc_regions_used != state->fc_regions_valid) 1603 state->fc_regions_used = state->fc_regions_valid; 1604 if (state->fc_regions_used == state->fc_regions_size) { 1605 state->fc_regions_size += 1606 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1607 state->fc_regions = krealloc( 1608 state->fc_regions, 1609 state->fc_regions_size * 1610 sizeof(struct ext4_fc_alloc_region), 1611 GFP_KERNEL); 1612 if (!state->fc_regions) 1613 return -ENOMEM; 1614 } 1615 region = &state->fc_regions[state->fc_regions_used++]; 1616 region->ino = ino; 1617 region->lblk = lblk; 1618 region->pblk = pblk; 1619 region->len = len; 1620 1621 if (replay) 1622 state->fc_regions_valid++; 1623 1624 return 0; 1625 } 1626 1627 /* Replay add range tag */ 1628 static int ext4_fc_replay_add_range(struct super_block *sb, 1629 struct ext4_fc_tl *tl, u8 *val) 1630 { 1631 struct ext4_fc_add_range fc_add_ex; 1632 struct ext4_extent newex, *ex; 1633 struct inode *inode; 1634 ext4_lblk_t start, cur; 1635 int remaining, len; 1636 ext4_fsblk_t start_pblk; 1637 struct ext4_map_blocks map; 1638 struct ext4_ext_path *path = NULL; 1639 int ret; 1640 1641 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1642 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1643 1644 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1645 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1646 ext4_ext_get_actual_len(ex)); 1647 1648 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1649 if (IS_ERR(inode)) { 1650 jbd_debug(1, "Inode not found."); 1651 return 0; 1652 } 1653 1654 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1655 if (ret) 1656 goto out; 1657 1658 start = le32_to_cpu(ex->ee_block); 1659 start_pblk = ext4_ext_pblock(ex); 1660 len = ext4_ext_get_actual_len(ex); 1661 1662 cur = start; 1663 remaining = len; 1664 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1665 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1666 inode->i_ino); 1667 1668 while (remaining > 0) { 1669 map.m_lblk = cur; 1670 map.m_len = remaining; 1671 map.m_pblk = 0; 1672 ret = ext4_map_blocks(NULL, inode, &map, 0); 1673 1674 if (ret < 0) 1675 goto out; 1676 1677 if (ret == 0) { 1678 /* Range is not mapped */ 1679 path = ext4_find_extent(inode, cur, NULL, 0); 1680 if (IS_ERR(path)) 1681 goto out; 1682 memset(&newex, 0, sizeof(newex)); 1683 newex.ee_block = cpu_to_le32(cur); 1684 ext4_ext_store_pblock( 1685 &newex, start_pblk + cur - start); 1686 newex.ee_len = cpu_to_le16(map.m_len); 1687 if (ext4_ext_is_unwritten(ex)) 1688 ext4_ext_mark_unwritten(&newex); 1689 down_write(&EXT4_I(inode)->i_data_sem); 1690 ret = ext4_ext_insert_extent( 1691 NULL, inode, &path, &newex, 0); 1692 up_write((&EXT4_I(inode)->i_data_sem)); 1693 ext4_ext_drop_refs(path); 1694 kfree(path); 1695 if (ret) 1696 goto out; 1697 goto next; 1698 } 1699 1700 if (start_pblk + cur - start != map.m_pblk) { 1701 /* 1702 * Logical to physical mapping changed. This can happen 1703 * if this range was removed and then reallocated to 1704 * map to new physical blocks during a fast commit. 1705 */ 1706 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1707 ext4_ext_is_unwritten(ex), 1708 start_pblk + cur - start); 1709 if (ret) 1710 goto out; 1711 /* 1712 * Mark the old blocks as free since they aren't used 1713 * anymore. We maintain an array of all the modified 1714 * inodes. In case these blocks are still used at either 1715 * a different logical range in the same inode or in 1716 * some different inode, we will mark them as allocated 1717 * at the end of the FC replay using our array of 1718 * modified inodes. 1719 */ 1720 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1721 goto next; 1722 } 1723 1724 /* Range is mapped and needs a state change */ 1725 jbd_debug(1, "Converting from %ld to %d %lld", 1726 map.m_flags & EXT4_MAP_UNWRITTEN, 1727 ext4_ext_is_unwritten(ex), map.m_pblk); 1728 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1729 ext4_ext_is_unwritten(ex), map.m_pblk); 1730 if (ret) 1731 goto out; 1732 /* 1733 * We may have split the extent tree while toggling the state. 1734 * Try to shrink the extent tree now. 1735 */ 1736 ext4_ext_replay_shrink_inode(inode, start + len); 1737 next: 1738 cur += map.m_len; 1739 remaining -= map.m_len; 1740 } 1741 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1742 sb->s_blocksize_bits); 1743 out: 1744 iput(inode); 1745 return 0; 1746 } 1747 1748 /* Replay DEL_RANGE tag */ 1749 static int 1750 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1751 u8 *val) 1752 { 1753 struct inode *inode; 1754 struct ext4_fc_del_range lrange; 1755 struct ext4_map_blocks map; 1756 ext4_lblk_t cur, remaining; 1757 int ret; 1758 1759 memcpy(&lrange, val, sizeof(lrange)); 1760 cur = le32_to_cpu(lrange.fc_lblk); 1761 remaining = le32_to_cpu(lrange.fc_len); 1762 1763 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1764 le32_to_cpu(lrange.fc_ino), cur, remaining); 1765 1766 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1767 if (IS_ERR(inode)) { 1768 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1769 return 0; 1770 } 1771 1772 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1773 if (ret) 1774 goto out; 1775 1776 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1777 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1778 le32_to_cpu(lrange.fc_len)); 1779 while (remaining > 0) { 1780 map.m_lblk = cur; 1781 map.m_len = remaining; 1782 1783 ret = ext4_map_blocks(NULL, inode, &map, 0); 1784 if (ret < 0) 1785 goto out; 1786 if (ret > 0) { 1787 remaining -= ret; 1788 cur += ret; 1789 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1790 } else { 1791 remaining -= map.m_len; 1792 cur += map.m_len; 1793 } 1794 } 1795 1796 down_write(&EXT4_I(inode)->i_data_sem); 1797 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1798 le32_to_cpu(lrange.fc_lblk) + 1799 le32_to_cpu(lrange.fc_len) - 1); 1800 up_write(&EXT4_I(inode)->i_data_sem); 1801 if (ret) 1802 goto out; 1803 ext4_ext_replay_shrink_inode(inode, 1804 i_size_read(inode) >> sb->s_blocksize_bits); 1805 ext4_mark_inode_dirty(NULL, inode); 1806 out: 1807 iput(inode); 1808 return 0; 1809 } 1810 1811 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1812 { 1813 struct ext4_fc_replay_state *state; 1814 struct inode *inode; 1815 struct ext4_ext_path *path = NULL; 1816 struct ext4_map_blocks map; 1817 int i, ret, j; 1818 ext4_lblk_t cur, end; 1819 1820 state = &EXT4_SB(sb)->s_fc_replay_state; 1821 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1822 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1823 EXT4_IGET_NORMAL); 1824 if (IS_ERR(inode)) { 1825 jbd_debug(1, "Inode %d not found.", 1826 state->fc_modified_inodes[i]); 1827 continue; 1828 } 1829 cur = 0; 1830 end = EXT_MAX_BLOCKS; 1831 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1832 iput(inode); 1833 continue; 1834 } 1835 while (cur < end) { 1836 map.m_lblk = cur; 1837 map.m_len = end - cur; 1838 1839 ret = ext4_map_blocks(NULL, inode, &map, 0); 1840 if (ret < 0) 1841 break; 1842 1843 if (ret > 0) { 1844 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1845 if (!IS_ERR(path)) { 1846 for (j = 0; j < path->p_depth; j++) 1847 ext4_mb_mark_bb(inode->i_sb, 1848 path[j].p_block, 1, 1); 1849 ext4_ext_drop_refs(path); 1850 kfree(path); 1851 } 1852 cur += ret; 1853 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1854 map.m_len, 1); 1855 } else { 1856 cur = cur + (map.m_len ? map.m_len : 1); 1857 } 1858 } 1859 iput(inode); 1860 } 1861 } 1862 1863 /* 1864 * Check if block is in excluded regions for block allocation. The simple 1865 * allocator that runs during replay phase is calls this function to see 1866 * if it is okay to use a block. 1867 */ 1868 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1869 { 1870 int i; 1871 struct ext4_fc_replay_state *state; 1872 1873 state = &EXT4_SB(sb)->s_fc_replay_state; 1874 for (i = 0; i < state->fc_regions_valid; i++) { 1875 if (state->fc_regions[i].ino == 0 || 1876 state->fc_regions[i].len == 0) 1877 continue; 1878 if (blk >= state->fc_regions[i].pblk && 1879 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1880 return true; 1881 } 1882 return false; 1883 } 1884 1885 /* Cleanup function called after replay */ 1886 void ext4_fc_replay_cleanup(struct super_block *sb) 1887 { 1888 struct ext4_sb_info *sbi = EXT4_SB(sb); 1889 1890 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1891 kfree(sbi->s_fc_replay_state.fc_regions); 1892 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1893 } 1894 1895 /* 1896 * Recovery Scan phase handler 1897 * 1898 * This function is called during the scan phase and is responsible 1899 * for doing following things: 1900 * - Make sure the fast commit area has valid tags for replay 1901 * - Count number of tags that need to be replayed by the replay handler 1902 * - Verify CRC 1903 * - Create a list of excluded blocks for allocation during replay phase 1904 * 1905 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1906 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1907 * to indicate that scan has finished and JBD2 can now start replay phase. 1908 * It returns a negative error to indicate that there was an error. At the end 1909 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1910 * to indicate the number of tags that need to replayed during the replay phase. 1911 */ 1912 static int ext4_fc_replay_scan(journal_t *journal, 1913 struct buffer_head *bh, int off, 1914 tid_t expected_tid) 1915 { 1916 struct super_block *sb = journal->j_private; 1917 struct ext4_sb_info *sbi = EXT4_SB(sb); 1918 struct ext4_fc_replay_state *state; 1919 int ret = JBD2_FC_REPLAY_CONTINUE; 1920 struct ext4_fc_add_range ext; 1921 struct ext4_fc_tl tl; 1922 struct ext4_fc_tail tail; 1923 __u8 *start, *end, *cur, *val; 1924 struct ext4_fc_head head; 1925 struct ext4_extent *ex; 1926 1927 state = &sbi->s_fc_replay_state; 1928 1929 start = (u8 *)bh->b_data; 1930 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1931 1932 if (state->fc_replay_expected_off == 0) { 1933 state->fc_cur_tag = 0; 1934 state->fc_replay_num_tags = 0; 1935 state->fc_crc = 0; 1936 state->fc_regions = NULL; 1937 state->fc_regions_valid = state->fc_regions_used = 1938 state->fc_regions_size = 0; 1939 /* Check if we can stop early */ 1940 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1941 != EXT4_FC_TAG_HEAD) 1942 return 0; 1943 } 1944 1945 if (off != state->fc_replay_expected_off) { 1946 ret = -EFSCORRUPTED; 1947 goto out_err; 1948 } 1949 1950 state->fc_replay_expected_off++; 1951 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 1952 memcpy(&tl, cur, sizeof(tl)); 1953 val = cur + sizeof(tl); 1954 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1955 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); 1956 switch (le16_to_cpu(tl.fc_tag)) { 1957 case EXT4_FC_TAG_ADD_RANGE: 1958 memcpy(&ext, val, sizeof(ext)); 1959 ex = (struct ext4_extent *)&ext.fc_ex; 1960 ret = ext4_fc_record_regions(sb, 1961 le32_to_cpu(ext.fc_ino), 1962 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1963 ext4_ext_get_actual_len(ex), 0); 1964 if (ret < 0) 1965 break; 1966 ret = JBD2_FC_REPLAY_CONTINUE; 1967 fallthrough; 1968 case EXT4_FC_TAG_DEL_RANGE: 1969 case EXT4_FC_TAG_LINK: 1970 case EXT4_FC_TAG_UNLINK: 1971 case EXT4_FC_TAG_CREAT: 1972 case EXT4_FC_TAG_INODE: 1973 case EXT4_FC_TAG_PAD: 1974 state->fc_cur_tag++; 1975 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1976 sizeof(tl) + le16_to_cpu(tl.fc_len)); 1977 break; 1978 case EXT4_FC_TAG_TAIL: 1979 state->fc_cur_tag++; 1980 memcpy(&tail, val, sizeof(tail)); 1981 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1982 sizeof(tl) + 1983 offsetof(struct ext4_fc_tail, 1984 fc_crc)); 1985 if (le32_to_cpu(tail.fc_tid) == expected_tid && 1986 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 1987 state->fc_replay_num_tags = state->fc_cur_tag; 1988 state->fc_regions_valid = 1989 state->fc_regions_used; 1990 } else { 1991 ret = state->fc_replay_num_tags ? 1992 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 1993 } 1994 state->fc_crc = 0; 1995 break; 1996 case EXT4_FC_TAG_HEAD: 1997 memcpy(&head, val, sizeof(head)); 1998 if (le32_to_cpu(head.fc_features) & 1999 ~EXT4_FC_SUPPORTED_FEATURES) { 2000 ret = -EOPNOTSUPP; 2001 break; 2002 } 2003 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2004 ret = JBD2_FC_REPLAY_STOP; 2005 break; 2006 } 2007 state->fc_cur_tag++; 2008 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2009 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2010 break; 2011 default: 2012 ret = state->fc_replay_num_tags ? 2013 JBD2_FC_REPLAY_STOP : -ECANCELED; 2014 } 2015 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2016 break; 2017 } 2018 2019 out_err: 2020 trace_ext4_fc_replay_scan(sb, ret, off); 2021 return ret; 2022 } 2023 2024 /* 2025 * Main recovery path entry point. 2026 * The meaning of return codes is similar as above. 2027 */ 2028 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2029 enum passtype pass, int off, tid_t expected_tid) 2030 { 2031 struct super_block *sb = journal->j_private; 2032 struct ext4_sb_info *sbi = EXT4_SB(sb); 2033 struct ext4_fc_tl tl; 2034 __u8 *start, *end, *cur, *val; 2035 int ret = JBD2_FC_REPLAY_CONTINUE; 2036 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2037 struct ext4_fc_tail tail; 2038 2039 if (pass == PASS_SCAN) { 2040 state->fc_current_pass = PASS_SCAN; 2041 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2042 } 2043 2044 if (state->fc_current_pass != pass) { 2045 state->fc_current_pass = pass; 2046 sbi->s_mount_state |= EXT4_FC_REPLAY; 2047 } 2048 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2049 jbd_debug(1, "Replay stops\n"); 2050 ext4_fc_set_bitmaps_and_counters(sb); 2051 return 0; 2052 } 2053 2054 #ifdef CONFIG_EXT4_DEBUG 2055 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2056 pr_warn("Dropping fc block %d because max_replay set\n", off); 2057 return JBD2_FC_REPLAY_STOP; 2058 } 2059 #endif 2060 2061 start = (u8 *)bh->b_data; 2062 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2063 2064 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2065 memcpy(&tl, cur, sizeof(tl)); 2066 val = cur + sizeof(tl); 2067 2068 if (state->fc_replay_num_tags == 0) { 2069 ret = JBD2_FC_REPLAY_STOP; 2070 ext4_fc_set_bitmaps_and_counters(sb); 2071 break; 2072 } 2073 jbd_debug(3, "Replay phase, tag:%s\n", 2074 tag2str(le16_to_cpu(tl.fc_tag))); 2075 state->fc_replay_num_tags--; 2076 switch (le16_to_cpu(tl.fc_tag)) { 2077 case EXT4_FC_TAG_LINK: 2078 ret = ext4_fc_replay_link(sb, &tl, val); 2079 break; 2080 case EXT4_FC_TAG_UNLINK: 2081 ret = ext4_fc_replay_unlink(sb, &tl, val); 2082 break; 2083 case EXT4_FC_TAG_ADD_RANGE: 2084 ret = ext4_fc_replay_add_range(sb, &tl, val); 2085 break; 2086 case EXT4_FC_TAG_CREAT: 2087 ret = ext4_fc_replay_create(sb, &tl, val); 2088 break; 2089 case EXT4_FC_TAG_DEL_RANGE: 2090 ret = ext4_fc_replay_del_range(sb, &tl, val); 2091 break; 2092 case EXT4_FC_TAG_INODE: 2093 ret = ext4_fc_replay_inode(sb, &tl, val); 2094 break; 2095 case EXT4_FC_TAG_PAD: 2096 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2097 le16_to_cpu(tl.fc_len), 0); 2098 break; 2099 case EXT4_FC_TAG_TAIL: 2100 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2101 le16_to_cpu(tl.fc_len), 0); 2102 memcpy(&tail, val, sizeof(tail)); 2103 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2104 break; 2105 case EXT4_FC_TAG_HEAD: 2106 break; 2107 default: 2108 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, 2109 le16_to_cpu(tl.fc_len), 0); 2110 ret = -ECANCELED; 2111 break; 2112 } 2113 if (ret < 0) 2114 break; 2115 ret = JBD2_FC_REPLAY_CONTINUE; 2116 } 2117 return ret; 2118 } 2119 2120 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2121 { 2122 /* 2123 * We set replay callback even if fast commit disabled because we may 2124 * could still have fast commit blocks that need to be replayed even if 2125 * fast commit has now been turned off. 2126 */ 2127 journal->j_fc_replay_callback = ext4_fc_replay; 2128 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2129 return; 2130 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2131 } 2132 2133 static const char *fc_ineligible_reasons[] = { 2134 "Extended attributes changed", 2135 "Cross rename", 2136 "Journal flag changed", 2137 "Insufficient memory", 2138 "Swap boot", 2139 "Resize", 2140 "Dir renamed", 2141 "Falloc range op", 2142 "Data journalling", 2143 "FC Commit Failed" 2144 }; 2145 2146 int ext4_fc_info_show(struct seq_file *seq, void *v) 2147 { 2148 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2149 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2150 int i; 2151 2152 if (v != SEQ_START_TOKEN) 2153 return 0; 2154 2155 seq_printf(seq, 2156 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2157 stats->fc_num_commits, stats->fc_ineligible_commits, 2158 stats->fc_numblks, 2159 div_u64(stats->s_fc_avg_commit_time, 1000)); 2160 seq_puts(seq, "Ineligible reasons:\n"); 2161 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2162 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2163 stats->fc_ineligible_reason_count[i]); 2164 2165 return 0; 2166 } 2167 2168 int __init ext4_fc_init_dentry_cache(void) 2169 { 2170 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2171 SLAB_RECLAIM_ACCOUNT); 2172 2173 if (ext4_fc_dentry_cachep == NULL) 2174 return -ENOMEM; 2175 2176 return 0; 2177 } 2178 2179 void ext4_fc_destroy_dentry_cache(void) 2180 { 2181 kmem_cache_destroy(ext4_fc_dentry_cachep); 2182 } 2183