1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * Not all operations are supported by fast commits today (e.g extended 69 * attributes). Fast commit ineligibility is marked by calling one of the 70 * two following functions: 71 * 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall 73 * back to full commit. This is useful in case of transient errors. 74 * 75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all 76 * the fast commits happening between ext4_fc_start_ineligible() and 77 * ext4_fc_stop_ineligible() and one fast commit after the call to 78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to 79 * make one more fast commit to fall back to full commit after stop call so 80 * that it guaranteed that the fast commit ineligible operation contained 81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is 82 * followed by at least 1 full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to guarantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * Fast Commit Replay Idempotence 107 * ------------------------------ 108 * 109 * Fast commits tags are idempotent in nature provided the recovery code follows 110 * certain rules. The guiding principle that the commit path follows while 111 * committing is that it stores the result of a particular operation instead of 112 * storing the procedure. 113 * 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 * was associated with inode 10. During fast commit, instead of storing this 116 * operation as a procedure "rename a to b", we store the resulting file system 117 * state as a "series" of outcomes: 118 * 119 * - Link dirent b to inode 10 120 * - Unlink dirent a 121 * - Inode <10> with valid refcount 122 * 123 * Now when recovery code runs, it needs "enforce" this state on the file 124 * system. This is what guarantees idempotence of fast commit replay. 125 * 126 * Let's take an example of a procedure that is not idempotent and see how fast 127 * commits make it idempotent. Consider following sequence of operations: 128 * 129 * rm A; mv B A; read A 130 * (x) (y) (z) 131 * 132 * (x), (y) and (z) are the points at which we can crash. If we store this 133 * sequence of operations as is then the replay is not idempotent. Let's say 134 * while in replay, we crash at (z). During the second replay, file A (which was 135 * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 * file named A would be absent when we try to read A. So, this sequence of 137 * operations is not idempotent. However, as mentioned above, instead of storing 138 * the procedure fast commits store the outcome of each procedure. Thus the fast 139 * commit log for above procedure would be as follows: 140 * 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 * inode 11 before the replay) 143 * 144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 * (w) (x) (y) (z) 146 * 147 * If we crash at (z), we will have file A linked to inode 11. During the second 148 * replay, we will remove file A (inode 11). But we will create it back and make 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 * similarly. Thus, by converting a non-idempotent procedure into a series of 153 * idempotent outcomes, fast commits ensured idempotence during the replay. 154 * 155 * TODOs 156 * ----- 157 * 158 * 0) Fast commit replay path hardening: Fast commit replay code should use 159 * journal handles to make sure all the updates it does during the replay 160 * path are atomic. With that if we crash during fast commit replay, after 161 * trying to do recovery again, we will find a file system where fast commit 162 * area is invalid (because new full commit would be found). In order to deal 163 * with that, fast commit replay code should ensure that the "FC_REPLAY" 164 * superblock state is persisted before starting the replay, so that after 165 * the crash, fast commit recovery code can look at that flag and perform 166 * fast commit recovery even if that area is invalidated by later full 167 * commits. 168 * 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 170 * eligible update must be protected within ext4_fc_start_update() and 171 * ext4_fc_stop_update(). These routines are called at much higher 172 * routines. This can be made more fine grained by combining with 173 * ext4_journal_start(). 174 * 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() 176 * 177 * 3) Handle more ineligible cases. 178 */ 179 180 #include <trace/events/ext4.h> 181 static struct kmem_cache *ext4_fc_dentry_cachep; 182 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 184 { 185 BUFFER_TRACE(bh, ""); 186 if (uptodate) { 187 ext4_debug("%s: Block %lld up-to-date", 188 __func__, bh->b_blocknr); 189 set_buffer_uptodate(bh); 190 } else { 191 ext4_debug("%s: Block %lld not up-to-date", 192 __func__, bh->b_blocknr); 193 clear_buffer_uptodate(bh); 194 } 195 196 unlock_buffer(bh); 197 } 198 199 static inline void ext4_fc_reset_inode(struct inode *inode) 200 { 201 struct ext4_inode_info *ei = EXT4_I(inode); 202 203 ei->i_fc_lblk_start = 0; 204 ei->i_fc_lblk_len = 0; 205 } 206 207 void ext4_fc_init_inode(struct inode *inode) 208 { 209 struct ext4_inode_info *ei = EXT4_I(inode); 210 211 ext4_fc_reset_inode(inode); 212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 213 INIT_LIST_HEAD(&ei->i_fc_list); 214 init_waitqueue_head(&ei->i_fc_wait); 215 atomic_set(&ei->i_fc_updates, 0); 216 } 217 218 /* This function must be called with sbi->s_fc_lock held. */ 219 static void ext4_fc_wait_committing_inode(struct inode *inode) 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 221 { 222 wait_queue_head_t *wq; 223 struct ext4_inode_info *ei = EXT4_I(inode); 224 225 #if (BITS_PER_LONG < 64) 226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 227 EXT4_STATE_FC_COMMITTING); 228 wq = bit_waitqueue(&ei->i_state_flags, 229 EXT4_STATE_FC_COMMITTING); 230 #else 231 DEFINE_WAIT_BIT(wait, &ei->i_flags, 232 EXT4_STATE_FC_COMMITTING); 233 wq = bit_waitqueue(&ei->i_flags, 234 EXT4_STATE_FC_COMMITTING); 235 #endif 236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 239 schedule(); 240 finish_wait(wq, &wait.wq_entry); 241 } 242 243 /* 244 * Inform Ext4's fast about start of an inode update 245 * 246 * This function is called by the high level call VFS callbacks before 247 * performing any inode update. This function blocks if there's an ongoing 248 * fast commit on the inode in question. 249 */ 250 void ext4_fc_start_update(struct inode *inode) 251 { 252 struct ext4_inode_info *ei = EXT4_I(inode); 253 254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 256 return; 257 258 restart: 259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 260 if (list_empty(&ei->i_fc_list)) 261 goto out; 262 263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 264 ext4_fc_wait_committing_inode(inode); 265 goto restart; 266 } 267 out: 268 atomic_inc(&ei->i_fc_updates); 269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 270 } 271 272 /* 273 * Stop inode update and wake up waiting fast commits if any. 274 */ 275 void ext4_fc_stop_update(struct inode *inode) 276 { 277 struct ext4_inode_info *ei = EXT4_I(inode); 278 279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 281 return; 282 283 if (atomic_dec_and_test(&ei->i_fc_updates)) 284 wake_up_all(&ei->i_fc_wait); 285 } 286 287 /* 288 * Remove inode from fast commit list. If the inode is being committed 289 * we wait until inode commit is done. 290 */ 291 void ext4_fc_del(struct inode *inode) 292 { 293 struct ext4_inode_info *ei = EXT4_I(inode); 294 295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 297 return; 298 299 restart: 300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 301 if (list_empty(&ei->i_fc_list)) { 302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 303 return; 304 } 305 306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 307 ext4_fc_wait_committing_inode(inode); 308 goto restart; 309 } 310 list_del_init(&ei->i_fc_list); 311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 312 } 313 314 /* 315 * Mark file system as fast commit ineligible. This means that next commit 316 * operation would result in a full jbd2 commit. 317 */ 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 319 { 320 struct ext4_sb_info *sbi = EXT4_SB(sb); 321 322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 324 return; 325 326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 327 WARN_ON(reason >= EXT4_FC_REASON_MAX); 328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 329 } 330 331 /* 332 * Start a fast commit ineligible update. Any commits that happen while 333 * such an operation is in progress fall back to full commits. 334 */ 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason) 336 { 337 struct ext4_sb_info *sbi = EXT4_SB(sb); 338 339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 341 return; 342 343 WARN_ON(reason >= EXT4_FC_REASON_MAX); 344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 345 atomic_inc(&sbi->s_fc_ineligible_updates); 346 } 347 348 /* 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here 350 * to ensure that after stopping the ineligible update, at least one full 351 * commit takes place. 352 */ 353 void ext4_fc_stop_ineligible(struct super_block *sb) 354 { 355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 357 return; 358 359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); 361 } 362 363 static inline int ext4_fc_is_ineligible(struct super_block *sb) 364 { 365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || 366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); 367 } 368 369 /* 370 * Generic fast commit tracking function. If this is the first time this we are 371 * called after a full commit, we initialize fast commit fields and then call 372 * __fc_track_fn() with update = 0. If we have already been called after a full 373 * commit, we pass update = 1. Based on that, the track function can determine 374 * if it needs to track a field for the first time or if it needs to just 375 * update the previously tracked value. 376 * 377 * If enqueue is set, this function enqueues the inode in fast commit list. 378 */ 379 static int ext4_fc_track_template( 380 handle_t *handle, struct inode *inode, 381 int (*__fc_track_fn)(struct inode *, void *, bool), 382 void *args, int enqueue) 383 { 384 bool update = false; 385 struct ext4_inode_info *ei = EXT4_I(inode); 386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 387 tid_t tid = 0; 388 int ret; 389 390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 391 (sbi->s_mount_state & EXT4_FC_REPLAY)) 392 return -EOPNOTSUPP; 393 394 if (ext4_fc_is_ineligible(inode->i_sb)) 395 return -EINVAL; 396 397 tid = handle->h_transaction->t_tid; 398 mutex_lock(&ei->i_fc_lock); 399 if (tid == ei->i_sync_tid) { 400 update = true; 401 } else { 402 ext4_fc_reset_inode(inode); 403 ei->i_sync_tid = tid; 404 } 405 ret = __fc_track_fn(inode, args, update); 406 mutex_unlock(&ei->i_fc_lock); 407 408 if (!enqueue) 409 return ret; 410 411 spin_lock(&sbi->s_fc_lock); 412 if (list_empty(&EXT4_I(inode)->i_fc_list)) 413 list_add_tail(&EXT4_I(inode)->i_fc_list, 414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 415 &sbi->s_fc_q[FC_Q_STAGING] : 416 &sbi->s_fc_q[FC_Q_MAIN]); 417 spin_unlock(&sbi->s_fc_lock); 418 419 return ret; 420 } 421 422 struct __track_dentry_update_args { 423 struct dentry *dentry; 424 int op; 425 }; 426 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 429 { 430 struct ext4_fc_dentry_update *node; 431 struct ext4_inode_info *ei = EXT4_I(inode); 432 struct __track_dentry_update_args *dentry_update = 433 (struct __track_dentry_update_args *)arg; 434 struct dentry *dentry = dentry_update->dentry; 435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 436 437 mutex_unlock(&ei->i_fc_lock); 438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 439 if (!node) { 440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 441 mutex_lock(&ei->i_fc_lock); 442 return -ENOMEM; 443 } 444 445 node->fcd_op = dentry_update->op; 446 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 447 node->fcd_ino = inode->i_ino; 448 if (dentry->d_name.len > DNAME_INLINE_LEN) { 449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 450 if (!node->fcd_name.name) { 451 kmem_cache_free(ext4_fc_dentry_cachep, node); 452 ext4_fc_mark_ineligible(inode->i_sb, 453 EXT4_FC_REASON_NOMEM); 454 mutex_lock(&ei->i_fc_lock); 455 return -ENOMEM; 456 } 457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 458 dentry->d_name.len); 459 } else { 460 memcpy(node->fcd_iname, dentry->d_name.name, 461 dentry->d_name.len); 462 node->fcd_name.name = node->fcd_iname; 463 } 464 node->fcd_name.len = dentry->d_name.len; 465 466 spin_lock(&sbi->s_fc_lock); 467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 468 list_add_tail(&node->fcd_list, 469 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 470 else 471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 472 spin_unlock(&sbi->s_fc_lock); 473 mutex_lock(&ei->i_fc_lock); 474 475 return 0; 476 } 477 478 void __ext4_fc_track_unlink(handle_t *handle, 479 struct inode *inode, struct dentry *dentry) 480 { 481 struct __track_dentry_update_args args; 482 int ret; 483 484 args.dentry = dentry; 485 args.op = EXT4_FC_TAG_UNLINK; 486 487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 488 (void *)&args, 0); 489 trace_ext4_fc_track_unlink(inode, dentry, ret); 490 } 491 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 493 { 494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 495 } 496 497 void __ext4_fc_track_link(handle_t *handle, 498 struct inode *inode, struct dentry *dentry) 499 { 500 struct __track_dentry_update_args args; 501 int ret; 502 503 args.dentry = dentry; 504 args.op = EXT4_FC_TAG_LINK; 505 506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 507 (void *)&args, 0); 508 trace_ext4_fc_track_link(inode, dentry, ret); 509 } 510 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 512 { 513 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 514 } 515 516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 517 struct dentry *dentry) 518 { 519 struct __track_dentry_update_args args; 520 int ret; 521 522 args.dentry = dentry; 523 args.op = EXT4_FC_TAG_CREAT; 524 525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 526 (void *)&args, 0); 527 trace_ext4_fc_track_create(inode, dentry, ret); 528 } 529 530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 531 { 532 __ext4_fc_track_create(handle, d_inode(dentry), dentry); 533 } 534 535 /* __track_fn for inode tracking */ 536 static int __track_inode(struct inode *inode, void *arg, bool update) 537 { 538 if (update) 539 return -EEXIST; 540 541 EXT4_I(inode)->i_fc_lblk_len = 0; 542 543 return 0; 544 } 545 546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 547 { 548 int ret; 549 550 if (S_ISDIR(inode->i_mode)) 551 return; 552 553 if (ext4_should_journal_data(inode)) { 554 ext4_fc_mark_ineligible(inode->i_sb, 555 EXT4_FC_REASON_INODE_JOURNAL_DATA); 556 return; 557 } 558 559 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 560 trace_ext4_fc_track_inode(inode, ret); 561 } 562 563 struct __track_range_args { 564 ext4_lblk_t start, end; 565 }; 566 567 /* __track_fn for tracking data updates */ 568 static int __track_range(struct inode *inode, void *arg, bool update) 569 { 570 struct ext4_inode_info *ei = EXT4_I(inode); 571 ext4_lblk_t oldstart; 572 struct __track_range_args *__arg = 573 (struct __track_range_args *)arg; 574 575 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 576 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 577 return -ECANCELED; 578 } 579 580 oldstart = ei->i_fc_lblk_start; 581 582 if (update && ei->i_fc_lblk_len > 0) { 583 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 584 ei->i_fc_lblk_len = 585 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 586 ei->i_fc_lblk_start + 1; 587 } else { 588 ei->i_fc_lblk_start = __arg->start; 589 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 590 } 591 592 return 0; 593 } 594 595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 596 ext4_lblk_t end) 597 { 598 struct __track_range_args args; 599 int ret; 600 601 if (S_ISDIR(inode->i_mode)) 602 return; 603 604 args.start = start; 605 args.end = end; 606 607 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 608 609 trace_ext4_fc_track_range(inode, start, end, ret); 610 } 611 612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 613 { 614 int write_flags = REQ_SYNC; 615 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 616 617 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 618 if (test_opt(sb, BARRIER) && is_tail) 619 write_flags |= REQ_FUA | REQ_PREFLUSH; 620 lock_buffer(bh); 621 set_buffer_dirty(bh); 622 set_buffer_uptodate(bh); 623 bh->b_end_io = ext4_end_buffer_io_sync; 624 submit_bh(REQ_OP_WRITE, write_flags, bh); 625 EXT4_SB(sb)->s_fc_bh = NULL; 626 } 627 628 /* Ext4 commit path routines */ 629 630 /* memzero and update CRC */ 631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 632 u32 *crc) 633 { 634 void *ret; 635 636 ret = memset(dst, 0, len); 637 if (crc) 638 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 639 return ret; 640 } 641 642 /* 643 * Allocate len bytes on a fast commit buffer. 644 * 645 * During the commit time this function is used to manage fast commit 646 * block space. We don't split a fast commit log onto different 647 * blocks. So this function makes sure that if there's not enough space 648 * on the current block, the remaining space in the current block is 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 650 * new block is from jbd2 and CRC is updated to reflect the padding 651 * we added. 652 */ 653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 654 { 655 struct ext4_fc_tl *tl; 656 struct ext4_sb_info *sbi = EXT4_SB(sb); 657 struct buffer_head *bh; 658 int bsize = sbi->s_journal->j_blocksize; 659 int ret, off = sbi->s_fc_bytes % bsize; 660 int pad_len; 661 662 /* 663 * After allocating len, we should have space at least for a 0 byte 664 * padding. 665 */ 666 if (len + sizeof(struct ext4_fc_tl) > bsize) 667 return NULL; 668 669 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 670 /* 671 * Only allocate from current buffer if we have enough space for 672 * this request AND we have space to add a zero byte padding. 673 */ 674 if (!sbi->s_fc_bh) { 675 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 676 if (ret) 677 return NULL; 678 sbi->s_fc_bh = bh; 679 } 680 sbi->s_fc_bytes += len; 681 return sbi->s_fc_bh->b_data + off; 682 } 683 /* Need to add PAD tag */ 684 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 685 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 686 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 687 tl->fc_len = cpu_to_le16(pad_len); 688 if (crc) 689 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 690 if (pad_len > 0) 691 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 692 ext4_fc_submit_bh(sb, false); 693 694 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 695 if (ret) 696 return NULL; 697 sbi->s_fc_bh = bh; 698 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 699 return sbi->s_fc_bh->b_data; 700 } 701 702 /* memcpy to fc reserved space and update CRC */ 703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 704 int len, u32 *crc) 705 { 706 if (crc) 707 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 708 return memcpy(dst, src, len); 709 } 710 711 /* 712 * Complete a fast commit by writing tail tag. 713 * 714 * Writing tail tag marks the end of a fast commit. In order to guarantee 715 * atomicity, after writing tail tag, even if there's space remaining 716 * in the block, next commit shouldn't use it. That's why tail tag 717 * has the length as that of the remaining space on the block. 718 */ 719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 720 { 721 struct ext4_sb_info *sbi = EXT4_SB(sb); 722 struct ext4_fc_tl tl; 723 struct ext4_fc_tail tail; 724 int off, bsize = sbi->s_journal->j_blocksize; 725 u8 *dst; 726 727 /* 728 * ext4_fc_reserve_space takes care of allocating an extra block if 729 * there's no enough space on this block for accommodating this tail. 730 */ 731 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 732 if (!dst) 733 return -ENOSPC; 734 735 off = sbi->s_fc_bytes % bsize; 736 737 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 738 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 739 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 740 741 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 742 dst += sizeof(tl); 743 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 744 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 745 dst += sizeof(tail.fc_tid); 746 tail.fc_crc = cpu_to_le32(crc); 747 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 748 749 ext4_fc_submit_bh(sb, true); 750 751 return 0; 752 } 753 754 /* 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 756 * Returns false if there's not enough space. 757 */ 758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 759 u32 *crc) 760 { 761 struct ext4_fc_tl tl; 762 u8 *dst; 763 764 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 765 if (!dst) 766 return false; 767 768 tl.fc_tag = cpu_to_le16(tag); 769 tl.fc_len = cpu_to_le16(len); 770 771 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 772 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 773 774 return true; 775 } 776 777 /* Same as above, but adds dentry tlv. */ 778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 779 struct ext4_fc_dentry_update *fc_dentry) 780 { 781 struct ext4_fc_dentry_info fcd; 782 struct ext4_fc_tl tl; 783 int dlen = fc_dentry->fcd_name.len; 784 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 785 crc); 786 787 if (!dst) 788 return false; 789 790 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 791 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 792 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 793 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 794 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 795 dst += sizeof(tl); 796 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 797 dst += sizeof(fcd); 798 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); 799 dst += dlen; 800 801 return true; 802 } 803 804 /* 805 * Writes inode in the fast commit space under TLV with tag @tag. 806 * Returns 0 on success, error on failure. 807 */ 808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 809 { 810 struct ext4_inode_info *ei = EXT4_I(inode); 811 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 812 int ret; 813 struct ext4_iloc iloc; 814 struct ext4_fc_inode fc_inode; 815 struct ext4_fc_tl tl; 816 u8 *dst; 817 818 ret = ext4_get_inode_loc(inode, &iloc); 819 if (ret) 820 return ret; 821 822 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 823 inode_len += ei->i_extra_isize; 824 825 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 826 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 827 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 828 829 dst = ext4_fc_reserve_space(inode->i_sb, 830 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 831 if (!dst) 832 return -ECANCELED; 833 834 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 835 return -ECANCELED; 836 dst += sizeof(tl); 837 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 838 return -ECANCELED; 839 dst += sizeof(fc_inode); 840 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 841 inode_len, crc)) 842 return -ECANCELED; 843 844 return 0; 845 } 846 847 /* 848 * Writes updated data ranges for the inode in question. Updates CRC. 849 * Returns 0 on success, error otherwise. 850 */ 851 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 852 { 853 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 854 struct ext4_inode_info *ei = EXT4_I(inode); 855 struct ext4_map_blocks map; 856 struct ext4_fc_add_range fc_ext; 857 struct ext4_fc_del_range lrange; 858 struct ext4_extent *ex; 859 int ret; 860 861 mutex_lock(&ei->i_fc_lock); 862 if (ei->i_fc_lblk_len == 0) { 863 mutex_unlock(&ei->i_fc_lock); 864 return 0; 865 } 866 old_blk_size = ei->i_fc_lblk_start; 867 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 868 ei->i_fc_lblk_len = 0; 869 mutex_unlock(&ei->i_fc_lock); 870 871 cur_lblk_off = old_blk_size; 872 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 873 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 874 875 while (cur_lblk_off <= new_blk_size) { 876 map.m_lblk = cur_lblk_off; 877 map.m_len = new_blk_size - cur_lblk_off + 1; 878 ret = ext4_map_blocks(NULL, inode, &map, 0); 879 if (ret < 0) 880 return -ECANCELED; 881 882 if (map.m_len == 0) { 883 cur_lblk_off++; 884 continue; 885 } 886 887 if (ret == 0) { 888 lrange.fc_ino = cpu_to_le32(inode->i_ino); 889 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 890 lrange.fc_len = cpu_to_le32(map.m_len); 891 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 892 sizeof(lrange), (u8 *)&lrange, crc)) 893 return -ENOSPC; 894 } else { 895 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 896 ex = (struct ext4_extent *)&fc_ext.fc_ex; 897 ex->ee_block = cpu_to_le32(map.m_lblk); 898 ex->ee_len = cpu_to_le16(map.m_len); 899 ext4_ext_store_pblock(ex, map.m_pblk); 900 if (map.m_flags & EXT4_MAP_UNWRITTEN) 901 ext4_ext_mark_unwritten(ex); 902 else 903 ext4_ext_mark_initialized(ex); 904 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 905 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 906 return -ENOSPC; 907 } 908 909 cur_lblk_off += map.m_len; 910 } 911 912 return 0; 913 } 914 915 916 /* Submit data for all the fast commit inodes */ 917 static int ext4_fc_submit_inode_data_all(journal_t *journal) 918 { 919 struct super_block *sb = (struct super_block *)(journal->j_private); 920 struct ext4_sb_info *sbi = EXT4_SB(sb); 921 struct ext4_inode_info *ei; 922 int ret = 0; 923 924 spin_lock(&sbi->s_fc_lock); 925 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 926 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 927 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 928 while (atomic_read(&ei->i_fc_updates)) { 929 DEFINE_WAIT(wait); 930 931 prepare_to_wait(&ei->i_fc_wait, &wait, 932 TASK_UNINTERRUPTIBLE); 933 if (atomic_read(&ei->i_fc_updates)) { 934 spin_unlock(&sbi->s_fc_lock); 935 schedule(); 936 spin_lock(&sbi->s_fc_lock); 937 } 938 finish_wait(&ei->i_fc_wait, &wait); 939 } 940 spin_unlock(&sbi->s_fc_lock); 941 ret = jbd2_submit_inode_data(ei->jinode); 942 if (ret) 943 return ret; 944 spin_lock(&sbi->s_fc_lock); 945 } 946 spin_unlock(&sbi->s_fc_lock); 947 948 return ret; 949 } 950 951 /* Wait for completion of data for all the fast commit inodes */ 952 static int ext4_fc_wait_inode_data_all(journal_t *journal) 953 { 954 struct super_block *sb = (struct super_block *)(journal->j_private); 955 struct ext4_sb_info *sbi = EXT4_SB(sb); 956 struct ext4_inode_info *pos, *n; 957 int ret = 0; 958 959 spin_lock(&sbi->s_fc_lock); 960 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 961 if (!ext4_test_inode_state(&pos->vfs_inode, 962 EXT4_STATE_FC_COMMITTING)) 963 continue; 964 spin_unlock(&sbi->s_fc_lock); 965 966 ret = jbd2_wait_inode_data(journal, pos->jinode); 967 if (ret) 968 return ret; 969 spin_lock(&sbi->s_fc_lock); 970 } 971 spin_unlock(&sbi->s_fc_lock); 972 973 return 0; 974 } 975 976 /* Commit all the directory entry updates */ 977 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 978 __acquires(&sbi->s_fc_lock) 979 __releases(&sbi->s_fc_lock) 980 { 981 struct super_block *sb = (struct super_block *)(journal->j_private); 982 struct ext4_sb_info *sbi = EXT4_SB(sb); 983 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 984 struct inode *inode; 985 struct ext4_inode_info *ei, *ei_n; 986 int ret; 987 988 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 989 return 0; 990 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 991 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 992 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 993 spin_unlock(&sbi->s_fc_lock); 994 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 995 ret = -ENOSPC; 996 goto lock_and_exit; 997 } 998 spin_lock(&sbi->s_fc_lock); 999 continue; 1000 } 1001 1002 inode = NULL; 1003 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], 1004 i_fc_list) { 1005 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 1006 inode = &ei->vfs_inode; 1007 break; 1008 } 1009 } 1010 /* 1011 * If we don't find inode in our list, then it was deleted, 1012 * in which case, we don't need to record it's create tag. 1013 */ 1014 if (!inode) 1015 continue; 1016 spin_unlock(&sbi->s_fc_lock); 1017 1018 /* 1019 * We first write the inode and then the create dirent. This 1020 * allows the recovery code to create an unnamed inode first 1021 * and then link it to a directory entry. This allows us 1022 * to use namei.c routines almost as is and simplifies 1023 * the recovery code. 1024 */ 1025 ret = ext4_fc_write_inode(inode, crc); 1026 if (ret) 1027 goto lock_and_exit; 1028 1029 ret = ext4_fc_write_inode_data(inode, crc); 1030 if (ret) 1031 goto lock_and_exit; 1032 1033 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1034 ret = -ENOSPC; 1035 goto lock_and_exit; 1036 } 1037 1038 spin_lock(&sbi->s_fc_lock); 1039 } 1040 return 0; 1041 lock_and_exit: 1042 spin_lock(&sbi->s_fc_lock); 1043 return ret; 1044 } 1045 1046 static int ext4_fc_perform_commit(journal_t *journal) 1047 { 1048 struct super_block *sb = (struct super_block *)(journal->j_private); 1049 struct ext4_sb_info *sbi = EXT4_SB(sb); 1050 struct ext4_inode_info *iter; 1051 struct ext4_fc_head head; 1052 struct inode *inode; 1053 struct blk_plug plug; 1054 int ret = 0; 1055 u32 crc = 0; 1056 1057 ret = ext4_fc_submit_inode_data_all(journal); 1058 if (ret) 1059 return ret; 1060 1061 ret = ext4_fc_wait_inode_data_all(journal); 1062 if (ret) 1063 return ret; 1064 1065 /* 1066 * If file system device is different from journal device, issue a cache 1067 * flush before we start writing fast commit blocks. 1068 */ 1069 if (journal->j_fs_dev != journal->j_dev) 1070 blkdev_issue_flush(journal->j_fs_dev); 1071 1072 blk_start_plug(&plug); 1073 if (sbi->s_fc_bytes == 0) { 1074 /* 1075 * Add a head tag only if this is the first fast commit 1076 * in this TID. 1077 */ 1078 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1079 head.fc_tid = cpu_to_le32( 1080 sbi->s_journal->j_running_transaction->t_tid); 1081 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1082 (u8 *)&head, &crc)) { 1083 ret = -ENOSPC; 1084 goto out; 1085 } 1086 } 1087 1088 spin_lock(&sbi->s_fc_lock); 1089 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1090 if (ret) { 1091 spin_unlock(&sbi->s_fc_lock); 1092 goto out; 1093 } 1094 1095 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1096 inode = &iter->vfs_inode; 1097 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1098 continue; 1099 1100 spin_unlock(&sbi->s_fc_lock); 1101 ret = ext4_fc_write_inode_data(inode, &crc); 1102 if (ret) 1103 goto out; 1104 ret = ext4_fc_write_inode(inode, &crc); 1105 if (ret) 1106 goto out; 1107 spin_lock(&sbi->s_fc_lock); 1108 } 1109 spin_unlock(&sbi->s_fc_lock); 1110 1111 ret = ext4_fc_write_tail(sb, crc); 1112 1113 out: 1114 blk_finish_plug(&plug); 1115 return ret; 1116 } 1117 1118 /* 1119 * The main commit entry point. Performs a fast commit for transaction 1120 * commit_tid if needed. If it's not possible to perform a fast commit 1121 * due to various reasons, we fall back to full commit. Returns 0 1122 * on success, error otherwise. 1123 */ 1124 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1125 { 1126 struct super_block *sb = (struct super_block *)(journal->j_private); 1127 struct ext4_sb_info *sbi = EXT4_SB(sb); 1128 int nblks = 0, ret, bsize = journal->j_blocksize; 1129 int subtid = atomic_read(&sbi->s_fc_subtid); 1130 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; 1131 ktime_t start_time, commit_time; 1132 1133 trace_ext4_fc_commit_start(sb); 1134 1135 start_time = ktime_get(); 1136 1137 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 1138 (ext4_fc_is_ineligible(sb))) { 1139 reason = EXT4_FC_REASON_INELIGIBLE; 1140 goto out; 1141 } 1142 1143 restart_fc: 1144 ret = jbd2_fc_begin_commit(journal, commit_tid); 1145 if (ret == -EALREADY) { 1146 /* There was an ongoing commit, check if we need to restart */ 1147 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1148 commit_tid > journal->j_commit_sequence) 1149 goto restart_fc; 1150 reason = EXT4_FC_REASON_ALREADY_COMMITTED; 1151 goto out; 1152 } else if (ret) { 1153 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1154 reason = EXT4_FC_REASON_FC_START_FAILED; 1155 goto out; 1156 } 1157 1158 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1159 ret = ext4_fc_perform_commit(journal); 1160 if (ret < 0) { 1161 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1162 reason = EXT4_FC_REASON_FC_FAILED; 1163 goto out; 1164 } 1165 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1166 ret = jbd2_fc_wait_bufs(journal, nblks); 1167 if (ret < 0) { 1168 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1169 reason = EXT4_FC_REASON_FC_FAILED; 1170 goto out; 1171 } 1172 atomic_inc(&sbi->s_fc_subtid); 1173 jbd2_fc_end_commit(journal); 1174 out: 1175 /* Has any ineligible update happened since we started? */ 1176 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { 1177 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1178 reason = EXT4_FC_REASON_INELIGIBLE; 1179 } 1180 1181 spin_lock(&sbi->s_fc_lock); 1182 if (reason != EXT4_FC_REASON_OK && 1183 reason != EXT4_FC_REASON_ALREADY_COMMITTED) { 1184 sbi->s_fc_stats.fc_ineligible_commits++; 1185 } else { 1186 sbi->s_fc_stats.fc_num_commits++; 1187 sbi->s_fc_stats.fc_numblks += nblks; 1188 } 1189 spin_unlock(&sbi->s_fc_lock); 1190 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; 1191 trace_ext4_fc_commit_stop(sb, nblks, reason); 1192 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1193 /* 1194 * weight the commit time higher than the average time so we don't 1195 * react too strongly to vast changes in the commit time 1196 */ 1197 if (likely(sbi->s_fc_avg_commit_time)) 1198 sbi->s_fc_avg_commit_time = (commit_time + 1199 sbi->s_fc_avg_commit_time * 3) / 4; 1200 else 1201 sbi->s_fc_avg_commit_time = commit_time; 1202 jbd_debug(1, 1203 "Fast commit ended with blks = %d, reason = %d, subtid - %d", 1204 nblks, reason, subtid); 1205 if (reason == EXT4_FC_REASON_FC_FAILED) 1206 return jbd2_fc_end_commit_fallback(journal); 1207 if (reason == EXT4_FC_REASON_FC_START_FAILED || 1208 reason == EXT4_FC_REASON_INELIGIBLE) 1209 return jbd2_complete_transaction(journal, commit_tid); 1210 return 0; 1211 } 1212 1213 /* 1214 * Fast commit cleanup routine. This is called after every fast commit and 1215 * full commit. full is true if we are called after a full commit. 1216 */ 1217 static void ext4_fc_cleanup(journal_t *journal, int full) 1218 { 1219 struct super_block *sb = journal->j_private; 1220 struct ext4_sb_info *sbi = EXT4_SB(sb); 1221 struct ext4_inode_info *iter, *iter_n; 1222 struct ext4_fc_dentry_update *fc_dentry; 1223 1224 if (full && sbi->s_fc_bh) 1225 sbi->s_fc_bh = NULL; 1226 1227 jbd2_fc_release_bufs(journal); 1228 1229 spin_lock(&sbi->s_fc_lock); 1230 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1231 i_fc_list) { 1232 list_del_init(&iter->i_fc_list); 1233 ext4_clear_inode_state(&iter->vfs_inode, 1234 EXT4_STATE_FC_COMMITTING); 1235 ext4_fc_reset_inode(&iter->vfs_inode); 1236 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1237 smp_mb(); 1238 #if (BITS_PER_LONG < 64) 1239 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1240 #else 1241 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1242 #endif 1243 } 1244 1245 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1246 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1247 struct ext4_fc_dentry_update, 1248 fcd_list); 1249 list_del_init(&fc_dentry->fcd_list); 1250 spin_unlock(&sbi->s_fc_lock); 1251 1252 if (fc_dentry->fcd_name.name && 1253 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1254 kfree(fc_dentry->fcd_name.name); 1255 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1256 spin_lock(&sbi->s_fc_lock); 1257 } 1258 1259 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1260 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1261 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1262 &sbi->s_fc_q[FC_Q_MAIN]); 1263 1264 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1265 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1266 1267 if (full) 1268 sbi->s_fc_bytes = 0; 1269 spin_unlock(&sbi->s_fc_lock); 1270 trace_ext4_fc_stats(sb); 1271 } 1272 1273 /* Ext4 Replay Path Routines */ 1274 1275 /* Helper struct for dentry replay routines */ 1276 struct dentry_info_args { 1277 int parent_ino, dname_len, ino, inode_len; 1278 char *dname; 1279 }; 1280 1281 static inline void tl_to_darg(struct dentry_info_args *darg, 1282 struct ext4_fc_tl *tl, u8 *val) 1283 { 1284 struct ext4_fc_dentry_info fcd; 1285 1286 memcpy(&fcd, val, sizeof(fcd)); 1287 1288 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1289 darg->ino = le32_to_cpu(fcd.fc_ino); 1290 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1291 darg->dname_len = le16_to_cpu(tl->fc_len) - 1292 sizeof(struct ext4_fc_dentry_info); 1293 } 1294 1295 /* Unlink replay function */ 1296 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1297 u8 *val) 1298 { 1299 struct inode *inode, *old_parent; 1300 struct qstr entry; 1301 struct dentry_info_args darg; 1302 int ret = 0; 1303 1304 tl_to_darg(&darg, tl, val); 1305 1306 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1307 darg.parent_ino, darg.dname_len); 1308 1309 entry.name = darg.dname; 1310 entry.len = darg.dname_len; 1311 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1312 1313 if (IS_ERR(inode)) { 1314 jbd_debug(1, "Inode %d not found", darg.ino); 1315 return 0; 1316 } 1317 1318 old_parent = ext4_iget(sb, darg.parent_ino, 1319 EXT4_IGET_NORMAL); 1320 if (IS_ERR(old_parent)) { 1321 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1322 iput(inode); 1323 return 0; 1324 } 1325 1326 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1327 /* -ENOENT ok coz it might not exist anymore. */ 1328 if (ret == -ENOENT) 1329 ret = 0; 1330 iput(old_parent); 1331 iput(inode); 1332 return ret; 1333 } 1334 1335 static int ext4_fc_replay_link_internal(struct super_block *sb, 1336 struct dentry_info_args *darg, 1337 struct inode *inode) 1338 { 1339 struct inode *dir = NULL; 1340 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1341 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1342 int ret = 0; 1343 1344 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1345 if (IS_ERR(dir)) { 1346 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1347 dir = NULL; 1348 goto out; 1349 } 1350 1351 dentry_dir = d_obtain_alias(dir); 1352 if (IS_ERR(dentry_dir)) { 1353 jbd_debug(1, "Failed to obtain dentry"); 1354 dentry_dir = NULL; 1355 goto out; 1356 } 1357 1358 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1359 if (!dentry_inode) { 1360 jbd_debug(1, "Inode dentry not created."); 1361 ret = -ENOMEM; 1362 goto out; 1363 } 1364 1365 ret = __ext4_link(dir, inode, dentry_inode); 1366 /* 1367 * It's possible that link already existed since data blocks 1368 * for the dir in question got persisted before we crashed OR 1369 * we replayed this tag and crashed before the entire replay 1370 * could complete. 1371 */ 1372 if (ret && ret != -EEXIST) { 1373 jbd_debug(1, "Failed to link\n"); 1374 goto out; 1375 } 1376 1377 ret = 0; 1378 out: 1379 if (dentry_dir) { 1380 d_drop(dentry_dir); 1381 dput(dentry_dir); 1382 } else if (dir) { 1383 iput(dir); 1384 } 1385 if (dentry_inode) { 1386 d_drop(dentry_inode); 1387 dput(dentry_inode); 1388 } 1389 1390 return ret; 1391 } 1392 1393 /* Link replay function */ 1394 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1395 u8 *val) 1396 { 1397 struct inode *inode; 1398 struct dentry_info_args darg; 1399 int ret = 0; 1400 1401 tl_to_darg(&darg, tl, val); 1402 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1403 darg.parent_ino, darg.dname_len); 1404 1405 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1406 if (IS_ERR(inode)) { 1407 jbd_debug(1, "Inode not found."); 1408 return 0; 1409 } 1410 1411 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1412 iput(inode); 1413 return ret; 1414 } 1415 1416 /* 1417 * Record all the modified inodes during replay. We use this later to setup 1418 * block bitmaps correctly. 1419 */ 1420 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1421 { 1422 struct ext4_fc_replay_state *state; 1423 int i; 1424 1425 state = &EXT4_SB(sb)->s_fc_replay_state; 1426 for (i = 0; i < state->fc_modified_inodes_used; i++) 1427 if (state->fc_modified_inodes[i] == ino) 1428 return 0; 1429 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1430 state->fc_modified_inodes_size += 1431 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1432 state->fc_modified_inodes = krealloc( 1433 state->fc_modified_inodes, sizeof(int) * 1434 state->fc_modified_inodes_size, 1435 GFP_KERNEL); 1436 if (!state->fc_modified_inodes) 1437 return -ENOMEM; 1438 } 1439 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1440 return 0; 1441 } 1442 1443 /* 1444 * Inode replay function 1445 */ 1446 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1447 u8 *val) 1448 { 1449 struct ext4_fc_inode fc_inode; 1450 struct ext4_inode *raw_inode; 1451 struct ext4_inode *raw_fc_inode; 1452 struct inode *inode = NULL; 1453 struct ext4_iloc iloc; 1454 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1455 struct ext4_extent_header *eh; 1456 1457 memcpy(&fc_inode, val, sizeof(fc_inode)); 1458 1459 ino = le32_to_cpu(fc_inode.fc_ino); 1460 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1461 1462 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1463 if (!IS_ERR(inode)) { 1464 ext4_ext_clear_bb(inode); 1465 iput(inode); 1466 } 1467 inode = NULL; 1468 1469 ext4_fc_record_modified_inode(sb, ino); 1470 1471 raw_fc_inode = (struct ext4_inode *) 1472 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1473 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1474 if (ret) 1475 goto out; 1476 1477 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); 1478 raw_inode = ext4_raw_inode(&iloc); 1479 1480 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1481 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1482 inode_len - offsetof(struct ext4_inode, i_generation)); 1483 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1484 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1485 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1486 memset(eh, 0, sizeof(*eh)); 1487 eh->eh_magic = EXT4_EXT_MAGIC; 1488 eh->eh_max = cpu_to_le16( 1489 (sizeof(raw_inode->i_block) - 1490 sizeof(struct ext4_extent_header)) 1491 / sizeof(struct ext4_extent)); 1492 } 1493 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1494 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1495 sizeof(raw_inode->i_block)); 1496 } 1497 1498 /* Immediately update the inode on disk. */ 1499 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1500 if (ret) 1501 goto out; 1502 ret = sync_dirty_buffer(iloc.bh); 1503 if (ret) 1504 goto out; 1505 ret = ext4_mark_inode_used(sb, ino); 1506 if (ret) 1507 goto out; 1508 1509 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1510 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1511 if (IS_ERR(inode)) { 1512 jbd_debug(1, "Inode not found."); 1513 return -EFSCORRUPTED; 1514 } 1515 1516 /* 1517 * Our allocator could have made different decisions than before 1518 * crashing. This should be fixed but until then, we calculate 1519 * the number of blocks the inode. 1520 */ 1521 ext4_ext_replay_set_iblocks(inode); 1522 1523 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1524 ext4_reset_inode_seed(inode); 1525 1526 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1527 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1528 sync_dirty_buffer(iloc.bh); 1529 brelse(iloc.bh); 1530 out: 1531 iput(inode); 1532 if (!ret) 1533 blkdev_issue_flush(sb->s_bdev); 1534 1535 return 0; 1536 } 1537 1538 /* 1539 * Dentry create replay function. 1540 * 1541 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1542 * inode for which we are trying to create a dentry here, should already have 1543 * been replayed before we start here. 1544 */ 1545 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1546 u8 *val) 1547 { 1548 int ret = 0; 1549 struct inode *inode = NULL; 1550 struct inode *dir = NULL; 1551 struct dentry_info_args darg; 1552 1553 tl_to_darg(&darg, tl, val); 1554 1555 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1556 darg.parent_ino, darg.dname_len); 1557 1558 /* This takes care of update group descriptor and other metadata */ 1559 ret = ext4_mark_inode_used(sb, darg.ino); 1560 if (ret) 1561 goto out; 1562 1563 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1564 if (IS_ERR(inode)) { 1565 jbd_debug(1, "inode %d not found.", darg.ino); 1566 inode = NULL; 1567 ret = -EINVAL; 1568 goto out; 1569 } 1570 1571 if (S_ISDIR(inode->i_mode)) { 1572 /* 1573 * If we are creating a directory, we need to make sure that the 1574 * dot and dot dot dirents are setup properly. 1575 */ 1576 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1577 if (IS_ERR(dir)) { 1578 jbd_debug(1, "Dir %d not found.", darg.ino); 1579 goto out; 1580 } 1581 ret = ext4_init_new_dir(NULL, dir, inode); 1582 iput(dir); 1583 if (ret) { 1584 ret = 0; 1585 goto out; 1586 } 1587 } 1588 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1589 if (ret) 1590 goto out; 1591 set_nlink(inode, 1); 1592 ext4_mark_inode_dirty(NULL, inode); 1593 out: 1594 if (inode) 1595 iput(inode); 1596 return ret; 1597 } 1598 1599 /* 1600 * Record physical disk regions which are in use as per fast commit area. Our 1601 * simple replay phase allocator excludes these regions from allocation. 1602 */ 1603 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1604 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1605 { 1606 struct ext4_fc_replay_state *state; 1607 struct ext4_fc_alloc_region *region; 1608 1609 state = &EXT4_SB(sb)->s_fc_replay_state; 1610 if (state->fc_regions_used == state->fc_regions_size) { 1611 state->fc_regions_size += 1612 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1613 state->fc_regions = krealloc( 1614 state->fc_regions, 1615 state->fc_regions_size * 1616 sizeof(struct ext4_fc_alloc_region), 1617 GFP_KERNEL); 1618 if (!state->fc_regions) 1619 return -ENOMEM; 1620 } 1621 region = &state->fc_regions[state->fc_regions_used++]; 1622 region->ino = ino; 1623 region->lblk = lblk; 1624 region->pblk = pblk; 1625 region->len = len; 1626 1627 return 0; 1628 } 1629 1630 /* Replay add range tag */ 1631 static int ext4_fc_replay_add_range(struct super_block *sb, 1632 struct ext4_fc_tl *tl, u8 *val) 1633 { 1634 struct ext4_fc_add_range fc_add_ex; 1635 struct ext4_extent newex, *ex; 1636 struct inode *inode; 1637 ext4_lblk_t start, cur; 1638 int remaining, len; 1639 ext4_fsblk_t start_pblk; 1640 struct ext4_map_blocks map; 1641 struct ext4_ext_path *path = NULL; 1642 int ret; 1643 1644 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1645 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1646 1647 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1648 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1649 ext4_ext_get_actual_len(ex)); 1650 1651 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1652 if (IS_ERR(inode)) { 1653 jbd_debug(1, "Inode not found."); 1654 return 0; 1655 } 1656 1657 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1658 1659 start = le32_to_cpu(ex->ee_block); 1660 start_pblk = ext4_ext_pblock(ex); 1661 len = ext4_ext_get_actual_len(ex); 1662 1663 cur = start; 1664 remaining = len; 1665 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1666 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1667 inode->i_ino); 1668 1669 while (remaining > 0) { 1670 map.m_lblk = cur; 1671 map.m_len = remaining; 1672 map.m_pblk = 0; 1673 ret = ext4_map_blocks(NULL, inode, &map, 0); 1674 1675 if (ret < 0) { 1676 iput(inode); 1677 return 0; 1678 } 1679 1680 if (ret == 0) { 1681 /* Range is not mapped */ 1682 path = ext4_find_extent(inode, cur, NULL, 0); 1683 if (IS_ERR(path)) { 1684 iput(inode); 1685 return 0; 1686 } 1687 memset(&newex, 0, sizeof(newex)); 1688 newex.ee_block = cpu_to_le32(cur); 1689 ext4_ext_store_pblock( 1690 &newex, start_pblk + cur - start); 1691 newex.ee_len = cpu_to_le16(map.m_len); 1692 if (ext4_ext_is_unwritten(ex)) 1693 ext4_ext_mark_unwritten(&newex); 1694 down_write(&EXT4_I(inode)->i_data_sem); 1695 ret = ext4_ext_insert_extent( 1696 NULL, inode, &path, &newex, 0); 1697 up_write((&EXT4_I(inode)->i_data_sem)); 1698 ext4_ext_drop_refs(path); 1699 kfree(path); 1700 if (ret) { 1701 iput(inode); 1702 return 0; 1703 } 1704 goto next; 1705 } 1706 1707 if (start_pblk + cur - start != map.m_pblk) { 1708 /* 1709 * Logical to physical mapping changed. This can happen 1710 * if this range was removed and then reallocated to 1711 * map to new physical blocks during a fast commit. 1712 */ 1713 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1714 ext4_ext_is_unwritten(ex), 1715 start_pblk + cur - start); 1716 if (ret) { 1717 iput(inode); 1718 return 0; 1719 } 1720 /* 1721 * Mark the old blocks as free since they aren't used 1722 * anymore. We maintain an array of all the modified 1723 * inodes. In case these blocks are still used at either 1724 * a different logical range in the same inode or in 1725 * some different inode, we will mark them as allocated 1726 * at the end of the FC replay using our array of 1727 * modified inodes. 1728 */ 1729 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1730 goto next; 1731 } 1732 1733 /* Range is mapped and needs a state change */ 1734 jbd_debug(1, "Converting from %ld to %d %lld", 1735 map.m_flags & EXT4_MAP_UNWRITTEN, 1736 ext4_ext_is_unwritten(ex), map.m_pblk); 1737 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1738 ext4_ext_is_unwritten(ex), map.m_pblk); 1739 if (ret) { 1740 iput(inode); 1741 return 0; 1742 } 1743 /* 1744 * We may have split the extent tree while toggling the state. 1745 * Try to shrink the extent tree now. 1746 */ 1747 ext4_ext_replay_shrink_inode(inode, start + len); 1748 next: 1749 cur += map.m_len; 1750 remaining -= map.m_len; 1751 } 1752 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1753 sb->s_blocksize_bits); 1754 iput(inode); 1755 return 0; 1756 } 1757 1758 /* Replay DEL_RANGE tag */ 1759 static int 1760 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1761 u8 *val) 1762 { 1763 struct inode *inode; 1764 struct ext4_fc_del_range lrange; 1765 struct ext4_map_blocks map; 1766 ext4_lblk_t cur, remaining; 1767 int ret; 1768 1769 memcpy(&lrange, val, sizeof(lrange)); 1770 cur = le32_to_cpu(lrange.fc_lblk); 1771 remaining = le32_to_cpu(lrange.fc_len); 1772 1773 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1774 le32_to_cpu(lrange.fc_ino), cur, remaining); 1775 1776 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1777 if (IS_ERR(inode)) { 1778 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1779 return 0; 1780 } 1781 1782 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1783 1784 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1785 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1786 le32_to_cpu(lrange.fc_len)); 1787 while (remaining > 0) { 1788 map.m_lblk = cur; 1789 map.m_len = remaining; 1790 1791 ret = ext4_map_blocks(NULL, inode, &map, 0); 1792 if (ret < 0) { 1793 iput(inode); 1794 return 0; 1795 } 1796 if (ret > 0) { 1797 remaining -= ret; 1798 cur += ret; 1799 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1800 } else { 1801 remaining -= map.m_len; 1802 cur += map.m_len; 1803 } 1804 } 1805 1806 ret = ext4_punch_hole(inode, 1807 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits, 1808 le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits); 1809 if (ret) 1810 jbd_debug(1, "ext4_punch_hole returned %d", ret); 1811 ext4_ext_replay_shrink_inode(inode, 1812 i_size_read(inode) >> sb->s_blocksize_bits); 1813 ext4_mark_inode_dirty(NULL, inode); 1814 iput(inode); 1815 1816 return 0; 1817 } 1818 1819 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1820 { 1821 struct ext4_fc_replay_state *state; 1822 struct inode *inode; 1823 struct ext4_ext_path *path = NULL; 1824 struct ext4_map_blocks map; 1825 int i, ret, j; 1826 ext4_lblk_t cur, end; 1827 1828 state = &EXT4_SB(sb)->s_fc_replay_state; 1829 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1830 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1831 EXT4_IGET_NORMAL); 1832 if (IS_ERR(inode)) { 1833 jbd_debug(1, "Inode %d not found.", 1834 state->fc_modified_inodes[i]); 1835 continue; 1836 } 1837 cur = 0; 1838 end = EXT_MAX_BLOCKS; 1839 while (cur < end) { 1840 map.m_lblk = cur; 1841 map.m_len = end - cur; 1842 1843 ret = ext4_map_blocks(NULL, inode, &map, 0); 1844 if (ret < 0) 1845 break; 1846 1847 if (ret > 0) { 1848 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1849 if (!IS_ERR(path)) { 1850 for (j = 0; j < path->p_depth; j++) 1851 ext4_mb_mark_bb(inode->i_sb, 1852 path[j].p_block, 1, 1); 1853 ext4_ext_drop_refs(path); 1854 kfree(path); 1855 } 1856 cur += ret; 1857 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1858 map.m_len, 1); 1859 } else { 1860 cur = cur + (map.m_len ? map.m_len : 1); 1861 } 1862 } 1863 iput(inode); 1864 } 1865 } 1866 1867 /* 1868 * Check if block is in excluded regions for block allocation. The simple 1869 * allocator that runs during replay phase is calls this function to see 1870 * if it is okay to use a block. 1871 */ 1872 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1873 { 1874 int i; 1875 struct ext4_fc_replay_state *state; 1876 1877 state = &EXT4_SB(sb)->s_fc_replay_state; 1878 for (i = 0; i < state->fc_regions_valid; i++) { 1879 if (state->fc_regions[i].ino == 0 || 1880 state->fc_regions[i].len == 0) 1881 continue; 1882 if (blk >= state->fc_regions[i].pblk && 1883 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1884 return true; 1885 } 1886 return false; 1887 } 1888 1889 /* Cleanup function called after replay */ 1890 void ext4_fc_replay_cleanup(struct super_block *sb) 1891 { 1892 struct ext4_sb_info *sbi = EXT4_SB(sb); 1893 1894 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1895 kfree(sbi->s_fc_replay_state.fc_regions); 1896 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1897 } 1898 1899 /* 1900 * Recovery Scan phase handler 1901 * 1902 * This function is called during the scan phase and is responsible 1903 * for doing following things: 1904 * - Make sure the fast commit area has valid tags for replay 1905 * - Count number of tags that need to be replayed by the replay handler 1906 * - Verify CRC 1907 * - Create a list of excluded blocks for allocation during replay phase 1908 * 1909 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1910 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1911 * to indicate that scan has finished and JBD2 can now start replay phase. 1912 * It returns a negative error to indicate that there was an error. At the end 1913 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1914 * to indicate the number of tags that need to replayed during the replay phase. 1915 */ 1916 static int ext4_fc_replay_scan(journal_t *journal, 1917 struct buffer_head *bh, int off, 1918 tid_t expected_tid) 1919 { 1920 struct super_block *sb = journal->j_private; 1921 struct ext4_sb_info *sbi = EXT4_SB(sb); 1922 struct ext4_fc_replay_state *state; 1923 int ret = JBD2_FC_REPLAY_CONTINUE; 1924 struct ext4_fc_add_range ext; 1925 struct ext4_fc_tl tl; 1926 struct ext4_fc_tail tail; 1927 __u8 *start, *end, *cur, *val; 1928 struct ext4_fc_head head; 1929 struct ext4_extent *ex; 1930 1931 state = &sbi->s_fc_replay_state; 1932 1933 start = (u8 *)bh->b_data; 1934 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1935 1936 if (state->fc_replay_expected_off == 0) { 1937 state->fc_cur_tag = 0; 1938 state->fc_replay_num_tags = 0; 1939 state->fc_crc = 0; 1940 state->fc_regions = NULL; 1941 state->fc_regions_valid = state->fc_regions_used = 1942 state->fc_regions_size = 0; 1943 /* Check if we can stop early */ 1944 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1945 != EXT4_FC_TAG_HEAD) 1946 return 0; 1947 } 1948 1949 if (off != state->fc_replay_expected_off) { 1950 ret = -EFSCORRUPTED; 1951 goto out_err; 1952 } 1953 1954 state->fc_replay_expected_off++; 1955 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 1956 memcpy(&tl, cur, sizeof(tl)); 1957 val = cur + sizeof(tl); 1958 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1959 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); 1960 switch (le16_to_cpu(tl.fc_tag)) { 1961 case EXT4_FC_TAG_ADD_RANGE: 1962 memcpy(&ext, val, sizeof(ext)); 1963 ex = (struct ext4_extent *)&ext.fc_ex; 1964 ret = ext4_fc_record_regions(sb, 1965 le32_to_cpu(ext.fc_ino), 1966 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1967 ext4_ext_get_actual_len(ex)); 1968 if (ret < 0) 1969 break; 1970 ret = JBD2_FC_REPLAY_CONTINUE; 1971 fallthrough; 1972 case EXT4_FC_TAG_DEL_RANGE: 1973 case EXT4_FC_TAG_LINK: 1974 case EXT4_FC_TAG_UNLINK: 1975 case EXT4_FC_TAG_CREAT: 1976 case EXT4_FC_TAG_INODE: 1977 case EXT4_FC_TAG_PAD: 1978 state->fc_cur_tag++; 1979 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1980 sizeof(tl) + le16_to_cpu(tl.fc_len)); 1981 break; 1982 case EXT4_FC_TAG_TAIL: 1983 state->fc_cur_tag++; 1984 memcpy(&tail, val, sizeof(tail)); 1985 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1986 sizeof(tl) + 1987 offsetof(struct ext4_fc_tail, 1988 fc_crc)); 1989 if (le32_to_cpu(tail.fc_tid) == expected_tid && 1990 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 1991 state->fc_replay_num_tags = state->fc_cur_tag; 1992 state->fc_regions_valid = 1993 state->fc_regions_used; 1994 } else { 1995 ret = state->fc_replay_num_tags ? 1996 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 1997 } 1998 state->fc_crc = 0; 1999 break; 2000 case EXT4_FC_TAG_HEAD: 2001 memcpy(&head, val, sizeof(head)); 2002 if (le32_to_cpu(head.fc_features) & 2003 ~EXT4_FC_SUPPORTED_FEATURES) { 2004 ret = -EOPNOTSUPP; 2005 break; 2006 } 2007 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2008 ret = JBD2_FC_REPLAY_STOP; 2009 break; 2010 } 2011 state->fc_cur_tag++; 2012 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2013 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2014 break; 2015 default: 2016 ret = state->fc_replay_num_tags ? 2017 JBD2_FC_REPLAY_STOP : -ECANCELED; 2018 } 2019 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2020 break; 2021 } 2022 2023 out_err: 2024 trace_ext4_fc_replay_scan(sb, ret, off); 2025 return ret; 2026 } 2027 2028 /* 2029 * Main recovery path entry point. 2030 * The meaning of return codes is similar as above. 2031 */ 2032 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2033 enum passtype pass, int off, tid_t expected_tid) 2034 { 2035 struct super_block *sb = journal->j_private; 2036 struct ext4_sb_info *sbi = EXT4_SB(sb); 2037 struct ext4_fc_tl tl; 2038 __u8 *start, *end, *cur, *val; 2039 int ret = JBD2_FC_REPLAY_CONTINUE; 2040 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2041 struct ext4_fc_tail tail; 2042 2043 if (pass == PASS_SCAN) { 2044 state->fc_current_pass = PASS_SCAN; 2045 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2046 } 2047 2048 if (state->fc_current_pass != pass) { 2049 state->fc_current_pass = pass; 2050 sbi->s_mount_state |= EXT4_FC_REPLAY; 2051 } 2052 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2053 jbd_debug(1, "Replay stops\n"); 2054 ext4_fc_set_bitmaps_and_counters(sb); 2055 return 0; 2056 } 2057 2058 #ifdef CONFIG_EXT4_DEBUG 2059 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2060 pr_warn("Dropping fc block %d because max_replay set\n", off); 2061 return JBD2_FC_REPLAY_STOP; 2062 } 2063 #endif 2064 2065 start = (u8 *)bh->b_data; 2066 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2067 2068 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2069 memcpy(&tl, cur, sizeof(tl)); 2070 val = cur + sizeof(tl); 2071 2072 if (state->fc_replay_num_tags == 0) { 2073 ret = JBD2_FC_REPLAY_STOP; 2074 ext4_fc_set_bitmaps_and_counters(sb); 2075 break; 2076 } 2077 jbd_debug(3, "Replay phase, tag:%s\n", 2078 tag2str(le16_to_cpu(tl.fc_tag))); 2079 state->fc_replay_num_tags--; 2080 switch (le16_to_cpu(tl.fc_tag)) { 2081 case EXT4_FC_TAG_LINK: 2082 ret = ext4_fc_replay_link(sb, &tl, val); 2083 break; 2084 case EXT4_FC_TAG_UNLINK: 2085 ret = ext4_fc_replay_unlink(sb, &tl, val); 2086 break; 2087 case EXT4_FC_TAG_ADD_RANGE: 2088 ret = ext4_fc_replay_add_range(sb, &tl, val); 2089 break; 2090 case EXT4_FC_TAG_CREAT: 2091 ret = ext4_fc_replay_create(sb, &tl, val); 2092 break; 2093 case EXT4_FC_TAG_DEL_RANGE: 2094 ret = ext4_fc_replay_del_range(sb, &tl, val); 2095 break; 2096 case EXT4_FC_TAG_INODE: 2097 ret = ext4_fc_replay_inode(sb, &tl, val); 2098 break; 2099 case EXT4_FC_TAG_PAD: 2100 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2101 le16_to_cpu(tl.fc_len), 0); 2102 break; 2103 case EXT4_FC_TAG_TAIL: 2104 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2105 le16_to_cpu(tl.fc_len), 0); 2106 memcpy(&tail, val, sizeof(tail)); 2107 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2108 break; 2109 case EXT4_FC_TAG_HEAD: 2110 break; 2111 default: 2112 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, 2113 le16_to_cpu(tl.fc_len), 0); 2114 ret = -ECANCELED; 2115 break; 2116 } 2117 if (ret < 0) 2118 break; 2119 ret = JBD2_FC_REPLAY_CONTINUE; 2120 } 2121 return ret; 2122 } 2123 2124 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2125 { 2126 /* 2127 * We set replay callback even if fast commit disabled because we may 2128 * could still have fast commit blocks that need to be replayed even if 2129 * fast commit has now been turned off. 2130 */ 2131 journal->j_fc_replay_callback = ext4_fc_replay; 2132 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2133 return; 2134 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2135 } 2136 2137 static const char *fc_ineligible_reasons[] = { 2138 "Extended attributes changed", 2139 "Cross rename", 2140 "Journal flag changed", 2141 "Insufficient memory", 2142 "Swap boot", 2143 "Resize", 2144 "Dir renamed", 2145 "Falloc range op", 2146 "Data journalling", 2147 "FC Commit Failed" 2148 }; 2149 2150 int ext4_fc_info_show(struct seq_file *seq, void *v) 2151 { 2152 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2153 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2154 int i; 2155 2156 if (v != SEQ_START_TOKEN) 2157 return 0; 2158 2159 seq_printf(seq, 2160 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2161 stats->fc_num_commits, stats->fc_ineligible_commits, 2162 stats->fc_numblks, 2163 div_u64(sbi->s_fc_avg_commit_time, 1000)); 2164 seq_puts(seq, "Ineligible reasons:\n"); 2165 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2166 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2167 stats->fc_ineligible_reason_count[i]); 2168 2169 return 0; 2170 } 2171 2172 int __init ext4_fc_init_dentry_cache(void) 2173 { 2174 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2175 SLAB_RECLAIM_ACCOUNT); 2176 2177 if (ext4_fc_dentry_cachep == NULL) 2178 return -ENOMEM; 2179 2180 return 0; 2181 } 2182