1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * Not all operations are supported by fast commits today (e.g extended 69 * attributes). Fast commit ineligiblity is marked by calling one of the 70 * two following functions: 71 * 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall 73 * back to full commit. This is useful in case of transient errors. 74 * 75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all 76 * the fast commits happening between ext4_fc_start_ineligible() and 77 * ext4_fc_stop_ineligible() and one fast commit after the call to 78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to 79 * make one more fast commit to fall back to full commit after stop call so 80 * that it guaranteed that the fast commit ineligible operation contained 81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is 82 * followed by at least 1 full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to guarantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * Fast Commit Replay Idempotence 107 * ------------------------------ 108 * 109 * Fast commits tags are idempotent in nature provided the recovery code follows 110 * certain rules. The guiding principle that the commit path follows while 111 * committing is that it stores the result of a particular operation instead of 112 * storing the procedure. 113 * 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 * was associated with inode 10. During fast commit, instead of storing this 116 * operation as a procedure "rename a to b", we store the resulting file system 117 * state as a "series" of outcomes: 118 * 119 * - Link dirent b to inode 10 120 * - Unlink dirent a 121 * - Inode <10> with valid refcount 122 * 123 * Now when recovery code runs, it needs "enforce" this state on the file 124 * system. This is what guarantees idempotence of fast commit replay. 125 * 126 * Let's take an example of a procedure that is not idempotent and see how fast 127 * commits make it idempotent. Consider following sequence of operations: 128 * 129 * rm A; mv B A; read A 130 * (x) (y) (z) 131 * 132 * (x), (y) and (z) are the points at which we can crash. If we store this 133 * sequence of operations as is then the replay is not idempotent. Let's say 134 * while in replay, we crash at (z). During the second replay, file A (which was 135 * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 * file named A would be absent when we try to read A. So, this sequence of 137 * operations is not idempotent. However, as mentioned above, instead of storing 138 * the procedure fast commits store the outcome of each procedure. Thus the fast 139 * commit log for above procedure would be as follows: 140 * 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 * inode 11 before the replay) 143 * 144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 * (w) (x) (y) (z) 146 * 147 * If we crash at (z), we will have file A linked to inode 11. During the second 148 * replay, we will remove file A (inode 11). But we will create it back and make 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 * similarly. Thus, by converting a non-idempotent procedure into a series of 153 * idempotent outcomes, fast commits ensured idempotence during the replay. 154 * 155 * TODOs 156 * ----- 157 * 158 * 0) Fast commit replay path hardening: Fast commit replay code should use 159 * journal handles to make sure all the updates it does during the replay 160 * path are atomic. With that if we crash during fast commit replay, after 161 * trying to do recovery again, we will find a file system where fast commit 162 * area is invalid (because new full commit would be found). In order to deal 163 * with that, fast commit replay code should ensure that the "FC_REPLAY" 164 * superblock state is persisted before starting the replay, so that after 165 * the crash, fast commit recovery code can look at that flag and perform 166 * fast commit recovery even if that area is invalidated by later full 167 * commits. 168 * 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 170 * eligible update must be protected within ext4_fc_start_update() and 171 * ext4_fc_stop_update(). These routines are called at much higher 172 * routines. This can be made more fine grained by combining with 173 * ext4_journal_start(). 174 * 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() 176 * 177 * 3) Handle more ineligible cases. 178 */ 179 180 #include <trace/events/ext4.h> 181 static struct kmem_cache *ext4_fc_dentry_cachep; 182 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 184 { 185 BUFFER_TRACE(bh, ""); 186 if (uptodate) { 187 ext4_debug("%s: Block %lld up-to-date", 188 __func__, bh->b_blocknr); 189 set_buffer_uptodate(bh); 190 } else { 191 ext4_debug("%s: Block %lld not up-to-date", 192 __func__, bh->b_blocknr); 193 clear_buffer_uptodate(bh); 194 } 195 196 unlock_buffer(bh); 197 } 198 199 static inline void ext4_fc_reset_inode(struct inode *inode) 200 { 201 struct ext4_inode_info *ei = EXT4_I(inode); 202 203 ei->i_fc_lblk_start = 0; 204 ei->i_fc_lblk_len = 0; 205 } 206 207 void ext4_fc_init_inode(struct inode *inode) 208 { 209 struct ext4_inode_info *ei = EXT4_I(inode); 210 211 ext4_fc_reset_inode(inode); 212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 213 INIT_LIST_HEAD(&ei->i_fc_list); 214 init_waitqueue_head(&ei->i_fc_wait); 215 atomic_set(&ei->i_fc_updates, 0); 216 } 217 218 /* This function must be called with sbi->s_fc_lock held. */ 219 static void ext4_fc_wait_committing_inode(struct inode *inode) 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 221 { 222 wait_queue_head_t *wq; 223 struct ext4_inode_info *ei = EXT4_I(inode); 224 225 #if (BITS_PER_LONG < 64) 226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 227 EXT4_STATE_FC_COMMITTING); 228 wq = bit_waitqueue(&ei->i_state_flags, 229 EXT4_STATE_FC_COMMITTING); 230 #else 231 DEFINE_WAIT_BIT(wait, &ei->i_flags, 232 EXT4_STATE_FC_COMMITTING); 233 wq = bit_waitqueue(&ei->i_flags, 234 EXT4_STATE_FC_COMMITTING); 235 #endif 236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 239 schedule(); 240 finish_wait(wq, &wait.wq_entry); 241 } 242 243 /* 244 * Inform Ext4's fast about start of an inode update 245 * 246 * This function is called by the high level call VFS callbacks before 247 * performing any inode update. This function blocks if there's an ongoing 248 * fast commit on the inode in question. 249 */ 250 void ext4_fc_start_update(struct inode *inode) 251 { 252 struct ext4_inode_info *ei = EXT4_I(inode); 253 254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 256 return; 257 258 restart: 259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 260 if (list_empty(&ei->i_fc_list)) 261 goto out; 262 263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 264 ext4_fc_wait_committing_inode(inode); 265 goto restart; 266 } 267 out: 268 atomic_inc(&ei->i_fc_updates); 269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 270 } 271 272 /* 273 * Stop inode update and wake up waiting fast commits if any. 274 */ 275 void ext4_fc_stop_update(struct inode *inode) 276 { 277 struct ext4_inode_info *ei = EXT4_I(inode); 278 279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 281 return; 282 283 if (atomic_dec_and_test(&ei->i_fc_updates)) 284 wake_up_all(&ei->i_fc_wait); 285 } 286 287 /* 288 * Remove inode from fast commit list. If the inode is being committed 289 * we wait until inode commit is done. 290 */ 291 void ext4_fc_del(struct inode *inode) 292 { 293 struct ext4_inode_info *ei = EXT4_I(inode); 294 295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 297 return; 298 299 restart: 300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 301 if (list_empty(&ei->i_fc_list)) { 302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 303 return; 304 } 305 306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 307 ext4_fc_wait_committing_inode(inode); 308 goto restart; 309 } 310 list_del_init(&ei->i_fc_list); 311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 312 } 313 314 /* 315 * Mark file system as fast commit ineligible. This means that next commit 316 * operation would result in a full jbd2 commit. 317 */ 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 319 { 320 struct ext4_sb_info *sbi = EXT4_SB(sb); 321 322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 324 return; 325 326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 327 WARN_ON(reason >= EXT4_FC_REASON_MAX); 328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 329 } 330 331 /* 332 * Start a fast commit ineligible update. Any commits that happen while 333 * such an operation is in progress fall back to full commits. 334 */ 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason) 336 { 337 struct ext4_sb_info *sbi = EXT4_SB(sb); 338 339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 341 return; 342 343 WARN_ON(reason >= EXT4_FC_REASON_MAX); 344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 345 atomic_inc(&sbi->s_fc_ineligible_updates); 346 } 347 348 /* 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here 350 * to ensure that after stopping the ineligible update, at least one full 351 * commit takes place. 352 */ 353 void ext4_fc_stop_ineligible(struct super_block *sb) 354 { 355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 357 return; 358 359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); 361 } 362 363 static inline int ext4_fc_is_ineligible(struct super_block *sb) 364 { 365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || 366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); 367 } 368 369 /* 370 * Generic fast commit tracking function. If this is the first time this we are 371 * called after a full commit, we initialize fast commit fields and then call 372 * __fc_track_fn() with update = 0. If we have already been called after a full 373 * commit, we pass update = 1. Based on that, the track function can determine 374 * if it needs to track a field for the first time or if it needs to just 375 * update the previously tracked value. 376 * 377 * If enqueue is set, this function enqueues the inode in fast commit list. 378 */ 379 static int ext4_fc_track_template( 380 handle_t *handle, struct inode *inode, 381 int (*__fc_track_fn)(struct inode *, void *, bool), 382 void *args, int enqueue) 383 { 384 bool update = false; 385 struct ext4_inode_info *ei = EXT4_I(inode); 386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 387 tid_t tid = 0; 388 int ret; 389 390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 391 (sbi->s_mount_state & EXT4_FC_REPLAY)) 392 return -EOPNOTSUPP; 393 394 if (ext4_fc_is_ineligible(inode->i_sb)) 395 return -EINVAL; 396 397 tid = handle->h_transaction->t_tid; 398 mutex_lock(&ei->i_fc_lock); 399 if (tid == ei->i_sync_tid) { 400 update = true; 401 } else { 402 ext4_fc_reset_inode(inode); 403 ei->i_sync_tid = tid; 404 } 405 ret = __fc_track_fn(inode, args, update); 406 mutex_unlock(&ei->i_fc_lock); 407 408 if (!enqueue) 409 return ret; 410 411 spin_lock(&sbi->s_fc_lock); 412 if (list_empty(&EXT4_I(inode)->i_fc_list)) 413 list_add_tail(&EXT4_I(inode)->i_fc_list, 414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 415 &sbi->s_fc_q[FC_Q_STAGING] : 416 &sbi->s_fc_q[FC_Q_MAIN]); 417 spin_unlock(&sbi->s_fc_lock); 418 419 return ret; 420 } 421 422 struct __track_dentry_update_args { 423 struct dentry *dentry; 424 int op; 425 }; 426 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 429 { 430 struct ext4_fc_dentry_update *node; 431 struct ext4_inode_info *ei = EXT4_I(inode); 432 struct __track_dentry_update_args *dentry_update = 433 (struct __track_dentry_update_args *)arg; 434 struct dentry *dentry = dentry_update->dentry; 435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 436 437 mutex_unlock(&ei->i_fc_lock); 438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 439 if (!node) { 440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 441 mutex_lock(&ei->i_fc_lock); 442 return -ENOMEM; 443 } 444 445 node->fcd_op = dentry_update->op; 446 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 447 node->fcd_ino = inode->i_ino; 448 if (dentry->d_name.len > DNAME_INLINE_LEN) { 449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 450 if (!node->fcd_name.name) { 451 kmem_cache_free(ext4_fc_dentry_cachep, node); 452 ext4_fc_mark_ineligible(inode->i_sb, 453 EXT4_FC_REASON_NOMEM); 454 mutex_lock(&ei->i_fc_lock); 455 return -ENOMEM; 456 } 457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 458 dentry->d_name.len); 459 } else { 460 memcpy(node->fcd_iname, dentry->d_name.name, 461 dentry->d_name.len); 462 node->fcd_name.name = node->fcd_iname; 463 } 464 node->fcd_name.len = dentry->d_name.len; 465 466 spin_lock(&sbi->s_fc_lock); 467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 468 list_add_tail(&node->fcd_list, 469 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 470 else 471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 472 spin_unlock(&sbi->s_fc_lock); 473 mutex_lock(&ei->i_fc_lock); 474 475 return 0; 476 } 477 478 void __ext4_fc_track_unlink(handle_t *handle, 479 struct inode *inode, struct dentry *dentry) 480 { 481 struct __track_dentry_update_args args; 482 int ret; 483 484 args.dentry = dentry; 485 args.op = EXT4_FC_TAG_UNLINK; 486 487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 488 (void *)&args, 0); 489 trace_ext4_fc_track_unlink(inode, dentry, ret); 490 } 491 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 493 { 494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 495 } 496 497 void __ext4_fc_track_link(handle_t *handle, 498 struct inode *inode, struct dentry *dentry) 499 { 500 struct __track_dentry_update_args args; 501 int ret; 502 503 args.dentry = dentry; 504 args.op = EXT4_FC_TAG_LINK; 505 506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 507 (void *)&args, 0); 508 trace_ext4_fc_track_link(inode, dentry, ret); 509 } 510 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 512 { 513 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 514 } 515 516 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 517 { 518 struct __track_dentry_update_args args; 519 struct inode *inode = d_inode(dentry); 520 int ret; 521 522 args.dentry = dentry; 523 args.op = EXT4_FC_TAG_CREAT; 524 525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 526 (void *)&args, 0); 527 trace_ext4_fc_track_create(inode, dentry, ret); 528 } 529 530 /* __track_fn for inode tracking */ 531 static int __track_inode(struct inode *inode, void *arg, bool update) 532 { 533 if (update) 534 return -EEXIST; 535 536 EXT4_I(inode)->i_fc_lblk_len = 0; 537 538 return 0; 539 } 540 541 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 542 { 543 int ret; 544 545 if (S_ISDIR(inode->i_mode)) 546 return; 547 548 if (ext4_should_journal_data(inode)) { 549 ext4_fc_mark_ineligible(inode->i_sb, 550 EXT4_FC_REASON_INODE_JOURNAL_DATA); 551 return; 552 } 553 554 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 555 trace_ext4_fc_track_inode(inode, ret); 556 } 557 558 struct __track_range_args { 559 ext4_lblk_t start, end; 560 }; 561 562 /* __track_fn for tracking data updates */ 563 static int __track_range(struct inode *inode, void *arg, bool update) 564 { 565 struct ext4_inode_info *ei = EXT4_I(inode); 566 ext4_lblk_t oldstart; 567 struct __track_range_args *__arg = 568 (struct __track_range_args *)arg; 569 570 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 571 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 572 return -ECANCELED; 573 } 574 575 oldstart = ei->i_fc_lblk_start; 576 577 if (update && ei->i_fc_lblk_len > 0) { 578 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 579 ei->i_fc_lblk_len = 580 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 581 ei->i_fc_lblk_start + 1; 582 } else { 583 ei->i_fc_lblk_start = __arg->start; 584 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 585 } 586 587 return 0; 588 } 589 590 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 591 ext4_lblk_t end) 592 { 593 struct __track_range_args args; 594 int ret; 595 596 if (S_ISDIR(inode->i_mode)) 597 return; 598 599 args.start = start; 600 args.end = end; 601 602 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 603 604 trace_ext4_fc_track_range(inode, start, end, ret); 605 } 606 607 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 608 { 609 int write_flags = REQ_SYNC; 610 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 611 612 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 613 if (test_opt(sb, BARRIER) && is_tail) 614 write_flags |= REQ_FUA | REQ_PREFLUSH; 615 lock_buffer(bh); 616 set_buffer_dirty(bh); 617 set_buffer_uptodate(bh); 618 bh->b_end_io = ext4_end_buffer_io_sync; 619 submit_bh(REQ_OP_WRITE, write_flags, bh); 620 EXT4_SB(sb)->s_fc_bh = NULL; 621 } 622 623 /* Ext4 commit path routines */ 624 625 /* memzero and update CRC */ 626 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 627 u32 *crc) 628 { 629 void *ret; 630 631 ret = memset(dst, 0, len); 632 if (crc) 633 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 634 return ret; 635 } 636 637 /* 638 * Allocate len bytes on a fast commit buffer. 639 * 640 * During the commit time this function is used to manage fast commit 641 * block space. We don't split a fast commit log onto different 642 * blocks. So this function makes sure that if there's not enough space 643 * on the current block, the remaining space in the current block is 644 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 645 * new block is from jbd2 and CRC is updated to reflect the padding 646 * we added. 647 */ 648 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 649 { 650 struct ext4_fc_tl *tl; 651 struct ext4_sb_info *sbi = EXT4_SB(sb); 652 struct buffer_head *bh; 653 int bsize = sbi->s_journal->j_blocksize; 654 int ret, off = sbi->s_fc_bytes % bsize; 655 int pad_len; 656 657 /* 658 * After allocating len, we should have space at least for a 0 byte 659 * padding. 660 */ 661 if (len + sizeof(struct ext4_fc_tl) > bsize) 662 return NULL; 663 664 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 665 /* 666 * Only allocate from current buffer if we have enough space for 667 * this request AND we have space to add a zero byte padding. 668 */ 669 if (!sbi->s_fc_bh) { 670 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 671 if (ret) 672 return NULL; 673 sbi->s_fc_bh = bh; 674 } 675 sbi->s_fc_bytes += len; 676 return sbi->s_fc_bh->b_data + off; 677 } 678 /* Need to add PAD tag */ 679 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 680 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 681 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 682 tl->fc_len = cpu_to_le16(pad_len); 683 if (crc) 684 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 685 if (pad_len > 0) 686 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 687 ext4_fc_submit_bh(sb, false); 688 689 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 690 if (ret) 691 return NULL; 692 sbi->s_fc_bh = bh; 693 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 694 return sbi->s_fc_bh->b_data; 695 } 696 697 /* memcpy to fc reserved space and update CRC */ 698 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 699 int len, u32 *crc) 700 { 701 if (crc) 702 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 703 return memcpy(dst, src, len); 704 } 705 706 /* 707 * Complete a fast commit by writing tail tag. 708 * 709 * Writing tail tag marks the end of a fast commit. In order to guarantee 710 * atomicity, after writing tail tag, even if there's space remaining 711 * in the block, next commit shouldn't use it. That's why tail tag 712 * has the length as that of the remaining space on the block. 713 */ 714 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 715 { 716 struct ext4_sb_info *sbi = EXT4_SB(sb); 717 struct ext4_fc_tl tl; 718 struct ext4_fc_tail tail; 719 int off, bsize = sbi->s_journal->j_blocksize; 720 u8 *dst; 721 722 /* 723 * ext4_fc_reserve_space takes care of allocating an extra block if 724 * there's no enough space on this block for accommodating this tail. 725 */ 726 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 727 if (!dst) 728 return -ENOSPC; 729 730 off = sbi->s_fc_bytes % bsize; 731 732 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 733 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 734 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 735 736 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 737 dst += sizeof(tl); 738 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 739 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 740 dst += sizeof(tail.fc_tid); 741 tail.fc_crc = cpu_to_le32(crc); 742 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 743 744 ext4_fc_submit_bh(sb, true); 745 746 return 0; 747 } 748 749 /* 750 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 751 * Returns false if there's not enough space. 752 */ 753 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 754 u32 *crc) 755 { 756 struct ext4_fc_tl tl; 757 u8 *dst; 758 759 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 760 if (!dst) 761 return false; 762 763 tl.fc_tag = cpu_to_le16(tag); 764 tl.fc_len = cpu_to_le16(len); 765 766 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 767 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 768 769 return true; 770 } 771 772 /* Same as above, but adds dentry tlv. */ 773 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, 774 int parent_ino, int ino, int dlen, 775 const unsigned char *dname, 776 u32 *crc) 777 { 778 struct ext4_fc_dentry_info fcd; 779 struct ext4_fc_tl tl; 780 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 781 crc); 782 783 if (!dst) 784 return false; 785 786 fcd.fc_parent_ino = cpu_to_le32(parent_ino); 787 fcd.fc_ino = cpu_to_le32(ino); 788 tl.fc_tag = cpu_to_le16(tag); 789 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 790 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 791 dst += sizeof(tl); 792 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 793 dst += sizeof(fcd); 794 ext4_fc_memcpy(sb, dst, dname, dlen, crc); 795 dst += dlen; 796 797 return true; 798 } 799 800 /* 801 * Writes inode in the fast commit space under TLV with tag @tag. 802 * Returns 0 on success, error on failure. 803 */ 804 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 805 { 806 struct ext4_inode_info *ei = EXT4_I(inode); 807 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 808 int ret; 809 struct ext4_iloc iloc; 810 struct ext4_fc_inode fc_inode; 811 struct ext4_fc_tl tl; 812 u8 *dst; 813 814 ret = ext4_get_inode_loc(inode, &iloc); 815 if (ret) 816 return ret; 817 818 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 819 inode_len += ei->i_extra_isize; 820 821 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 822 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 823 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 824 825 dst = ext4_fc_reserve_space(inode->i_sb, 826 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 827 if (!dst) 828 return -ECANCELED; 829 830 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 831 return -ECANCELED; 832 dst += sizeof(tl); 833 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 834 return -ECANCELED; 835 dst += sizeof(fc_inode); 836 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 837 inode_len, crc)) 838 return -ECANCELED; 839 840 return 0; 841 } 842 843 /* 844 * Writes updated data ranges for the inode in question. Updates CRC. 845 * Returns 0 on success, error otherwise. 846 */ 847 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 848 { 849 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 850 struct ext4_inode_info *ei = EXT4_I(inode); 851 struct ext4_map_blocks map; 852 struct ext4_fc_add_range fc_ext; 853 struct ext4_fc_del_range lrange; 854 struct ext4_extent *ex; 855 int ret; 856 857 mutex_lock(&ei->i_fc_lock); 858 if (ei->i_fc_lblk_len == 0) { 859 mutex_unlock(&ei->i_fc_lock); 860 return 0; 861 } 862 old_blk_size = ei->i_fc_lblk_start; 863 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 864 ei->i_fc_lblk_len = 0; 865 mutex_unlock(&ei->i_fc_lock); 866 867 cur_lblk_off = old_blk_size; 868 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 869 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 870 871 while (cur_lblk_off <= new_blk_size) { 872 map.m_lblk = cur_lblk_off; 873 map.m_len = new_blk_size - cur_lblk_off + 1; 874 ret = ext4_map_blocks(NULL, inode, &map, 0); 875 if (ret < 0) 876 return -ECANCELED; 877 878 if (map.m_len == 0) { 879 cur_lblk_off++; 880 continue; 881 } 882 883 if (ret == 0) { 884 lrange.fc_ino = cpu_to_le32(inode->i_ino); 885 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 886 lrange.fc_len = cpu_to_le32(map.m_len); 887 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 888 sizeof(lrange), (u8 *)&lrange, crc)) 889 return -ENOSPC; 890 } else { 891 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 892 ex = (struct ext4_extent *)&fc_ext.fc_ex; 893 ex->ee_block = cpu_to_le32(map.m_lblk); 894 ex->ee_len = cpu_to_le16(map.m_len); 895 ext4_ext_store_pblock(ex, map.m_pblk); 896 if (map.m_flags & EXT4_MAP_UNWRITTEN) 897 ext4_ext_mark_unwritten(ex); 898 else 899 ext4_ext_mark_initialized(ex); 900 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 901 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 902 return -ENOSPC; 903 } 904 905 cur_lblk_off += map.m_len; 906 } 907 908 return 0; 909 } 910 911 912 /* Submit data for all the fast commit inodes */ 913 static int ext4_fc_submit_inode_data_all(journal_t *journal) 914 { 915 struct super_block *sb = (struct super_block *)(journal->j_private); 916 struct ext4_sb_info *sbi = EXT4_SB(sb); 917 struct ext4_inode_info *ei; 918 int ret = 0; 919 920 spin_lock(&sbi->s_fc_lock); 921 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 922 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 923 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 924 while (atomic_read(&ei->i_fc_updates)) { 925 DEFINE_WAIT(wait); 926 927 prepare_to_wait(&ei->i_fc_wait, &wait, 928 TASK_UNINTERRUPTIBLE); 929 if (atomic_read(&ei->i_fc_updates)) { 930 spin_unlock(&sbi->s_fc_lock); 931 schedule(); 932 spin_lock(&sbi->s_fc_lock); 933 } 934 finish_wait(&ei->i_fc_wait, &wait); 935 } 936 spin_unlock(&sbi->s_fc_lock); 937 ret = jbd2_submit_inode_data(ei->jinode); 938 if (ret) 939 return ret; 940 spin_lock(&sbi->s_fc_lock); 941 } 942 spin_unlock(&sbi->s_fc_lock); 943 944 return ret; 945 } 946 947 /* Wait for completion of data for all the fast commit inodes */ 948 static int ext4_fc_wait_inode_data_all(journal_t *journal) 949 { 950 struct super_block *sb = (struct super_block *)(journal->j_private); 951 struct ext4_sb_info *sbi = EXT4_SB(sb); 952 struct ext4_inode_info *pos, *n; 953 int ret = 0; 954 955 spin_lock(&sbi->s_fc_lock); 956 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 957 if (!ext4_test_inode_state(&pos->vfs_inode, 958 EXT4_STATE_FC_COMMITTING)) 959 continue; 960 spin_unlock(&sbi->s_fc_lock); 961 962 ret = jbd2_wait_inode_data(journal, pos->jinode); 963 if (ret) 964 return ret; 965 spin_lock(&sbi->s_fc_lock); 966 } 967 spin_unlock(&sbi->s_fc_lock); 968 969 return 0; 970 } 971 972 /* Commit all the directory entry updates */ 973 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 974 __acquires(&sbi->s_fc_lock) 975 __releases(&sbi->s_fc_lock) 976 { 977 struct super_block *sb = (struct super_block *)(journal->j_private); 978 struct ext4_sb_info *sbi = EXT4_SB(sb); 979 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 980 struct inode *inode; 981 struct ext4_inode_info *ei, *ei_n; 982 int ret; 983 984 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 985 return 0; 986 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 987 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 988 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 989 spin_unlock(&sbi->s_fc_lock); 990 if (!ext4_fc_add_dentry_tlv( 991 sb, fc_dentry->fcd_op, 992 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 993 fc_dentry->fcd_name.len, 994 fc_dentry->fcd_name.name, crc)) { 995 ret = -ENOSPC; 996 goto lock_and_exit; 997 } 998 spin_lock(&sbi->s_fc_lock); 999 continue; 1000 } 1001 1002 inode = NULL; 1003 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], 1004 i_fc_list) { 1005 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 1006 inode = &ei->vfs_inode; 1007 break; 1008 } 1009 } 1010 /* 1011 * If we don't find inode in our list, then it was deleted, 1012 * in which case, we don't need to record it's create tag. 1013 */ 1014 if (!inode) 1015 continue; 1016 spin_unlock(&sbi->s_fc_lock); 1017 1018 /* 1019 * We first write the inode and then the create dirent. This 1020 * allows the recovery code to create an unnamed inode first 1021 * and then link it to a directory entry. This allows us 1022 * to use namei.c routines almost as is and simplifies 1023 * the recovery code. 1024 */ 1025 ret = ext4_fc_write_inode(inode, crc); 1026 if (ret) 1027 goto lock_and_exit; 1028 1029 ret = ext4_fc_write_inode_data(inode, crc); 1030 if (ret) 1031 goto lock_and_exit; 1032 1033 if (!ext4_fc_add_dentry_tlv( 1034 sb, fc_dentry->fcd_op, 1035 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 1036 fc_dentry->fcd_name.len, 1037 fc_dentry->fcd_name.name, crc)) { 1038 ret = -ENOSPC; 1039 goto lock_and_exit; 1040 } 1041 1042 spin_lock(&sbi->s_fc_lock); 1043 } 1044 return 0; 1045 lock_and_exit: 1046 spin_lock(&sbi->s_fc_lock); 1047 return ret; 1048 } 1049 1050 static int ext4_fc_perform_commit(journal_t *journal) 1051 { 1052 struct super_block *sb = (struct super_block *)(journal->j_private); 1053 struct ext4_sb_info *sbi = EXT4_SB(sb); 1054 struct ext4_inode_info *iter; 1055 struct ext4_fc_head head; 1056 struct inode *inode; 1057 struct blk_plug plug; 1058 int ret = 0; 1059 u32 crc = 0; 1060 1061 ret = ext4_fc_submit_inode_data_all(journal); 1062 if (ret) 1063 return ret; 1064 1065 ret = ext4_fc_wait_inode_data_all(journal); 1066 if (ret) 1067 return ret; 1068 1069 /* 1070 * If file system device is different from journal device, issue a cache 1071 * flush before we start writing fast commit blocks. 1072 */ 1073 if (journal->j_fs_dev != journal->j_dev) 1074 blkdev_issue_flush(journal->j_fs_dev); 1075 1076 blk_start_plug(&plug); 1077 if (sbi->s_fc_bytes == 0) { 1078 /* 1079 * Add a head tag only if this is the first fast commit 1080 * in this TID. 1081 */ 1082 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1083 head.fc_tid = cpu_to_le32( 1084 sbi->s_journal->j_running_transaction->t_tid); 1085 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1086 (u8 *)&head, &crc)) 1087 goto out; 1088 } 1089 1090 spin_lock(&sbi->s_fc_lock); 1091 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1092 if (ret) { 1093 spin_unlock(&sbi->s_fc_lock); 1094 goto out; 1095 } 1096 1097 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1098 inode = &iter->vfs_inode; 1099 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1100 continue; 1101 1102 spin_unlock(&sbi->s_fc_lock); 1103 ret = ext4_fc_write_inode_data(inode, &crc); 1104 if (ret) 1105 goto out; 1106 ret = ext4_fc_write_inode(inode, &crc); 1107 if (ret) 1108 goto out; 1109 spin_lock(&sbi->s_fc_lock); 1110 } 1111 spin_unlock(&sbi->s_fc_lock); 1112 1113 ret = ext4_fc_write_tail(sb, crc); 1114 1115 out: 1116 blk_finish_plug(&plug); 1117 return ret; 1118 } 1119 1120 /* 1121 * The main commit entry point. Performs a fast commit for transaction 1122 * commit_tid if needed. If it's not possible to perform a fast commit 1123 * due to various reasons, we fall back to full commit. Returns 0 1124 * on success, error otherwise. 1125 */ 1126 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1127 { 1128 struct super_block *sb = (struct super_block *)(journal->j_private); 1129 struct ext4_sb_info *sbi = EXT4_SB(sb); 1130 int nblks = 0, ret, bsize = journal->j_blocksize; 1131 int subtid = atomic_read(&sbi->s_fc_subtid); 1132 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; 1133 ktime_t start_time, commit_time; 1134 1135 trace_ext4_fc_commit_start(sb); 1136 1137 start_time = ktime_get(); 1138 1139 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 1140 (ext4_fc_is_ineligible(sb))) { 1141 reason = EXT4_FC_REASON_INELIGIBLE; 1142 goto out; 1143 } 1144 1145 restart_fc: 1146 ret = jbd2_fc_begin_commit(journal, commit_tid); 1147 if (ret == -EALREADY) { 1148 /* There was an ongoing commit, check if we need to restart */ 1149 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1150 commit_tid > journal->j_commit_sequence) 1151 goto restart_fc; 1152 reason = EXT4_FC_REASON_ALREADY_COMMITTED; 1153 goto out; 1154 } else if (ret) { 1155 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1156 reason = EXT4_FC_REASON_FC_START_FAILED; 1157 goto out; 1158 } 1159 1160 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1161 ret = ext4_fc_perform_commit(journal); 1162 if (ret < 0) { 1163 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1164 reason = EXT4_FC_REASON_FC_FAILED; 1165 goto out; 1166 } 1167 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1168 ret = jbd2_fc_wait_bufs(journal, nblks); 1169 if (ret < 0) { 1170 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1171 reason = EXT4_FC_REASON_FC_FAILED; 1172 goto out; 1173 } 1174 atomic_inc(&sbi->s_fc_subtid); 1175 jbd2_fc_end_commit(journal); 1176 out: 1177 /* Has any ineligible update happened since we started? */ 1178 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { 1179 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1180 reason = EXT4_FC_REASON_INELIGIBLE; 1181 } 1182 1183 spin_lock(&sbi->s_fc_lock); 1184 if (reason != EXT4_FC_REASON_OK && 1185 reason != EXT4_FC_REASON_ALREADY_COMMITTED) { 1186 sbi->s_fc_stats.fc_ineligible_commits++; 1187 } else { 1188 sbi->s_fc_stats.fc_num_commits++; 1189 sbi->s_fc_stats.fc_numblks += nblks; 1190 } 1191 spin_unlock(&sbi->s_fc_lock); 1192 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; 1193 trace_ext4_fc_commit_stop(sb, nblks, reason); 1194 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1195 /* 1196 * weight the commit time higher than the average time so we don't 1197 * react too strongly to vast changes in the commit time 1198 */ 1199 if (likely(sbi->s_fc_avg_commit_time)) 1200 sbi->s_fc_avg_commit_time = (commit_time + 1201 sbi->s_fc_avg_commit_time * 3) / 4; 1202 else 1203 sbi->s_fc_avg_commit_time = commit_time; 1204 jbd_debug(1, 1205 "Fast commit ended with blks = %d, reason = %d, subtid - %d", 1206 nblks, reason, subtid); 1207 if (reason == EXT4_FC_REASON_FC_FAILED) 1208 return jbd2_fc_end_commit_fallback(journal); 1209 if (reason == EXT4_FC_REASON_FC_START_FAILED || 1210 reason == EXT4_FC_REASON_INELIGIBLE) 1211 return jbd2_complete_transaction(journal, commit_tid); 1212 return 0; 1213 } 1214 1215 /* 1216 * Fast commit cleanup routine. This is called after every fast commit and 1217 * full commit. full is true if we are called after a full commit. 1218 */ 1219 static void ext4_fc_cleanup(journal_t *journal, int full) 1220 { 1221 struct super_block *sb = journal->j_private; 1222 struct ext4_sb_info *sbi = EXT4_SB(sb); 1223 struct ext4_inode_info *iter, *iter_n; 1224 struct ext4_fc_dentry_update *fc_dentry; 1225 1226 if (full && sbi->s_fc_bh) 1227 sbi->s_fc_bh = NULL; 1228 1229 jbd2_fc_release_bufs(journal); 1230 1231 spin_lock(&sbi->s_fc_lock); 1232 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1233 i_fc_list) { 1234 list_del_init(&iter->i_fc_list); 1235 ext4_clear_inode_state(&iter->vfs_inode, 1236 EXT4_STATE_FC_COMMITTING); 1237 ext4_fc_reset_inode(&iter->vfs_inode); 1238 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1239 smp_mb(); 1240 #if (BITS_PER_LONG < 64) 1241 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1242 #else 1243 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1244 #endif 1245 } 1246 1247 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1248 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1249 struct ext4_fc_dentry_update, 1250 fcd_list); 1251 list_del_init(&fc_dentry->fcd_list); 1252 spin_unlock(&sbi->s_fc_lock); 1253 1254 if (fc_dentry->fcd_name.name && 1255 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1256 kfree(fc_dentry->fcd_name.name); 1257 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1258 spin_lock(&sbi->s_fc_lock); 1259 } 1260 1261 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1262 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1263 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1264 &sbi->s_fc_q[FC_Q_MAIN]); 1265 1266 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1267 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1268 1269 if (full) 1270 sbi->s_fc_bytes = 0; 1271 spin_unlock(&sbi->s_fc_lock); 1272 trace_ext4_fc_stats(sb); 1273 } 1274 1275 /* Ext4 Replay Path Routines */ 1276 1277 /* Helper struct for dentry replay routines */ 1278 struct dentry_info_args { 1279 int parent_ino, dname_len, ino, inode_len; 1280 char *dname; 1281 }; 1282 1283 static inline void tl_to_darg(struct dentry_info_args *darg, 1284 struct ext4_fc_tl *tl) 1285 { 1286 struct ext4_fc_dentry_info *fcd; 1287 1288 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); 1289 1290 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); 1291 darg->ino = le32_to_cpu(fcd->fc_ino); 1292 darg->dname = fcd->fc_dname; 1293 darg->dname_len = ext4_fc_tag_len(tl) - 1294 sizeof(struct ext4_fc_dentry_info); 1295 } 1296 1297 /* Unlink replay function */ 1298 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) 1299 { 1300 struct inode *inode, *old_parent; 1301 struct qstr entry; 1302 struct dentry_info_args darg; 1303 int ret = 0; 1304 1305 tl_to_darg(&darg, tl); 1306 1307 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1308 darg.parent_ino, darg.dname_len); 1309 1310 entry.name = darg.dname; 1311 entry.len = darg.dname_len; 1312 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1313 1314 if (IS_ERR(inode)) { 1315 jbd_debug(1, "Inode %d not found", darg.ino); 1316 return 0; 1317 } 1318 1319 old_parent = ext4_iget(sb, darg.parent_ino, 1320 EXT4_IGET_NORMAL); 1321 if (IS_ERR(old_parent)) { 1322 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1323 iput(inode); 1324 return 0; 1325 } 1326 1327 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1328 /* -ENOENT ok coz it might not exist anymore. */ 1329 if (ret == -ENOENT) 1330 ret = 0; 1331 iput(old_parent); 1332 iput(inode); 1333 return ret; 1334 } 1335 1336 static int ext4_fc_replay_link_internal(struct super_block *sb, 1337 struct dentry_info_args *darg, 1338 struct inode *inode) 1339 { 1340 struct inode *dir = NULL; 1341 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1342 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1343 int ret = 0; 1344 1345 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1346 if (IS_ERR(dir)) { 1347 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1348 dir = NULL; 1349 goto out; 1350 } 1351 1352 dentry_dir = d_obtain_alias(dir); 1353 if (IS_ERR(dentry_dir)) { 1354 jbd_debug(1, "Failed to obtain dentry"); 1355 dentry_dir = NULL; 1356 goto out; 1357 } 1358 1359 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1360 if (!dentry_inode) { 1361 jbd_debug(1, "Inode dentry not created."); 1362 ret = -ENOMEM; 1363 goto out; 1364 } 1365 1366 ret = __ext4_link(dir, inode, dentry_inode); 1367 /* 1368 * It's possible that link already existed since data blocks 1369 * for the dir in question got persisted before we crashed OR 1370 * we replayed this tag and crashed before the entire replay 1371 * could complete. 1372 */ 1373 if (ret && ret != -EEXIST) { 1374 jbd_debug(1, "Failed to link\n"); 1375 goto out; 1376 } 1377 1378 ret = 0; 1379 out: 1380 if (dentry_dir) { 1381 d_drop(dentry_dir); 1382 dput(dentry_dir); 1383 } else if (dir) { 1384 iput(dir); 1385 } 1386 if (dentry_inode) { 1387 d_drop(dentry_inode); 1388 dput(dentry_inode); 1389 } 1390 1391 return ret; 1392 } 1393 1394 /* Link replay function */ 1395 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) 1396 { 1397 struct inode *inode; 1398 struct dentry_info_args darg; 1399 int ret = 0; 1400 1401 tl_to_darg(&darg, tl); 1402 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1403 darg.parent_ino, darg.dname_len); 1404 1405 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1406 if (IS_ERR(inode)) { 1407 jbd_debug(1, "Inode not found."); 1408 return 0; 1409 } 1410 1411 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1412 iput(inode); 1413 return ret; 1414 } 1415 1416 /* 1417 * Record all the modified inodes during replay. We use this later to setup 1418 * block bitmaps correctly. 1419 */ 1420 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1421 { 1422 struct ext4_fc_replay_state *state; 1423 int i; 1424 1425 state = &EXT4_SB(sb)->s_fc_replay_state; 1426 for (i = 0; i < state->fc_modified_inodes_used; i++) 1427 if (state->fc_modified_inodes[i] == ino) 1428 return 0; 1429 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1430 state->fc_modified_inodes_size += 1431 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1432 state->fc_modified_inodes = krealloc( 1433 state->fc_modified_inodes, sizeof(int) * 1434 state->fc_modified_inodes_size, 1435 GFP_KERNEL); 1436 if (!state->fc_modified_inodes) 1437 return -ENOMEM; 1438 } 1439 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1440 return 0; 1441 } 1442 1443 /* 1444 * Inode replay function 1445 */ 1446 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) 1447 { 1448 struct ext4_fc_inode *fc_inode; 1449 struct ext4_inode *raw_inode; 1450 struct ext4_inode *raw_fc_inode; 1451 struct inode *inode = NULL; 1452 struct ext4_iloc iloc; 1453 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1454 struct ext4_extent_header *eh; 1455 1456 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); 1457 1458 ino = le32_to_cpu(fc_inode->fc_ino); 1459 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1460 1461 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1462 if (!IS_ERR(inode)) { 1463 ext4_ext_clear_bb(inode); 1464 iput(inode); 1465 } 1466 inode = NULL; 1467 1468 ext4_fc_record_modified_inode(sb, ino); 1469 1470 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; 1471 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1472 if (ret) 1473 goto out; 1474 1475 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); 1476 raw_inode = ext4_raw_inode(&iloc); 1477 1478 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1479 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1480 inode_len - offsetof(struct ext4_inode, i_generation)); 1481 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1482 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1483 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1484 memset(eh, 0, sizeof(*eh)); 1485 eh->eh_magic = EXT4_EXT_MAGIC; 1486 eh->eh_max = cpu_to_le16( 1487 (sizeof(raw_inode->i_block) - 1488 sizeof(struct ext4_extent_header)) 1489 / sizeof(struct ext4_extent)); 1490 } 1491 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1492 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1493 sizeof(raw_inode->i_block)); 1494 } 1495 1496 /* Immediately update the inode on disk. */ 1497 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1498 if (ret) 1499 goto out; 1500 ret = sync_dirty_buffer(iloc.bh); 1501 if (ret) 1502 goto out; 1503 ret = ext4_mark_inode_used(sb, ino); 1504 if (ret) 1505 goto out; 1506 1507 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1508 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1509 if (IS_ERR(inode)) { 1510 jbd_debug(1, "Inode not found."); 1511 return -EFSCORRUPTED; 1512 } 1513 1514 /* 1515 * Our allocator could have made different decisions than before 1516 * crashing. This should be fixed but until then, we calculate 1517 * the number of blocks the inode. 1518 */ 1519 ext4_ext_replay_set_iblocks(inode); 1520 1521 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1522 ext4_reset_inode_seed(inode); 1523 1524 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1525 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1526 sync_dirty_buffer(iloc.bh); 1527 brelse(iloc.bh); 1528 out: 1529 iput(inode); 1530 if (!ret) 1531 blkdev_issue_flush(sb->s_bdev); 1532 1533 return 0; 1534 } 1535 1536 /* 1537 * Dentry create replay function. 1538 * 1539 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1540 * inode for which we are trying to create a dentry here, should already have 1541 * been replayed before we start here. 1542 */ 1543 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) 1544 { 1545 int ret = 0; 1546 struct inode *inode = NULL; 1547 struct inode *dir = NULL; 1548 struct dentry_info_args darg; 1549 1550 tl_to_darg(&darg, tl); 1551 1552 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1553 darg.parent_ino, darg.dname_len); 1554 1555 /* This takes care of update group descriptor and other metadata */ 1556 ret = ext4_mark_inode_used(sb, darg.ino); 1557 if (ret) 1558 goto out; 1559 1560 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1561 if (IS_ERR(inode)) { 1562 jbd_debug(1, "inode %d not found.", darg.ino); 1563 inode = NULL; 1564 ret = -EINVAL; 1565 goto out; 1566 } 1567 1568 if (S_ISDIR(inode->i_mode)) { 1569 /* 1570 * If we are creating a directory, we need to make sure that the 1571 * dot and dot dot dirents are setup properly. 1572 */ 1573 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1574 if (IS_ERR(dir)) { 1575 jbd_debug(1, "Dir %d not found.", darg.ino); 1576 goto out; 1577 } 1578 ret = ext4_init_new_dir(NULL, dir, inode); 1579 iput(dir); 1580 if (ret) { 1581 ret = 0; 1582 goto out; 1583 } 1584 } 1585 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1586 if (ret) 1587 goto out; 1588 set_nlink(inode, 1); 1589 ext4_mark_inode_dirty(NULL, inode); 1590 out: 1591 if (inode) 1592 iput(inode); 1593 return ret; 1594 } 1595 1596 /* 1597 * Record physical disk regions which are in use as per fast commit area. Our 1598 * simple replay phase allocator excludes these regions from allocation. 1599 */ 1600 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1601 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1602 { 1603 struct ext4_fc_replay_state *state; 1604 struct ext4_fc_alloc_region *region; 1605 1606 state = &EXT4_SB(sb)->s_fc_replay_state; 1607 if (state->fc_regions_used == state->fc_regions_size) { 1608 state->fc_regions_size += 1609 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1610 state->fc_regions = krealloc( 1611 state->fc_regions, 1612 state->fc_regions_size * 1613 sizeof(struct ext4_fc_alloc_region), 1614 GFP_KERNEL); 1615 if (!state->fc_regions) 1616 return -ENOMEM; 1617 } 1618 region = &state->fc_regions[state->fc_regions_used++]; 1619 region->ino = ino; 1620 region->lblk = lblk; 1621 region->pblk = pblk; 1622 region->len = len; 1623 1624 return 0; 1625 } 1626 1627 /* Replay add range tag */ 1628 static int ext4_fc_replay_add_range(struct super_block *sb, 1629 struct ext4_fc_tl *tl) 1630 { 1631 struct ext4_fc_add_range *fc_add_ex; 1632 struct ext4_extent newex, *ex; 1633 struct inode *inode; 1634 ext4_lblk_t start, cur; 1635 int remaining, len; 1636 ext4_fsblk_t start_pblk; 1637 struct ext4_map_blocks map; 1638 struct ext4_ext_path *path = NULL; 1639 int ret; 1640 1641 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1642 ex = (struct ext4_extent *)&fc_add_ex->fc_ex; 1643 1644 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1645 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), 1646 ext4_ext_get_actual_len(ex)); 1647 1648 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), 1649 EXT4_IGET_NORMAL); 1650 if (IS_ERR(inode)) { 1651 jbd_debug(1, "Inode not found."); 1652 return 0; 1653 } 1654 1655 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1656 1657 start = le32_to_cpu(ex->ee_block); 1658 start_pblk = ext4_ext_pblock(ex); 1659 len = ext4_ext_get_actual_len(ex); 1660 1661 cur = start; 1662 remaining = len; 1663 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1664 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1665 inode->i_ino); 1666 1667 while (remaining > 0) { 1668 map.m_lblk = cur; 1669 map.m_len = remaining; 1670 map.m_pblk = 0; 1671 ret = ext4_map_blocks(NULL, inode, &map, 0); 1672 1673 if (ret < 0) { 1674 iput(inode); 1675 return 0; 1676 } 1677 1678 if (ret == 0) { 1679 /* Range is not mapped */ 1680 path = ext4_find_extent(inode, cur, NULL, 0); 1681 if (IS_ERR(path)) { 1682 iput(inode); 1683 return 0; 1684 } 1685 memset(&newex, 0, sizeof(newex)); 1686 newex.ee_block = cpu_to_le32(cur); 1687 ext4_ext_store_pblock( 1688 &newex, start_pblk + cur - start); 1689 newex.ee_len = cpu_to_le16(map.m_len); 1690 if (ext4_ext_is_unwritten(ex)) 1691 ext4_ext_mark_unwritten(&newex); 1692 down_write(&EXT4_I(inode)->i_data_sem); 1693 ret = ext4_ext_insert_extent( 1694 NULL, inode, &path, &newex, 0); 1695 up_write((&EXT4_I(inode)->i_data_sem)); 1696 ext4_ext_drop_refs(path); 1697 kfree(path); 1698 if (ret) { 1699 iput(inode); 1700 return 0; 1701 } 1702 goto next; 1703 } 1704 1705 if (start_pblk + cur - start != map.m_pblk) { 1706 /* 1707 * Logical to physical mapping changed. This can happen 1708 * if this range was removed and then reallocated to 1709 * map to new physical blocks during a fast commit. 1710 */ 1711 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1712 ext4_ext_is_unwritten(ex), 1713 start_pblk + cur - start); 1714 if (ret) { 1715 iput(inode); 1716 return 0; 1717 } 1718 /* 1719 * Mark the old blocks as free since they aren't used 1720 * anymore. We maintain an array of all the modified 1721 * inodes. In case these blocks are still used at either 1722 * a different logical range in the same inode or in 1723 * some different inode, we will mark them as allocated 1724 * at the end of the FC replay using our array of 1725 * modified inodes. 1726 */ 1727 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1728 goto next; 1729 } 1730 1731 /* Range is mapped and needs a state change */ 1732 jbd_debug(1, "Converting from %d to %d %lld", 1733 map.m_flags & EXT4_MAP_UNWRITTEN, 1734 ext4_ext_is_unwritten(ex), map.m_pblk); 1735 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1736 ext4_ext_is_unwritten(ex), map.m_pblk); 1737 if (ret) { 1738 iput(inode); 1739 return 0; 1740 } 1741 /* 1742 * We may have split the extent tree while toggling the state. 1743 * Try to shrink the extent tree now. 1744 */ 1745 ext4_ext_replay_shrink_inode(inode, start + len); 1746 next: 1747 cur += map.m_len; 1748 remaining -= map.m_len; 1749 } 1750 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1751 sb->s_blocksize_bits); 1752 iput(inode); 1753 return 0; 1754 } 1755 1756 /* Replay DEL_RANGE tag */ 1757 static int 1758 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) 1759 { 1760 struct inode *inode; 1761 struct ext4_fc_del_range *lrange; 1762 struct ext4_map_blocks map; 1763 ext4_lblk_t cur, remaining; 1764 int ret; 1765 1766 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); 1767 cur = le32_to_cpu(lrange->fc_lblk); 1768 remaining = le32_to_cpu(lrange->fc_len); 1769 1770 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1771 le32_to_cpu(lrange->fc_ino), cur, remaining); 1772 1773 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); 1774 if (IS_ERR(inode)) { 1775 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); 1776 return 0; 1777 } 1778 1779 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1780 1781 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1782 inode->i_ino, le32_to_cpu(lrange->fc_lblk), 1783 le32_to_cpu(lrange->fc_len)); 1784 while (remaining > 0) { 1785 map.m_lblk = cur; 1786 map.m_len = remaining; 1787 1788 ret = ext4_map_blocks(NULL, inode, &map, 0); 1789 if (ret < 0) { 1790 iput(inode); 1791 return 0; 1792 } 1793 if (ret > 0) { 1794 remaining -= ret; 1795 cur += ret; 1796 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1797 } else { 1798 remaining -= map.m_len; 1799 cur += map.m_len; 1800 } 1801 } 1802 1803 ret = ext4_punch_hole(inode, 1804 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, 1805 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); 1806 if (ret) 1807 jbd_debug(1, "ext4_punch_hole returned %d", ret); 1808 ext4_ext_replay_shrink_inode(inode, 1809 i_size_read(inode) >> sb->s_blocksize_bits); 1810 ext4_mark_inode_dirty(NULL, inode); 1811 iput(inode); 1812 1813 return 0; 1814 } 1815 1816 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1817 { 1818 struct ext4_fc_replay_state *state; 1819 struct inode *inode; 1820 struct ext4_ext_path *path = NULL; 1821 struct ext4_map_blocks map; 1822 int i, ret, j; 1823 ext4_lblk_t cur, end; 1824 1825 state = &EXT4_SB(sb)->s_fc_replay_state; 1826 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1827 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1828 EXT4_IGET_NORMAL); 1829 if (IS_ERR(inode)) { 1830 jbd_debug(1, "Inode %d not found.", 1831 state->fc_modified_inodes[i]); 1832 continue; 1833 } 1834 cur = 0; 1835 end = EXT_MAX_BLOCKS; 1836 while (cur < end) { 1837 map.m_lblk = cur; 1838 map.m_len = end - cur; 1839 1840 ret = ext4_map_blocks(NULL, inode, &map, 0); 1841 if (ret < 0) 1842 break; 1843 1844 if (ret > 0) { 1845 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1846 if (!IS_ERR(path)) { 1847 for (j = 0; j < path->p_depth; j++) 1848 ext4_mb_mark_bb(inode->i_sb, 1849 path[j].p_block, 1, 1); 1850 ext4_ext_drop_refs(path); 1851 kfree(path); 1852 } 1853 cur += ret; 1854 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1855 map.m_len, 1); 1856 } else { 1857 cur = cur + (map.m_len ? map.m_len : 1); 1858 } 1859 } 1860 iput(inode); 1861 } 1862 } 1863 1864 /* 1865 * Check if block is in excluded regions for block allocation. The simple 1866 * allocator that runs during replay phase is calls this function to see 1867 * if it is okay to use a block. 1868 */ 1869 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1870 { 1871 int i; 1872 struct ext4_fc_replay_state *state; 1873 1874 state = &EXT4_SB(sb)->s_fc_replay_state; 1875 for (i = 0; i < state->fc_regions_valid; i++) { 1876 if (state->fc_regions[i].ino == 0 || 1877 state->fc_regions[i].len == 0) 1878 continue; 1879 if (blk >= state->fc_regions[i].pblk && 1880 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1881 return true; 1882 } 1883 return false; 1884 } 1885 1886 /* Cleanup function called after replay */ 1887 void ext4_fc_replay_cleanup(struct super_block *sb) 1888 { 1889 struct ext4_sb_info *sbi = EXT4_SB(sb); 1890 1891 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1892 kfree(sbi->s_fc_replay_state.fc_regions); 1893 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1894 } 1895 1896 /* 1897 * Recovery Scan phase handler 1898 * 1899 * This function is called during the scan phase and is responsible 1900 * for doing following things: 1901 * - Make sure the fast commit area has valid tags for replay 1902 * - Count number of tags that need to be replayed by the replay handler 1903 * - Verify CRC 1904 * - Create a list of excluded blocks for allocation during replay phase 1905 * 1906 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1907 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1908 * to indicate that scan has finished and JBD2 can now start replay phase. 1909 * It returns a negative error to indicate that there was an error. At the end 1910 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1911 * to indicate the number of tags that need to replayed during the replay phase. 1912 */ 1913 static int ext4_fc_replay_scan(journal_t *journal, 1914 struct buffer_head *bh, int off, 1915 tid_t expected_tid) 1916 { 1917 struct super_block *sb = journal->j_private; 1918 struct ext4_sb_info *sbi = EXT4_SB(sb); 1919 struct ext4_fc_replay_state *state; 1920 int ret = JBD2_FC_REPLAY_CONTINUE; 1921 struct ext4_fc_add_range *ext; 1922 struct ext4_fc_tl *tl; 1923 struct ext4_fc_tail *tail; 1924 __u8 *start, *end; 1925 struct ext4_fc_head *head; 1926 struct ext4_extent *ex; 1927 1928 state = &sbi->s_fc_replay_state; 1929 1930 start = (u8 *)bh->b_data; 1931 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1932 1933 if (state->fc_replay_expected_off == 0) { 1934 state->fc_cur_tag = 0; 1935 state->fc_replay_num_tags = 0; 1936 state->fc_crc = 0; 1937 state->fc_regions = NULL; 1938 state->fc_regions_valid = state->fc_regions_used = 1939 state->fc_regions_size = 0; 1940 /* Check if we can stop early */ 1941 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1942 != EXT4_FC_TAG_HEAD) 1943 return 0; 1944 } 1945 1946 if (off != state->fc_replay_expected_off) { 1947 ret = -EFSCORRUPTED; 1948 goto out_err; 1949 } 1950 1951 state->fc_replay_expected_off++; 1952 fc_for_each_tl(start, end, tl) { 1953 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1954 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); 1955 switch (le16_to_cpu(tl->fc_tag)) { 1956 case EXT4_FC_TAG_ADD_RANGE: 1957 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1958 ex = (struct ext4_extent *)&ext->fc_ex; 1959 ret = ext4_fc_record_regions(sb, 1960 le32_to_cpu(ext->fc_ino), 1961 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1962 ext4_ext_get_actual_len(ex)); 1963 if (ret < 0) 1964 break; 1965 ret = JBD2_FC_REPLAY_CONTINUE; 1966 fallthrough; 1967 case EXT4_FC_TAG_DEL_RANGE: 1968 case EXT4_FC_TAG_LINK: 1969 case EXT4_FC_TAG_UNLINK: 1970 case EXT4_FC_TAG_CREAT: 1971 case EXT4_FC_TAG_INODE: 1972 case EXT4_FC_TAG_PAD: 1973 state->fc_cur_tag++; 1974 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1975 sizeof(*tl) + ext4_fc_tag_len(tl)); 1976 break; 1977 case EXT4_FC_TAG_TAIL: 1978 state->fc_cur_tag++; 1979 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 1980 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1981 sizeof(*tl) + 1982 offsetof(struct ext4_fc_tail, 1983 fc_crc)); 1984 if (le32_to_cpu(tail->fc_tid) == expected_tid && 1985 le32_to_cpu(tail->fc_crc) == state->fc_crc) { 1986 state->fc_replay_num_tags = state->fc_cur_tag; 1987 state->fc_regions_valid = 1988 state->fc_regions_used; 1989 } else { 1990 ret = state->fc_replay_num_tags ? 1991 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 1992 } 1993 state->fc_crc = 0; 1994 break; 1995 case EXT4_FC_TAG_HEAD: 1996 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); 1997 if (le32_to_cpu(head->fc_features) & 1998 ~EXT4_FC_SUPPORTED_FEATURES) { 1999 ret = -EOPNOTSUPP; 2000 break; 2001 } 2002 if (le32_to_cpu(head->fc_tid) != expected_tid) { 2003 ret = JBD2_FC_REPLAY_STOP; 2004 break; 2005 } 2006 state->fc_cur_tag++; 2007 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 2008 sizeof(*tl) + ext4_fc_tag_len(tl)); 2009 break; 2010 default: 2011 ret = state->fc_replay_num_tags ? 2012 JBD2_FC_REPLAY_STOP : -ECANCELED; 2013 } 2014 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2015 break; 2016 } 2017 2018 out_err: 2019 trace_ext4_fc_replay_scan(sb, ret, off); 2020 return ret; 2021 } 2022 2023 /* 2024 * Main recovery path entry point. 2025 * The meaning of return codes is similar as above. 2026 */ 2027 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2028 enum passtype pass, int off, tid_t expected_tid) 2029 { 2030 struct super_block *sb = journal->j_private; 2031 struct ext4_sb_info *sbi = EXT4_SB(sb); 2032 struct ext4_fc_tl *tl; 2033 __u8 *start, *end; 2034 int ret = JBD2_FC_REPLAY_CONTINUE; 2035 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2036 struct ext4_fc_tail *tail; 2037 2038 if (pass == PASS_SCAN) { 2039 state->fc_current_pass = PASS_SCAN; 2040 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2041 } 2042 2043 if (state->fc_current_pass != pass) { 2044 state->fc_current_pass = pass; 2045 sbi->s_mount_state |= EXT4_FC_REPLAY; 2046 } 2047 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2048 jbd_debug(1, "Replay stops\n"); 2049 ext4_fc_set_bitmaps_and_counters(sb); 2050 return 0; 2051 } 2052 2053 #ifdef CONFIG_EXT4_DEBUG 2054 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2055 pr_warn("Dropping fc block %d because max_replay set\n", off); 2056 return JBD2_FC_REPLAY_STOP; 2057 } 2058 #endif 2059 2060 start = (u8 *)bh->b_data; 2061 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2062 2063 fc_for_each_tl(start, end, tl) { 2064 if (state->fc_replay_num_tags == 0) { 2065 ret = JBD2_FC_REPLAY_STOP; 2066 ext4_fc_set_bitmaps_and_counters(sb); 2067 break; 2068 } 2069 jbd_debug(3, "Replay phase, tag:%s\n", 2070 tag2str(le16_to_cpu(tl->fc_tag))); 2071 state->fc_replay_num_tags--; 2072 switch (le16_to_cpu(tl->fc_tag)) { 2073 case EXT4_FC_TAG_LINK: 2074 ret = ext4_fc_replay_link(sb, tl); 2075 break; 2076 case EXT4_FC_TAG_UNLINK: 2077 ret = ext4_fc_replay_unlink(sb, tl); 2078 break; 2079 case EXT4_FC_TAG_ADD_RANGE: 2080 ret = ext4_fc_replay_add_range(sb, tl); 2081 break; 2082 case EXT4_FC_TAG_CREAT: 2083 ret = ext4_fc_replay_create(sb, tl); 2084 break; 2085 case EXT4_FC_TAG_DEL_RANGE: 2086 ret = ext4_fc_replay_del_range(sb, tl); 2087 break; 2088 case EXT4_FC_TAG_INODE: 2089 ret = ext4_fc_replay_inode(sb, tl); 2090 break; 2091 case EXT4_FC_TAG_PAD: 2092 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2093 ext4_fc_tag_len(tl), 0); 2094 break; 2095 case EXT4_FC_TAG_TAIL: 2096 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2097 ext4_fc_tag_len(tl), 0); 2098 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 2099 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); 2100 break; 2101 case EXT4_FC_TAG_HEAD: 2102 break; 2103 default: 2104 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, 2105 ext4_fc_tag_len(tl), 0); 2106 ret = -ECANCELED; 2107 break; 2108 } 2109 if (ret < 0) 2110 break; 2111 ret = JBD2_FC_REPLAY_CONTINUE; 2112 } 2113 return ret; 2114 } 2115 2116 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2117 { 2118 /* 2119 * We set replay callback even if fast commit disabled because we may 2120 * could still have fast commit blocks that need to be replayed even if 2121 * fast commit has now been turned off. 2122 */ 2123 journal->j_fc_replay_callback = ext4_fc_replay; 2124 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2125 return; 2126 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2127 } 2128 2129 static const char *fc_ineligible_reasons[] = { 2130 "Extended attributes changed", 2131 "Cross rename", 2132 "Journal flag changed", 2133 "Insufficient memory", 2134 "Swap boot", 2135 "Resize", 2136 "Dir renamed", 2137 "Falloc range op", 2138 "Data journalling", 2139 "FC Commit Failed" 2140 }; 2141 2142 int ext4_fc_info_show(struct seq_file *seq, void *v) 2143 { 2144 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2145 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2146 int i; 2147 2148 if (v != SEQ_START_TOKEN) 2149 return 0; 2150 2151 seq_printf(seq, 2152 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2153 stats->fc_num_commits, stats->fc_ineligible_commits, 2154 stats->fc_numblks, 2155 div_u64(sbi->s_fc_avg_commit_time, 1000)); 2156 seq_puts(seq, "Ineligible reasons:\n"); 2157 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2158 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2159 stats->fc_ineligible_reason_count[i]); 2160 2161 return 0; 2162 } 2163 2164 int __init ext4_fc_init_dentry_cache(void) 2165 { 2166 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2167 SLAB_RECLAIM_ACCOUNT); 2168 2169 if (ext4_fc_dentry_cachep == NULL) 2170 return -ENOMEM; 2171 2172 return 0; 2173 } 2174