1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * Not all operations are supported by fast commits today (e.g extended 69 * attributes). Fast commit ineligibility is marked by calling one of the 70 * two following functions: 71 * 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall 73 * back to full commit. This is useful in case of transient errors. 74 * 75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all 76 * the fast commits happening between ext4_fc_start_ineligible() and 77 * ext4_fc_stop_ineligible() and one fast commit after the call to 78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to 79 * make one more fast commit to fall back to full commit after stop call so 80 * that it guaranteed that the fast commit ineligible operation contained 81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is 82 * followed by at least 1 full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to guarantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * Fast Commit Replay Idempotence 107 * ------------------------------ 108 * 109 * Fast commits tags are idempotent in nature provided the recovery code follows 110 * certain rules. The guiding principle that the commit path follows while 111 * committing is that it stores the result of a particular operation instead of 112 * storing the procedure. 113 * 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 * was associated with inode 10. During fast commit, instead of storing this 116 * operation as a procedure "rename a to b", we store the resulting file system 117 * state as a "series" of outcomes: 118 * 119 * - Link dirent b to inode 10 120 * - Unlink dirent a 121 * - Inode <10> with valid refcount 122 * 123 * Now when recovery code runs, it needs "enforce" this state on the file 124 * system. This is what guarantees idempotence of fast commit replay. 125 * 126 * Let's take an example of a procedure that is not idempotent and see how fast 127 * commits make it idempotent. Consider following sequence of operations: 128 * 129 * rm A; mv B A; read A 130 * (x) (y) (z) 131 * 132 * (x), (y) and (z) are the points at which we can crash. If we store this 133 * sequence of operations as is then the replay is not idempotent. Let's say 134 * while in replay, we crash at (z). During the second replay, file A (which was 135 * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 * file named A would be absent when we try to read A. So, this sequence of 137 * operations is not idempotent. However, as mentioned above, instead of storing 138 * the procedure fast commits store the outcome of each procedure. Thus the fast 139 * commit log for above procedure would be as follows: 140 * 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 * inode 11 before the replay) 143 * 144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 * (w) (x) (y) (z) 146 * 147 * If we crash at (z), we will have file A linked to inode 11. During the second 148 * replay, we will remove file A (inode 11). But we will create it back and make 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 * similarly. Thus, by converting a non-idempotent procedure into a series of 153 * idempotent outcomes, fast commits ensured idempotence during the replay. 154 * 155 * TODOs 156 * ----- 157 * 158 * 0) Fast commit replay path hardening: Fast commit replay code should use 159 * journal handles to make sure all the updates it does during the replay 160 * path are atomic. With that if we crash during fast commit replay, after 161 * trying to do recovery again, we will find a file system where fast commit 162 * area is invalid (because new full commit would be found). In order to deal 163 * with that, fast commit replay code should ensure that the "FC_REPLAY" 164 * superblock state is persisted before starting the replay, so that after 165 * the crash, fast commit recovery code can look at that flag and perform 166 * fast commit recovery even if that area is invalidated by later full 167 * commits. 168 * 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 170 * eligible update must be protected within ext4_fc_start_update() and 171 * ext4_fc_stop_update(). These routines are called at much higher 172 * routines. This can be made more fine grained by combining with 173 * ext4_journal_start(). 174 * 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() 176 * 177 * 3) Handle more ineligible cases. 178 */ 179 180 #include <trace/events/ext4.h> 181 static struct kmem_cache *ext4_fc_dentry_cachep; 182 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 184 { 185 BUFFER_TRACE(bh, ""); 186 if (uptodate) { 187 ext4_debug("%s: Block %lld up-to-date", 188 __func__, bh->b_blocknr); 189 set_buffer_uptodate(bh); 190 } else { 191 ext4_debug("%s: Block %lld not up-to-date", 192 __func__, bh->b_blocknr); 193 clear_buffer_uptodate(bh); 194 } 195 196 unlock_buffer(bh); 197 } 198 199 static inline void ext4_fc_reset_inode(struct inode *inode) 200 { 201 struct ext4_inode_info *ei = EXT4_I(inode); 202 203 ei->i_fc_lblk_start = 0; 204 ei->i_fc_lblk_len = 0; 205 } 206 207 void ext4_fc_init_inode(struct inode *inode) 208 { 209 struct ext4_inode_info *ei = EXT4_I(inode); 210 211 ext4_fc_reset_inode(inode); 212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 213 INIT_LIST_HEAD(&ei->i_fc_list); 214 init_waitqueue_head(&ei->i_fc_wait); 215 atomic_set(&ei->i_fc_updates, 0); 216 } 217 218 /* This function must be called with sbi->s_fc_lock held. */ 219 static void ext4_fc_wait_committing_inode(struct inode *inode) 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 221 { 222 wait_queue_head_t *wq; 223 struct ext4_inode_info *ei = EXT4_I(inode); 224 225 #if (BITS_PER_LONG < 64) 226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 227 EXT4_STATE_FC_COMMITTING); 228 wq = bit_waitqueue(&ei->i_state_flags, 229 EXT4_STATE_FC_COMMITTING); 230 #else 231 DEFINE_WAIT_BIT(wait, &ei->i_flags, 232 EXT4_STATE_FC_COMMITTING); 233 wq = bit_waitqueue(&ei->i_flags, 234 EXT4_STATE_FC_COMMITTING); 235 #endif 236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 239 schedule(); 240 finish_wait(wq, &wait.wq_entry); 241 } 242 243 /* 244 * Inform Ext4's fast about start of an inode update 245 * 246 * This function is called by the high level call VFS callbacks before 247 * performing any inode update. This function blocks if there's an ongoing 248 * fast commit on the inode in question. 249 */ 250 void ext4_fc_start_update(struct inode *inode) 251 { 252 struct ext4_inode_info *ei = EXT4_I(inode); 253 254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 256 return; 257 258 restart: 259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 260 if (list_empty(&ei->i_fc_list)) 261 goto out; 262 263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 264 ext4_fc_wait_committing_inode(inode); 265 goto restart; 266 } 267 out: 268 atomic_inc(&ei->i_fc_updates); 269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 270 } 271 272 /* 273 * Stop inode update and wake up waiting fast commits if any. 274 */ 275 void ext4_fc_stop_update(struct inode *inode) 276 { 277 struct ext4_inode_info *ei = EXT4_I(inode); 278 279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 281 return; 282 283 if (atomic_dec_and_test(&ei->i_fc_updates)) 284 wake_up_all(&ei->i_fc_wait); 285 } 286 287 /* 288 * Remove inode from fast commit list. If the inode is being committed 289 * we wait until inode commit is done. 290 */ 291 void ext4_fc_del(struct inode *inode) 292 { 293 struct ext4_inode_info *ei = EXT4_I(inode); 294 295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 297 return; 298 299 restart: 300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 301 if (list_empty(&ei->i_fc_list)) { 302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 303 return; 304 } 305 306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 307 ext4_fc_wait_committing_inode(inode); 308 goto restart; 309 } 310 list_del_init(&ei->i_fc_list); 311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 312 } 313 314 /* 315 * Mark file system as fast commit ineligible. This means that next commit 316 * operation would result in a full jbd2 commit. 317 */ 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 319 { 320 struct ext4_sb_info *sbi = EXT4_SB(sb); 321 322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 324 return; 325 326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 327 WARN_ON(reason >= EXT4_FC_REASON_MAX); 328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 329 } 330 331 /* 332 * Start a fast commit ineligible update. Any commits that happen while 333 * such an operation is in progress fall back to full commits. 334 */ 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason) 336 { 337 struct ext4_sb_info *sbi = EXT4_SB(sb); 338 339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 341 return; 342 343 WARN_ON(reason >= EXT4_FC_REASON_MAX); 344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 345 atomic_inc(&sbi->s_fc_ineligible_updates); 346 } 347 348 /* 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here 350 * to ensure that after stopping the ineligible update, at least one full 351 * commit takes place. 352 */ 353 void ext4_fc_stop_ineligible(struct super_block *sb) 354 { 355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 357 return; 358 359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); 361 } 362 363 static inline int ext4_fc_is_ineligible(struct super_block *sb) 364 { 365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || 366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); 367 } 368 369 /* 370 * Generic fast commit tracking function. If this is the first time this we are 371 * called after a full commit, we initialize fast commit fields and then call 372 * __fc_track_fn() with update = 0. If we have already been called after a full 373 * commit, we pass update = 1. Based on that, the track function can determine 374 * if it needs to track a field for the first time or if it needs to just 375 * update the previously tracked value. 376 * 377 * If enqueue is set, this function enqueues the inode in fast commit list. 378 */ 379 static int ext4_fc_track_template( 380 handle_t *handle, struct inode *inode, 381 int (*__fc_track_fn)(struct inode *, void *, bool), 382 void *args, int enqueue) 383 { 384 bool update = false; 385 struct ext4_inode_info *ei = EXT4_I(inode); 386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 387 tid_t tid = 0; 388 int ret; 389 390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 391 (sbi->s_mount_state & EXT4_FC_REPLAY)) 392 return -EOPNOTSUPP; 393 394 if (ext4_fc_is_ineligible(inode->i_sb)) 395 return -EINVAL; 396 397 tid = handle->h_transaction->t_tid; 398 mutex_lock(&ei->i_fc_lock); 399 if (tid == ei->i_sync_tid) { 400 update = true; 401 } else { 402 ext4_fc_reset_inode(inode); 403 ei->i_sync_tid = tid; 404 } 405 ret = __fc_track_fn(inode, args, update); 406 mutex_unlock(&ei->i_fc_lock); 407 408 if (!enqueue) 409 return ret; 410 411 spin_lock(&sbi->s_fc_lock); 412 if (list_empty(&EXT4_I(inode)->i_fc_list)) 413 list_add_tail(&EXT4_I(inode)->i_fc_list, 414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 415 &sbi->s_fc_q[FC_Q_STAGING] : 416 &sbi->s_fc_q[FC_Q_MAIN]); 417 spin_unlock(&sbi->s_fc_lock); 418 419 return ret; 420 } 421 422 struct __track_dentry_update_args { 423 struct dentry *dentry; 424 int op; 425 }; 426 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 429 { 430 struct ext4_fc_dentry_update *node; 431 struct ext4_inode_info *ei = EXT4_I(inode); 432 struct __track_dentry_update_args *dentry_update = 433 (struct __track_dentry_update_args *)arg; 434 struct dentry *dentry = dentry_update->dentry; 435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 436 437 mutex_unlock(&ei->i_fc_lock); 438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 439 if (!node) { 440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 441 mutex_lock(&ei->i_fc_lock); 442 return -ENOMEM; 443 } 444 445 node->fcd_op = dentry_update->op; 446 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 447 node->fcd_ino = inode->i_ino; 448 if (dentry->d_name.len > DNAME_INLINE_LEN) { 449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 450 if (!node->fcd_name.name) { 451 kmem_cache_free(ext4_fc_dentry_cachep, node); 452 ext4_fc_mark_ineligible(inode->i_sb, 453 EXT4_FC_REASON_NOMEM); 454 mutex_lock(&ei->i_fc_lock); 455 return -ENOMEM; 456 } 457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 458 dentry->d_name.len); 459 } else { 460 memcpy(node->fcd_iname, dentry->d_name.name, 461 dentry->d_name.len); 462 node->fcd_name.name = node->fcd_iname; 463 } 464 node->fcd_name.len = dentry->d_name.len; 465 466 spin_lock(&sbi->s_fc_lock); 467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 468 list_add_tail(&node->fcd_list, 469 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 470 else 471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 472 spin_unlock(&sbi->s_fc_lock); 473 mutex_lock(&ei->i_fc_lock); 474 475 return 0; 476 } 477 478 void __ext4_fc_track_unlink(handle_t *handle, 479 struct inode *inode, struct dentry *dentry) 480 { 481 struct __track_dentry_update_args args; 482 int ret; 483 484 args.dentry = dentry; 485 args.op = EXT4_FC_TAG_UNLINK; 486 487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 488 (void *)&args, 0); 489 trace_ext4_fc_track_unlink(inode, dentry, ret); 490 } 491 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 493 { 494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 495 } 496 497 void __ext4_fc_track_link(handle_t *handle, 498 struct inode *inode, struct dentry *dentry) 499 { 500 struct __track_dentry_update_args args; 501 int ret; 502 503 args.dentry = dentry; 504 args.op = EXT4_FC_TAG_LINK; 505 506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 507 (void *)&args, 0); 508 trace_ext4_fc_track_link(inode, dentry, ret); 509 } 510 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 512 { 513 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 514 } 515 516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 517 struct dentry *dentry) 518 { 519 struct __track_dentry_update_args args; 520 int ret; 521 522 args.dentry = dentry; 523 args.op = EXT4_FC_TAG_CREAT; 524 525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 526 (void *)&args, 0); 527 trace_ext4_fc_track_create(inode, dentry, ret); 528 } 529 530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 531 { 532 __ext4_fc_track_create(handle, d_inode(dentry), dentry); 533 } 534 535 /* __track_fn for inode tracking */ 536 static int __track_inode(struct inode *inode, void *arg, bool update) 537 { 538 if (update) 539 return -EEXIST; 540 541 EXT4_I(inode)->i_fc_lblk_len = 0; 542 543 return 0; 544 } 545 546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 547 { 548 int ret; 549 550 if (S_ISDIR(inode->i_mode)) 551 return; 552 553 if (ext4_should_journal_data(inode)) { 554 ext4_fc_mark_ineligible(inode->i_sb, 555 EXT4_FC_REASON_INODE_JOURNAL_DATA); 556 return; 557 } 558 559 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 560 trace_ext4_fc_track_inode(inode, ret); 561 } 562 563 struct __track_range_args { 564 ext4_lblk_t start, end; 565 }; 566 567 /* __track_fn for tracking data updates */ 568 static int __track_range(struct inode *inode, void *arg, bool update) 569 { 570 struct ext4_inode_info *ei = EXT4_I(inode); 571 ext4_lblk_t oldstart; 572 struct __track_range_args *__arg = 573 (struct __track_range_args *)arg; 574 575 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 576 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 577 return -ECANCELED; 578 } 579 580 oldstart = ei->i_fc_lblk_start; 581 582 if (update && ei->i_fc_lblk_len > 0) { 583 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 584 ei->i_fc_lblk_len = 585 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 586 ei->i_fc_lblk_start + 1; 587 } else { 588 ei->i_fc_lblk_start = __arg->start; 589 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 590 } 591 592 return 0; 593 } 594 595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 596 ext4_lblk_t end) 597 { 598 struct __track_range_args args; 599 int ret; 600 601 if (S_ISDIR(inode->i_mode)) 602 return; 603 604 args.start = start; 605 args.end = end; 606 607 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 608 609 trace_ext4_fc_track_range(inode, start, end, ret); 610 } 611 612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 613 { 614 int write_flags = REQ_SYNC; 615 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 616 617 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 618 if (test_opt(sb, BARRIER) && is_tail) 619 write_flags |= REQ_FUA | REQ_PREFLUSH; 620 lock_buffer(bh); 621 set_buffer_dirty(bh); 622 set_buffer_uptodate(bh); 623 bh->b_end_io = ext4_end_buffer_io_sync; 624 submit_bh(REQ_OP_WRITE, write_flags, bh); 625 EXT4_SB(sb)->s_fc_bh = NULL; 626 } 627 628 /* Ext4 commit path routines */ 629 630 /* memzero and update CRC */ 631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 632 u32 *crc) 633 { 634 void *ret; 635 636 ret = memset(dst, 0, len); 637 if (crc) 638 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 639 return ret; 640 } 641 642 /* 643 * Allocate len bytes on a fast commit buffer. 644 * 645 * During the commit time this function is used to manage fast commit 646 * block space. We don't split a fast commit log onto different 647 * blocks. So this function makes sure that if there's not enough space 648 * on the current block, the remaining space in the current block is 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 650 * new block is from jbd2 and CRC is updated to reflect the padding 651 * we added. 652 */ 653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 654 { 655 struct ext4_fc_tl *tl; 656 struct ext4_sb_info *sbi = EXT4_SB(sb); 657 struct buffer_head *bh; 658 int bsize = sbi->s_journal->j_blocksize; 659 int ret, off = sbi->s_fc_bytes % bsize; 660 int pad_len; 661 662 /* 663 * After allocating len, we should have space at least for a 0 byte 664 * padding. 665 */ 666 if (len + sizeof(struct ext4_fc_tl) > bsize) 667 return NULL; 668 669 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 670 /* 671 * Only allocate from current buffer if we have enough space for 672 * this request AND we have space to add a zero byte padding. 673 */ 674 if (!sbi->s_fc_bh) { 675 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 676 if (ret) 677 return NULL; 678 sbi->s_fc_bh = bh; 679 } 680 sbi->s_fc_bytes += len; 681 return sbi->s_fc_bh->b_data + off; 682 } 683 /* Need to add PAD tag */ 684 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 685 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 686 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 687 tl->fc_len = cpu_to_le16(pad_len); 688 if (crc) 689 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 690 if (pad_len > 0) 691 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 692 ext4_fc_submit_bh(sb, false); 693 694 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 695 if (ret) 696 return NULL; 697 sbi->s_fc_bh = bh; 698 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 699 return sbi->s_fc_bh->b_data; 700 } 701 702 /* memcpy to fc reserved space and update CRC */ 703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 704 int len, u32 *crc) 705 { 706 if (crc) 707 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 708 return memcpy(dst, src, len); 709 } 710 711 /* 712 * Complete a fast commit by writing tail tag. 713 * 714 * Writing tail tag marks the end of a fast commit. In order to guarantee 715 * atomicity, after writing tail tag, even if there's space remaining 716 * in the block, next commit shouldn't use it. That's why tail tag 717 * has the length as that of the remaining space on the block. 718 */ 719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 720 { 721 struct ext4_sb_info *sbi = EXT4_SB(sb); 722 struct ext4_fc_tl tl; 723 struct ext4_fc_tail tail; 724 int off, bsize = sbi->s_journal->j_blocksize; 725 u8 *dst; 726 727 /* 728 * ext4_fc_reserve_space takes care of allocating an extra block if 729 * there's no enough space on this block for accommodating this tail. 730 */ 731 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 732 if (!dst) 733 return -ENOSPC; 734 735 off = sbi->s_fc_bytes % bsize; 736 737 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 738 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 739 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 740 741 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 742 dst += sizeof(tl); 743 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 744 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 745 dst += sizeof(tail.fc_tid); 746 tail.fc_crc = cpu_to_le32(crc); 747 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 748 749 ext4_fc_submit_bh(sb, true); 750 751 return 0; 752 } 753 754 /* 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 756 * Returns false if there's not enough space. 757 */ 758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 759 u32 *crc) 760 { 761 struct ext4_fc_tl tl; 762 u8 *dst; 763 764 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 765 if (!dst) 766 return false; 767 768 tl.fc_tag = cpu_to_le16(tag); 769 tl.fc_len = cpu_to_le16(len); 770 771 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 772 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 773 774 return true; 775 } 776 777 /* Same as above, but adds dentry tlv. */ 778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, 779 int parent_ino, int ino, int dlen, 780 const unsigned char *dname, 781 u32 *crc) 782 { 783 struct ext4_fc_dentry_info fcd; 784 struct ext4_fc_tl tl; 785 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 786 crc); 787 788 if (!dst) 789 return false; 790 791 fcd.fc_parent_ino = cpu_to_le32(parent_ino); 792 fcd.fc_ino = cpu_to_le32(ino); 793 tl.fc_tag = cpu_to_le16(tag); 794 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 795 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 796 dst += sizeof(tl); 797 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 798 dst += sizeof(fcd); 799 ext4_fc_memcpy(sb, dst, dname, dlen, crc); 800 dst += dlen; 801 802 return true; 803 } 804 805 /* 806 * Writes inode in the fast commit space under TLV with tag @tag. 807 * Returns 0 on success, error on failure. 808 */ 809 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 810 { 811 struct ext4_inode_info *ei = EXT4_I(inode); 812 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 813 int ret; 814 struct ext4_iloc iloc; 815 struct ext4_fc_inode fc_inode; 816 struct ext4_fc_tl tl; 817 u8 *dst; 818 819 ret = ext4_get_inode_loc(inode, &iloc); 820 if (ret) 821 return ret; 822 823 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 824 inode_len += ei->i_extra_isize; 825 826 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 827 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 828 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 829 830 dst = ext4_fc_reserve_space(inode->i_sb, 831 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 832 if (!dst) 833 return -ECANCELED; 834 835 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 836 return -ECANCELED; 837 dst += sizeof(tl); 838 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 839 return -ECANCELED; 840 dst += sizeof(fc_inode); 841 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 842 inode_len, crc)) 843 return -ECANCELED; 844 845 return 0; 846 } 847 848 /* 849 * Writes updated data ranges for the inode in question. Updates CRC. 850 * Returns 0 on success, error otherwise. 851 */ 852 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 853 { 854 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 855 struct ext4_inode_info *ei = EXT4_I(inode); 856 struct ext4_map_blocks map; 857 struct ext4_fc_add_range fc_ext; 858 struct ext4_fc_del_range lrange; 859 struct ext4_extent *ex; 860 int ret; 861 862 mutex_lock(&ei->i_fc_lock); 863 if (ei->i_fc_lblk_len == 0) { 864 mutex_unlock(&ei->i_fc_lock); 865 return 0; 866 } 867 old_blk_size = ei->i_fc_lblk_start; 868 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 869 ei->i_fc_lblk_len = 0; 870 mutex_unlock(&ei->i_fc_lock); 871 872 cur_lblk_off = old_blk_size; 873 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 874 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 875 876 while (cur_lblk_off <= new_blk_size) { 877 map.m_lblk = cur_lblk_off; 878 map.m_len = new_blk_size - cur_lblk_off + 1; 879 ret = ext4_map_blocks(NULL, inode, &map, 0); 880 if (ret < 0) 881 return -ECANCELED; 882 883 if (map.m_len == 0) { 884 cur_lblk_off++; 885 continue; 886 } 887 888 if (ret == 0) { 889 lrange.fc_ino = cpu_to_le32(inode->i_ino); 890 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 891 lrange.fc_len = cpu_to_le32(map.m_len); 892 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 893 sizeof(lrange), (u8 *)&lrange, crc)) 894 return -ENOSPC; 895 } else { 896 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 897 ex = (struct ext4_extent *)&fc_ext.fc_ex; 898 ex->ee_block = cpu_to_le32(map.m_lblk); 899 ex->ee_len = cpu_to_le16(map.m_len); 900 ext4_ext_store_pblock(ex, map.m_pblk); 901 if (map.m_flags & EXT4_MAP_UNWRITTEN) 902 ext4_ext_mark_unwritten(ex); 903 else 904 ext4_ext_mark_initialized(ex); 905 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 906 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 907 return -ENOSPC; 908 } 909 910 cur_lblk_off += map.m_len; 911 } 912 913 return 0; 914 } 915 916 917 /* Submit data for all the fast commit inodes */ 918 static int ext4_fc_submit_inode_data_all(journal_t *journal) 919 { 920 struct super_block *sb = (struct super_block *)(journal->j_private); 921 struct ext4_sb_info *sbi = EXT4_SB(sb); 922 struct ext4_inode_info *ei; 923 int ret = 0; 924 925 spin_lock(&sbi->s_fc_lock); 926 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 927 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 928 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 929 while (atomic_read(&ei->i_fc_updates)) { 930 DEFINE_WAIT(wait); 931 932 prepare_to_wait(&ei->i_fc_wait, &wait, 933 TASK_UNINTERRUPTIBLE); 934 if (atomic_read(&ei->i_fc_updates)) { 935 spin_unlock(&sbi->s_fc_lock); 936 schedule(); 937 spin_lock(&sbi->s_fc_lock); 938 } 939 finish_wait(&ei->i_fc_wait, &wait); 940 } 941 spin_unlock(&sbi->s_fc_lock); 942 ret = jbd2_submit_inode_data(ei->jinode); 943 if (ret) 944 return ret; 945 spin_lock(&sbi->s_fc_lock); 946 } 947 spin_unlock(&sbi->s_fc_lock); 948 949 return ret; 950 } 951 952 /* Wait for completion of data for all the fast commit inodes */ 953 static int ext4_fc_wait_inode_data_all(journal_t *journal) 954 { 955 struct super_block *sb = (struct super_block *)(journal->j_private); 956 struct ext4_sb_info *sbi = EXT4_SB(sb); 957 struct ext4_inode_info *pos, *n; 958 int ret = 0; 959 960 spin_lock(&sbi->s_fc_lock); 961 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 962 if (!ext4_test_inode_state(&pos->vfs_inode, 963 EXT4_STATE_FC_COMMITTING)) 964 continue; 965 spin_unlock(&sbi->s_fc_lock); 966 967 ret = jbd2_wait_inode_data(journal, pos->jinode); 968 if (ret) 969 return ret; 970 spin_lock(&sbi->s_fc_lock); 971 } 972 spin_unlock(&sbi->s_fc_lock); 973 974 return 0; 975 } 976 977 /* Commit all the directory entry updates */ 978 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 979 __acquires(&sbi->s_fc_lock) 980 __releases(&sbi->s_fc_lock) 981 { 982 struct super_block *sb = (struct super_block *)(journal->j_private); 983 struct ext4_sb_info *sbi = EXT4_SB(sb); 984 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 985 struct inode *inode; 986 struct ext4_inode_info *ei, *ei_n; 987 int ret; 988 989 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 990 return 0; 991 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 992 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 993 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 994 spin_unlock(&sbi->s_fc_lock); 995 if (!ext4_fc_add_dentry_tlv( 996 sb, fc_dentry->fcd_op, 997 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 998 fc_dentry->fcd_name.len, 999 fc_dentry->fcd_name.name, crc)) { 1000 ret = -ENOSPC; 1001 goto lock_and_exit; 1002 } 1003 spin_lock(&sbi->s_fc_lock); 1004 continue; 1005 } 1006 1007 inode = NULL; 1008 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], 1009 i_fc_list) { 1010 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 1011 inode = &ei->vfs_inode; 1012 break; 1013 } 1014 } 1015 /* 1016 * If we don't find inode in our list, then it was deleted, 1017 * in which case, we don't need to record it's create tag. 1018 */ 1019 if (!inode) 1020 continue; 1021 spin_unlock(&sbi->s_fc_lock); 1022 1023 /* 1024 * We first write the inode and then the create dirent. This 1025 * allows the recovery code to create an unnamed inode first 1026 * and then link it to a directory entry. This allows us 1027 * to use namei.c routines almost as is and simplifies 1028 * the recovery code. 1029 */ 1030 ret = ext4_fc_write_inode(inode, crc); 1031 if (ret) 1032 goto lock_and_exit; 1033 1034 ret = ext4_fc_write_inode_data(inode, crc); 1035 if (ret) 1036 goto lock_and_exit; 1037 1038 if (!ext4_fc_add_dentry_tlv( 1039 sb, fc_dentry->fcd_op, 1040 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 1041 fc_dentry->fcd_name.len, 1042 fc_dentry->fcd_name.name, crc)) { 1043 ret = -ENOSPC; 1044 goto lock_and_exit; 1045 } 1046 1047 spin_lock(&sbi->s_fc_lock); 1048 } 1049 return 0; 1050 lock_and_exit: 1051 spin_lock(&sbi->s_fc_lock); 1052 return ret; 1053 } 1054 1055 static int ext4_fc_perform_commit(journal_t *journal) 1056 { 1057 struct super_block *sb = (struct super_block *)(journal->j_private); 1058 struct ext4_sb_info *sbi = EXT4_SB(sb); 1059 struct ext4_inode_info *iter; 1060 struct ext4_fc_head head; 1061 struct inode *inode; 1062 struct blk_plug plug; 1063 int ret = 0; 1064 u32 crc = 0; 1065 1066 ret = ext4_fc_submit_inode_data_all(journal); 1067 if (ret) 1068 return ret; 1069 1070 ret = ext4_fc_wait_inode_data_all(journal); 1071 if (ret) 1072 return ret; 1073 1074 /* 1075 * If file system device is different from journal device, issue a cache 1076 * flush before we start writing fast commit blocks. 1077 */ 1078 if (journal->j_fs_dev != journal->j_dev) 1079 blkdev_issue_flush(journal->j_fs_dev); 1080 1081 blk_start_plug(&plug); 1082 if (sbi->s_fc_bytes == 0) { 1083 /* 1084 * Add a head tag only if this is the first fast commit 1085 * in this TID. 1086 */ 1087 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1088 head.fc_tid = cpu_to_le32( 1089 sbi->s_journal->j_running_transaction->t_tid); 1090 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1091 (u8 *)&head, &crc)) { 1092 ret = -ENOSPC; 1093 goto out; 1094 } 1095 } 1096 1097 spin_lock(&sbi->s_fc_lock); 1098 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1099 if (ret) { 1100 spin_unlock(&sbi->s_fc_lock); 1101 goto out; 1102 } 1103 1104 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1105 inode = &iter->vfs_inode; 1106 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1107 continue; 1108 1109 spin_unlock(&sbi->s_fc_lock); 1110 ret = ext4_fc_write_inode_data(inode, &crc); 1111 if (ret) 1112 goto out; 1113 ret = ext4_fc_write_inode(inode, &crc); 1114 if (ret) 1115 goto out; 1116 spin_lock(&sbi->s_fc_lock); 1117 } 1118 spin_unlock(&sbi->s_fc_lock); 1119 1120 ret = ext4_fc_write_tail(sb, crc); 1121 1122 out: 1123 blk_finish_plug(&plug); 1124 return ret; 1125 } 1126 1127 /* 1128 * The main commit entry point. Performs a fast commit for transaction 1129 * commit_tid if needed. If it's not possible to perform a fast commit 1130 * due to various reasons, we fall back to full commit. Returns 0 1131 * on success, error otherwise. 1132 */ 1133 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1134 { 1135 struct super_block *sb = (struct super_block *)(journal->j_private); 1136 struct ext4_sb_info *sbi = EXT4_SB(sb); 1137 int nblks = 0, ret, bsize = journal->j_blocksize; 1138 int subtid = atomic_read(&sbi->s_fc_subtid); 1139 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; 1140 ktime_t start_time, commit_time; 1141 1142 trace_ext4_fc_commit_start(sb); 1143 1144 start_time = ktime_get(); 1145 1146 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 1147 (ext4_fc_is_ineligible(sb))) { 1148 reason = EXT4_FC_REASON_INELIGIBLE; 1149 goto out; 1150 } 1151 1152 restart_fc: 1153 ret = jbd2_fc_begin_commit(journal, commit_tid); 1154 if (ret == -EALREADY) { 1155 /* There was an ongoing commit, check if we need to restart */ 1156 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1157 commit_tid > journal->j_commit_sequence) 1158 goto restart_fc; 1159 reason = EXT4_FC_REASON_ALREADY_COMMITTED; 1160 goto out; 1161 } else if (ret) { 1162 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1163 reason = EXT4_FC_REASON_FC_START_FAILED; 1164 goto out; 1165 } 1166 1167 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1168 ret = ext4_fc_perform_commit(journal); 1169 if (ret < 0) { 1170 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1171 reason = EXT4_FC_REASON_FC_FAILED; 1172 goto out; 1173 } 1174 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1175 ret = jbd2_fc_wait_bufs(journal, nblks); 1176 if (ret < 0) { 1177 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1178 reason = EXT4_FC_REASON_FC_FAILED; 1179 goto out; 1180 } 1181 atomic_inc(&sbi->s_fc_subtid); 1182 jbd2_fc_end_commit(journal); 1183 out: 1184 /* Has any ineligible update happened since we started? */ 1185 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { 1186 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1187 reason = EXT4_FC_REASON_INELIGIBLE; 1188 } 1189 1190 spin_lock(&sbi->s_fc_lock); 1191 if (reason != EXT4_FC_REASON_OK && 1192 reason != EXT4_FC_REASON_ALREADY_COMMITTED) { 1193 sbi->s_fc_stats.fc_ineligible_commits++; 1194 } else { 1195 sbi->s_fc_stats.fc_num_commits++; 1196 sbi->s_fc_stats.fc_numblks += nblks; 1197 } 1198 spin_unlock(&sbi->s_fc_lock); 1199 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; 1200 trace_ext4_fc_commit_stop(sb, nblks, reason); 1201 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1202 /* 1203 * weight the commit time higher than the average time so we don't 1204 * react too strongly to vast changes in the commit time 1205 */ 1206 if (likely(sbi->s_fc_avg_commit_time)) 1207 sbi->s_fc_avg_commit_time = (commit_time + 1208 sbi->s_fc_avg_commit_time * 3) / 4; 1209 else 1210 sbi->s_fc_avg_commit_time = commit_time; 1211 jbd_debug(1, 1212 "Fast commit ended with blks = %d, reason = %d, subtid - %d", 1213 nblks, reason, subtid); 1214 if (reason == EXT4_FC_REASON_FC_FAILED) 1215 return jbd2_fc_end_commit_fallback(journal); 1216 if (reason == EXT4_FC_REASON_FC_START_FAILED || 1217 reason == EXT4_FC_REASON_INELIGIBLE) 1218 return jbd2_complete_transaction(journal, commit_tid); 1219 return 0; 1220 } 1221 1222 /* 1223 * Fast commit cleanup routine. This is called after every fast commit and 1224 * full commit. full is true if we are called after a full commit. 1225 */ 1226 static void ext4_fc_cleanup(journal_t *journal, int full) 1227 { 1228 struct super_block *sb = journal->j_private; 1229 struct ext4_sb_info *sbi = EXT4_SB(sb); 1230 struct ext4_inode_info *iter, *iter_n; 1231 struct ext4_fc_dentry_update *fc_dentry; 1232 1233 if (full && sbi->s_fc_bh) 1234 sbi->s_fc_bh = NULL; 1235 1236 jbd2_fc_release_bufs(journal); 1237 1238 spin_lock(&sbi->s_fc_lock); 1239 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1240 i_fc_list) { 1241 list_del_init(&iter->i_fc_list); 1242 ext4_clear_inode_state(&iter->vfs_inode, 1243 EXT4_STATE_FC_COMMITTING); 1244 ext4_fc_reset_inode(&iter->vfs_inode); 1245 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1246 smp_mb(); 1247 #if (BITS_PER_LONG < 64) 1248 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1249 #else 1250 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1251 #endif 1252 } 1253 1254 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1255 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1256 struct ext4_fc_dentry_update, 1257 fcd_list); 1258 list_del_init(&fc_dentry->fcd_list); 1259 spin_unlock(&sbi->s_fc_lock); 1260 1261 if (fc_dentry->fcd_name.name && 1262 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1263 kfree(fc_dentry->fcd_name.name); 1264 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1265 spin_lock(&sbi->s_fc_lock); 1266 } 1267 1268 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1269 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1270 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1271 &sbi->s_fc_q[FC_Q_MAIN]); 1272 1273 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1274 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1275 1276 if (full) 1277 sbi->s_fc_bytes = 0; 1278 spin_unlock(&sbi->s_fc_lock); 1279 trace_ext4_fc_stats(sb); 1280 } 1281 1282 /* Ext4 Replay Path Routines */ 1283 1284 /* Helper struct for dentry replay routines */ 1285 struct dentry_info_args { 1286 int parent_ino, dname_len, ino, inode_len; 1287 char *dname; 1288 }; 1289 1290 static inline void tl_to_darg(struct dentry_info_args *darg, 1291 struct ext4_fc_tl *tl, u8 *val) 1292 { 1293 struct ext4_fc_dentry_info fcd; 1294 1295 memcpy(&fcd, val, sizeof(fcd)); 1296 1297 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1298 darg->ino = le32_to_cpu(fcd.fc_ino); 1299 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1300 darg->dname_len = le16_to_cpu(tl->fc_len) - 1301 sizeof(struct ext4_fc_dentry_info); 1302 } 1303 1304 /* Unlink replay function */ 1305 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1306 u8 *val) 1307 { 1308 struct inode *inode, *old_parent; 1309 struct qstr entry; 1310 struct dentry_info_args darg; 1311 int ret = 0; 1312 1313 tl_to_darg(&darg, tl, val); 1314 1315 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1316 darg.parent_ino, darg.dname_len); 1317 1318 entry.name = darg.dname; 1319 entry.len = darg.dname_len; 1320 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1321 1322 if (IS_ERR(inode)) { 1323 jbd_debug(1, "Inode %d not found", darg.ino); 1324 return 0; 1325 } 1326 1327 old_parent = ext4_iget(sb, darg.parent_ino, 1328 EXT4_IGET_NORMAL); 1329 if (IS_ERR(old_parent)) { 1330 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1331 iput(inode); 1332 return 0; 1333 } 1334 1335 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1336 /* -ENOENT ok coz it might not exist anymore. */ 1337 if (ret == -ENOENT) 1338 ret = 0; 1339 iput(old_parent); 1340 iput(inode); 1341 return ret; 1342 } 1343 1344 static int ext4_fc_replay_link_internal(struct super_block *sb, 1345 struct dentry_info_args *darg, 1346 struct inode *inode) 1347 { 1348 struct inode *dir = NULL; 1349 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1350 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1351 int ret = 0; 1352 1353 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1354 if (IS_ERR(dir)) { 1355 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1356 dir = NULL; 1357 goto out; 1358 } 1359 1360 dentry_dir = d_obtain_alias(dir); 1361 if (IS_ERR(dentry_dir)) { 1362 jbd_debug(1, "Failed to obtain dentry"); 1363 dentry_dir = NULL; 1364 goto out; 1365 } 1366 1367 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1368 if (!dentry_inode) { 1369 jbd_debug(1, "Inode dentry not created."); 1370 ret = -ENOMEM; 1371 goto out; 1372 } 1373 1374 ret = __ext4_link(dir, inode, dentry_inode); 1375 /* 1376 * It's possible that link already existed since data blocks 1377 * for the dir in question got persisted before we crashed OR 1378 * we replayed this tag and crashed before the entire replay 1379 * could complete. 1380 */ 1381 if (ret && ret != -EEXIST) { 1382 jbd_debug(1, "Failed to link\n"); 1383 goto out; 1384 } 1385 1386 ret = 0; 1387 out: 1388 if (dentry_dir) { 1389 d_drop(dentry_dir); 1390 dput(dentry_dir); 1391 } else if (dir) { 1392 iput(dir); 1393 } 1394 if (dentry_inode) { 1395 d_drop(dentry_inode); 1396 dput(dentry_inode); 1397 } 1398 1399 return ret; 1400 } 1401 1402 /* Link replay function */ 1403 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1404 u8 *val) 1405 { 1406 struct inode *inode; 1407 struct dentry_info_args darg; 1408 int ret = 0; 1409 1410 tl_to_darg(&darg, tl, val); 1411 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1412 darg.parent_ino, darg.dname_len); 1413 1414 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1415 if (IS_ERR(inode)) { 1416 jbd_debug(1, "Inode not found."); 1417 return 0; 1418 } 1419 1420 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1421 iput(inode); 1422 return ret; 1423 } 1424 1425 /* 1426 * Record all the modified inodes during replay. We use this later to setup 1427 * block bitmaps correctly. 1428 */ 1429 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1430 { 1431 struct ext4_fc_replay_state *state; 1432 int i; 1433 1434 state = &EXT4_SB(sb)->s_fc_replay_state; 1435 for (i = 0; i < state->fc_modified_inodes_used; i++) 1436 if (state->fc_modified_inodes[i] == ino) 1437 return 0; 1438 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1439 state->fc_modified_inodes_size += 1440 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1441 state->fc_modified_inodes = krealloc( 1442 state->fc_modified_inodes, sizeof(int) * 1443 state->fc_modified_inodes_size, 1444 GFP_KERNEL); 1445 if (!state->fc_modified_inodes) 1446 return -ENOMEM; 1447 } 1448 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1449 return 0; 1450 } 1451 1452 /* 1453 * Inode replay function 1454 */ 1455 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1456 u8 *val) 1457 { 1458 struct ext4_fc_inode fc_inode; 1459 struct ext4_inode *raw_inode; 1460 struct ext4_inode *raw_fc_inode; 1461 struct inode *inode = NULL; 1462 struct ext4_iloc iloc; 1463 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1464 struct ext4_extent_header *eh; 1465 1466 memcpy(&fc_inode, val, sizeof(fc_inode)); 1467 1468 ino = le32_to_cpu(fc_inode.fc_ino); 1469 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1470 1471 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1472 if (!IS_ERR(inode)) { 1473 ext4_ext_clear_bb(inode); 1474 iput(inode); 1475 } 1476 inode = NULL; 1477 1478 ext4_fc_record_modified_inode(sb, ino); 1479 1480 raw_fc_inode = (struct ext4_inode *) 1481 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1482 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1483 if (ret) 1484 goto out; 1485 1486 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); 1487 raw_inode = ext4_raw_inode(&iloc); 1488 1489 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1490 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1491 inode_len - offsetof(struct ext4_inode, i_generation)); 1492 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1493 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1494 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1495 memset(eh, 0, sizeof(*eh)); 1496 eh->eh_magic = EXT4_EXT_MAGIC; 1497 eh->eh_max = cpu_to_le16( 1498 (sizeof(raw_inode->i_block) - 1499 sizeof(struct ext4_extent_header)) 1500 / sizeof(struct ext4_extent)); 1501 } 1502 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1503 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1504 sizeof(raw_inode->i_block)); 1505 } 1506 1507 /* Immediately update the inode on disk. */ 1508 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1509 if (ret) 1510 goto out; 1511 ret = sync_dirty_buffer(iloc.bh); 1512 if (ret) 1513 goto out; 1514 ret = ext4_mark_inode_used(sb, ino); 1515 if (ret) 1516 goto out; 1517 1518 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1519 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1520 if (IS_ERR(inode)) { 1521 jbd_debug(1, "Inode not found."); 1522 return -EFSCORRUPTED; 1523 } 1524 1525 /* 1526 * Our allocator could have made different decisions than before 1527 * crashing. This should be fixed but until then, we calculate 1528 * the number of blocks the inode. 1529 */ 1530 ext4_ext_replay_set_iblocks(inode); 1531 1532 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1533 ext4_reset_inode_seed(inode); 1534 1535 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1536 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1537 sync_dirty_buffer(iloc.bh); 1538 brelse(iloc.bh); 1539 out: 1540 iput(inode); 1541 if (!ret) 1542 blkdev_issue_flush(sb->s_bdev); 1543 1544 return 0; 1545 } 1546 1547 /* 1548 * Dentry create replay function. 1549 * 1550 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1551 * inode for which we are trying to create a dentry here, should already have 1552 * been replayed before we start here. 1553 */ 1554 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1555 u8 *val) 1556 { 1557 int ret = 0; 1558 struct inode *inode = NULL; 1559 struct inode *dir = NULL; 1560 struct dentry_info_args darg; 1561 1562 tl_to_darg(&darg, tl, val); 1563 1564 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1565 darg.parent_ino, darg.dname_len); 1566 1567 /* This takes care of update group descriptor and other metadata */ 1568 ret = ext4_mark_inode_used(sb, darg.ino); 1569 if (ret) 1570 goto out; 1571 1572 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1573 if (IS_ERR(inode)) { 1574 jbd_debug(1, "inode %d not found.", darg.ino); 1575 inode = NULL; 1576 ret = -EINVAL; 1577 goto out; 1578 } 1579 1580 if (S_ISDIR(inode->i_mode)) { 1581 /* 1582 * If we are creating a directory, we need to make sure that the 1583 * dot and dot dot dirents are setup properly. 1584 */ 1585 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1586 if (IS_ERR(dir)) { 1587 jbd_debug(1, "Dir %d not found.", darg.ino); 1588 goto out; 1589 } 1590 ret = ext4_init_new_dir(NULL, dir, inode); 1591 iput(dir); 1592 if (ret) { 1593 ret = 0; 1594 goto out; 1595 } 1596 } 1597 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1598 if (ret) 1599 goto out; 1600 set_nlink(inode, 1); 1601 ext4_mark_inode_dirty(NULL, inode); 1602 out: 1603 if (inode) 1604 iput(inode); 1605 return ret; 1606 } 1607 1608 /* 1609 * Record physical disk regions which are in use as per fast commit area. Our 1610 * simple replay phase allocator excludes these regions from allocation. 1611 */ 1612 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1613 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1614 { 1615 struct ext4_fc_replay_state *state; 1616 struct ext4_fc_alloc_region *region; 1617 1618 state = &EXT4_SB(sb)->s_fc_replay_state; 1619 if (state->fc_regions_used == state->fc_regions_size) { 1620 state->fc_regions_size += 1621 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1622 state->fc_regions = krealloc( 1623 state->fc_regions, 1624 state->fc_regions_size * 1625 sizeof(struct ext4_fc_alloc_region), 1626 GFP_KERNEL); 1627 if (!state->fc_regions) 1628 return -ENOMEM; 1629 } 1630 region = &state->fc_regions[state->fc_regions_used++]; 1631 region->ino = ino; 1632 region->lblk = lblk; 1633 region->pblk = pblk; 1634 region->len = len; 1635 1636 return 0; 1637 } 1638 1639 /* Replay add range tag */ 1640 static int ext4_fc_replay_add_range(struct super_block *sb, 1641 struct ext4_fc_tl *tl, u8 *val) 1642 { 1643 struct ext4_fc_add_range fc_add_ex; 1644 struct ext4_extent newex, *ex; 1645 struct inode *inode; 1646 ext4_lblk_t start, cur; 1647 int remaining, len; 1648 ext4_fsblk_t start_pblk; 1649 struct ext4_map_blocks map; 1650 struct ext4_ext_path *path = NULL; 1651 int ret; 1652 1653 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1654 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1655 1656 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1657 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1658 ext4_ext_get_actual_len(ex)); 1659 1660 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1661 if (IS_ERR(inode)) { 1662 jbd_debug(1, "Inode not found."); 1663 return 0; 1664 } 1665 1666 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1667 1668 start = le32_to_cpu(ex->ee_block); 1669 start_pblk = ext4_ext_pblock(ex); 1670 len = ext4_ext_get_actual_len(ex); 1671 1672 cur = start; 1673 remaining = len; 1674 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1675 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1676 inode->i_ino); 1677 1678 while (remaining > 0) { 1679 map.m_lblk = cur; 1680 map.m_len = remaining; 1681 map.m_pblk = 0; 1682 ret = ext4_map_blocks(NULL, inode, &map, 0); 1683 1684 if (ret < 0) { 1685 iput(inode); 1686 return 0; 1687 } 1688 1689 if (ret == 0) { 1690 /* Range is not mapped */ 1691 path = ext4_find_extent(inode, cur, NULL, 0); 1692 if (IS_ERR(path)) { 1693 iput(inode); 1694 return 0; 1695 } 1696 memset(&newex, 0, sizeof(newex)); 1697 newex.ee_block = cpu_to_le32(cur); 1698 ext4_ext_store_pblock( 1699 &newex, start_pblk + cur - start); 1700 newex.ee_len = cpu_to_le16(map.m_len); 1701 if (ext4_ext_is_unwritten(ex)) 1702 ext4_ext_mark_unwritten(&newex); 1703 down_write(&EXT4_I(inode)->i_data_sem); 1704 ret = ext4_ext_insert_extent( 1705 NULL, inode, &path, &newex, 0); 1706 up_write((&EXT4_I(inode)->i_data_sem)); 1707 ext4_ext_drop_refs(path); 1708 kfree(path); 1709 if (ret) { 1710 iput(inode); 1711 return 0; 1712 } 1713 goto next; 1714 } 1715 1716 if (start_pblk + cur - start != map.m_pblk) { 1717 /* 1718 * Logical to physical mapping changed. This can happen 1719 * if this range was removed and then reallocated to 1720 * map to new physical blocks during a fast commit. 1721 */ 1722 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1723 ext4_ext_is_unwritten(ex), 1724 start_pblk + cur - start); 1725 if (ret) { 1726 iput(inode); 1727 return 0; 1728 } 1729 /* 1730 * Mark the old blocks as free since they aren't used 1731 * anymore. We maintain an array of all the modified 1732 * inodes. In case these blocks are still used at either 1733 * a different logical range in the same inode or in 1734 * some different inode, we will mark them as allocated 1735 * at the end of the FC replay using our array of 1736 * modified inodes. 1737 */ 1738 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1739 goto next; 1740 } 1741 1742 /* Range is mapped and needs a state change */ 1743 jbd_debug(1, "Converting from %ld to %d %lld", 1744 map.m_flags & EXT4_MAP_UNWRITTEN, 1745 ext4_ext_is_unwritten(ex), map.m_pblk); 1746 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1747 ext4_ext_is_unwritten(ex), map.m_pblk); 1748 if (ret) { 1749 iput(inode); 1750 return 0; 1751 } 1752 /* 1753 * We may have split the extent tree while toggling the state. 1754 * Try to shrink the extent tree now. 1755 */ 1756 ext4_ext_replay_shrink_inode(inode, start + len); 1757 next: 1758 cur += map.m_len; 1759 remaining -= map.m_len; 1760 } 1761 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1762 sb->s_blocksize_bits); 1763 iput(inode); 1764 return 0; 1765 } 1766 1767 /* Replay DEL_RANGE tag */ 1768 static int 1769 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1770 u8 *val) 1771 { 1772 struct inode *inode; 1773 struct ext4_fc_del_range lrange; 1774 struct ext4_map_blocks map; 1775 ext4_lblk_t cur, remaining; 1776 int ret; 1777 1778 memcpy(&lrange, val, sizeof(lrange)); 1779 cur = le32_to_cpu(lrange.fc_lblk); 1780 remaining = le32_to_cpu(lrange.fc_len); 1781 1782 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1783 le32_to_cpu(lrange.fc_ino), cur, remaining); 1784 1785 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1786 if (IS_ERR(inode)) { 1787 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1788 return 0; 1789 } 1790 1791 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1792 1793 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1794 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1795 le32_to_cpu(lrange.fc_len)); 1796 while (remaining > 0) { 1797 map.m_lblk = cur; 1798 map.m_len = remaining; 1799 1800 ret = ext4_map_blocks(NULL, inode, &map, 0); 1801 if (ret < 0) { 1802 iput(inode); 1803 return 0; 1804 } 1805 if (ret > 0) { 1806 remaining -= ret; 1807 cur += ret; 1808 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1809 } else { 1810 remaining -= map.m_len; 1811 cur += map.m_len; 1812 } 1813 } 1814 1815 ret = ext4_punch_hole(inode, 1816 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits, 1817 le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits); 1818 if (ret) 1819 jbd_debug(1, "ext4_punch_hole returned %d", ret); 1820 ext4_ext_replay_shrink_inode(inode, 1821 i_size_read(inode) >> sb->s_blocksize_bits); 1822 ext4_mark_inode_dirty(NULL, inode); 1823 iput(inode); 1824 1825 return 0; 1826 } 1827 1828 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1829 { 1830 struct ext4_fc_replay_state *state; 1831 struct inode *inode; 1832 struct ext4_ext_path *path = NULL; 1833 struct ext4_map_blocks map; 1834 int i, ret, j; 1835 ext4_lblk_t cur, end; 1836 1837 state = &EXT4_SB(sb)->s_fc_replay_state; 1838 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1839 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1840 EXT4_IGET_NORMAL); 1841 if (IS_ERR(inode)) { 1842 jbd_debug(1, "Inode %d not found.", 1843 state->fc_modified_inodes[i]); 1844 continue; 1845 } 1846 cur = 0; 1847 end = EXT_MAX_BLOCKS; 1848 while (cur < end) { 1849 map.m_lblk = cur; 1850 map.m_len = end - cur; 1851 1852 ret = ext4_map_blocks(NULL, inode, &map, 0); 1853 if (ret < 0) 1854 break; 1855 1856 if (ret > 0) { 1857 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1858 if (!IS_ERR(path)) { 1859 for (j = 0; j < path->p_depth; j++) 1860 ext4_mb_mark_bb(inode->i_sb, 1861 path[j].p_block, 1, 1); 1862 ext4_ext_drop_refs(path); 1863 kfree(path); 1864 } 1865 cur += ret; 1866 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1867 map.m_len, 1); 1868 } else { 1869 cur = cur + (map.m_len ? map.m_len : 1); 1870 } 1871 } 1872 iput(inode); 1873 } 1874 } 1875 1876 /* 1877 * Check if block is in excluded regions for block allocation. The simple 1878 * allocator that runs during replay phase is calls this function to see 1879 * if it is okay to use a block. 1880 */ 1881 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1882 { 1883 int i; 1884 struct ext4_fc_replay_state *state; 1885 1886 state = &EXT4_SB(sb)->s_fc_replay_state; 1887 for (i = 0; i < state->fc_regions_valid; i++) { 1888 if (state->fc_regions[i].ino == 0 || 1889 state->fc_regions[i].len == 0) 1890 continue; 1891 if (blk >= state->fc_regions[i].pblk && 1892 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1893 return true; 1894 } 1895 return false; 1896 } 1897 1898 /* Cleanup function called after replay */ 1899 void ext4_fc_replay_cleanup(struct super_block *sb) 1900 { 1901 struct ext4_sb_info *sbi = EXT4_SB(sb); 1902 1903 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1904 kfree(sbi->s_fc_replay_state.fc_regions); 1905 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1906 } 1907 1908 /* 1909 * Recovery Scan phase handler 1910 * 1911 * This function is called during the scan phase and is responsible 1912 * for doing following things: 1913 * - Make sure the fast commit area has valid tags for replay 1914 * - Count number of tags that need to be replayed by the replay handler 1915 * - Verify CRC 1916 * - Create a list of excluded blocks for allocation during replay phase 1917 * 1918 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1919 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1920 * to indicate that scan has finished and JBD2 can now start replay phase. 1921 * It returns a negative error to indicate that there was an error. At the end 1922 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1923 * to indicate the number of tags that need to replayed during the replay phase. 1924 */ 1925 static int ext4_fc_replay_scan(journal_t *journal, 1926 struct buffer_head *bh, int off, 1927 tid_t expected_tid) 1928 { 1929 struct super_block *sb = journal->j_private; 1930 struct ext4_sb_info *sbi = EXT4_SB(sb); 1931 struct ext4_fc_replay_state *state; 1932 int ret = JBD2_FC_REPLAY_CONTINUE; 1933 struct ext4_fc_add_range ext; 1934 struct ext4_fc_tl tl; 1935 struct ext4_fc_tail tail; 1936 __u8 *start, *end, *cur, *val; 1937 struct ext4_fc_head head; 1938 struct ext4_extent *ex; 1939 1940 state = &sbi->s_fc_replay_state; 1941 1942 start = (u8 *)bh->b_data; 1943 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1944 1945 if (state->fc_replay_expected_off == 0) { 1946 state->fc_cur_tag = 0; 1947 state->fc_replay_num_tags = 0; 1948 state->fc_crc = 0; 1949 state->fc_regions = NULL; 1950 state->fc_regions_valid = state->fc_regions_used = 1951 state->fc_regions_size = 0; 1952 /* Check if we can stop early */ 1953 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1954 != EXT4_FC_TAG_HEAD) 1955 return 0; 1956 } 1957 1958 if (off != state->fc_replay_expected_off) { 1959 ret = -EFSCORRUPTED; 1960 goto out_err; 1961 } 1962 1963 state->fc_replay_expected_off++; 1964 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 1965 memcpy(&tl, cur, sizeof(tl)); 1966 val = cur + sizeof(tl); 1967 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1968 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); 1969 switch (le16_to_cpu(tl.fc_tag)) { 1970 case EXT4_FC_TAG_ADD_RANGE: 1971 memcpy(&ext, val, sizeof(ext)); 1972 ex = (struct ext4_extent *)&ext.fc_ex; 1973 ret = ext4_fc_record_regions(sb, 1974 le32_to_cpu(ext.fc_ino), 1975 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1976 ext4_ext_get_actual_len(ex)); 1977 if (ret < 0) 1978 break; 1979 ret = JBD2_FC_REPLAY_CONTINUE; 1980 fallthrough; 1981 case EXT4_FC_TAG_DEL_RANGE: 1982 case EXT4_FC_TAG_LINK: 1983 case EXT4_FC_TAG_UNLINK: 1984 case EXT4_FC_TAG_CREAT: 1985 case EXT4_FC_TAG_INODE: 1986 case EXT4_FC_TAG_PAD: 1987 state->fc_cur_tag++; 1988 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1989 sizeof(tl) + le16_to_cpu(tl.fc_len)); 1990 break; 1991 case EXT4_FC_TAG_TAIL: 1992 state->fc_cur_tag++; 1993 memcpy(&tail, val, sizeof(tail)); 1994 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1995 sizeof(tl) + 1996 offsetof(struct ext4_fc_tail, 1997 fc_crc)); 1998 if (le32_to_cpu(tail.fc_tid) == expected_tid && 1999 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2000 state->fc_replay_num_tags = state->fc_cur_tag; 2001 state->fc_regions_valid = 2002 state->fc_regions_used; 2003 } else { 2004 ret = state->fc_replay_num_tags ? 2005 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2006 } 2007 state->fc_crc = 0; 2008 break; 2009 case EXT4_FC_TAG_HEAD: 2010 memcpy(&head, val, sizeof(head)); 2011 if (le32_to_cpu(head.fc_features) & 2012 ~EXT4_FC_SUPPORTED_FEATURES) { 2013 ret = -EOPNOTSUPP; 2014 break; 2015 } 2016 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2017 ret = JBD2_FC_REPLAY_STOP; 2018 break; 2019 } 2020 state->fc_cur_tag++; 2021 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2022 sizeof(tl) + le16_to_cpu(tl.fc_len)); 2023 break; 2024 default: 2025 ret = state->fc_replay_num_tags ? 2026 JBD2_FC_REPLAY_STOP : -ECANCELED; 2027 } 2028 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2029 break; 2030 } 2031 2032 out_err: 2033 trace_ext4_fc_replay_scan(sb, ret, off); 2034 return ret; 2035 } 2036 2037 /* 2038 * Main recovery path entry point. 2039 * The meaning of return codes is similar as above. 2040 */ 2041 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2042 enum passtype pass, int off, tid_t expected_tid) 2043 { 2044 struct super_block *sb = journal->j_private; 2045 struct ext4_sb_info *sbi = EXT4_SB(sb); 2046 struct ext4_fc_tl tl; 2047 __u8 *start, *end, *cur, *val; 2048 int ret = JBD2_FC_REPLAY_CONTINUE; 2049 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2050 struct ext4_fc_tail tail; 2051 2052 if (pass == PASS_SCAN) { 2053 state->fc_current_pass = PASS_SCAN; 2054 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2055 } 2056 2057 if (state->fc_current_pass != pass) { 2058 state->fc_current_pass = pass; 2059 sbi->s_mount_state |= EXT4_FC_REPLAY; 2060 } 2061 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2062 jbd_debug(1, "Replay stops\n"); 2063 ext4_fc_set_bitmaps_and_counters(sb); 2064 return 0; 2065 } 2066 2067 #ifdef CONFIG_EXT4_DEBUG 2068 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2069 pr_warn("Dropping fc block %d because max_replay set\n", off); 2070 return JBD2_FC_REPLAY_STOP; 2071 } 2072 #endif 2073 2074 start = (u8 *)bh->b_data; 2075 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2076 2077 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2078 memcpy(&tl, cur, sizeof(tl)); 2079 val = cur + sizeof(tl); 2080 2081 if (state->fc_replay_num_tags == 0) { 2082 ret = JBD2_FC_REPLAY_STOP; 2083 ext4_fc_set_bitmaps_and_counters(sb); 2084 break; 2085 } 2086 jbd_debug(3, "Replay phase, tag:%s\n", 2087 tag2str(le16_to_cpu(tl.fc_tag))); 2088 state->fc_replay_num_tags--; 2089 switch (le16_to_cpu(tl.fc_tag)) { 2090 case EXT4_FC_TAG_LINK: 2091 ret = ext4_fc_replay_link(sb, &tl, val); 2092 break; 2093 case EXT4_FC_TAG_UNLINK: 2094 ret = ext4_fc_replay_unlink(sb, &tl, val); 2095 break; 2096 case EXT4_FC_TAG_ADD_RANGE: 2097 ret = ext4_fc_replay_add_range(sb, &tl, val); 2098 break; 2099 case EXT4_FC_TAG_CREAT: 2100 ret = ext4_fc_replay_create(sb, &tl, val); 2101 break; 2102 case EXT4_FC_TAG_DEL_RANGE: 2103 ret = ext4_fc_replay_del_range(sb, &tl, val); 2104 break; 2105 case EXT4_FC_TAG_INODE: 2106 ret = ext4_fc_replay_inode(sb, &tl, val); 2107 break; 2108 case EXT4_FC_TAG_PAD: 2109 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2110 le16_to_cpu(tl.fc_len), 0); 2111 break; 2112 case EXT4_FC_TAG_TAIL: 2113 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2114 le16_to_cpu(tl.fc_len), 0); 2115 memcpy(&tail, val, sizeof(tail)); 2116 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2117 break; 2118 case EXT4_FC_TAG_HEAD: 2119 break; 2120 default: 2121 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, 2122 le16_to_cpu(tl.fc_len), 0); 2123 ret = -ECANCELED; 2124 break; 2125 } 2126 if (ret < 0) 2127 break; 2128 ret = JBD2_FC_REPLAY_CONTINUE; 2129 } 2130 return ret; 2131 } 2132 2133 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2134 { 2135 /* 2136 * We set replay callback even if fast commit disabled because we may 2137 * could still have fast commit blocks that need to be replayed even if 2138 * fast commit has now been turned off. 2139 */ 2140 journal->j_fc_replay_callback = ext4_fc_replay; 2141 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2142 return; 2143 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2144 } 2145 2146 static const char *fc_ineligible_reasons[] = { 2147 "Extended attributes changed", 2148 "Cross rename", 2149 "Journal flag changed", 2150 "Insufficient memory", 2151 "Swap boot", 2152 "Resize", 2153 "Dir renamed", 2154 "Falloc range op", 2155 "Data journalling", 2156 "FC Commit Failed" 2157 }; 2158 2159 int ext4_fc_info_show(struct seq_file *seq, void *v) 2160 { 2161 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2162 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2163 int i; 2164 2165 if (v != SEQ_START_TOKEN) 2166 return 0; 2167 2168 seq_printf(seq, 2169 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2170 stats->fc_num_commits, stats->fc_ineligible_commits, 2171 stats->fc_numblks, 2172 div_u64(sbi->s_fc_avg_commit_time, 1000)); 2173 seq_puts(seq, "Ineligible reasons:\n"); 2174 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2175 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2176 stats->fc_ineligible_reason_count[i]); 2177 2178 return 0; 2179 } 2180 2181 int __init ext4_fc_init_dentry_cache(void) 2182 { 2183 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2184 SLAB_RECLAIM_ACCOUNT); 2185 2186 if (ext4_fc_dentry_cachep == NULL) 2187 return -ENOMEM; 2188 2189 return 0; 2190 } 2191