1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 init_waitqueue_head(&ei->i_fc_wait); 203 atomic_set(&ei->i_fc_updates, 0); 204 } 205 206 /* This function must be called with sbi->s_fc_lock held. */ 207 static void ext4_fc_wait_committing_inode(struct inode *inode) 208 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 209 { 210 wait_queue_head_t *wq; 211 struct ext4_inode_info *ei = EXT4_I(inode); 212 213 #if (BITS_PER_LONG < 64) 214 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 215 EXT4_STATE_FC_COMMITTING); 216 wq = bit_waitqueue(&ei->i_state_flags, 217 EXT4_STATE_FC_COMMITTING); 218 #else 219 DEFINE_WAIT_BIT(wait, &ei->i_flags, 220 EXT4_STATE_FC_COMMITTING); 221 wq = bit_waitqueue(&ei->i_flags, 222 EXT4_STATE_FC_COMMITTING); 223 #endif 224 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 225 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 226 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 227 schedule(); 228 finish_wait(wq, &wait.wq_entry); 229 } 230 231 /* 232 * Inform Ext4's fast about start of an inode update 233 * 234 * This function is called by the high level call VFS callbacks before 235 * performing any inode update. This function blocks if there's an ongoing 236 * fast commit on the inode in question. 237 */ 238 void ext4_fc_start_update(struct inode *inode) 239 { 240 struct ext4_inode_info *ei = EXT4_I(inode); 241 242 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 243 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 244 return; 245 246 restart: 247 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 248 if (list_empty(&ei->i_fc_list)) 249 goto out; 250 251 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 252 ext4_fc_wait_committing_inode(inode); 253 goto restart; 254 } 255 out: 256 atomic_inc(&ei->i_fc_updates); 257 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 258 } 259 260 /* 261 * Stop inode update and wake up waiting fast commits if any. 262 */ 263 void ext4_fc_stop_update(struct inode *inode) 264 { 265 struct ext4_inode_info *ei = EXT4_I(inode); 266 267 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 268 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 269 return; 270 271 if (atomic_dec_and_test(&ei->i_fc_updates)) 272 wake_up_all(&ei->i_fc_wait); 273 } 274 275 /* 276 * Remove inode from fast commit list. If the inode is being committed 277 * we wait until inode commit is done. 278 */ 279 void ext4_fc_del(struct inode *inode) 280 { 281 struct ext4_inode_info *ei = EXT4_I(inode); 282 283 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 284 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 285 return; 286 287 restart: 288 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 289 if (list_empty(&ei->i_fc_list)) { 290 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 291 return; 292 } 293 294 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 295 ext4_fc_wait_committing_inode(inode); 296 goto restart; 297 } 298 list_del_init(&ei->i_fc_list); 299 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 300 } 301 302 /* 303 * Mark file system as fast commit ineligible. This means that next commit 304 * operation would result in a full jbd2 commit. 305 */ 306 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 307 { 308 struct ext4_sb_info *sbi = EXT4_SB(sb); 309 310 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 311 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 312 return; 313 314 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 315 WARN_ON(reason >= EXT4_FC_REASON_MAX); 316 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 317 } 318 319 /* 320 * Generic fast commit tracking function. If this is the first time this we are 321 * called after a full commit, we initialize fast commit fields and then call 322 * __fc_track_fn() with update = 0. If we have already been called after a full 323 * commit, we pass update = 1. Based on that, the track function can determine 324 * if it needs to track a field for the first time or if it needs to just 325 * update the previously tracked value. 326 * 327 * If enqueue is set, this function enqueues the inode in fast commit list. 328 */ 329 static int ext4_fc_track_template( 330 handle_t *handle, struct inode *inode, 331 int (*__fc_track_fn)(struct inode *, void *, bool), 332 void *args, int enqueue) 333 { 334 bool update = false; 335 struct ext4_inode_info *ei = EXT4_I(inode); 336 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 337 tid_t tid = 0; 338 int ret; 339 340 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 341 (sbi->s_mount_state & EXT4_FC_REPLAY)) 342 return -EOPNOTSUPP; 343 344 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 345 return -EINVAL; 346 347 tid = handle->h_transaction->t_tid; 348 mutex_lock(&ei->i_fc_lock); 349 if (tid == ei->i_sync_tid) { 350 update = true; 351 } else { 352 ext4_fc_reset_inode(inode); 353 ei->i_sync_tid = tid; 354 } 355 ret = __fc_track_fn(inode, args, update); 356 mutex_unlock(&ei->i_fc_lock); 357 358 if (!enqueue) 359 return ret; 360 361 spin_lock(&sbi->s_fc_lock); 362 if (list_empty(&EXT4_I(inode)->i_fc_list)) 363 list_add_tail(&EXT4_I(inode)->i_fc_list, 364 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 365 &sbi->s_fc_q[FC_Q_STAGING] : 366 &sbi->s_fc_q[FC_Q_MAIN]); 367 spin_unlock(&sbi->s_fc_lock); 368 369 return ret; 370 } 371 372 struct __track_dentry_update_args { 373 struct dentry *dentry; 374 int op; 375 }; 376 377 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 378 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 379 { 380 struct ext4_fc_dentry_update *node; 381 struct ext4_inode_info *ei = EXT4_I(inode); 382 struct __track_dentry_update_args *dentry_update = 383 (struct __track_dentry_update_args *)arg; 384 struct dentry *dentry = dentry_update->dentry; 385 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 386 387 mutex_unlock(&ei->i_fc_lock); 388 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 389 if (!node) { 390 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 391 mutex_lock(&ei->i_fc_lock); 392 return -ENOMEM; 393 } 394 395 node->fcd_op = dentry_update->op; 396 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 397 node->fcd_ino = inode->i_ino; 398 if (dentry->d_name.len > DNAME_INLINE_LEN) { 399 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 400 if (!node->fcd_name.name) { 401 kmem_cache_free(ext4_fc_dentry_cachep, node); 402 ext4_fc_mark_ineligible(inode->i_sb, 403 EXT4_FC_REASON_NOMEM); 404 mutex_lock(&ei->i_fc_lock); 405 return -ENOMEM; 406 } 407 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 408 dentry->d_name.len); 409 } else { 410 memcpy(node->fcd_iname, dentry->d_name.name, 411 dentry->d_name.len); 412 node->fcd_name.name = node->fcd_iname; 413 } 414 node->fcd_name.len = dentry->d_name.len; 415 416 spin_lock(&sbi->s_fc_lock); 417 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 418 list_add_tail(&node->fcd_list, 419 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 420 else 421 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 422 spin_unlock(&sbi->s_fc_lock); 423 mutex_lock(&ei->i_fc_lock); 424 425 return 0; 426 } 427 428 void __ext4_fc_track_unlink(handle_t *handle, 429 struct inode *inode, struct dentry *dentry) 430 { 431 struct __track_dentry_update_args args; 432 int ret; 433 434 args.dentry = dentry; 435 args.op = EXT4_FC_TAG_UNLINK; 436 437 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 438 (void *)&args, 0); 439 trace_ext4_fc_track_unlink(inode, dentry, ret); 440 } 441 442 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 443 { 444 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 445 } 446 447 void __ext4_fc_track_link(handle_t *handle, 448 struct inode *inode, struct dentry *dentry) 449 { 450 struct __track_dentry_update_args args; 451 int ret; 452 453 args.dentry = dentry; 454 args.op = EXT4_FC_TAG_LINK; 455 456 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 457 (void *)&args, 0); 458 trace_ext4_fc_track_link(inode, dentry, ret); 459 } 460 461 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 462 { 463 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 464 } 465 466 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 467 struct dentry *dentry) 468 { 469 struct __track_dentry_update_args args; 470 int ret; 471 472 args.dentry = dentry; 473 args.op = EXT4_FC_TAG_CREAT; 474 475 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 476 (void *)&args, 0); 477 trace_ext4_fc_track_create(inode, dentry, ret); 478 } 479 480 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 481 { 482 __ext4_fc_track_create(handle, d_inode(dentry), dentry); 483 } 484 485 /* __track_fn for inode tracking */ 486 static int __track_inode(struct inode *inode, void *arg, bool update) 487 { 488 if (update) 489 return -EEXIST; 490 491 EXT4_I(inode)->i_fc_lblk_len = 0; 492 493 return 0; 494 } 495 496 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 497 { 498 int ret; 499 500 if (S_ISDIR(inode->i_mode)) 501 return; 502 503 if (ext4_should_journal_data(inode)) { 504 ext4_fc_mark_ineligible(inode->i_sb, 505 EXT4_FC_REASON_INODE_JOURNAL_DATA); 506 return; 507 } 508 509 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 510 trace_ext4_fc_track_inode(inode, ret); 511 } 512 513 struct __track_range_args { 514 ext4_lblk_t start, end; 515 }; 516 517 /* __track_fn for tracking data updates */ 518 static int __track_range(struct inode *inode, void *arg, bool update) 519 { 520 struct ext4_inode_info *ei = EXT4_I(inode); 521 ext4_lblk_t oldstart; 522 struct __track_range_args *__arg = 523 (struct __track_range_args *)arg; 524 525 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 526 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 527 return -ECANCELED; 528 } 529 530 oldstart = ei->i_fc_lblk_start; 531 532 if (update && ei->i_fc_lblk_len > 0) { 533 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 534 ei->i_fc_lblk_len = 535 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 536 ei->i_fc_lblk_start + 1; 537 } else { 538 ei->i_fc_lblk_start = __arg->start; 539 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 540 } 541 542 return 0; 543 } 544 545 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 546 ext4_lblk_t end) 547 { 548 struct __track_range_args args; 549 int ret; 550 551 if (S_ISDIR(inode->i_mode)) 552 return; 553 554 args.start = start; 555 args.end = end; 556 557 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 558 559 trace_ext4_fc_track_range(inode, start, end, ret); 560 } 561 562 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 563 { 564 int write_flags = REQ_SYNC; 565 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 566 567 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 568 if (test_opt(sb, BARRIER) && is_tail) 569 write_flags |= REQ_FUA | REQ_PREFLUSH; 570 lock_buffer(bh); 571 set_buffer_dirty(bh); 572 set_buffer_uptodate(bh); 573 bh->b_end_io = ext4_end_buffer_io_sync; 574 submit_bh(REQ_OP_WRITE, write_flags, bh); 575 EXT4_SB(sb)->s_fc_bh = NULL; 576 } 577 578 /* Ext4 commit path routines */ 579 580 /* memzero and update CRC */ 581 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 582 u32 *crc) 583 { 584 void *ret; 585 586 ret = memset(dst, 0, len); 587 if (crc) 588 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 589 return ret; 590 } 591 592 /* 593 * Allocate len bytes on a fast commit buffer. 594 * 595 * During the commit time this function is used to manage fast commit 596 * block space. We don't split a fast commit log onto different 597 * blocks. So this function makes sure that if there's not enough space 598 * on the current block, the remaining space in the current block is 599 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 600 * new block is from jbd2 and CRC is updated to reflect the padding 601 * we added. 602 */ 603 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 604 { 605 struct ext4_fc_tl *tl; 606 struct ext4_sb_info *sbi = EXT4_SB(sb); 607 struct buffer_head *bh; 608 int bsize = sbi->s_journal->j_blocksize; 609 int ret, off = sbi->s_fc_bytes % bsize; 610 int pad_len; 611 612 /* 613 * After allocating len, we should have space at least for a 0 byte 614 * padding. 615 */ 616 if (len + sizeof(struct ext4_fc_tl) > bsize) 617 return NULL; 618 619 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 620 /* 621 * Only allocate from current buffer if we have enough space for 622 * this request AND we have space to add a zero byte padding. 623 */ 624 if (!sbi->s_fc_bh) { 625 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 626 if (ret) 627 return NULL; 628 sbi->s_fc_bh = bh; 629 } 630 sbi->s_fc_bytes += len; 631 return sbi->s_fc_bh->b_data + off; 632 } 633 /* Need to add PAD tag */ 634 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 635 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 636 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 637 tl->fc_len = cpu_to_le16(pad_len); 638 if (crc) 639 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 640 if (pad_len > 0) 641 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 642 ext4_fc_submit_bh(sb, false); 643 644 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 645 if (ret) 646 return NULL; 647 sbi->s_fc_bh = bh; 648 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 649 return sbi->s_fc_bh->b_data; 650 } 651 652 /* memcpy to fc reserved space and update CRC */ 653 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 654 int len, u32 *crc) 655 { 656 if (crc) 657 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 658 return memcpy(dst, src, len); 659 } 660 661 /* 662 * Complete a fast commit by writing tail tag. 663 * 664 * Writing tail tag marks the end of a fast commit. In order to guarantee 665 * atomicity, after writing tail tag, even if there's space remaining 666 * in the block, next commit shouldn't use it. That's why tail tag 667 * has the length as that of the remaining space on the block. 668 */ 669 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 670 { 671 struct ext4_sb_info *sbi = EXT4_SB(sb); 672 struct ext4_fc_tl tl; 673 struct ext4_fc_tail tail; 674 int off, bsize = sbi->s_journal->j_blocksize; 675 u8 *dst; 676 677 /* 678 * ext4_fc_reserve_space takes care of allocating an extra block if 679 * there's no enough space on this block for accommodating this tail. 680 */ 681 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 682 if (!dst) 683 return -ENOSPC; 684 685 off = sbi->s_fc_bytes % bsize; 686 687 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 688 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 689 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 690 691 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 692 dst += sizeof(tl); 693 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 694 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 695 dst += sizeof(tail.fc_tid); 696 tail.fc_crc = cpu_to_le32(crc); 697 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 698 699 ext4_fc_submit_bh(sb, true); 700 701 return 0; 702 } 703 704 /* 705 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 706 * Returns false if there's not enough space. 707 */ 708 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 709 u32 *crc) 710 { 711 struct ext4_fc_tl tl; 712 u8 *dst; 713 714 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 715 if (!dst) 716 return false; 717 718 tl.fc_tag = cpu_to_le16(tag); 719 tl.fc_len = cpu_to_le16(len); 720 721 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 722 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 723 724 return true; 725 } 726 727 /* Same as above, but adds dentry tlv. */ 728 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 729 struct ext4_fc_dentry_update *fc_dentry) 730 { 731 struct ext4_fc_dentry_info fcd; 732 struct ext4_fc_tl tl; 733 int dlen = fc_dentry->fcd_name.len; 734 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 735 crc); 736 737 if (!dst) 738 return false; 739 740 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 741 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 742 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 743 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 744 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 745 dst += sizeof(tl); 746 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 747 dst += sizeof(fcd); 748 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); 749 750 return true; 751 } 752 753 /* 754 * Writes inode in the fast commit space under TLV with tag @tag. 755 * Returns 0 on success, error on failure. 756 */ 757 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 758 { 759 struct ext4_inode_info *ei = EXT4_I(inode); 760 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 761 int ret; 762 struct ext4_iloc iloc; 763 struct ext4_fc_inode fc_inode; 764 struct ext4_fc_tl tl; 765 u8 *dst; 766 767 ret = ext4_get_inode_loc(inode, &iloc); 768 if (ret) 769 return ret; 770 771 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 772 inode_len = EXT4_INODE_SIZE(inode->i_sb); 773 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 774 inode_len += ei->i_extra_isize; 775 776 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 777 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 778 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 779 780 dst = ext4_fc_reserve_space(inode->i_sb, 781 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 782 if (!dst) 783 return -ECANCELED; 784 785 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 786 return -ECANCELED; 787 dst += sizeof(tl); 788 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 789 return -ECANCELED; 790 dst += sizeof(fc_inode); 791 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 792 inode_len, crc)) 793 return -ECANCELED; 794 795 return 0; 796 } 797 798 /* 799 * Writes updated data ranges for the inode in question. Updates CRC. 800 * Returns 0 on success, error otherwise. 801 */ 802 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 803 { 804 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 805 struct ext4_inode_info *ei = EXT4_I(inode); 806 struct ext4_map_blocks map; 807 struct ext4_fc_add_range fc_ext; 808 struct ext4_fc_del_range lrange; 809 struct ext4_extent *ex; 810 int ret; 811 812 mutex_lock(&ei->i_fc_lock); 813 if (ei->i_fc_lblk_len == 0) { 814 mutex_unlock(&ei->i_fc_lock); 815 return 0; 816 } 817 old_blk_size = ei->i_fc_lblk_start; 818 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 819 ei->i_fc_lblk_len = 0; 820 mutex_unlock(&ei->i_fc_lock); 821 822 cur_lblk_off = old_blk_size; 823 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 824 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 825 826 while (cur_lblk_off <= new_blk_size) { 827 map.m_lblk = cur_lblk_off; 828 map.m_len = new_blk_size - cur_lblk_off + 1; 829 ret = ext4_map_blocks(NULL, inode, &map, 0); 830 if (ret < 0) 831 return -ECANCELED; 832 833 if (map.m_len == 0) { 834 cur_lblk_off++; 835 continue; 836 } 837 838 if (ret == 0) { 839 lrange.fc_ino = cpu_to_le32(inode->i_ino); 840 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 841 lrange.fc_len = cpu_to_le32(map.m_len); 842 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 843 sizeof(lrange), (u8 *)&lrange, crc)) 844 return -ENOSPC; 845 } else { 846 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 847 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 848 849 /* Limit the number of blocks in one extent */ 850 map.m_len = min(max, map.m_len); 851 852 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 853 ex = (struct ext4_extent *)&fc_ext.fc_ex; 854 ex->ee_block = cpu_to_le32(map.m_lblk); 855 ex->ee_len = cpu_to_le16(map.m_len); 856 ext4_ext_store_pblock(ex, map.m_pblk); 857 if (map.m_flags & EXT4_MAP_UNWRITTEN) 858 ext4_ext_mark_unwritten(ex); 859 else 860 ext4_ext_mark_initialized(ex); 861 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 862 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 863 return -ENOSPC; 864 } 865 866 cur_lblk_off += map.m_len; 867 } 868 869 return 0; 870 } 871 872 873 /* Submit data for all the fast commit inodes */ 874 static int ext4_fc_submit_inode_data_all(journal_t *journal) 875 { 876 struct super_block *sb = (struct super_block *)(journal->j_private); 877 struct ext4_sb_info *sbi = EXT4_SB(sb); 878 struct ext4_inode_info *ei; 879 int ret = 0; 880 881 spin_lock(&sbi->s_fc_lock); 882 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 883 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 884 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 885 while (atomic_read(&ei->i_fc_updates)) { 886 DEFINE_WAIT(wait); 887 888 prepare_to_wait(&ei->i_fc_wait, &wait, 889 TASK_UNINTERRUPTIBLE); 890 if (atomic_read(&ei->i_fc_updates)) { 891 spin_unlock(&sbi->s_fc_lock); 892 schedule(); 893 spin_lock(&sbi->s_fc_lock); 894 } 895 finish_wait(&ei->i_fc_wait, &wait); 896 } 897 spin_unlock(&sbi->s_fc_lock); 898 ret = jbd2_submit_inode_data(ei->jinode); 899 if (ret) 900 return ret; 901 spin_lock(&sbi->s_fc_lock); 902 } 903 spin_unlock(&sbi->s_fc_lock); 904 905 return ret; 906 } 907 908 /* Wait for completion of data for all the fast commit inodes */ 909 static int ext4_fc_wait_inode_data_all(journal_t *journal) 910 { 911 struct super_block *sb = (struct super_block *)(journal->j_private); 912 struct ext4_sb_info *sbi = EXT4_SB(sb); 913 struct ext4_inode_info *pos, *n; 914 int ret = 0; 915 916 spin_lock(&sbi->s_fc_lock); 917 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 918 if (!ext4_test_inode_state(&pos->vfs_inode, 919 EXT4_STATE_FC_COMMITTING)) 920 continue; 921 spin_unlock(&sbi->s_fc_lock); 922 923 ret = jbd2_wait_inode_data(journal, pos->jinode); 924 if (ret) 925 return ret; 926 spin_lock(&sbi->s_fc_lock); 927 } 928 spin_unlock(&sbi->s_fc_lock); 929 930 return 0; 931 } 932 933 /* Commit all the directory entry updates */ 934 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 935 __acquires(&sbi->s_fc_lock) 936 __releases(&sbi->s_fc_lock) 937 { 938 struct super_block *sb = (struct super_block *)(journal->j_private); 939 struct ext4_sb_info *sbi = EXT4_SB(sb); 940 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 941 struct inode *inode; 942 struct ext4_inode_info *ei, *ei_n; 943 int ret; 944 945 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 946 return 0; 947 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 948 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 949 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 950 spin_unlock(&sbi->s_fc_lock); 951 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 952 ret = -ENOSPC; 953 goto lock_and_exit; 954 } 955 spin_lock(&sbi->s_fc_lock); 956 continue; 957 } 958 959 inode = NULL; 960 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], 961 i_fc_list) { 962 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 963 inode = &ei->vfs_inode; 964 break; 965 } 966 } 967 /* 968 * If we don't find inode in our list, then it was deleted, 969 * in which case, we don't need to record it's create tag. 970 */ 971 if (!inode) 972 continue; 973 spin_unlock(&sbi->s_fc_lock); 974 975 /* 976 * We first write the inode and then the create dirent. This 977 * allows the recovery code to create an unnamed inode first 978 * and then link it to a directory entry. This allows us 979 * to use namei.c routines almost as is and simplifies 980 * the recovery code. 981 */ 982 ret = ext4_fc_write_inode(inode, crc); 983 if (ret) 984 goto lock_and_exit; 985 986 ret = ext4_fc_write_inode_data(inode, crc); 987 if (ret) 988 goto lock_and_exit; 989 990 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 991 ret = -ENOSPC; 992 goto lock_and_exit; 993 } 994 995 spin_lock(&sbi->s_fc_lock); 996 } 997 return 0; 998 lock_and_exit: 999 spin_lock(&sbi->s_fc_lock); 1000 return ret; 1001 } 1002 1003 static int ext4_fc_perform_commit(journal_t *journal) 1004 { 1005 struct super_block *sb = (struct super_block *)(journal->j_private); 1006 struct ext4_sb_info *sbi = EXT4_SB(sb); 1007 struct ext4_inode_info *iter; 1008 struct ext4_fc_head head; 1009 struct inode *inode; 1010 struct blk_plug plug; 1011 int ret = 0; 1012 u32 crc = 0; 1013 1014 ret = ext4_fc_submit_inode_data_all(journal); 1015 if (ret) 1016 return ret; 1017 1018 ret = ext4_fc_wait_inode_data_all(journal); 1019 if (ret) 1020 return ret; 1021 1022 /* 1023 * If file system device is different from journal device, issue a cache 1024 * flush before we start writing fast commit blocks. 1025 */ 1026 if (journal->j_fs_dev != journal->j_dev) 1027 blkdev_issue_flush(journal->j_fs_dev); 1028 1029 blk_start_plug(&plug); 1030 if (sbi->s_fc_bytes == 0) { 1031 /* 1032 * Add a head tag only if this is the first fast commit 1033 * in this TID. 1034 */ 1035 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1036 head.fc_tid = cpu_to_le32( 1037 sbi->s_journal->j_running_transaction->t_tid); 1038 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1039 (u8 *)&head, &crc)) { 1040 ret = -ENOSPC; 1041 goto out; 1042 } 1043 } 1044 1045 spin_lock(&sbi->s_fc_lock); 1046 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1047 if (ret) { 1048 spin_unlock(&sbi->s_fc_lock); 1049 goto out; 1050 } 1051 1052 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1053 inode = &iter->vfs_inode; 1054 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1055 continue; 1056 1057 spin_unlock(&sbi->s_fc_lock); 1058 ret = ext4_fc_write_inode_data(inode, &crc); 1059 if (ret) 1060 goto out; 1061 ret = ext4_fc_write_inode(inode, &crc); 1062 if (ret) 1063 goto out; 1064 spin_lock(&sbi->s_fc_lock); 1065 } 1066 spin_unlock(&sbi->s_fc_lock); 1067 1068 ret = ext4_fc_write_tail(sb, crc); 1069 1070 out: 1071 blk_finish_plug(&plug); 1072 return ret; 1073 } 1074 1075 static void ext4_fc_update_stats(struct super_block *sb, int status, 1076 u64 commit_time, int nblks) 1077 { 1078 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1079 1080 jbd_debug(1, "Fast commit ended with status = %d", status); 1081 if (status == EXT4_FC_STATUS_OK) { 1082 stats->fc_num_commits++; 1083 stats->fc_numblks += nblks; 1084 if (likely(stats->s_fc_avg_commit_time)) 1085 stats->s_fc_avg_commit_time = 1086 (commit_time + 1087 stats->s_fc_avg_commit_time * 3) / 4; 1088 else 1089 stats->s_fc_avg_commit_time = commit_time; 1090 } else if (status == EXT4_FC_STATUS_FAILED || 1091 status == EXT4_FC_STATUS_INELIGIBLE) { 1092 if (status == EXT4_FC_STATUS_FAILED) 1093 stats->fc_failed_commits++; 1094 stats->fc_ineligible_commits++; 1095 } else { 1096 stats->fc_skipped_commits++; 1097 } 1098 trace_ext4_fc_commit_stop(sb, nblks, status); 1099 } 1100 1101 /* 1102 * The main commit entry point. Performs a fast commit for transaction 1103 * commit_tid if needed. If it's not possible to perform a fast commit 1104 * due to various reasons, we fall back to full commit. Returns 0 1105 * on success, error otherwise. 1106 */ 1107 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1108 { 1109 struct super_block *sb = (struct super_block *)(journal->j_private); 1110 struct ext4_sb_info *sbi = EXT4_SB(sb); 1111 int nblks = 0, ret, bsize = journal->j_blocksize; 1112 int subtid = atomic_read(&sbi->s_fc_subtid); 1113 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1114 ktime_t start_time, commit_time; 1115 1116 trace_ext4_fc_commit_start(sb); 1117 1118 start_time = ktime_get(); 1119 1120 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1121 return jbd2_complete_transaction(journal, commit_tid); 1122 1123 restart_fc: 1124 ret = jbd2_fc_begin_commit(journal, commit_tid); 1125 if (ret == -EALREADY) { 1126 /* There was an ongoing commit, check if we need to restart */ 1127 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1128 commit_tid > journal->j_commit_sequence) 1129 goto restart_fc; 1130 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0); 1131 return 0; 1132 } else if (ret) { 1133 /* 1134 * Commit couldn't start. Just update stats and perform a 1135 * full commit. 1136 */ 1137 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0); 1138 return jbd2_complete_transaction(journal, commit_tid); 1139 } 1140 1141 /* 1142 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1143 * if we are fast commit ineligible. 1144 */ 1145 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1146 status = EXT4_FC_STATUS_INELIGIBLE; 1147 goto fallback; 1148 } 1149 1150 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1151 ret = ext4_fc_perform_commit(journal); 1152 if (ret < 0) { 1153 status = EXT4_FC_STATUS_FAILED; 1154 goto fallback; 1155 } 1156 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1157 ret = jbd2_fc_wait_bufs(journal, nblks); 1158 if (ret < 0) { 1159 status = EXT4_FC_STATUS_FAILED; 1160 goto fallback; 1161 } 1162 atomic_inc(&sbi->s_fc_subtid); 1163 ret = jbd2_fc_end_commit(journal); 1164 /* 1165 * weight the commit time higher than the average time so we 1166 * don't react too strongly to vast changes in the commit time 1167 */ 1168 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1169 ext4_fc_update_stats(sb, status, commit_time, nblks); 1170 return ret; 1171 1172 fallback: 1173 ret = jbd2_fc_end_commit_fallback(journal); 1174 ext4_fc_update_stats(sb, status, 0, 0); 1175 return ret; 1176 } 1177 1178 /* 1179 * Fast commit cleanup routine. This is called after every fast commit and 1180 * full commit. full is true if we are called after a full commit. 1181 */ 1182 static void ext4_fc_cleanup(journal_t *journal, int full) 1183 { 1184 struct super_block *sb = journal->j_private; 1185 struct ext4_sb_info *sbi = EXT4_SB(sb); 1186 struct ext4_inode_info *iter, *iter_n; 1187 struct ext4_fc_dentry_update *fc_dentry; 1188 1189 if (full && sbi->s_fc_bh) 1190 sbi->s_fc_bh = NULL; 1191 1192 jbd2_fc_release_bufs(journal); 1193 1194 spin_lock(&sbi->s_fc_lock); 1195 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1196 i_fc_list) { 1197 list_del_init(&iter->i_fc_list); 1198 ext4_clear_inode_state(&iter->vfs_inode, 1199 EXT4_STATE_FC_COMMITTING); 1200 ext4_fc_reset_inode(&iter->vfs_inode); 1201 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1202 smp_mb(); 1203 #if (BITS_PER_LONG < 64) 1204 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1205 #else 1206 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1207 #endif 1208 } 1209 1210 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1211 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1212 struct ext4_fc_dentry_update, 1213 fcd_list); 1214 list_del_init(&fc_dentry->fcd_list); 1215 spin_unlock(&sbi->s_fc_lock); 1216 1217 if (fc_dentry->fcd_name.name && 1218 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1219 kfree(fc_dentry->fcd_name.name); 1220 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1221 spin_lock(&sbi->s_fc_lock); 1222 } 1223 1224 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1225 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1226 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1227 &sbi->s_fc_q[FC_Q_MAIN]); 1228 1229 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1230 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1231 1232 if (full) 1233 sbi->s_fc_bytes = 0; 1234 spin_unlock(&sbi->s_fc_lock); 1235 trace_ext4_fc_stats(sb); 1236 } 1237 1238 /* Ext4 Replay Path Routines */ 1239 1240 /* Helper struct for dentry replay routines */ 1241 struct dentry_info_args { 1242 int parent_ino, dname_len, ino, inode_len; 1243 char *dname; 1244 }; 1245 1246 static inline void tl_to_darg(struct dentry_info_args *darg, 1247 struct ext4_fc_tl *tl, u8 *val) 1248 { 1249 struct ext4_fc_dentry_info fcd; 1250 1251 memcpy(&fcd, val, sizeof(fcd)); 1252 1253 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1254 darg->ino = le32_to_cpu(fcd.fc_ino); 1255 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1256 darg->dname_len = le16_to_cpu(tl->fc_len) - 1257 sizeof(struct ext4_fc_dentry_info); 1258 } 1259 1260 /* Unlink replay function */ 1261 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1262 u8 *val) 1263 { 1264 struct inode *inode, *old_parent; 1265 struct qstr entry; 1266 struct dentry_info_args darg; 1267 int ret = 0; 1268 1269 tl_to_darg(&darg, tl, val); 1270 1271 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1272 darg.parent_ino, darg.dname_len); 1273 1274 entry.name = darg.dname; 1275 entry.len = darg.dname_len; 1276 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1277 1278 if (IS_ERR(inode)) { 1279 jbd_debug(1, "Inode %d not found", darg.ino); 1280 return 0; 1281 } 1282 1283 old_parent = ext4_iget(sb, darg.parent_ino, 1284 EXT4_IGET_NORMAL); 1285 if (IS_ERR(old_parent)) { 1286 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1287 iput(inode); 1288 return 0; 1289 } 1290 1291 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1292 /* -ENOENT ok coz it might not exist anymore. */ 1293 if (ret == -ENOENT) 1294 ret = 0; 1295 iput(old_parent); 1296 iput(inode); 1297 return ret; 1298 } 1299 1300 static int ext4_fc_replay_link_internal(struct super_block *sb, 1301 struct dentry_info_args *darg, 1302 struct inode *inode) 1303 { 1304 struct inode *dir = NULL; 1305 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1306 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1307 int ret = 0; 1308 1309 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1310 if (IS_ERR(dir)) { 1311 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1312 dir = NULL; 1313 goto out; 1314 } 1315 1316 dentry_dir = d_obtain_alias(dir); 1317 if (IS_ERR(dentry_dir)) { 1318 jbd_debug(1, "Failed to obtain dentry"); 1319 dentry_dir = NULL; 1320 goto out; 1321 } 1322 1323 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1324 if (!dentry_inode) { 1325 jbd_debug(1, "Inode dentry not created."); 1326 ret = -ENOMEM; 1327 goto out; 1328 } 1329 1330 ret = __ext4_link(dir, inode, dentry_inode); 1331 /* 1332 * It's possible that link already existed since data blocks 1333 * for the dir in question got persisted before we crashed OR 1334 * we replayed this tag and crashed before the entire replay 1335 * could complete. 1336 */ 1337 if (ret && ret != -EEXIST) { 1338 jbd_debug(1, "Failed to link\n"); 1339 goto out; 1340 } 1341 1342 ret = 0; 1343 out: 1344 if (dentry_dir) { 1345 d_drop(dentry_dir); 1346 dput(dentry_dir); 1347 } else if (dir) { 1348 iput(dir); 1349 } 1350 if (dentry_inode) { 1351 d_drop(dentry_inode); 1352 dput(dentry_inode); 1353 } 1354 1355 return ret; 1356 } 1357 1358 /* Link replay function */ 1359 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1360 u8 *val) 1361 { 1362 struct inode *inode; 1363 struct dentry_info_args darg; 1364 int ret = 0; 1365 1366 tl_to_darg(&darg, tl, val); 1367 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1368 darg.parent_ino, darg.dname_len); 1369 1370 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1371 if (IS_ERR(inode)) { 1372 jbd_debug(1, "Inode not found."); 1373 return 0; 1374 } 1375 1376 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1377 iput(inode); 1378 return ret; 1379 } 1380 1381 /* 1382 * Record all the modified inodes during replay. We use this later to setup 1383 * block bitmaps correctly. 1384 */ 1385 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1386 { 1387 struct ext4_fc_replay_state *state; 1388 int i; 1389 1390 state = &EXT4_SB(sb)->s_fc_replay_state; 1391 for (i = 0; i < state->fc_modified_inodes_used; i++) 1392 if (state->fc_modified_inodes[i] == ino) 1393 return 0; 1394 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1395 state->fc_modified_inodes_size += 1396 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1397 state->fc_modified_inodes = krealloc( 1398 state->fc_modified_inodes, sizeof(int) * 1399 state->fc_modified_inodes_size, 1400 GFP_KERNEL); 1401 if (!state->fc_modified_inodes) 1402 return -ENOMEM; 1403 } 1404 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1405 return 0; 1406 } 1407 1408 /* 1409 * Inode replay function 1410 */ 1411 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1412 u8 *val) 1413 { 1414 struct ext4_fc_inode fc_inode; 1415 struct ext4_inode *raw_inode; 1416 struct ext4_inode *raw_fc_inode; 1417 struct inode *inode = NULL; 1418 struct ext4_iloc iloc; 1419 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1420 struct ext4_extent_header *eh; 1421 1422 memcpy(&fc_inode, val, sizeof(fc_inode)); 1423 1424 ino = le32_to_cpu(fc_inode.fc_ino); 1425 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1426 1427 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1428 if (!IS_ERR(inode)) { 1429 ext4_ext_clear_bb(inode); 1430 iput(inode); 1431 } 1432 inode = NULL; 1433 1434 ext4_fc_record_modified_inode(sb, ino); 1435 1436 raw_fc_inode = (struct ext4_inode *) 1437 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1438 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1439 if (ret) 1440 goto out; 1441 1442 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); 1443 raw_inode = ext4_raw_inode(&iloc); 1444 1445 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1446 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1447 inode_len - offsetof(struct ext4_inode, i_generation)); 1448 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1449 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1450 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1451 memset(eh, 0, sizeof(*eh)); 1452 eh->eh_magic = EXT4_EXT_MAGIC; 1453 eh->eh_max = cpu_to_le16( 1454 (sizeof(raw_inode->i_block) - 1455 sizeof(struct ext4_extent_header)) 1456 / sizeof(struct ext4_extent)); 1457 } 1458 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1459 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1460 sizeof(raw_inode->i_block)); 1461 } 1462 1463 /* Immediately update the inode on disk. */ 1464 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1465 if (ret) 1466 goto out; 1467 ret = sync_dirty_buffer(iloc.bh); 1468 if (ret) 1469 goto out; 1470 ret = ext4_mark_inode_used(sb, ino); 1471 if (ret) 1472 goto out; 1473 1474 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1475 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1476 if (IS_ERR(inode)) { 1477 jbd_debug(1, "Inode not found."); 1478 return -EFSCORRUPTED; 1479 } 1480 1481 /* 1482 * Our allocator could have made different decisions than before 1483 * crashing. This should be fixed but until then, we calculate 1484 * the number of blocks the inode. 1485 */ 1486 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1487 ext4_ext_replay_set_iblocks(inode); 1488 1489 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1490 ext4_reset_inode_seed(inode); 1491 1492 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1493 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1494 sync_dirty_buffer(iloc.bh); 1495 brelse(iloc.bh); 1496 out: 1497 iput(inode); 1498 if (!ret) 1499 blkdev_issue_flush(sb->s_bdev); 1500 1501 return 0; 1502 } 1503 1504 /* 1505 * Dentry create replay function. 1506 * 1507 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1508 * inode for which we are trying to create a dentry here, should already have 1509 * been replayed before we start here. 1510 */ 1511 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1512 u8 *val) 1513 { 1514 int ret = 0; 1515 struct inode *inode = NULL; 1516 struct inode *dir = NULL; 1517 struct dentry_info_args darg; 1518 1519 tl_to_darg(&darg, tl, val); 1520 1521 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1522 darg.parent_ino, darg.dname_len); 1523 1524 /* This takes care of update group descriptor and other metadata */ 1525 ret = ext4_mark_inode_used(sb, darg.ino); 1526 if (ret) 1527 goto out; 1528 1529 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1530 if (IS_ERR(inode)) { 1531 jbd_debug(1, "inode %d not found.", darg.ino); 1532 inode = NULL; 1533 ret = -EINVAL; 1534 goto out; 1535 } 1536 1537 if (S_ISDIR(inode->i_mode)) { 1538 /* 1539 * If we are creating a directory, we need to make sure that the 1540 * dot and dot dot dirents are setup properly. 1541 */ 1542 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1543 if (IS_ERR(dir)) { 1544 jbd_debug(1, "Dir %d not found.", darg.ino); 1545 goto out; 1546 } 1547 ret = ext4_init_new_dir(NULL, dir, inode); 1548 iput(dir); 1549 if (ret) { 1550 ret = 0; 1551 goto out; 1552 } 1553 } 1554 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1555 if (ret) 1556 goto out; 1557 set_nlink(inode, 1); 1558 ext4_mark_inode_dirty(NULL, inode); 1559 out: 1560 if (inode) 1561 iput(inode); 1562 return ret; 1563 } 1564 1565 /* 1566 * Record physical disk regions which are in use as per fast commit area. Our 1567 * simple replay phase allocator excludes these regions from allocation. 1568 */ 1569 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1570 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1571 { 1572 struct ext4_fc_replay_state *state; 1573 struct ext4_fc_alloc_region *region; 1574 1575 state = &EXT4_SB(sb)->s_fc_replay_state; 1576 if (state->fc_regions_used == state->fc_regions_size) { 1577 state->fc_regions_size += 1578 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1579 state->fc_regions = krealloc( 1580 state->fc_regions, 1581 state->fc_regions_size * 1582 sizeof(struct ext4_fc_alloc_region), 1583 GFP_KERNEL); 1584 if (!state->fc_regions) 1585 return -ENOMEM; 1586 } 1587 region = &state->fc_regions[state->fc_regions_used++]; 1588 region->ino = ino; 1589 region->lblk = lblk; 1590 region->pblk = pblk; 1591 region->len = len; 1592 1593 return 0; 1594 } 1595 1596 /* Replay add range tag */ 1597 static int ext4_fc_replay_add_range(struct super_block *sb, 1598 struct ext4_fc_tl *tl, u8 *val) 1599 { 1600 struct ext4_fc_add_range fc_add_ex; 1601 struct ext4_extent newex, *ex; 1602 struct inode *inode; 1603 ext4_lblk_t start, cur; 1604 int remaining, len; 1605 ext4_fsblk_t start_pblk; 1606 struct ext4_map_blocks map; 1607 struct ext4_ext_path *path = NULL; 1608 int ret; 1609 1610 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1611 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1612 1613 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1614 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1615 ext4_ext_get_actual_len(ex)); 1616 1617 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1618 if (IS_ERR(inode)) { 1619 jbd_debug(1, "Inode not found."); 1620 return 0; 1621 } 1622 1623 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1624 1625 start = le32_to_cpu(ex->ee_block); 1626 start_pblk = ext4_ext_pblock(ex); 1627 len = ext4_ext_get_actual_len(ex); 1628 1629 cur = start; 1630 remaining = len; 1631 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1632 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1633 inode->i_ino); 1634 1635 while (remaining > 0) { 1636 map.m_lblk = cur; 1637 map.m_len = remaining; 1638 map.m_pblk = 0; 1639 ret = ext4_map_blocks(NULL, inode, &map, 0); 1640 1641 if (ret < 0) { 1642 iput(inode); 1643 return 0; 1644 } 1645 1646 if (ret == 0) { 1647 /* Range is not mapped */ 1648 path = ext4_find_extent(inode, cur, NULL, 0); 1649 if (IS_ERR(path)) { 1650 iput(inode); 1651 return 0; 1652 } 1653 memset(&newex, 0, sizeof(newex)); 1654 newex.ee_block = cpu_to_le32(cur); 1655 ext4_ext_store_pblock( 1656 &newex, start_pblk + cur - start); 1657 newex.ee_len = cpu_to_le16(map.m_len); 1658 if (ext4_ext_is_unwritten(ex)) 1659 ext4_ext_mark_unwritten(&newex); 1660 down_write(&EXT4_I(inode)->i_data_sem); 1661 ret = ext4_ext_insert_extent( 1662 NULL, inode, &path, &newex, 0); 1663 up_write((&EXT4_I(inode)->i_data_sem)); 1664 ext4_ext_drop_refs(path); 1665 kfree(path); 1666 if (ret) { 1667 iput(inode); 1668 return 0; 1669 } 1670 goto next; 1671 } 1672 1673 if (start_pblk + cur - start != map.m_pblk) { 1674 /* 1675 * Logical to physical mapping changed. This can happen 1676 * if this range was removed and then reallocated to 1677 * map to new physical blocks during a fast commit. 1678 */ 1679 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1680 ext4_ext_is_unwritten(ex), 1681 start_pblk + cur - start); 1682 if (ret) { 1683 iput(inode); 1684 return 0; 1685 } 1686 /* 1687 * Mark the old blocks as free since they aren't used 1688 * anymore. We maintain an array of all the modified 1689 * inodes. In case these blocks are still used at either 1690 * a different logical range in the same inode or in 1691 * some different inode, we will mark them as allocated 1692 * at the end of the FC replay using our array of 1693 * modified inodes. 1694 */ 1695 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1696 goto next; 1697 } 1698 1699 /* Range is mapped and needs a state change */ 1700 jbd_debug(1, "Converting from %ld to %d %lld", 1701 map.m_flags & EXT4_MAP_UNWRITTEN, 1702 ext4_ext_is_unwritten(ex), map.m_pblk); 1703 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1704 ext4_ext_is_unwritten(ex), map.m_pblk); 1705 if (ret) { 1706 iput(inode); 1707 return 0; 1708 } 1709 /* 1710 * We may have split the extent tree while toggling the state. 1711 * Try to shrink the extent tree now. 1712 */ 1713 ext4_ext_replay_shrink_inode(inode, start + len); 1714 next: 1715 cur += map.m_len; 1716 remaining -= map.m_len; 1717 } 1718 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1719 sb->s_blocksize_bits); 1720 iput(inode); 1721 return 0; 1722 } 1723 1724 /* Replay DEL_RANGE tag */ 1725 static int 1726 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1727 u8 *val) 1728 { 1729 struct inode *inode; 1730 struct ext4_fc_del_range lrange; 1731 struct ext4_map_blocks map; 1732 ext4_lblk_t cur, remaining; 1733 int ret; 1734 1735 memcpy(&lrange, val, sizeof(lrange)); 1736 cur = le32_to_cpu(lrange.fc_lblk); 1737 remaining = le32_to_cpu(lrange.fc_len); 1738 1739 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1740 le32_to_cpu(lrange.fc_ino), cur, remaining); 1741 1742 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1743 if (IS_ERR(inode)) { 1744 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1745 return 0; 1746 } 1747 1748 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1749 1750 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1751 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1752 le32_to_cpu(lrange.fc_len)); 1753 while (remaining > 0) { 1754 map.m_lblk = cur; 1755 map.m_len = remaining; 1756 1757 ret = ext4_map_blocks(NULL, inode, &map, 0); 1758 if (ret < 0) { 1759 iput(inode); 1760 return 0; 1761 } 1762 if (ret > 0) { 1763 remaining -= ret; 1764 cur += ret; 1765 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1766 } else { 1767 remaining -= map.m_len; 1768 cur += map.m_len; 1769 } 1770 } 1771 1772 down_write(&EXT4_I(inode)->i_data_sem); 1773 ret = ext4_ext_remove_space(inode, lrange.fc_lblk, 1774 lrange.fc_lblk + lrange.fc_len - 1); 1775 up_write(&EXT4_I(inode)->i_data_sem); 1776 if (ret) { 1777 iput(inode); 1778 return 0; 1779 } 1780 ext4_ext_replay_shrink_inode(inode, 1781 i_size_read(inode) >> sb->s_blocksize_bits); 1782 ext4_mark_inode_dirty(NULL, inode); 1783 iput(inode); 1784 1785 return 0; 1786 } 1787 1788 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1789 { 1790 struct ext4_fc_replay_state *state; 1791 struct inode *inode; 1792 struct ext4_ext_path *path = NULL; 1793 struct ext4_map_blocks map; 1794 int i, ret, j; 1795 ext4_lblk_t cur, end; 1796 1797 state = &EXT4_SB(sb)->s_fc_replay_state; 1798 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1799 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1800 EXT4_IGET_NORMAL); 1801 if (IS_ERR(inode)) { 1802 jbd_debug(1, "Inode %d not found.", 1803 state->fc_modified_inodes[i]); 1804 continue; 1805 } 1806 cur = 0; 1807 end = EXT_MAX_BLOCKS; 1808 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1809 iput(inode); 1810 continue; 1811 } 1812 while (cur < end) { 1813 map.m_lblk = cur; 1814 map.m_len = end - cur; 1815 1816 ret = ext4_map_blocks(NULL, inode, &map, 0); 1817 if (ret < 0) 1818 break; 1819 1820 if (ret > 0) { 1821 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1822 if (!IS_ERR(path)) { 1823 for (j = 0; j < path->p_depth; j++) 1824 ext4_mb_mark_bb(inode->i_sb, 1825 path[j].p_block, 1, 1); 1826 ext4_ext_drop_refs(path); 1827 kfree(path); 1828 } 1829 cur += ret; 1830 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1831 map.m_len, 1); 1832 } else { 1833 cur = cur + (map.m_len ? map.m_len : 1); 1834 } 1835 } 1836 iput(inode); 1837 } 1838 } 1839 1840 /* 1841 * Check if block is in excluded regions for block allocation. The simple 1842 * allocator that runs during replay phase is calls this function to see 1843 * if it is okay to use a block. 1844 */ 1845 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1846 { 1847 int i; 1848 struct ext4_fc_replay_state *state; 1849 1850 state = &EXT4_SB(sb)->s_fc_replay_state; 1851 for (i = 0; i < state->fc_regions_valid; i++) { 1852 if (state->fc_regions[i].ino == 0 || 1853 state->fc_regions[i].len == 0) 1854 continue; 1855 if (blk >= state->fc_regions[i].pblk && 1856 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1857 return true; 1858 } 1859 return false; 1860 } 1861 1862 /* Cleanup function called after replay */ 1863 void ext4_fc_replay_cleanup(struct super_block *sb) 1864 { 1865 struct ext4_sb_info *sbi = EXT4_SB(sb); 1866 1867 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1868 kfree(sbi->s_fc_replay_state.fc_regions); 1869 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1870 } 1871 1872 /* 1873 * Recovery Scan phase handler 1874 * 1875 * This function is called during the scan phase and is responsible 1876 * for doing following things: 1877 * - Make sure the fast commit area has valid tags for replay 1878 * - Count number of tags that need to be replayed by the replay handler 1879 * - Verify CRC 1880 * - Create a list of excluded blocks for allocation during replay phase 1881 * 1882 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1883 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1884 * to indicate that scan has finished and JBD2 can now start replay phase. 1885 * It returns a negative error to indicate that there was an error. At the end 1886 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1887 * to indicate the number of tags that need to replayed during the replay phase. 1888 */ 1889 static int ext4_fc_replay_scan(journal_t *journal, 1890 struct buffer_head *bh, int off, 1891 tid_t expected_tid) 1892 { 1893 struct super_block *sb = journal->j_private; 1894 struct ext4_sb_info *sbi = EXT4_SB(sb); 1895 struct ext4_fc_replay_state *state; 1896 int ret = JBD2_FC_REPLAY_CONTINUE; 1897 struct ext4_fc_add_range ext; 1898 struct ext4_fc_tl tl; 1899 struct ext4_fc_tail tail; 1900 __u8 *start, *end, *cur, *val; 1901 struct ext4_fc_head head; 1902 struct ext4_extent *ex; 1903 1904 state = &sbi->s_fc_replay_state; 1905 1906 start = (u8 *)bh->b_data; 1907 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1908 1909 if (state->fc_replay_expected_off == 0) { 1910 state->fc_cur_tag = 0; 1911 state->fc_replay_num_tags = 0; 1912 state->fc_crc = 0; 1913 state->fc_regions = NULL; 1914 state->fc_regions_valid = state->fc_regions_used = 1915 state->fc_regions_size = 0; 1916 /* Check if we can stop early */ 1917 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1918 != EXT4_FC_TAG_HEAD) 1919 return 0; 1920 } 1921 1922 if (off != state->fc_replay_expected_off) { 1923 ret = -EFSCORRUPTED; 1924 goto out_err; 1925 } 1926 1927 state->fc_replay_expected_off++; 1928 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 1929 memcpy(&tl, cur, sizeof(tl)); 1930 val = cur + sizeof(tl); 1931 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1932 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); 1933 switch (le16_to_cpu(tl.fc_tag)) { 1934 case EXT4_FC_TAG_ADD_RANGE: 1935 memcpy(&ext, val, sizeof(ext)); 1936 ex = (struct ext4_extent *)&ext.fc_ex; 1937 ret = ext4_fc_record_regions(sb, 1938 le32_to_cpu(ext.fc_ino), 1939 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1940 ext4_ext_get_actual_len(ex)); 1941 if (ret < 0) 1942 break; 1943 ret = JBD2_FC_REPLAY_CONTINUE; 1944 fallthrough; 1945 case EXT4_FC_TAG_DEL_RANGE: 1946 case EXT4_FC_TAG_LINK: 1947 case EXT4_FC_TAG_UNLINK: 1948 case EXT4_FC_TAG_CREAT: 1949 case EXT4_FC_TAG_INODE: 1950 case EXT4_FC_TAG_PAD: 1951 state->fc_cur_tag++; 1952 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1953 sizeof(tl) + le16_to_cpu(tl.fc_len)); 1954 break; 1955 case EXT4_FC_TAG_TAIL: 1956 state->fc_cur_tag++; 1957 memcpy(&tail, val, sizeof(tail)); 1958 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1959 sizeof(tl) + 1960 offsetof(struct ext4_fc_tail, 1961 fc_crc)); 1962 if (le32_to_cpu(tail.fc_tid) == expected_tid && 1963 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 1964 state->fc_replay_num_tags = state->fc_cur_tag; 1965 state->fc_regions_valid = 1966 state->fc_regions_used; 1967 } else { 1968 ret = state->fc_replay_num_tags ? 1969 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 1970 } 1971 state->fc_crc = 0; 1972 break; 1973 case EXT4_FC_TAG_HEAD: 1974 memcpy(&head, val, sizeof(head)); 1975 if (le32_to_cpu(head.fc_features) & 1976 ~EXT4_FC_SUPPORTED_FEATURES) { 1977 ret = -EOPNOTSUPP; 1978 break; 1979 } 1980 if (le32_to_cpu(head.fc_tid) != expected_tid) { 1981 ret = JBD2_FC_REPLAY_STOP; 1982 break; 1983 } 1984 state->fc_cur_tag++; 1985 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 1986 sizeof(tl) + le16_to_cpu(tl.fc_len)); 1987 break; 1988 default: 1989 ret = state->fc_replay_num_tags ? 1990 JBD2_FC_REPLAY_STOP : -ECANCELED; 1991 } 1992 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 1993 break; 1994 } 1995 1996 out_err: 1997 trace_ext4_fc_replay_scan(sb, ret, off); 1998 return ret; 1999 } 2000 2001 /* 2002 * Main recovery path entry point. 2003 * The meaning of return codes is similar as above. 2004 */ 2005 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2006 enum passtype pass, int off, tid_t expected_tid) 2007 { 2008 struct super_block *sb = journal->j_private; 2009 struct ext4_sb_info *sbi = EXT4_SB(sb); 2010 struct ext4_fc_tl tl; 2011 __u8 *start, *end, *cur, *val; 2012 int ret = JBD2_FC_REPLAY_CONTINUE; 2013 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2014 struct ext4_fc_tail tail; 2015 2016 if (pass == PASS_SCAN) { 2017 state->fc_current_pass = PASS_SCAN; 2018 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2019 } 2020 2021 if (state->fc_current_pass != pass) { 2022 state->fc_current_pass = pass; 2023 sbi->s_mount_state |= EXT4_FC_REPLAY; 2024 } 2025 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2026 jbd_debug(1, "Replay stops\n"); 2027 ext4_fc_set_bitmaps_and_counters(sb); 2028 return 0; 2029 } 2030 2031 #ifdef CONFIG_EXT4_DEBUG 2032 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2033 pr_warn("Dropping fc block %d because max_replay set\n", off); 2034 return JBD2_FC_REPLAY_STOP; 2035 } 2036 #endif 2037 2038 start = (u8 *)bh->b_data; 2039 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2040 2041 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { 2042 memcpy(&tl, cur, sizeof(tl)); 2043 val = cur + sizeof(tl); 2044 2045 if (state->fc_replay_num_tags == 0) { 2046 ret = JBD2_FC_REPLAY_STOP; 2047 ext4_fc_set_bitmaps_and_counters(sb); 2048 break; 2049 } 2050 jbd_debug(3, "Replay phase, tag:%s\n", 2051 tag2str(le16_to_cpu(tl.fc_tag))); 2052 state->fc_replay_num_tags--; 2053 switch (le16_to_cpu(tl.fc_tag)) { 2054 case EXT4_FC_TAG_LINK: 2055 ret = ext4_fc_replay_link(sb, &tl, val); 2056 break; 2057 case EXT4_FC_TAG_UNLINK: 2058 ret = ext4_fc_replay_unlink(sb, &tl, val); 2059 break; 2060 case EXT4_FC_TAG_ADD_RANGE: 2061 ret = ext4_fc_replay_add_range(sb, &tl, val); 2062 break; 2063 case EXT4_FC_TAG_CREAT: 2064 ret = ext4_fc_replay_create(sb, &tl, val); 2065 break; 2066 case EXT4_FC_TAG_DEL_RANGE: 2067 ret = ext4_fc_replay_del_range(sb, &tl, val); 2068 break; 2069 case EXT4_FC_TAG_INODE: 2070 ret = ext4_fc_replay_inode(sb, &tl, val); 2071 break; 2072 case EXT4_FC_TAG_PAD: 2073 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2074 le16_to_cpu(tl.fc_len), 0); 2075 break; 2076 case EXT4_FC_TAG_TAIL: 2077 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2078 le16_to_cpu(tl.fc_len), 0); 2079 memcpy(&tail, val, sizeof(tail)); 2080 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2081 break; 2082 case EXT4_FC_TAG_HEAD: 2083 break; 2084 default: 2085 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, 2086 le16_to_cpu(tl.fc_len), 0); 2087 ret = -ECANCELED; 2088 break; 2089 } 2090 if (ret < 0) 2091 break; 2092 ret = JBD2_FC_REPLAY_CONTINUE; 2093 } 2094 return ret; 2095 } 2096 2097 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2098 { 2099 /* 2100 * We set replay callback even if fast commit disabled because we may 2101 * could still have fast commit blocks that need to be replayed even if 2102 * fast commit has now been turned off. 2103 */ 2104 journal->j_fc_replay_callback = ext4_fc_replay; 2105 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2106 return; 2107 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2108 } 2109 2110 static const char *fc_ineligible_reasons[] = { 2111 "Extended attributes changed", 2112 "Cross rename", 2113 "Journal flag changed", 2114 "Insufficient memory", 2115 "Swap boot", 2116 "Resize", 2117 "Dir renamed", 2118 "Falloc range op", 2119 "Data journalling", 2120 "FC Commit Failed" 2121 }; 2122 2123 int ext4_fc_info_show(struct seq_file *seq, void *v) 2124 { 2125 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2126 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2127 int i; 2128 2129 if (v != SEQ_START_TOKEN) 2130 return 0; 2131 2132 seq_printf(seq, 2133 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2134 stats->fc_num_commits, stats->fc_ineligible_commits, 2135 stats->fc_numblks, 2136 div_u64(stats->s_fc_avg_commit_time, 1000)); 2137 seq_puts(seq, "Ineligible reasons:\n"); 2138 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2139 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2140 stats->fc_ineligible_reason_count[i]); 2141 2142 return 0; 2143 } 2144 2145 int __init ext4_fc_init_dentry_cache(void) 2146 { 2147 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2148 SLAB_RECLAIM_ACCOUNT); 2149 2150 if (ext4_fc_dentry_cachep == NULL) 2151 return -ENOMEM; 2152 2153 return 0; 2154 } 2155 2156 void ext4_fc_destroy_dentry_cache(void) 2157 { 2158 kmem_cache_destroy(ext4_fc_dentry_cachep); 2159 } 2160