1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * Not all operations are supported by fast commits today (e.g extended 69 * attributes). Fast commit ineligiblity is marked by calling one of the 70 * two following functions: 71 * 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall 73 * back to full commit. This is useful in case of transient errors. 74 * 75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all 76 * the fast commits happening between ext4_fc_start_ineligible() and 77 * ext4_fc_stop_ineligible() and one fast commit after the call to 78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to 79 * make one more fast commit to fall back to full commit after stop call so 80 * that it guaranteed that the fast commit ineligible operation contained 81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is 82 * followed by at least 1 full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to guarantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * TODOs 107 * ----- 108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 109 * eligible update must be protected within ext4_fc_start_update() and 110 * ext4_fc_stop_update(). These routines are called at much higher 111 * routines. This can be made more fine grained by combining with 112 * ext4_journal_start(). 113 * 114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() 115 * 116 * 3) Handle more ineligible cases. 117 */ 118 119 #include <trace/events/ext4.h> 120 static struct kmem_cache *ext4_fc_dentry_cachep; 121 122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 123 { 124 BUFFER_TRACE(bh, ""); 125 if (uptodate) { 126 ext4_debug("%s: Block %lld up-to-date", 127 __func__, bh->b_blocknr); 128 set_buffer_uptodate(bh); 129 } else { 130 ext4_debug("%s: Block %lld not up-to-date", 131 __func__, bh->b_blocknr); 132 clear_buffer_uptodate(bh); 133 } 134 135 unlock_buffer(bh); 136 } 137 138 static inline void ext4_fc_reset_inode(struct inode *inode) 139 { 140 struct ext4_inode_info *ei = EXT4_I(inode); 141 142 ei->i_fc_lblk_start = 0; 143 ei->i_fc_lblk_len = 0; 144 } 145 146 void ext4_fc_init_inode(struct inode *inode) 147 { 148 struct ext4_inode_info *ei = EXT4_I(inode); 149 150 ext4_fc_reset_inode(inode); 151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 152 INIT_LIST_HEAD(&ei->i_fc_list); 153 init_waitqueue_head(&ei->i_fc_wait); 154 atomic_set(&ei->i_fc_updates, 0); 155 } 156 157 /* This function must be called with sbi->s_fc_lock held. */ 158 static void ext4_fc_wait_committing_inode(struct inode *inode) 159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 160 { 161 wait_queue_head_t *wq; 162 struct ext4_inode_info *ei = EXT4_I(inode); 163 164 #if (BITS_PER_LONG < 64) 165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 166 EXT4_STATE_FC_COMMITTING); 167 wq = bit_waitqueue(&ei->i_state_flags, 168 EXT4_STATE_FC_COMMITTING); 169 #else 170 DEFINE_WAIT_BIT(wait, &ei->i_flags, 171 EXT4_STATE_FC_COMMITTING); 172 wq = bit_waitqueue(&ei->i_flags, 173 EXT4_STATE_FC_COMMITTING); 174 #endif 175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 178 schedule(); 179 finish_wait(wq, &wait.wq_entry); 180 } 181 182 /* 183 * Inform Ext4's fast about start of an inode update 184 * 185 * This function is called by the high level call VFS callbacks before 186 * performing any inode update. This function blocks if there's an ongoing 187 * fast commit on the inode in question. 188 */ 189 void ext4_fc_start_update(struct inode *inode) 190 { 191 struct ext4_inode_info *ei = EXT4_I(inode); 192 193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 195 return; 196 197 restart: 198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 199 if (list_empty(&ei->i_fc_list)) 200 goto out; 201 202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 203 ext4_fc_wait_committing_inode(inode); 204 goto restart; 205 } 206 out: 207 atomic_inc(&ei->i_fc_updates); 208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 209 } 210 211 /* 212 * Stop inode update and wake up waiting fast commits if any. 213 */ 214 void ext4_fc_stop_update(struct inode *inode) 215 { 216 struct ext4_inode_info *ei = EXT4_I(inode); 217 218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 220 return; 221 222 if (atomic_dec_and_test(&ei->i_fc_updates)) 223 wake_up_all(&ei->i_fc_wait); 224 } 225 226 /* 227 * Remove inode from fast commit list. If the inode is being committed 228 * we wait until inode commit is done. 229 */ 230 void ext4_fc_del(struct inode *inode) 231 { 232 struct ext4_inode_info *ei = EXT4_I(inode); 233 234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 236 return; 237 238 restart: 239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 240 if (list_empty(&ei->i_fc_list)) { 241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 242 return; 243 } 244 245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 246 ext4_fc_wait_committing_inode(inode); 247 goto restart; 248 } 249 list_del_init(&ei->i_fc_list); 250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 251 } 252 253 /* 254 * Mark file system as fast commit ineligible. This means that next commit 255 * operation would result in a full jbd2 commit. 256 */ 257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 258 { 259 struct ext4_sb_info *sbi = EXT4_SB(sb); 260 261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 263 return; 264 265 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 266 WARN_ON(reason >= EXT4_FC_REASON_MAX); 267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 268 } 269 270 /* 271 * Start a fast commit ineligible update. Any commits that happen while 272 * such an operation is in progress fall back to full commits. 273 */ 274 void ext4_fc_start_ineligible(struct super_block *sb, int reason) 275 { 276 struct ext4_sb_info *sbi = EXT4_SB(sb); 277 278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 280 return; 281 282 WARN_ON(reason >= EXT4_FC_REASON_MAX); 283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 284 atomic_inc(&sbi->s_fc_ineligible_updates); 285 } 286 287 /* 288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here 289 * to ensure that after stopping the ineligible update, at least one full 290 * commit takes place. 291 */ 292 void ext4_fc_stop_ineligible(struct super_block *sb) 293 { 294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 296 return; 297 298 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); 300 } 301 302 static inline int ext4_fc_is_ineligible(struct super_block *sb) 303 { 304 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || 305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); 306 } 307 308 /* 309 * Generic fast commit tracking function. If this is the first time this we are 310 * called after a full commit, we initialize fast commit fields and then call 311 * __fc_track_fn() with update = 0. If we have already been called after a full 312 * commit, we pass update = 1. Based on that, the track function can determine 313 * if it needs to track a field for the first time or if it needs to just 314 * update the previously tracked value. 315 * 316 * If enqueue is set, this function enqueues the inode in fast commit list. 317 */ 318 static int ext4_fc_track_template( 319 handle_t *handle, struct inode *inode, 320 int (*__fc_track_fn)(struct inode *, void *, bool), 321 void *args, int enqueue) 322 { 323 bool update = false; 324 struct ext4_inode_info *ei = EXT4_I(inode); 325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 326 tid_t tid = 0; 327 int ret; 328 329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 330 (sbi->s_mount_state & EXT4_FC_REPLAY)) 331 return -EOPNOTSUPP; 332 333 if (ext4_fc_is_ineligible(inode->i_sb)) 334 return -EINVAL; 335 336 tid = handle->h_transaction->t_tid; 337 mutex_lock(&ei->i_fc_lock); 338 if (tid == ei->i_sync_tid) { 339 update = true; 340 } else { 341 ext4_fc_reset_inode(inode); 342 ei->i_sync_tid = tid; 343 } 344 ret = __fc_track_fn(inode, args, update); 345 mutex_unlock(&ei->i_fc_lock); 346 347 if (!enqueue) 348 return ret; 349 350 spin_lock(&sbi->s_fc_lock); 351 if (list_empty(&EXT4_I(inode)->i_fc_list)) 352 list_add_tail(&EXT4_I(inode)->i_fc_list, 353 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 354 &sbi->s_fc_q[FC_Q_STAGING] : 355 &sbi->s_fc_q[FC_Q_MAIN]); 356 spin_unlock(&sbi->s_fc_lock); 357 358 return ret; 359 } 360 361 struct __track_dentry_update_args { 362 struct dentry *dentry; 363 int op; 364 }; 365 366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 367 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 368 { 369 struct ext4_fc_dentry_update *node; 370 struct ext4_inode_info *ei = EXT4_I(inode); 371 struct __track_dentry_update_args *dentry_update = 372 (struct __track_dentry_update_args *)arg; 373 struct dentry *dentry = dentry_update->dentry; 374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 375 376 mutex_unlock(&ei->i_fc_lock); 377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 378 if (!node) { 379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 380 mutex_lock(&ei->i_fc_lock); 381 return -ENOMEM; 382 } 383 384 node->fcd_op = dentry_update->op; 385 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 386 node->fcd_ino = inode->i_ino; 387 if (dentry->d_name.len > DNAME_INLINE_LEN) { 388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 389 if (!node->fcd_name.name) { 390 kmem_cache_free(ext4_fc_dentry_cachep, node); 391 ext4_fc_mark_ineligible(inode->i_sb, 392 EXT4_FC_REASON_NOMEM); 393 mutex_lock(&ei->i_fc_lock); 394 return -ENOMEM; 395 } 396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 397 dentry->d_name.len); 398 } else { 399 memcpy(node->fcd_iname, dentry->d_name.name, 400 dentry->d_name.len); 401 node->fcd_name.name = node->fcd_iname; 402 } 403 node->fcd_name.len = dentry->d_name.len; 404 405 spin_lock(&sbi->s_fc_lock); 406 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 407 list_add_tail(&node->fcd_list, 408 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 409 else 410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 411 spin_unlock(&sbi->s_fc_lock); 412 mutex_lock(&ei->i_fc_lock); 413 414 return 0; 415 } 416 417 void __ext4_fc_track_unlink(handle_t *handle, 418 struct inode *inode, struct dentry *dentry) 419 { 420 struct __track_dentry_update_args args; 421 int ret; 422 423 args.dentry = dentry; 424 args.op = EXT4_FC_TAG_UNLINK; 425 426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 427 (void *)&args, 0); 428 trace_ext4_fc_track_unlink(inode, dentry, ret); 429 } 430 431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 432 { 433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 434 } 435 436 void __ext4_fc_track_link(handle_t *handle, 437 struct inode *inode, struct dentry *dentry) 438 { 439 struct __track_dentry_update_args args; 440 int ret; 441 442 args.dentry = dentry; 443 args.op = EXT4_FC_TAG_LINK; 444 445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 446 (void *)&args, 0); 447 trace_ext4_fc_track_link(inode, dentry, ret); 448 } 449 450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 451 { 452 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 453 } 454 455 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 456 { 457 struct __track_dentry_update_args args; 458 struct inode *inode = d_inode(dentry); 459 int ret; 460 461 args.dentry = dentry; 462 args.op = EXT4_FC_TAG_CREAT; 463 464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 465 (void *)&args, 0); 466 trace_ext4_fc_track_create(inode, dentry, ret); 467 } 468 469 /* __track_fn for inode tracking */ 470 static int __track_inode(struct inode *inode, void *arg, bool update) 471 { 472 if (update) 473 return -EEXIST; 474 475 EXT4_I(inode)->i_fc_lblk_len = 0; 476 477 return 0; 478 } 479 480 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 481 { 482 int ret; 483 484 if (S_ISDIR(inode->i_mode)) 485 return; 486 487 if (ext4_should_journal_data(inode)) { 488 ext4_fc_mark_ineligible(inode->i_sb, 489 EXT4_FC_REASON_INODE_JOURNAL_DATA); 490 return; 491 } 492 493 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 494 trace_ext4_fc_track_inode(inode, ret); 495 } 496 497 struct __track_range_args { 498 ext4_lblk_t start, end; 499 }; 500 501 /* __track_fn for tracking data updates */ 502 static int __track_range(struct inode *inode, void *arg, bool update) 503 { 504 struct ext4_inode_info *ei = EXT4_I(inode); 505 ext4_lblk_t oldstart; 506 struct __track_range_args *__arg = 507 (struct __track_range_args *)arg; 508 509 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 510 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 511 return -ECANCELED; 512 } 513 514 oldstart = ei->i_fc_lblk_start; 515 516 if (update && ei->i_fc_lblk_len > 0) { 517 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 518 ei->i_fc_lblk_len = 519 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 520 ei->i_fc_lblk_start + 1; 521 } else { 522 ei->i_fc_lblk_start = __arg->start; 523 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 524 } 525 526 return 0; 527 } 528 529 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 530 ext4_lblk_t end) 531 { 532 struct __track_range_args args; 533 int ret; 534 535 if (S_ISDIR(inode->i_mode)) 536 return; 537 538 args.start = start; 539 args.end = end; 540 541 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 542 543 trace_ext4_fc_track_range(inode, start, end, ret); 544 } 545 546 static void ext4_fc_submit_bh(struct super_block *sb) 547 { 548 int write_flags = REQ_SYNC; 549 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 550 551 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ 552 if (test_opt(sb, BARRIER)) 553 write_flags |= REQ_FUA | REQ_PREFLUSH; 554 lock_buffer(bh); 555 set_buffer_dirty(bh); 556 set_buffer_uptodate(bh); 557 bh->b_end_io = ext4_end_buffer_io_sync; 558 submit_bh(REQ_OP_WRITE, write_flags, bh); 559 EXT4_SB(sb)->s_fc_bh = NULL; 560 } 561 562 /* Ext4 commit path routines */ 563 564 /* memzero and update CRC */ 565 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 566 u32 *crc) 567 { 568 void *ret; 569 570 ret = memset(dst, 0, len); 571 if (crc) 572 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 573 return ret; 574 } 575 576 /* 577 * Allocate len bytes on a fast commit buffer. 578 * 579 * During the commit time this function is used to manage fast commit 580 * block space. We don't split a fast commit log onto different 581 * blocks. So this function makes sure that if there's not enough space 582 * on the current block, the remaining space in the current block is 583 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 584 * new block is from jbd2 and CRC is updated to reflect the padding 585 * we added. 586 */ 587 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 588 { 589 struct ext4_fc_tl *tl; 590 struct ext4_sb_info *sbi = EXT4_SB(sb); 591 struct buffer_head *bh; 592 int bsize = sbi->s_journal->j_blocksize; 593 int ret, off = sbi->s_fc_bytes % bsize; 594 int pad_len; 595 596 /* 597 * After allocating len, we should have space at least for a 0 byte 598 * padding. 599 */ 600 if (len + sizeof(struct ext4_fc_tl) > bsize) 601 return NULL; 602 603 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 604 /* 605 * Only allocate from current buffer if we have enough space for 606 * this request AND we have space to add a zero byte padding. 607 */ 608 if (!sbi->s_fc_bh) { 609 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 610 if (ret) 611 return NULL; 612 sbi->s_fc_bh = bh; 613 } 614 sbi->s_fc_bytes += len; 615 return sbi->s_fc_bh->b_data + off; 616 } 617 /* Need to add PAD tag */ 618 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 619 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 620 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 621 tl->fc_len = cpu_to_le16(pad_len); 622 if (crc) 623 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 624 if (pad_len > 0) 625 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 626 ext4_fc_submit_bh(sb); 627 628 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 629 if (ret) 630 return NULL; 631 sbi->s_fc_bh = bh; 632 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 633 return sbi->s_fc_bh->b_data; 634 } 635 636 /* memcpy to fc reserved space and update CRC */ 637 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 638 int len, u32 *crc) 639 { 640 if (crc) 641 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 642 return memcpy(dst, src, len); 643 } 644 645 /* 646 * Complete a fast commit by writing tail tag. 647 * 648 * Writing tail tag marks the end of a fast commit. In order to guarantee 649 * atomicity, after writing tail tag, even if there's space remaining 650 * in the block, next commit shouldn't use it. That's why tail tag 651 * has the length as that of the remaining space on the block. 652 */ 653 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 654 { 655 struct ext4_sb_info *sbi = EXT4_SB(sb); 656 struct ext4_fc_tl tl; 657 struct ext4_fc_tail tail; 658 int off, bsize = sbi->s_journal->j_blocksize; 659 u8 *dst; 660 661 /* 662 * ext4_fc_reserve_space takes care of allocating an extra block if 663 * there's no enough space on this block for accommodating this tail. 664 */ 665 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 666 if (!dst) 667 return -ENOSPC; 668 669 off = sbi->s_fc_bytes % bsize; 670 671 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 672 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 673 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 674 675 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 676 dst += sizeof(tl); 677 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 678 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 679 dst += sizeof(tail.fc_tid); 680 tail.fc_crc = cpu_to_le32(crc); 681 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 682 683 ext4_fc_submit_bh(sb); 684 685 return 0; 686 } 687 688 /* 689 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 690 * Returns false if there's not enough space. 691 */ 692 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 693 u32 *crc) 694 { 695 struct ext4_fc_tl tl; 696 u8 *dst; 697 698 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 699 if (!dst) 700 return false; 701 702 tl.fc_tag = cpu_to_le16(tag); 703 tl.fc_len = cpu_to_le16(len); 704 705 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 706 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 707 708 return true; 709 } 710 711 /* Same as above, but adds dentry tlv. */ 712 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, 713 int parent_ino, int ino, int dlen, 714 const unsigned char *dname, 715 u32 *crc) 716 { 717 struct ext4_fc_dentry_info fcd; 718 struct ext4_fc_tl tl; 719 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 720 crc); 721 722 if (!dst) 723 return false; 724 725 fcd.fc_parent_ino = cpu_to_le32(parent_ino); 726 fcd.fc_ino = cpu_to_le32(ino); 727 tl.fc_tag = cpu_to_le16(tag); 728 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 729 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 730 dst += sizeof(tl); 731 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 732 dst += sizeof(fcd); 733 ext4_fc_memcpy(sb, dst, dname, dlen, crc); 734 dst += dlen; 735 736 return true; 737 } 738 739 /* 740 * Writes inode in the fast commit space under TLV with tag @tag. 741 * Returns 0 on success, error on failure. 742 */ 743 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 744 { 745 struct ext4_inode_info *ei = EXT4_I(inode); 746 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 747 int ret; 748 struct ext4_iloc iloc; 749 struct ext4_fc_inode fc_inode; 750 struct ext4_fc_tl tl; 751 u8 *dst; 752 753 ret = ext4_get_inode_loc(inode, &iloc); 754 if (ret) 755 return ret; 756 757 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 758 inode_len += ei->i_extra_isize; 759 760 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 761 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 762 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 763 764 dst = ext4_fc_reserve_space(inode->i_sb, 765 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 766 if (!dst) 767 return -ECANCELED; 768 769 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 770 return -ECANCELED; 771 dst += sizeof(tl); 772 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 773 return -ECANCELED; 774 dst += sizeof(fc_inode); 775 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 776 inode_len, crc)) 777 return -ECANCELED; 778 779 return 0; 780 } 781 782 /* 783 * Writes updated data ranges for the inode in question. Updates CRC. 784 * Returns 0 on success, error otherwise. 785 */ 786 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 787 { 788 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 789 struct ext4_inode_info *ei = EXT4_I(inode); 790 struct ext4_map_blocks map; 791 struct ext4_fc_add_range fc_ext; 792 struct ext4_fc_del_range lrange; 793 struct ext4_extent *ex; 794 int ret; 795 796 mutex_lock(&ei->i_fc_lock); 797 if (ei->i_fc_lblk_len == 0) { 798 mutex_unlock(&ei->i_fc_lock); 799 return 0; 800 } 801 old_blk_size = ei->i_fc_lblk_start; 802 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 803 ei->i_fc_lblk_len = 0; 804 mutex_unlock(&ei->i_fc_lock); 805 806 cur_lblk_off = old_blk_size; 807 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 808 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 809 810 while (cur_lblk_off <= new_blk_size) { 811 map.m_lblk = cur_lblk_off; 812 map.m_len = new_blk_size - cur_lblk_off + 1; 813 ret = ext4_map_blocks(NULL, inode, &map, 0); 814 if (ret < 0) 815 return -ECANCELED; 816 817 if (map.m_len == 0) { 818 cur_lblk_off++; 819 continue; 820 } 821 822 if (ret == 0) { 823 lrange.fc_ino = cpu_to_le32(inode->i_ino); 824 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 825 lrange.fc_len = cpu_to_le32(map.m_len); 826 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 827 sizeof(lrange), (u8 *)&lrange, crc)) 828 return -ENOSPC; 829 } else { 830 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 831 ex = (struct ext4_extent *)&fc_ext.fc_ex; 832 ex->ee_block = cpu_to_le32(map.m_lblk); 833 ex->ee_len = cpu_to_le16(map.m_len); 834 ext4_ext_store_pblock(ex, map.m_pblk); 835 if (map.m_flags & EXT4_MAP_UNWRITTEN) 836 ext4_ext_mark_unwritten(ex); 837 else 838 ext4_ext_mark_initialized(ex); 839 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 840 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 841 return -ENOSPC; 842 } 843 844 cur_lblk_off += map.m_len; 845 } 846 847 return 0; 848 } 849 850 851 /* Submit data for all the fast commit inodes */ 852 static int ext4_fc_submit_inode_data_all(journal_t *journal) 853 { 854 struct super_block *sb = (struct super_block *)(journal->j_private); 855 struct ext4_sb_info *sbi = EXT4_SB(sb); 856 struct ext4_inode_info *ei; 857 struct list_head *pos; 858 int ret = 0; 859 860 spin_lock(&sbi->s_fc_lock); 861 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 862 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { 863 ei = list_entry(pos, struct ext4_inode_info, i_fc_list); 864 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 865 while (atomic_read(&ei->i_fc_updates)) { 866 DEFINE_WAIT(wait); 867 868 prepare_to_wait(&ei->i_fc_wait, &wait, 869 TASK_UNINTERRUPTIBLE); 870 if (atomic_read(&ei->i_fc_updates)) { 871 spin_unlock(&sbi->s_fc_lock); 872 schedule(); 873 spin_lock(&sbi->s_fc_lock); 874 } 875 finish_wait(&ei->i_fc_wait, &wait); 876 } 877 spin_unlock(&sbi->s_fc_lock); 878 ret = jbd2_submit_inode_data(ei->jinode); 879 if (ret) 880 return ret; 881 spin_lock(&sbi->s_fc_lock); 882 } 883 spin_unlock(&sbi->s_fc_lock); 884 885 return ret; 886 } 887 888 /* Wait for completion of data for all the fast commit inodes */ 889 static int ext4_fc_wait_inode_data_all(journal_t *journal) 890 { 891 struct super_block *sb = (struct super_block *)(journal->j_private); 892 struct ext4_sb_info *sbi = EXT4_SB(sb); 893 struct ext4_inode_info *pos, *n; 894 int ret = 0; 895 896 spin_lock(&sbi->s_fc_lock); 897 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 898 if (!ext4_test_inode_state(&pos->vfs_inode, 899 EXT4_STATE_FC_COMMITTING)) 900 continue; 901 spin_unlock(&sbi->s_fc_lock); 902 903 ret = jbd2_wait_inode_data(journal, pos->jinode); 904 if (ret) 905 return ret; 906 spin_lock(&sbi->s_fc_lock); 907 } 908 spin_unlock(&sbi->s_fc_lock); 909 910 return 0; 911 } 912 913 /* Commit all the directory entry updates */ 914 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 915 __acquires(&sbi->s_fc_lock) 916 __releases(&sbi->s_fc_lock) 917 { 918 struct super_block *sb = (struct super_block *)(journal->j_private); 919 struct ext4_sb_info *sbi = EXT4_SB(sb); 920 struct ext4_fc_dentry_update *fc_dentry; 921 struct inode *inode; 922 struct list_head *pos, *n, *fcd_pos, *fcd_n; 923 struct ext4_inode_info *ei; 924 int ret; 925 926 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 927 return 0; 928 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { 929 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, 930 fcd_list); 931 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 932 spin_unlock(&sbi->s_fc_lock); 933 if (!ext4_fc_add_dentry_tlv( 934 sb, fc_dentry->fcd_op, 935 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 936 fc_dentry->fcd_name.len, 937 fc_dentry->fcd_name.name, crc)) { 938 ret = -ENOSPC; 939 goto lock_and_exit; 940 } 941 spin_lock(&sbi->s_fc_lock); 942 continue; 943 } 944 945 inode = NULL; 946 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { 947 ei = list_entry(pos, struct ext4_inode_info, i_fc_list); 948 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 949 inode = &ei->vfs_inode; 950 break; 951 } 952 } 953 /* 954 * If we don't find inode in our list, then it was deleted, 955 * in which case, we don't need to record it's create tag. 956 */ 957 if (!inode) 958 continue; 959 spin_unlock(&sbi->s_fc_lock); 960 961 /* 962 * We first write the inode and then the create dirent. This 963 * allows the recovery code to create an unnamed inode first 964 * and then link it to a directory entry. This allows us 965 * to use namei.c routines almost as is and simplifies 966 * the recovery code. 967 */ 968 ret = ext4_fc_write_inode(inode, crc); 969 if (ret) 970 goto lock_and_exit; 971 972 ret = ext4_fc_write_inode_data(inode, crc); 973 if (ret) 974 goto lock_and_exit; 975 976 if (!ext4_fc_add_dentry_tlv( 977 sb, fc_dentry->fcd_op, 978 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 979 fc_dentry->fcd_name.len, 980 fc_dentry->fcd_name.name, crc)) { 981 ret = -ENOSPC; 982 goto lock_and_exit; 983 } 984 985 spin_lock(&sbi->s_fc_lock); 986 } 987 return 0; 988 lock_and_exit: 989 spin_lock(&sbi->s_fc_lock); 990 return ret; 991 } 992 993 static int ext4_fc_perform_commit(journal_t *journal) 994 { 995 struct super_block *sb = (struct super_block *)(journal->j_private); 996 struct ext4_sb_info *sbi = EXT4_SB(sb); 997 struct ext4_inode_info *iter; 998 struct ext4_fc_head head; 999 struct list_head *pos; 1000 struct inode *inode; 1001 struct blk_plug plug; 1002 int ret = 0; 1003 u32 crc = 0; 1004 1005 ret = ext4_fc_submit_inode_data_all(journal); 1006 if (ret) 1007 return ret; 1008 1009 ret = ext4_fc_wait_inode_data_all(journal); 1010 if (ret) 1011 return ret; 1012 1013 /* 1014 * If file system device is different from journal device, issue a cache 1015 * flush before we start writing fast commit blocks. 1016 */ 1017 if (journal->j_fs_dev != journal->j_dev) 1018 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); 1019 1020 blk_start_plug(&plug); 1021 if (sbi->s_fc_bytes == 0) { 1022 /* 1023 * Add a head tag only if this is the first fast commit 1024 * in this TID. 1025 */ 1026 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1027 head.fc_tid = cpu_to_le32( 1028 sbi->s_journal->j_running_transaction->t_tid); 1029 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1030 (u8 *)&head, &crc)) 1031 goto out; 1032 } 1033 1034 spin_lock(&sbi->s_fc_lock); 1035 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1036 if (ret) { 1037 spin_unlock(&sbi->s_fc_lock); 1038 goto out; 1039 } 1040 1041 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { 1042 iter = list_entry(pos, struct ext4_inode_info, i_fc_list); 1043 inode = &iter->vfs_inode; 1044 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1045 continue; 1046 1047 spin_unlock(&sbi->s_fc_lock); 1048 ret = ext4_fc_write_inode_data(inode, &crc); 1049 if (ret) 1050 goto out; 1051 ret = ext4_fc_write_inode(inode, &crc); 1052 if (ret) 1053 goto out; 1054 spin_lock(&sbi->s_fc_lock); 1055 } 1056 spin_unlock(&sbi->s_fc_lock); 1057 1058 ret = ext4_fc_write_tail(sb, crc); 1059 1060 out: 1061 blk_finish_plug(&plug); 1062 return ret; 1063 } 1064 1065 /* 1066 * The main commit entry point. Performs a fast commit for transaction 1067 * commit_tid if needed. If it's not possible to perform a fast commit 1068 * due to various reasons, we fall back to full commit. Returns 0 1069 * on success, error otherwise. 1070 */ 1071 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1072 { 1073 struct super_block *sb = (struct super_block *)(journal->j_private); 1074 struct ext4_sb_info *sbi = EXT4_SB(sb); 1075 int nblks = 0, ret, bsize = journal->j_blocksize; 1076 int subtid = atomic_read(&sbi->s_fc_subtid); 1077 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; 1078 ktime_t start_time, commit_time; 1079 1080 trace_ext4_fc_commit_start(sb); 1081 1082 start_time = ktime_get(); 1083 1084 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 1085 (ext4_fc_is_ineligible(sb))) { 1086 reason = EXT4_FC_REASON_INELIGIBLE; 1087 goto out; 1088 } 1089 1090 restart_fc: 1091 ret = jbd2_fc_begin_commit(journal, commit_tid); 1092 if (ret == -EALREADY) { 1093 /* There was an ongoing commit, check if we need to restart */ 1094 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1095 commit_tid > journal->j_commit_sequence) 1096 goto restart_fc; 1097 reason = EXT4_FC_REASON_ALREADY_COMMITTED; 1098 goto out; 1099 } else if (ret) { 1100 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1101 reason = EXT4_FC_REASON_FC_START_FAILED; 1102 goto out; 1103 } 1104 1105 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1106 ret = ext4_fc_perform_commit(journal); 1107 if (ret < 0) { 1108 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1109 reason = EXT4_FC_REASON_FC_FAILED; 1110 goto out; 1111 } 1112 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1113 ret = jbd2_fc_wait_bufs(journal, nblks); 1114 if (ret < 0) { 1115 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1116 reason = EXT4_FC_REASON_FC_FAILED; 1117 goto out; 1118 } 1119 atomic_inc(&sbi->s_fc_subtid); 1120 jbd2_fc_end_commit(journal); 1121 out: 1122 /* Has any ineligible update happened since we started? */ 1123 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { 1124 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1125 reason = EXT4_FC_REASON_INELIGIBLE; 1126 } 1127 1128 spin_lock(&sbi->s_fc_lock); 1129 if (reason != EXT4_FC_REASON_OK && 1130 reason != EXT4_FC_REASON_ALREADY_COMMITTED) { 1131 sbi->s_fc_stats.fc_ineligible_commits++; 1132 } else { 1133 sbi->s_fc_stats.fc_num_commits++; 1134 sbi->s_fc_stats.fc_numblks += nblks; 1135 } 1136 spin_unlock(&sbi->s_fc_lock); 1137 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; 1138 trace_ext4_fc_commit_stop(sb, nblks, reason); 1139 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1140 /* 1141 * weight the commit time higher than the average time so we don't 1142 * react too strongly to vast changes in the commit time 1143 */ 1144 if (likely(sbi->s_fc_avg_commit_time)) 1145 sbi->s_fc_avg_commit_time = (commit_time + 1146 sbi->s_fc_avg_commit_time * 3) / 4; 1147 else 1148 sbi->s_fc_avg_commit_time = commit_time; 1149 jbd_debug(1, 1150 "Fast commit ended with blks = %d, reason = %d, subtid - %d", 1151 nblks, reason, subtid); 1152 if (reason == EXT4_FC_REASON_FC_FAILED) 1153 return jbd2_fc_end_commit_fallback(journal); 1154 if (reason == EXT4_FC_REASON_FC_START_FAILED || 1155 reason == EXT4_FC_REASON_INELIGIBLE) 1156 return jbd2_complete_transaction(journal, commit_tid); 1157 return 0; 1158 } 1159 1160 /* 1161 * Fast commit cleanup routine. This is called after every fast commit and 1162 * full commit. full is true if we are called after a full commit. 1163 */ 1164 static void ext4_fc_cleanup(journal_t *journal, int full) 1165 { 1166 struct super_block *sb = journal->j_private; 1167 struct ext4_sb_info *sbi = EXT4_SB(sb); 1168 struct ext4_inode_info *iter; 1169 struct ext4_fc_dentry_update *fc_dentry; 1170 struct list_head *pos, *n; 1171 1172 if (full && sbi->s_fc_bh) 1173 sbi->s_fc_bh = NULL; 1174 1175 jbd2_fc_release_bufs(journal); 1176 1177 spin_lock(&sbi->s_fc_lock); 1178 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { 1179 iter = list_entry(pos, struct ext4_inode_info, i_fc_list); 1180 list_del_init(&iter->i_fc_list); 1181 ext4_clear_inode_state(&iter->vfs_inode, 1182 EXT4_STATE_FC_COMMITTING); 1183 ext4_fc_reset_inode(&iter->vfs_inode); 1184 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1185 smp_mb(); 1186 #if (BITS_PER_LONG < 64) 1187 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1188 #else 1189 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1190 #endif 1191 } 1192 1193 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1194 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1195 struct ext4_fc_dentry_update, 1196 fcd_list); 1197 list_del_init(&fc_dentry->fcd_list); 1198 spin_unlock(&sbi->s_fc_lock); 1199 1200 if (fc_dentry->fcd_name.name && 1201 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1202 kfree(fc_dentry->fcd_name.name); 1203 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1204 spin_lock(&sbi->s_fc_lock); 1205 } 1206 1207 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1208 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1209 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1210 &sbi->s_fc_q[FC_Q_STAGING]); 1211 1212 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1213 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1214 1215 if (full) 1216 sbi->s_fc_bytes = 0; 1217 spin_unlock(&sbi->s_fc_lock); 1218 trace_ext4_fc_stats(sb); 1219 } 1220 1221 /* Ext4 Replay Path Routines */ 1222 1223 /* Get length of a particular tlv */ 1224 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 1225 { 1226 return le16_to_cpu(tl->fc_len); 1227 } 1228 1229 /* Get a pointer to "value" of a tlv */ 1230 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 1231 { 1232 return (u8 *)tl + sizeof(*tl); 1233 } 1234 1235 /* Helper struct for dentry replay routines */ 1236 struct dentry_info_args { 1237 int parent_ino, dname_len, ino, inode_len; 1238 char *dname; 1239 }; 1240 1241 static inline void tl_to_darg(struct dentry_info_args *darg, 1242 struct ext4_fc_tl *tl) 1243 { 1244 struct ext4_fc_dentry_info *fcd; 1245 1246 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); 1247 1248 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); 1249 darg->ino = le32_to_cpu(fcd->fc_ino); 1250 darg->dname = fcd->fc_dname; 1251 darg->dname_len = ext4_fc_tag_len(tl) - 1252 sizeof(struct ext4_fc_dentry_info); 1253 } 1254 1255 /* Unlink replay function */ 1256 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) 1257 { 1258 struct inode *inode, *old_parent; 1259 struct qstr entry; 1260 struct dentry_info_args darg; 1261 int ret = 0; 1262 1263 tl_to_darg(&darg, tl); 1264 1265 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1266 darg.parent_ino, darg.dname_len); 1267 1268 entry.name = darg.dname; 1269 entry.len = darg.dname_len; 1270 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1271 1272 if (IS_ERR_OR_NULL(inode)) { 1273 jbd_debug(1, "Inode %d not found", darg.ino); 1274 return 0; 1275 } 1276 1277 old_parent = ext4_iget(sb, darg.parent_ino, 1278 EXT4_IGET_NORMAL); 1279 if (IS_ERR_OR_NULL(old_parent)) { 1280 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1281 iput(inode); 1282 return 0; 1283 } 1284 1285 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1286 /* -ENOENT ok coz it might not exist anymore. */ 1287 if (ret == -ENOENT) 1288 ret = 0; 1289 iput(old_parent); 1290 iput(inode); 1291 return ret; 1292 } 1293 1294 static int ext4_fc_replay_link_internal(struct super_block *sb, 1295 struct dentry_info_args *darg, 1296 struct inode *inode) 1297 { 1298 struct inode *dir = NULL; 1299 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1300 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1301 int ret = 0; 1302 1303 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1304 if (IS_ERR(dir)) { 1305 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1306 dir = NULL; 1307 goto out; 1308 } 1309 1310 dentry_dir = d_obtain_alias(dir); 1311 if (IS_ERR(dentry_dir)) { 1312 jbd_debug(1, "Failed to obtain dentry"); 1313 dentry_dir = NULL; 1314 goto out; 1315 } 1316 1317 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1318 if (!dentry_inode) { 1319 jbd_debug(1, "Inode dentry not created."); 1320 ret = -ENOMEM; 1321 goto out; 1322 } 1323 1324 ret = __ext4_link(dir, inode, dentry_inode); 1325 /* 1326 * It's possible that link already existed since data blocks 1327 * for the dir in question got persisted before we crashed OR 1328 * we replayed this tag and crashed before the entire replay 1329 * could complete. 1330 */ 1331 if (ret && ret != -EEXIST) { 1332 jbd_debug(1, "Failed to link\n"); 1333 goto out; 1334 } 1335 1336 ret = 0; 1337 out: 1338 if (dentry_dir) { 1339 d_drop(dentry_dir); 1340 dput(dentry_dir); 1341 } else if (dir) { 1342 iput(dir); 1343 } 1344 if (dentry_inode) { 1345 d_drop(dentry_inode); 1346 dput(dentry_inode); 1347 } 1348 1349 return ret; 1350 } 1351 1352 /* Link replay function */ 1353 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) 1354 { 1355 struct inode *inode; 1356 struct dentry_info_args darg; 1357 int ret = 0; 1358 1359 tl_to_darg(&darg, tl); 1360 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1361 darg.parent_ino, darg.dname_len); 1362 1363 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1364 if (IS_ERR_OR_NULL(inode)) { 1365 jbd_debug(1, "Inode not found."); 1366 return 0; 1367 } 1368 1369 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1370 iput(inode); 1371 return ret; 1372 } 1373 1374 /* 1375 * Record all the modified inodes during replay. We use this later to setup 1376 * block bitmaps correctly. 1377 */ 1378 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1379 { 1380 struct ext4_fc_replay_state *state; 1381 int i; 1382 1383 state = &EXT4_SB(sb)->s_fc_replay_state; 1384 for (i = 0; i < state->fc_modified_inodes_used; i++) 1385 if (state->fc_modified_inodes[i] == ino) 1386 return 0; 1387 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1388 state->fc_modified_inodes_size += 1389 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1390 state->fc_modified_inodes = krealloc( 1391 state->fc_modified_inodes, sizeof(int) * 1392 state->fc_modified_inodes_size, 1393 GFP_KERNEL); 1394 if (!state->fc_modified_inodes) 1395 return -ENOMEM; 1396 } 1397 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1398 return 0; 1399 } 1400 1401 /* 1402 * Inode replay function 1403 */ 1404 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) 1405 { 1406 struct ext4_fc_inode *fc_inode; 1407 struct ext4_inode *raw_inode; 1408 struct ext4_inode *raw_fc_inode; 1409 struct inode *inode = NULL; 1410 struct ext4_iloc iloc; 1411 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1412 struct ext4_extent_header *eh; 1413 1414 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); 1415 1416 ino = le32_to_cpu(fc_inode->fc_ino); 1417 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1418 1419 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1420 if (!IS_ERR_OR_NULL(inode)) { 1421 ext4_ext_clear_bb(inode); 1422 iput(inode); 1423 } 1424 1425 ext4_fc_record_modified_inode(sb, ino); 1426 1427 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; 1428 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1429 if (ret) 1430 goto out; 1431 1432 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); 1433 raw_inode = ext4_raw_inode(&iloc); 1434 1435 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1436 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1437 inode_len - offsetof(struct ext4_inode, i_generation)); 1438 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1439 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1440 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1441 memset(eh, 0, sizeof(*eh)); 1442 eh->eh_magic = EXT4_EXT_MAGIC; 1443 eh->eh_max = cpu_to_le16( 1444 (sizeof(raw_inode->i_block) - 1445 sizeof(struct ext4_extent_header)) 1446 / sizeof(struct ext4_extent)); 1447 } 1448 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1449 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1450 sizeof(raw_inode->i_block)); 1451 } 1452 1453 /* Immediately update the inode on disk. */ 1454 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1455 if (ret) 1456 goto out; 1457 ret = sync_dirty_buffer(iloc.bh); 1458 if (ret) 1459 goto out; 1460 ret = ext4_mark_inode_used(sb, ino); 1461 if (ret) 1462 goto out; 1463 1464 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1465 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1466 if (IS_ERR_OR_NULL(inode)) { 1467 jbd_debug(1, "Inode not found."); 1468 return -EFSCORRUPTED; 1469 } 1470 1471 /* 1472 * Our allocator could have made different decisions than before 1473 * crashing. This should be fixed but until then, we calculate 1474 * the number of blocks the inode. 1475 */ 1476 ext4_ext_replay_set_iblocks(inode); 1477 1478 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1479 ext4_reset_inode_seed(inode); 1480 1481 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1482 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1483 sync_dirty_buffer(iloc.bh); 1484 brelse(iloc.bh); 1485 out: 1486 iput(inode); 1487 if (!ret) 1488 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); 1489 1490 return 0; 1491 } 1492 1493 /* 1494 * Dentry create replay function. 1495 * 1496 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1497 * inode for which we are trying to create a dentry here, should already have 1498 * been replayed before we start here. 1499 */ 1500 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) 1501 { 1502 int ret = 0; 1503 struct inode *inode = NULL; 1504 struct inode *dir = NULL; 1505 struct dentry_info_args darg; 1506 1507 tl_to_darg(&darg, tl); 1508 1509 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1510 darg.parent_ino, darg.dname_len); 1511 1512 /* This takes care of update group descriptor and other metadata */ 1513 ret = ext4_mark_inode_used(sb, darg.ino); 1514 if (ret) 1515 goto out; 1516 1517 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1518 if (IS_ERR_OR_NULL(inode)) { 1519 jbd_debug(1, "inode %d not found.", darg.ino); 1520 inode = NULL; 1521 ret = -EINVAL; 1522 goto out; 1523 } 1524 1525 if (S_ISDIR(inode->i_mode)) { 1526 /* 1527 * If we are creating a directory, we need to make sure that the 1528 * dot and dot dot dirents are setup properly. 1529 */ 1530 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1531 if (IS_ERR_OR_NULL(dir)) { 1532 jbd_debug(1, "Dir %d not found.", darg.ino); 1533 goto out; 1534 } 1535 ret = ext4_init_new_dir(NULL, dir, inode); 1536 iput(dir); 1537 if (ret) { 1538 ret = 0; 1539 goto out; 1540 } 1541 } 1542 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1543 if (ret) 1544 goto out; 1545 set_nlink(inode, 1); 1546 ext4_mark_inode_dirty(NULL, inode); 1547 out: 1548 if (inode) 1549 iput(inode); 1550 return ret; 1551 } 1552 1553 /* 1554 * Record physical disk regions which are in use as per fast commit area. Our 1555 * simple replay phase allocator excludes these regions from allocation. 1556 */ 1557 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1558 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1559 { 1560 struct ext4_fc_replay_state *state; 1561 struct ext4_fc_alloc_region *region; 1562 1563 state = &EXT4_SB(sb)->s_fc_replay_state; 1564 if (state->fc_regions_used == state->fc_regions_size) { 1565 state->fc_regions_size += 1566 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1567 state->fc_regions = krealloc( 1568 state->fc_regions, 1569 state->fc_regions_size * 1570 sizeof(struct ext4_fc_alloc_region), 1571 GFP_KERNEL); 1572 if (!state->fc_regions) 1573 return -ENOMEM; 1574 } 1575 region = &state->fc_regions[state->fc_regions_used++]; 1576 region->ino = ino; 1577 region->lblk = lblk; 1578 region->pblk = pblk; 1579 region->len = len; 1580 1581 return 0; 1582 } 1583 1584 /* Replay add range tag */ 1585 static int ext4_fc_replay_add_range(struct super_block *sb, 1586 struct ext4_fc_tl *tl) 1587 { 1588 struct ext4_fc_add_range *fc_add_ex; 1589 struct ext4_extent newex, *ex; 1590 struct inode *inode; 1591 ext4_lblk_t start, cur; 1592 int remaining, len; 1593 ext4_fsblk_t start_pblk; 1594 struct ext4_map_blocks map; 1595 struct ext4_ext_path *path = NULL; 1596 int ret; 1597 1598 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1599 ex = (struct ext4_extent *)&fc_add_ex->fc_ex; 1600 1601 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1602 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), 1603 ext4_ext_get_actual_len(ex)); 1604 1605 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), 1606 EXT4_IGET_NORMAL); 1607 if (IS_ERR_OR_NULL(inode)) { 1608 jbd_debug(1, "Inode not found."); 1609 return 0; 1610 } 1611 1612 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1613 1614 start = le32_to_cpu(ex->ee_block); 1615 start_pblk = ext4_ext_pblock(ex); 1616 len = ext4_ext_get_actual_len(ex); 1617 1618 cur = start; 1619 remaining = len; 1620 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1621 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1622 inode->i_ino); 1623 1624 while (remaining > 0) { 1625 map.m_lblk = cur; 1626 map.m_len = remaining; 1627 map.m_pblk = 0; 1628 ret = ext4_map_blocks(NULL, inode, &map, 0); 1629 1630 if (ret < 0) { 1631 iput(inode); 1632 return 0; 1633 } 1634 1635 if (ret == 0) { 1636 /* Range is not mapped */ 1637 path = ext4_find_extent(inode, cur, NULL, 0); 1638 if (IS_ERR(path)) { 1639 iput(inode); 1640 return 0; 1641 } 1642 memset(&newex, 0, sizeof(newex)); 1643 newex.ee_block = cpu_to_le32(cur); 1644 ext4_ext_store_pblock( 1645 &newex, start_pblk + cur - start); 1646 newex.ee_len = cpu_to_le16(map.m_len); 1647 if (ext4_ext_is_unwritten(ex)) 1648 ext4_ext_mark_unwritten(&newex); 1649 down_write(&EXT4_I(inode)->i_data_sem); 1650 ret = ext4_ext_insert_extent( 1651 NULL, inode, &path, &newex, 0); 1652 up_write((&EXT4_I(inode)->i_data_sem)); 1653 ext4_ext_drop_refs(path); 1654 kfree(path); 1655 if (ret) { 1656 iput(inode); 1657 return 0; 1658 } 1659 goto next; 1660 } 1661 1662 if (start_pblk + cur - start != map.m_pblk) { 1663 /* 1664 * Logical to physical mapping changed. This can happen 1665 * if this range was removed and then reallocated to 1666 * map to new physical blocks during a fast commit. 1667 */ 1668 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1669 ext4_ext_is_unwritten(ex), 1670 start_pblk + cur - start); 1671 if (ret) { 1672 iput(inode); 1673 return 0; 1674 } 1675 /* 1676 * Mark the old blocks as free since they aren't used 1677 * anymore. We maintain an array of all the modified 1678 * inodes. In case these blocks are still used at either 1679 * a different logical range in the same inode or in 1680 * some different inode, we will mark them as allocated 1681 * at the end of the FC replay using our array of 1682 * modified inodes. 1683 */ 1684 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1685 goto next; 1686 } 1687 1688 /* Range is mapped and needs a state change */ 1689 jbd_debug(1, "Converting from %d to %d %lld", 1690 map.m_flags & EXT4_MAP_UNWRITTEN, 1691 ext4_ext_is_unwritten(ex), map.m_pblk); 1692 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1693 ext4_ext_is_unwritten(ex), map.m_pblk); 1694 if (ret) { 1695 iput(inode); 1696 return 0; 1697 } 1698 /* 1699 * We may have split the extent tree while toggling the state. 1700 * Try to shrink the extent tree now. 1701 */ 1702 ext4_ext_replay_shrink_inode(inode, start + len); 1703 next: 1704 cur += map.m_len; 1705 remaining -= map.m_len; 1706 } 1707 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1708 sb->s_blocksize_bits); 1709 iput(inode); 1710 return 0; 1711 } 1712 1713 /* Replay DEL_RANGE tag */ 1714 static int 1715 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) 1716 { 1717 struct inode *inode; 1718 struct ext4_fc_del_range *lrange; 1719 struct ext4_map_blocks map; 1720 ext4_lblk_t cur, remaining; 1721 int ret; 1722 1723 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); 1724 cur = le32_to_cpu(lrange->fc_lblk); 1725 remaining = le32_to_cpu(lrange->fc_len); 1726 1727 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1728 le32_to_cpu(lrange->fc_ino), cur, remaining); 1729 1730 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); 1731 if (IS_ERR_OR_NULL(inode)) { 1732 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); 1733 return 0; 1734 } 1735 1736 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1737 1738 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1739 inode->i_ino, le32_to_cpu(lrange->fc_lblk), 1740 le32_to_cpu(lrange->fc_len)); 1741 while (remaining > 0) { 1742 map.m_lblk = cur; 1743 map.m_len = remaining; 1744 1745 ret = ext4_map_blocks(NULL, inode, &map, 0); 1746 if (ret < 0) { 1747 iput(inode); 1748 return 0; 1749 } 1750 if (ret > 0) { 1751 remaining -= ret; 1752 cur += ret; 1753 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1754 } else { 1755 remaining -= map.m_len; 1756 cur += map.m_len; 1757 } 1758 } 1759 1760 ret = ext4_punch_hole(inode, 1761 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, 1762 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); 1763 if (ret) 1764 jbd_debug(1, "ext4_punch_hole returned %d", ret); 1765 ext4_ext_replay_shrink_inode(inode, 1766 i_size_read(inode) >> sb->s_blocksize_bits); 1767 ext4_mark_inode_dirty(NULL, inode); 1768 iput(inode); 1769 1770 return 0; 1771 } 1772 1773 static inline const char *tag2str(u16 tag) 1774 { 1775 switch (tag) { 1776 case EXT4_FC_TAG_LINK: 1777 return "TAG_ADD_ENTRY"; 1778 case EXT4_FC_TAG_UNLINK: 1779 return "TAG_DEL_ENTRY"; 1780 case EXT4_FC_TAG_ADD_RANGE: 1781 return "TAG_ADD_RANGE"; 1782 case EXT4_FC_TAG_CREAT: 1783 return "TAG_CREAT_DENTRY"; 1784 case EXT4_FC_TAG_DEL_RANGE: 1785 return "TAG_DEL_RANGE"; 1786 case EXT4_FC_TAG_INODE: 1787 return "TAG_INODE"; 1788 case EXT4_FC_TAG_PAD: 1789 return "TAG_PAD"; 1790 case EXT4_FC_TAG_TAIL: 1791 return "TAG_TAIL"; 1792 case EXT4_FC_TAG_HEAD: 1793 return "TAG_HEAD"; 1794 default: 1795 return "TAG_ERROR"; 1796 } 1797 } 1798 1799 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1800 { 1801 struct ext4_fc_replay_state *state; 1802 struct inode *inode; 1803 struct ext4_ext_path *path = NULL; 1804 struct ext4_map_blocks map; 1805 int i, ret, j; 1806 ext4_lblk_t cur, end; 1807 1808 state = &EXT4_SB(sb)->s_fc_replay_state; 1809 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1810 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1811 EXT4_IGET_NORMAL); 1812 if (IS_ERR_OR_NULL(inode)) { 1813 jbd_debug(1, "Inode %d not found.", 1814 state->fc_modified_inodes[i]); 1815 continue; 1816 } 1817 cur = 0; 1818 end = EXT_MAX_BLOCKS; 1819 while (cur < end) { 1820 map.m_lblk = cur; 1821 map.m_len = end - cur; 1822 1823 ret = ext4_map_blocks(NULL, inode, &map, 0); 1824 if (ret < 0) 1825 break; 1826 1827 if (ret > 0) { 1828 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1829 if (!IS_ERR_OR_NULL(path)) { 1830 for (j = 0; j < path->p_depth; j++) 1831 ext4_mb_mark_bb(inode->i_sb, 1832 path[j].p_block, 1, 1); 1833 ext4_ext_drop_refs(path); 1834 kfree(path); 1835 } 1836 cur += ret; 1837 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1838 map.m_len, 1); 1839 } else { 1840 cur = cur + (map.m_len ? map.m_len : 1); 1841 } 1842 } 1843 iput(inode); 1844 } 1845 } 1846 1847 /* 1848 * Check if block is in excluded regions for block allocation. The simple 1849 * allocator that runs during replay phase is calls this function to see 1850 * if it is okay to use a block. 1851 */ 1852 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1853 { 1854 int i; 1855 struct ext4_fc_replay_state *state; 1856 1857 state = &EXT4_SB(sb)->s_fc_replay_state; 1858 for (i = 0; i < state->fc_regions_valid; i++) { 1859 if (state->fc_regions[i].ino == 0 || 1860 state->fc_regions[i].len == 0) 1861 continue; 1862 if (blk >= state->fc_regions[i].pblk && 1863 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1864 return true; 1865 } 1866 return false; 1867 } 1868 1869 /* Cleanup function called after replay */ 1870 void ext4_fc_replay_cleanup(struct super_block *sb) 1871 { 1872 struct ext4_sb_info *sbi = EXT4_SB(sb); 1873 1874 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1875 kfree(sbi->s_fc_replay_state.fc_regions); 1876 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1877 } 1878 1879 /* 1880 * Recovery Scan phase handler 1881 * 1882 * This function is called during the scan phase and is responsible 1883 * for doing following things: 1884 * - Make sure the fast commit area has valid tags for replay 1885 * - Count number of tags that need to be replayed by the replay handler 1886 * - Verify CRC 1887 * - Create a list of excluded blocks for allocation during replay phase 1888 * 1889 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1890 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1891 * to indicate that scan has finished and JBD2 can now start replay phase. 1892 * It returns a negative error to indicate that there was an error. At the end 1893 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1894 * to indicate the number of tags that need to replayed during the replay phase. 1895 */ 1896 static int ext4_fc_replay_scan(journal_t *journal, 1897 struct buffer_head *bh, int off, 1898 tid_t expected_tid) 1899 { 1900 struct super_block *sb = journal->j_private; 1901 struct ext4_sb_info *sbi = EXT4_SB(sb); 1902 struct ext4_fc_replay_state *state; 1903 int ret = JBD2_FC_REPLAY_CONTINUE; 1904 struct ext4_fc_add_range *ext; 1905 struct ext4_fc_tl *tl; 1906 struct ext4_fc_tail *tail; 1907 __u8 *start, *end; 1908 struct ext4_fc_head *head; 1909 struct ext4_extent *ex; 1910 1911 state = &sbi->s_fc_replay_state; 1912 1913 start = (u8 *)bh->b_data; 1914 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1915 1916 if (state->fc_replay_expected_off == 0) { 1917 state->fc_cur_tag = 0; 1918 state->fc_replay_num_tags = 0; 1919 state->fc_crc = 0; 1920 state->fc_regions = NULL; 1921 state->fc_regions_valid = state->fc_regions_used = 1922 state->fc_regions_size = 0; 1923 /* Check if we can stop early */ 1924 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1925 != EXT4_FC_TAG_HEAD) 1926 return 0; 1927 } 1928 1929 if (off != state->fc_replay_expected_off) { 1930 ret = -EFSCORRUPTED; 1931 goto out_err; 1932 } 1933 1934 state->fc_replay_expected_off++; 1935 fc_for_each_tl(start, end, tl) { 1936 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1937 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); 1938 switch (le16_to_cpu(tl->fc_tag)) { 1939 case EXT4_FC_TAG_ADD_RANGE: 1940 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1941 ex = (struct ext4_extent *)&ext->fc_ex; 1942 ret = ext4_fc_record_regions(sb, 1943 le32_to_cpu(ext->fc_ino), 1944 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1945 ext4_ext_get_actual_len(ex)); 1946 if (ret < 0) 1947 break; 1948 ret = JBD2_FC_REPLAY_CONTINUE; 1949 fallthrough; 1950 case EXT4_FC_TAG_DEL_RANGE: 1951 case EXT4_FC_TAG_LINK: 1952 case EXT4_FC_TAG_UNLINK: 1953 case EXT4_FC_TAG_CREAT: 1954 case EXT4_FC_TAG_INODE: 1955 case EXT4_FC_TAG_PAD: 1956 state->fc_cur_tag++; 1957 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1958 sizeof(*tl) + ext4_fc_tag_len(tl)); 1959 break; 1960 case EXT4_FC_TAG_TAIL: 1961 state->fc_cur_tag++; 1962 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 1963 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1964 sizeof(*tl) + 1965 offsetof(struct ext4_fc_tail, 1966 fc_crc)); 1967 if (le32_to_cpu(tail->fc_tid) == expected_tid && 1968 le32_to_cpu(tail->fc_crc) == state->fc_crc) { 1969 state->fc_replay_num_tags = state->fc_cur_tag; 1970 state->fc_regions_valid = 1971 state->fc_regions_used; 1972 } else { 1973 ret = state->fc_replay_num_tags ? 1974 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 1975 } 1976 state->fc_crc = 0; 1977 break; 1978 case EXT4_FC_TAG_HEAD: 1979 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); 1980 if (le32_to_cpu(head->fc_features) & 1981 ~EXT4_FC_SUPPORTED_FEATURES) { 1982 ret = -EOPNOTSUPP; 1983 break; 1984 } 1985 if (le32_to_cpu(head->fc_tid) != expected_tid) { 1986 ret = JBD2_FC_REPLAY_STOP; 1987 break; 1988 } 1989 state->fc_cur_tag++; 1990 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1991 sizeof(*tl) + ext4_fc_tag_len(tl)); 1992 break; 1993 default: 1994 ret = state->fc_replay_num_tags ? 1995 JBD2_FC_REPLAY_STOP : -ECANCELED; 1996 } 1997 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 1998 break; 1999 } 2000 2001 out_err: 2002 trace_ext4_fc_replay_scan(sb, ret, off); 2003 return ret; 2004 } 2005 2006 /* 2007 * Main recovery path entry point. 2008 * The meaning of return codes is similar as above. 2009 */ 2010 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2011 enum passtype pass, int off, tid_t expected_tid) 2012 { 2013 struct super_block *sb = journal->j_private; 2014 struct ext4_sb_info *sbi = EXT4_SB(sb); 2015 struct ext4_fc_tl *tl; 2016 __u8 *start, *end; 2017 int ret = JBD2_FC_REPLAY_CONTINUE; 2018 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2019 struct ext4_fc_tail *tail; 2020 2021 if (pass == PASS_SCAN) { 2022 state->fc_current_pass = PASS_SCAN; 2023 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2024 } 2025 2026 if (state->fc_current_pass != pass) { 2027 state->fc_current_pass = pass; 2028 sbi->s_mount_state |= EXT4_FC_REPLAY; 2029 } 2030 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2031 jbd_debug(1, "Replay stops\n"); 2032 ext4_fc_set_bitmaps_and_counters(sb); 2033 return 0; 2034 } 2035 2036 #ifdef CONFIG_EXT4_DEBUG 2037 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2038 pr_warn("Dropping fc block %d because max_replay set\n", off); 2039 return JBD2_FC_REPLAY_STOP; 2040 } 2041 #endif 2042 2043 start = (u8 *)bh->b_data; 2044 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2045 2046 fc_for_each_tl(start, end, tl) { 2047 if (state->fc_replay_num_tags == 0) { 2048 ret = JBD2_FC_REPLAY_STOP; 2049 ext4_fc_set_bitmaps_and_counters(sb); 2050 break; 2051 } 2052 jbd_debug(3, "Replay phase, tag:%s\n", 2053 tag2str(le16_to_cpu(tl->fc_tag))); 2054 state->fc_replay_num_tags--; 2055 switch (le16_to_cpu(tl->fc_tag)) { 2056 case EXT4_FC_TAG_LINK: 2057 ret = ext4_fc_replay_link(sb, tl); 2058 break; 2059 case EXT4_FC_TAG_UNLINK: 2060 ret = ext4_fc_replay_unlink(sb, tl); 2061 break; 2062 case EXT4_FC_TAG_ADD_RANGE: 2063 ret = ext4_fc_replay_add_range(sb, tl); 2064 break; 2065 case EXT4_FC_TAG_CREAT: 2066 ret = ext4_fc_replay_create(sb, tl); 2067 break; 2068 case EXT4_FC_TAG_DEL_RANGE: 2069 ret = ext4_fc_replay_del_range(sb, tl); 2070 break; 2071 case EXT4_FC_TAG_INODE: 2072 ret = ext4_fc_replay_inode(sb, tl); 2073 break; 2074 case EXT4_FC_TAG_PAD: 2075 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2076 ext4_fc_tag_len(tl), 0); 2077 break; 2078 case EXT4_FC_TAG_TAIL: 2079 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2080 ext4_fc_tag_len(tl), 0); 2081 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 2082 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); 2083 break; 2084 case EXT4_FC_TAG_HEAD: 2085 break; 2086 default: 2087 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, 2088 ext4_fc_tag_len(tl), 0); 2089 ret = -ECANCELED; 2090 break; 2091 } 2092 if (ret < 0) 2093 break; 2094 ret = JBD2_FC_REPLAY_CONTINUE; 2095 } 2096 return ret; 2097 } 2098 2099 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2100 { 2101 /* 2102 * We set replay callback even if fast commit disabled because we may 2103 * could still have fast commit blocks that need to be replayed even if 2104 * fast commit has now been turned off. 2105 */ 2106 journal->j_fc_replay_callback = ext4_fc_replay; 2107 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2108 return; 2109 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2110 } 2111 2112 static const char *fc_ineligible_reasons[] = { 2113 "Extended attributes changed", 2114 "Cross rename", 2115 "Journal flag changed", 2116 "Insufficient memory", 2117 "Swap boot", 2118 "Resize", 2119 "Dir renamed", 2120 "Falloc range op", 2121 "Data journalling", 2122 "FC Commit Failed" 2123 }; 2124 2125 int ext4_fc_info_show(struct seq_file *seq, void *v) 2126 { 2127 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2128 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2129 int i; 2130 2131 if (v != SEQ_START_TOKEN) 2132 return 0; 2133 2134 seq_printf(seq, 2135 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2136 stats->fc_num_commits, stats->fc_ineligible_commits, 2137 stats->fc_numblks, 2138 div_u64(sbi->s_fc_avg_commit_time, 1000)); 2139 seq_puts(seq, "Ineligible reasons:\n"); 2140 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2141 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2142 stats->fc_ineligible_reason_count[i]); 2143 2144 return 0; 2145 } 2146 2147 int __init ext4_fc_init_dentry_cache(void) 2148 { 2149 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2150 SLAB_RECLAIM_ACCOUNT); 2151 2152 if (ext4_fc_dentry_cachep == NULL) 2153 return -ENOMEM; 2154 2155 return 0; 2156 } 2157