1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * Not all operations are supported by fast commits today (e.g extended 69 * attributes). Fast commit ineligiblity is marked by calling one of the 70 * two following functions: 71 * 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall 73 * back to full commit. This is useful in case of transient errors. 74 * 75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all 76 * the fast commits happening between ext4_fc_start_ineligible() and 77 * ext4_fc_stop_ineligible() and one fast commit after the call to 78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to 79 * make one more fast commit to fall back to full commit after stop call so 80 * that it guaranteed that the fast commit ineligible operation contained 81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is 82 * followed by at least 1 full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to gaurantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * TODOs 107 * ----- 108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 109 * eligible update must be protected within ext4_fc_start_update() and 110 * ext4_fc_stop_update(). These routines are called at much higher 111 * routines. This can be made more fine grained by combining with 112 * ext4_journal_start(). 113 * 114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() 115 * 116 * 3) Handle more ineligible cases. 117 */ 118 119 #include <trace/events/ext4.h> 120 static struct kmem_cache *ext4_fc_dentry_cachep; 121 122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 123 { 124 BUFFER_TRACE(bh, ""); 125 if (uptodate) { 126 ext4_debug("%s: Block %lld up-to-date", 127 __func__, bh->b_blocknr); 128 set_buffer_uptodate(bh); 129 } else { 130 ext4_debug("%s: Block %lld not up-to-date", 131 __func__, bh->b_blocknr); 132 clear_buffer_uptodate(bh); 133 } 134 135 unlock_buffer(bh); 136 } 137 138 static inline void ext4_fc_reset_inode(struct inode *inode) 139 { 140 struct ext4_inode_info *ei = EXT4_I(inode); 141 142 ei->i_fc_lblk_start = 0; 143 ei->i_fc_lblk_len = 0; 144 } 145 146 void ext4_fc_init_inode(struct inode *inode) 147 { 148 struct ext4_inode_info *ei = EXT4_I(inode); 149 150 ext4_fc_reset_inode(inode); 151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 152 INIT_LIST_HEAD(&ei->i_fc_list); 153 init_waitqueue_head(&ei->i_fc_wait); 154 atomic_set(&ei->i_fc_updates, 0); 155 ei->i_fc_committed_subtid = 0; 156 } 157 158 /* 159 * Inform Ext4's fast about start of an inode update 160 * 161 * This function is called by the high level call VFS callbacks before 162 * performing any inode update. This function blocks if there's an ongoing 163 * fast commit on the inode in question. 164 */ 165 void ext4_fc_start_update(struct inode *inode) 166 { 167 struct ext4_inode_info *ei = EXT4_I(inode); 168 169 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 170 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 171 return; 172 173 restart: 174 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 175 if (list_empty(&ei->i_fc_list)) 176 goto out; 177 178 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 179 wait_queue_head_t *wq; 180 #if (BITS_PER_LONG < 64) 181 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 182 EXT4_STATE_FC_COMMITTING); 183 wq = bit_waitqueue(&ei->i_state_flags, 184 EXT4_STATE_FC_COMMITTING); 185 #else 186 DEFINE_WAIT_BIT(wait, &ei->i_flags, 187 EXT4_STATE_FC_COMMITTING); 188 wq = bit_waitqueue(&ei->i_flags, 189 EXT4_STATE_FC_COMMITTING); 190 #endif 191 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 192 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 193 schedule(); 194 finish_wait(wq, &wait.wq_entry); 195 goto restart; 196 } 197 out: 198 atomic_inc(&ei->i_fc_updates); 199 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 200 } 201 202 /* 203 * Stop inode update and wake up waiting fast commits if any. 204 */ 205 void ext4_fc_stop_update(struct inode *inode) 206 { 207 struct ext4_inode_info *ei = EXT4_I(inode); 208 209 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 210 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 211 return; 212 213 if (atomic_dec_and_test(&ei->i_fc_updates)) 214 wake_up_all(&ei->i_fc_wait); 215 } 216 217 /* 218 * Remove inode from fast commit list. If the inode is being committed 219 * we wait until inode commit is done. 220 */ 221 void ext4_fc_del(struct inode *inode) 222 { 223 struct ext4_inode_info *ei = EXT4_I(inode); 224 225 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 226 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 227 return; 228 229 restart: 230 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 231 if (list_empty(&ei->i_fc_list)) { 232 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 233 return; 234 } 235 236 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 237 wait_queue_head_t *wq; 238 #if (BITS_PER_LONG < 64) 239 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 240 EXT4_STATE_FC_COMMITTING); 241 wq = bit_waitqueue(&ei->i_state_flags, 242 EXT4_STATE_FC_COMMITTING); 243 #else 244 DEFINE_WAIT_BIT(wait, &ei->i_flags, 245 EXT4_STATE_FC_COMMITTING); 246 wq = bit_waitqueue(&ei->i_flags, 247 EXT4_STATE_FC_COMMITTING); 248 #endif 249 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 251 schedule(); 252 finish_wait(wq, &wait.wq_entry); 253 goto restart; 254 } 255 if (!list_empty(&ei->i_fc_list)) 256 list_del_init(&ei->i_fc_list); 257 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 258 } 259 260 /* 261 * Mark file system as fast commit ineligible. This means that next commit 262 * operation would result in a full jbd2 commit. 263 */ 264 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 265 { 266 struct ext4_sb_info *sbi = EXT4_SB(sb); 267 268 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 269 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 270 return; 271 272 sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; 273 WARN_ON(reason >= EXT4_FC_REASON_MAX); 274 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 275 } 276 277 /* 278 * Start a fast commit ineligible update. Any commits that happen while 279 * such an operation is in progress fall back to full commits. 280 */ 281 void ext4_fc_start_ineligible(struct super_block *sb, int reason) 282 { 283 struct ext4_sb_info *sbi = EXT4_SB(sb); 284 285 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 286 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 287 return; 288 289 WARN_ON(reason >= EXT4_FC_REASON_MAX); 290 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 291 atomic_inc(&sbi->s_fc_ineligible_updates); 292 } 293 294 /* 295 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here 296 * to ensure that after stopping the ineligible update, at least one full 297 * commit takes place. 298 */ 299 void ext4_fc_stop_ineligible(struct super_block *sb) 300 { 301 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 302 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 303 return; 304 305 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; 306 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); 307 } 308 309 static inline int ext4_fc_is_ineligible(struct super_block *sb) 310 { 311 return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) || 312 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); 313 } 314 315 /* 316 * Generic fast commit tracking function. If this is the first time this we are 317 * called after a full commit, we initialize fast commit fields and then call 318 * __fc_track_fn() with update = 0. If we have already been called after a full 319 * commit, we pass update = 1. Based on that, the track function can determine 320 * if it needs to track a field for the first time or if it needs to just 321 * update the previously tracked value. 322 * 323 * If enqueue is set, this function enqueues the inode in fast commit list. 324 */ 325 static int ext4_fc_track_template( 326 struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), 327 void *args, int enqueue) 328 { 329 tid_t running_txn_tid; 330 bool update = false; 331 struct ext4_inode_info *ei = EXT4_I(inode); 332 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 333 int ret; 334 335 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 336 (sbi->s_mount_state & EXT4_FC_REPLAY)) 337 return -EOPNOTSUPP; 338 339 if (ext4_fc_is_ineligible(inode->i_sb)) 340 return -EINVAL; 341 342 running_txn_tid = sbi->s_journal ? 343 sbi->s_journal->j_commit_sequence + 1 : 0; 344 345 mutex_lock(&ei->i_fc_lock); 346 if (running_txn_tid == ei->i_sync_tid) { 347 update = true; 348 } else { 349 ext4_fc_reset_inode(inode); 350 ei->i_sync_tid = running_txn_tid; 351 } 352 ret = __fc_track_fn(inode, args, update); 353 mutex_unlock(&ei->i_fc_lock); 354 355 if (!enqueue) 356 return ret; 357 358 spin_lock(&sbi->s_fc_lock); 359 if (list_empty(&EXT4_I(inode)->i_fc_list)) 360 list_add_tail(&EXT4_I(inode)->i_fc_list, 361 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ? 362 &sbi->s_fc_q[FC_Q_STAGING] : 363 &sbi->s_fc_q[FC_Q_MAIN]); 364 spin_unlock(&sbi->s_fc_lock); 365 366 return ret; 367 } 368 369 struct __track_dentry_update_args { 370 struct dentry *dentry; 371 int op; 372 }; 373 374 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 375 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 376 { 377 struct ext4_fc_dentry_update *node; 378 struct ext4_inode_info *ei = EXT4_I(inode); 379 struct __track_dentry_update_args *dentry_update = 380 (struct __track_dentry_update_args *)arg; 381 struct dentry *dentry = dentry_update->dentry; 382 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 383 384 mutex_unlock(&ei->i_fc_lock); 385 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 386 if (!node) { 387 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); 388 mutex_lock(&ei->i_fc_lock); 389 return -ENOMEM; 390 } 391 392 node->fcd_op = dentry_update->op; 393 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 394 node->fcd_ino = inode->i_ino; 395 if (dentry->d_name.len > DNAME_INLINE_LEN) { 396 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 397 if (!node->fcd_name.name) { 398 kmem_cache_free(ext4_fc_dentry_cachep, node); 399 ext4_fc_mark_ineligible(inode->i_sb, 400 EXT4_FC_REASON_MEM); 401 mutex_lock(&ei->i_fc_lock); 402 return -ENOMEM; 403 } 404 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 405 dentry->d_name.len); 406 } else { 407 memcpy(node->fcd_iname, dentry->d_name.name, 408 dentry->d_name.len); 409 node->fcd_name.name = node->fcd_iname; 410 } 411 node->fcd_name.len = dentry->d_name.len; 412 413 spin_lock(&sbi->s_fc_lock); 414 if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) 415 list_add_tail(&node->fcd_list, 416 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 417 else 418 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 419 spin_unlock(&sbi->s_fc_lock); 420 mutex_lock(&ei->i_fc_lock); 421 422 return 0; 423 } 424 425 void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) 426 { 427 struct __track_dentry_update_args args; 428 int ret; 429 430 args.dentry = dentry; 431 args.op = EXT4_FC_TAG_UNLINK; 432 433 ret = ext4_fc_track_template(inode, __track_dentry_update, 434 (void *)&args, 0); 435 trace_ext4_fc_track_unlink(inode, dentry, ret); 436 } 437 438 void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) 439 { 440 struct __track_dentry_update_args args; 441 int ret; 442 443 args.dentry = dentry; 444 args.op = EXT4_FC_TAG_LINK; 445 446 ret = ext4_fc_track_template(inode, __track_dentry_update, 447 (void *)&args, 0); 448 trace_ext4_fc_track_link(inode, dentry, ret); 449 } 450 451 void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) 452 { 453 struct __track_dentry_update_args args; 454 int ret; 455 456 args.dentry = dentry; 457 args.op = EXT4_FC_TAG_CREAT; 458 459 ret = ext4_fc_track_template(inode, __track_dentry_update, 460 (void *)&args, 0); 461 trace_ext4_fc_track_create(inode, dentry, ret); 462 } 463 464 /* __track_fn for inode tracking */ 465 static int __track_inode(struct inode *inode, void *arg, bool update) 466 { 467 if (update) 468 return -EEXIST; 469 470 EXT4_I(inode)->i_fc_lblk_len = 0; 471 472 return 0; 473 } 474 475 void ext4_fc_track_inode(struct inode *inode) 476 { 477 int ret; 478 479 if (S_ISDIR(inode->i_mode)) 480 return; 481 482 ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); 483 trace_ext4_fc_track_inode(inode, ret); 484 } 485 486 struct __track_range_args { 487 ext4_lblk_t start, end; 488 }; 489 490 /* __track_fn for tracking data updates */ 491 static int __track_range(struct inode *inode, void *arg, bool update) 492 { 493 struct ext4_inode_info *ei = EXT4_I(inode); 494 ext4_lblk_t oldstart; 495 struct __track_range_args *__arg = 496 (struct __track_range_args *)arg; 497 498 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 499 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 500 return -ECANCELED; 501 } 502 503 oldstart = ei->i_fc_lblk_start; 504 505 if (update && ei->i_fc_lblk_len > 0) { 506 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 507 ei->i_fc_lblk_len = 508 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 509 ei->i_fc_lblk_start + 1; 510 } else { 511 ei->i_fc_lblk_start = __arg->start; 512 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 513 } 514 515 return 0; 516 } 517 518 void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, 519 ext4_lblk_t end) 520 { 521 struct __track_range_args args; 522 int ret; 523 524 if (S_ISDIR(inode->i_mode)) 525 return; 526 527 args.start = start; 528 args.end = end; 529 530 ret = ext4_fc_track_template(inode, __track_range, &args, 1); 531 532 trace_ext4_fc_track_range(inode, start, end, ret); 533 } 534 535 static void ext4_fc_submit_bh(struct super_block *sb) 536 { 537 int write_flags = REQ_SYNC; 538 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 539 540 if (test_opt(sb, BARRIER)) 541 write_flags |= REQ_FUA | REQ_PREFLUSH; 542 lock_buffer(bh); 543 clear_buffer_dirty(bh); 544 set_buffer_uptodate(bh); 545 bh->b_end_io = ext4_end_buffer_io_sync; 546 submit_bh(REQ_OP_WRITE, write_flags, bh); 547 EXT4_SB(sb)->s_fc_bh = NULL; 548 } 549 550 /* Ext4 commit path routines */ 551 552 /* memzero and update CRC */ 553 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 554 u32 *crc) 555 { 556 void *ret; 557 558 ret = memset(dst, 0, len); 559 if (crc) 560 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 561 return ret; 562 } 563 564 /* 565 * Allocate len bytes on a fast commit buffer. 566 * 567 * During the commit time this function is used to manage fast commit 568 * block space. We don't split a fast commit log onto different 569 * blocks. So this function makes sure that if there's not enough space 570 * on the current block, the remaining space in the current block is 571 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 572 * new block is from jbd2 and CRC is updated to reflect the padding 573 * we added. 574 */ 575 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 576 { 577 struct ext4_fc_tl *tl; 578 struct ext4_sb_info *sbi = EXT4_SB(sb); 579 struct buffer_head *bh; 580 int bsize = sbi->s_journal->j_blocksize; 581 int ret, off = sbi->s_fc_bytes % bsize; 582 int pad_len; 583 584 /* 585 * After allocating len, we should have space at least for a 0 byte 586 * padding. 587 */ 588 if (len + sizeof(struct ext4_fc_tl) > bsize) 589 return NULL; 590 591 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 592 /* 593 * Only allocate from current buffer if we have enough space for 594 * this request AND we have space to add a zero byte padding. 595 */ 596 if (!sbi->s_fc_bh) { 597 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 598 if (ret) 599 return NULL; 600 sbi->s_fc_bh = bh; 601 } 602 sbi->s_fc_bytes += len; 603 return sbi->s_fc_bh->b_data + off; 604 } 605 /* Need to add PAD tag */ 606 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 607 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 608 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 609 tl->fc_len = cpu_to_le16(pad_len); 610 if (crc) 611 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 612 if (pad_len > 0) 613 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 614 ext4_fc_submit_bh(sb); 615 616 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 617 if (ret) 618 return NULL; 619 sbi->s_fc_bh = bh; 620 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 621 return sbi->s_fc_bh->b_data; 622 } 623 624 /* memcpy to fc reserved space and update CRC */ 625 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 626 int len, u32 *crc) 627 { 628 if (crc) 629 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 630 return memcpy(dst, src, len); 631 } 632 633 /* 634 * Complete a fast commit by writing tail tag. 635 * 636 * Writing tail tag marks the end of a fast commit. In order to guarantee 637 * atomicity, after writing tail tag, even if there's space remaining 638 * in the block, next commit shouldn't use it. That's why tail tag 639 * has the length as that of the remaining space on the block. 640 */ 641 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 642 { 643 struct ext4_sb_info *sbi = EXT4_SB(sb); 644 struct ext4_fc_tl tl; 645 struct ext4_fc_tail tail; 646 int off, bsize = sbi->s_journal->j_blocksize; 647 u8 *dst; 648 649 /* 650 * ext4_fc_reserve_space takes care of allocating an extra block if 651 * there's no enough space on this block for accommodating this tail. 652 */ 653 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 654 if (!dst) 655 return -ENOSPC; 656 657 off = sbi->s_fc_bytes % bsize; 658 659 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 660 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 661 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 662 663 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 664 dst += sizeof(tl); 665 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 666 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 667 dst += sizeof(tail.fc_tid); 668 tail.fc_crc = cpu_to_le32(crc); 669 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 670 671 ext4_fc_submit_bh(sb); 672 673 return 0; 674 } 675 676 /* 677 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 678 * Returns false if there's not enough space. 679 */ 680 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 681 u32 *crc) 682 { 683 struct ext4_fc_tl tl; 684 u8 *dst; 685 686 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 687 if (!dst) 688 return false; 689 690 tl.fc_tag = cpu_to_le16(tag); 691 tl.fc_len = cpu_to_le16(len); 692 693 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 694 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 695 696 return true; 697 } 698 699 /* Same as above, but adds dentry tlv. */ 700 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, 701 int parent_ino, int ino, int dlen, 702 const unsigned char *dname, 703 u32 *crc) 704 { 705 struct ext4_fc_dentry_info fcd; 706 struct ext4_fc_tl tl; 707 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 708 crc); 709 710 if (!dst) 711 return false; 712 713 fcd.fc_parent_ino = cpu_to_le32(parent_ino); 714 fcd.fc_ino = cpu_to_le32(ino); 715 tl.fc_tag = cpu_to_le16(tag); 716 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 717 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 718 dst += sizeof(tl); 719 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 720 dst += sizeof(fcd); 721 ext4_fc_memcpy(sb, dst, dname, dlen, crc); 722 dst += dlen; 723 724 return true; 725 } 726 727 /* 728 * Writes inode in the fast commit space under TLV with tag @tag. 729 * Returns 0 on success, error on failure. 730 */ 731 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 732 { 733 struct ext4_inode_info *ei = EXT4_I(inode); 734 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 735 int ret; 736 struct ext4_iloc iloc; 737 struct ext4_fc_inode fc_inode; 738 struct ext4_fc_tl tl; 739 u8 *dst; 740 741 ret = ext4_get_inode_loc(inode, &iloc); 742 if (ret) 743 return ret; 744 745 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 746 inode_len += ei->i_extra_isize; 747 748 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 749 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 750 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 751 752 dst = ext4_fc_reserve_space(inode->i_sb, 753 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 754 if (!dst) 755 return -ECANCELED; 756 757 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 758 return -ECANCELED; 759 dst += sizeof(tl); 760 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 761 return -ECANCELED; 762 dst += sizeof(fc_inode); 763 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 764 inode_len, crc)) 765 return -ECANCELED; 766 767 return 0; 768 } 769 770 /* 771 * Writes updated data ranges for the inode in question. Updates CRC. 772 * Returns 0 on success, error otherwise. 773 */ 774 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 775 { 776 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 777 struct ext4_inode_info *ei = EXT4_I(inode); 778 struct ext4_map_blocks map; 779 struct ext4_fc_add_range fc_ext; 780 struct ext4_fc_del_range lrange; 781 struct ext4_extent *ex; 782 int ret; 783 784 mutex_lock(&ei->i_fc_lock); 785 if (ei->i_fc_lblk_len == 0) { 786 mutex_unlock(&ei->i_fc_lock); 787 return 0; 788 } 789 old_blk_size = ei->i_fc_lblk_start; 790 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 791 ei->i_fc_lblk_len = 0; 792 mutex_unlock(&ei->i_fc_lock); 793 794 cur_lblk_off = old_blk_size; 795 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 796 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 797 798 while (cur_lblk_off <= new_blk_size) { 799 map.m_lblk = cur_lblk_off; 800 map.m_len = new_blk_size - cur_lblk_off + 1; 801 ret = ext4_map_blocks(NULL, inode, &map, 0); 802 if (ret < 0) 803 return -ECANCELED; 804 805 if (map.m_len == 0) { 806 cur_lblk_off++; 807 continue; 808 } 809 810 if (ret == 0) { 811 lrange.fc_ino = cpu_to_le32(inode->i_ino); 812 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 813 lrange.fc_len = cpu_to_le32(map.m_len); 814 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 815 sizeof(lrange), (u8 *)&lrange, crc)) 816 return -ENOSPC; 817 } else { 818 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 819 ex = (struct ext4_extent *)&fc_ext.fc_ex; 820 ex->ee_block = cpu_to_le32(map.m_lblk); 821 ex->ee_len = cpu_to_le16(map.m_len); 822 ext4_ext_store_pblock(ex, map.m_pblk); 823 if (map.m_flags & EXT4_MAP_UNWRITTEN) 824 ext4_ext_mark_unwritten(ex); 825 else 826 ext4_ext_mark_initialized(ex); 827 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 828 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 829 return -ENOSPC; 830 } 831 832 cur_lblk_off += map.m_len; 833 } 834 835 return 0; 836 } 837 838 839 /* Submit data for all the fast commit inodes */ 840 static int ext4_fc_submit_inode_data_all(journal_t *journal) 841 { 842 struct super_block *sb = (struct super_block *)(journal->j_private); 843 struct ext4_sb_info *sbi = EXT4_SB(sb); 844 struct ext4_inode_info *ei; 845 struct list_head *pos; 846 int ret = 0; 847 848 spin_lock(&sbi->s_fc_lock); 849 sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING; 850 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { 851 ei = list_entry(pos, struct ext4_inode_info, i_fc_list); 852 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 853 while (atomic_read(&ei->i_fc_updates)) { 854 DEFINE_WAIT(wait); 855 856 prepare_to_wait(&ei->i_fc_wait, &wait, 857 TASK_UNINTERRUPTIBLE); 858 if (atomic_read(&ei->i_fc_updates)) { 859 spin_unlock(&sbi->s_fc_lock); 860 schedule(); 861 spin_lock(&sbi->s_fc_lock); 862 } 863 finish_wait(&ei->i_fc_wait, &wait); 864 } 865 spin_unlock(&sbi->s_fc_lock); 866 ret = jbd2_submit_inode_data(ei->jinode); 867 if (ret) 868 return ret; 869 spin_lock(&sbi->s_fc_lock); 870 } 871 spin_unlock(&sbi->s_fc_lock); 872 873 return ret; 874 } 875 876 /* Wait for completion of data for all the fast commit inodes */ 877 static int ext4_fc_wait_inode_data_all(journal_t *journal) 878 { 879 struct super_block *sb = (struct super_block *)(journal->j_private); 880 struct ext4_sb_info *sbi = EXT4_SB(sb); 881 struct ext4_inode_info *pos, *n; 882 int ret = 0; 883 884 spin_lock(&sbi->s_fc_lock); 885 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 886 if (!ext4_test_inode_state(&pos->vfs_inode, 887 EXT4_STATE_FC_COMMITTING)) 888 continue; 889 spin_unlock(&sbi->s_fc_lock); 890 891 ret = jbd2_wait_inode_data(journal, pos->jinode); 892 if (ret) 893 return ret; 894 spin_lock(&sbi->s_fc_lock); 895 } 896 spin_unlock(&sbi->s_fc_lock); 897 898 return 0; 899 } 900 901 /* Commit all the directory entry updates */ 902 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 903 { 904 struct super_block *sb = (struct super_block *)(journal->j_private); 905 struct ext4_sb_info *sbi = EXT4_SB(sb); 906 struct ext4_fc_dentry_update *fc_dentry; 907 struct inode *inode; 908 struct list_head *pos, *n, *fcd_pos, *fcd_n; 909 struct ext4_inode_info *ei; 910 int ret; 911 912 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 913 return 0; 914 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { 915 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, 916 fcd_list); 917 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 918 spin_unlock(&sbi->s_fc_lock); 919 if (!ext4_fc_add_dentry_tlv( 920 sb, fc_dentry->fcd_op, 921 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 922 fc_dentry->fcd_name.len, 923 fc_dentry->fcd_name.name, crc)) { 924 ret = -ENOSPC; 925 goto lock_and_exit; 926 } 927 spin_lock(&sbi->s_fc_lock); 928 continue; 929 } 930 931 inode = NULL; 932 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { 933 ei = list_entry(pos, struct ext4_inode_info, i_fc_list); 934 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 935 inode = &ei->vfs_inode; 936 break; 937 } 938 } 939 /* 940 * If we don't find inode in our list, then it was deleted, 941 * in which case, we don't need to record it's create tag. 942 */ 943 if (!inode) 944 continue; 945 spin_unlock(&sbi->s_fc_lock); 946 947 /* 948 * We first write the inode and then the create dirent. This 949 * allows the recovery code to create an unnamed inode first 950 * and then link it to a directory entry. This allows us 951 * to use namei.c routines almost as is and simplifies 952 * the recovery code. 953 */ 954 ret = ext4_fc_write_inode(inode, crc); 955 if (ret) 956 goto lock_and_exit; 957 958 ret = ext4_fc_write_inode_data(inode, crc); 959 if (ret) 960 goto lock_and_exit; 961 962 if (!ext4_fc_add_dentry_tlv( 963 sb, fc_dentry->fcd_op, 964 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 965 fc_dentry->fcd_name.len, 966 fc_dentry->fcd_name.name, crc)) { 967 ret = -ENOSPC; 968 goto lock_and_exit; 969 } 970 971 spin_lock(&sbi->s_fc_lock); 972 } 973 return 0; 974 lock_and_exit: 975 spin_lock(&sbi->s_fc_lock); 976 return ret; 977 } 978 979 static int ext4_fc_perform_commit(journal_t *journal) 980 { 981 struct super_block *sb = (struct super_block *)(journal->j_private); 982 struct ext4_sb_info *sbi = EXT4_SB(sb); 983 struct ext4_inode_info *iter; 984 struct ext4_fc_head head; 985 struct list_head *pos; 986 struct inode *inode; 987 struct blk_plug plug; 988 int ret = 0; 989 u32 crc = 0; 990 991 ret = ext4_fc_submit_inode_data_all(journal); 992 if (ret) 993 return ret; 994 995 ret = ext4_fc_wait_inode_data_all(journal); 996 if (ret) 997 return ret; 998 999 blk_start_plug(&plug); 1000 if (sbi->s_fc_bytes == 0) { 1001 /* 1002 * Add a head tag only if this is the first fast commit 1003 * in this TID. 1004 */ 1005 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1006 head.fc_tid = cpu_to_le32( 1007 sbi->s_journal->j_running_transaction->t_tid); 1008 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1009 (u8 *)&head, &crc)) 1010 goto out; 1011 } 1012 1013 spin_lock(&sbi->s_fc_lock); 1014 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1015 if (ret) { 1016 spin_unlock(&sbi->s_fc_lock); 1017 goto out; 1018 } 1019 1020 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { 1021 iter = list_entry(pos, struct ext4_inode_info, i_fc_list); 1022 inode = &iter->vfs_inode; 1023 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1024 continue; 1025 1026 spin_unlock(&sbi->s_fc_lock); 1027 ret = ext4_fc_write_inode_data(inode, &crc); 1028 if (ret) 1029 goto out; 1030 ret = ext4_fc_write_inode(inode, &crc); 1031 if (ret) 1032 goto out; 1033 spin_lock(&sbi->s_fc_lock); 1034 EXT4_I(inode)->i_fc_committed_subtid = 1035 atomic_read(&sbi->s_fc_subtid); 1036 } 1037 spin_unlock(&sbi->s_fc_lock); 1038 1039 ret = ext4_fc_write_tail(sb, crc); 1040 1041 out: 1042 blk_finish_plug(&plug); 1043 return ret; 1044 } 1045 1046 /* 1047 * The main commit entry point. Performs a fast commit for transaction 1048 * commit_tid if needed. If it's not possible to perform a fast commit 1049 * due to various reasons, we fall back to full commit. Returns 0 1050 * on success, error otherwise. 1051 */ 1052 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1053 { 1054 struct super_block *sb = (struct super_block *)(journal->j_private); 1055 struct ext4_sb_info *sbi = EXT4_SB(sb); 1056 int nblks = 0, ret, bsize = journal->j_blocksize; 1057 int subtid = atomic_read(&sbi->s_fc_subtid); 1058 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; 1059 ktime_t start_time, commit_time; 1060 1061 trace_ext4_fc_commit_start(sb); 1062 1063 start_time = ktime_get(); 1064 1065 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 1066 (ext4_fc_is_ineligible(sb))) { 1067 reason = EXT4_FC_REASON_INELIGIBLE; 1068 goto out; 1069 } 1070 1071 restart_fc: 1072 ret = jbd2_fc_begin_commit(journal, commit_tid); 1073 if (ret == -EALREADY) { 1074 /* There was an ongoing commit, check if we need to restart */ 1075 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1076 commit_tid > journal->j_commit_sequence) 1077 goto restart_fc; 1078 reason = EXT4_FC_REASON_ALREADY_COMMITTED; 1079 goto out; 1080 } else if (ret) { 1081 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1082 reason = EXT4_FC_REASON_FC_START_FAILED; 1083 goto out; 1084 } 1085 1086 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1087 ret = ext4_fc_perform_commit(journal); 1088 if (ret < 0) { 1089 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1090 reason = EXT4_FC_REASON_FC_FAILED; 1091 goto out; 1092 } 1093 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1094 ret = jbd2_fc_wait_bufs(journal, nblks); 1095 if (ret < 0) { 1096 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1097 reason = EXT4_FC_REASON_FC_FAILED; 1098 goto out; 1099 } 1100 atomic_inc(&sbi->s_fc_subtid); 1101 jbd2_fc_end_commit(journal); 1102 out: 1103 /* Has any ineligible update happened since we started? */ 1104 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { 1105 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1106 reason = EXT4_FC_REASON_INELIGIBLE; 1107 } 1108 1109 spin_lock(&sbi->s_fc_lock); 1110 if (reason != EXT4_FC_REASON_OK && 1111 reason != EXT4_FC_REASON_ALREADY_COMMITTED) { 1112 sbi->s_fc_stats.fc_ineligible_commits++; 1113 } else { 1114 sbi->s_fc_stats.fc_num_commits++; 1115 sbi->s_fc_stats.fc_numblks += nblks; 1116 } 1117 spin_unlock(&sbi->s_fc_lock); 1118 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; 1119 trace_ext4_fc_commit_stop(sb, nblks, reason); 1120 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1121 /* 1122 * weight the commit time higher than the average time so we don't 1123 * react too strongly to vast changes in the commit time 1124 */ 1125 if (likely(sbi->s_fc_avg_commit_time)) 1126 sbi->s_fc_avg_commit_time = (commit_time + 1127 sbi->s_fc_avg_commit_time * 3) / 4; 1128 else 1129 sbi->s_fc_avg_commit_time = commit_time; 1130 jbd_debug(1, 1131 "Fast commit ended with blks = %d, reason = %d, subtid - %d", 1132 nblks, reason, subtid); 1133 if (reason == EXT4_FC_REASON_FC_FAILED) 1134 return jbd2_fc_end_commit_fallback(journal, commit_tid); 1135 if (reason == EXT4_FC_REASON_FC_START_FAILED || 1136 reason == EXT4_FC_REASON_INELIGIBLE) 1137 return jbd2_complete_transaction(journal, commit_tid); 1138 return 0; 1139 } 1140 1141 /* 1142 * Fast commit cleanup routine. This is called after every fast commit and 1143 * full commit. full is true if we are called after a full commit. 1144 */ 1145 static void ext4_fc_cleanup(journal_t *journal, int full) 1146 { 1147 struct super_block *sb = journal->j_private; 1148 struct ext4_sb_info *sbi = EXT4_SB(sb); 1149 struct ext4_inode_info *iter; 1150 struct ext4_fc_dentry_update *fc_dentry; 1151 struct list_head *pos, *n; 1152 1153 if (full && sbi->s_fc_bh) 1154 sbi->s_fc_bh = NULL; 1155 1156 jbd2_fc_release_bufs(journal); 1157 1158 spin_lock(&sbi->s_fc_lock); 1159 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { 1160 iter = list_entry(pos, struct ext4_inode_info, i_fc_list); 1161 list_del_init(&iter->i_fc_list); 1162 ext4_clear_inode_state(&iter->vfs_inode, 1163 EXT4_STATE_FC_COMMITTING); 1164 ext4_fc_reset_inode(&iter->vfs_inode); 1165 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1166 smp_mb(); 1167 #if (BITS_PER_LONG < 64) 1168 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1169 #else 1170 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1171 #endif 1172 } 1173 1174 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1175 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1176 struct ext4_fc_dentry_update, 1177 fcd_list); 1178 list_del_init(&fc_dentry->fcd_list); 1179 spin_unlock(&sbi->s_fc_lock); 1180 1181 if (fc_dentry->fcd_name.name && 1182 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1183 kfree(fc_dentry->fcd_name.name); 1184 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1185 spin_lock(&sbi->s_fc_lock); 1186 } 1187 1188 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1189 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1190 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1191 &sbi->s_fc_q[FC_Q_STAGING]); 1192 1193 sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; 1194 sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; 1195 1196 if (full) 1197 sbi->s_fc_bytes = 0; 1198 spin_unlock(&sbi->s_fc_lock); 1199 trace_ext4_fc_stats(sb); 1200 } 1201 1202 /* Ext4 Replay Path Routines */ 1203 1204 /* Get length of a particular tlv */ 1205 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 1206 { 1207 return le16_to_cpu(tl->fc_len); 1208 } 1209 1210 /* Get a pointer to "value" of a tlv */ 1211 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 1212 { 1213 return (u8 *)tl + sizeof(*tl); 1214 } 1215 1216 /* Helper struct for dentry replay routines */ 1217 struct dentry_info_args { 1218 int parent_ino, dname_len, ino, inode_len; 1219 char *dname; 1220 }; 1221 1222 static inline void tl_to_darg(struct dentry_info_args *darg, 1223 struct ext4_fc_tl *tl) 1224 { 1225 struct ext4_fc_dentry_info *fcd; 1226 1227 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); 1228 1229 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); 1230 darg->ino = le32_to_cpu(fcd->fc_ino); 1231 darg->dname = fcd->fc_dname; 1232 darg->dname_len = ext4_fc_tag_len(tl) - 1233 sizeof(struct ext4_fc_dentry_info); 1234 } 1235 1236 /* Unlink replay function */ 1237 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) 1238 { 1239 struct inode *inode, *old_parent; 1240 struct qstr entry; 1241 struct dentry_info_args darg; 1242 int ret = 0; 1243 1244 tl_to_darg(&darg, tl); 1245 1246 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1247 darg.parent_ino, darg.dname_len); 1248 1249 entry.name = darg.dname; 1250 entry.len = darg.dname_len; 1251 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1252 1253 if (IS_ERR_OR_NULL(inode)) { 1254 jbd_debug(1, "Inode %d not found", darg.ino); 1255 return 0; 1256 } 1257 1258 old_parent = ext4_iget(sb, darg.parent_ino, 1259 EXT4_IGET_NORMAL); 1260 if (IS_ERR_OR_NULL(old_parent)) { 1261 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1262 iput(inode); 1263 return 0; 1264 } 1265 1266 ret = __ext4_unlink(old_parent, &entry, inode); 1267 /* -ENOENT ok coz it might not exist anymore. */ 1268 if (ret == -ENOENT) 1269 ret = 0; 1270 iput(old_parent); 1271 iput(inode); 1272 return ret; 1273 } 1274 1275 static int ext4_fc_replay_link_internal(struct super_block *sb, 1276 struct dentry_info_args *darg, 1277 struct inode *inode) 1278 { 1279 struct inode *dir = NULL; 1280 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1281 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1282 int ret = 0; 1283 1284 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1285 if (IS_ERR(dir)) { 1286 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1287 dir = NULL; 1288 goto out; 1289 } 1290 1291 dentry_dir = d_obtain_alias(dir); 1292 if (IS_ERR(dentry_dir)) { 1293 jbd_debug(1, "Failed to obtain dentry"); 1294 dentry_dir = NULL; 1295 goto out; 1296 } 1297 1298 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1299 if (!dentry_inode) { 1300 jbd_debug(1, "Inode dentry not created."); 1301 ret = -ENOMEM; 1302 goto out; 1303 } 1304 1305 ret = __ext4_link(dir, inode, dentry_inode); 1306 /* 1307 * It's possible that link already existed since data blocks 1308 * for the dir in question got persisted before we crashed OR 1309 * we replayed this tag and crashed before the entire replay 1310 * could complete. 1311 */ 1312 if (ret && ret != -EEXIST) { 1313 jbd_debug(1, "Failed to link\n"); 1314 goto out; 1315 } 1316 1317 ret = 0; 1318 out: 1319 if (dentry_dir) { 1320 d_drop(dentry_dir); 1321 dput(dentry_dir); 1322 } else if (dir) { 1323 iput(dir); 1324 } 1325 if (dentry_inode) { 1326 d_drop(dentry_inode); 1327 dput(dentry_inode); 1328 } 1329 1330 return ret; 1331 } 1332 1333 /* Link replay function */ 1334 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) 1335 { 1336 struct inode *inode; 1337 struct dentry_info_args darg; 1338 int ret = 0; 1339 1340 tl_to_darg(&darg, tl); 1341 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1342 darg.parent_ino, darg.dname_len); 1343 1344 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1345 if (IS_ERR_OR_NULL(inode)) { 1346 jbd_debug(1, "Inode not found."); 1347 return 0; 1348 } 1349 1350 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1351 iput(inode); 1352 return ret; 1353 } 1354 1355 /* 1356 * Record all the modified inodes during replay. We use this later to setup 1357 * block bitmaps correctly. 1358 */ 1359 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1360 { 1361 struct ext4_fc_replay_state *state; 1362 int i; 1363 1364 state = &EXT4_SB(sb)->s_fc_replay_state; 1365 for (i = 0; i < state->fc_modified_inodes_used; i++) 1366 if (state->fc_modified_inodes[i] == ino) 1367 return 0; 1368 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1369 state->fc_modified_inodes_size += 1370 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1371 state->fc_modified_inodes = krealloc( 1372 state->fc_modified_inodes, sizeof(int) * 1373 state->fc_modified_inodes_size, 1374 GFP_KERNEL); 1375 if (!state->fc_modified_inodes) 1376 return -ENOMEM; 1377 } 1378 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1379 return 0; 1380 } 1381 1382 /* 1383 * Inode replay function 1384 */ 1385 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) 1386 { 1387 struct ext4_fc_inode *fc_inode; 1388 struct ext4_inode *raw_inode; 1389 struct ext4_inode *raw_fc_inode; 1390 struct inode *inode = NULL; 1391 struct ext4_iloc iloc; 1392 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1393 struct ext4_extent_header *eh; 1394 1395 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); 1396 1397 ino = le32_to_cpu(fc_inode->fc_ino); 1398 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1399 1400 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1401 if (!IS_ERR_OR_NULL(inode)) { 1402 ext4_ext_clear_bb(inode); 1403 iput(inode); 1404 } 1405 1406 ext4_fc_record_modified_inode(sb, ino); 1407 1408 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; 1409 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1410 if (ret) 1411 goto out; 1412 1413 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); 1414 raw_inode = ext4_raw_inode(&iloc); 1415 1416 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1417 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1418 inode_len - offsetof(struct ext4_inode, i_generation)); 1419 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1420 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1421 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1422 memset(eh, 0, sizeof(*eh)); 1423 eh->eh_magic = EXT4_EXT_MAGIC; 1424 eh->eh_max = cpu_to_le16( 1425 (sizeof(raw_inode->i_block) - 1426 sizeof(struct ext4_extent_header)) 1427 / sizeof(struct ext4_extent)); 1428 } 1429 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1430 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1431 sizeof(raw_inode->i_block)); 1432 } 1433 1434 /* Immediately update the inode on disk. */ 1435 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1436 if (ret) 1437 goto out; 1438 ret = sync_dirty_buffer(iloc.bh); 1439 if (ret) 1440 goto out; 1441 ret = ext4_mark_inode_used(sb, ino); 1442 if (ret) 1443 goto out; 1444 1445 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1446 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1447 if (IS_ERR_OR_NULL(inode)) { 1448 jbd_debug(1, "Inode not found."); 1449 return -EFSCORRUPTED; 1450 } 1451 1452 /* 1453 * Our allocator could have made different decisions than before 1454 * crashing. This should be fixed but until then, we calculate 1455 * the number of blocks the inode. 1456 */ 1457 ext4_ext_replay_set_iblocks(inode); 1458 1459 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1460 ext4_reset_inode_seed(inode); 1461 1462 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1463 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1464 sync_dirty_buffer(iloc.bh); 1465 brelse(iloc.bh); 1466 out: 1467 iput(inode); 1468 if (!ret) 1469 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); 1470 1471 return 0; 1472 } 1473 1474 /* 1475 * Dentry create replay function. 1476 * 1477 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1478 * inode for which we are trying to create a dentry here, should already have 1479 * been replayed before we start here. 1480 */ 1481 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) 1482 { 1483 int ret = 0; 1484 struct inode *inode = NULL; 1485 struct inode *dir = NULL; 1486 struct dentry_info_args darg; 1487 1488 tl_to_darg(&darg, tl); 1489 1490 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1491 darg.parent_ino, darg.dname_len); 1492 1493 /* This takes care of update group descriptor and other metadata */ 1494 ret = ext4_mark_inode_used(sb, darg.ino); 1495 if (ret) 1496 goto out; 1497 1498 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1499 if (IS_ERR_OR_NULL(inode)) { 1500 jbd_debug(1, "inode %d not found.", darg.ino); 1501 inode = NULL; 1502 ret = -EINVAL; 1503 goto out; 1504 } 1505 1506 if (S_ISDIR(inode->i_mode)) { 1507 /* 1508 * If we are creating a directory, we need to make sure that the 1509 * dot and dot dot dirents are setup properly. 1510 */ 1511 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1512 if (IS_ERR_OR_NULL(dir)) { 1513 jbd_debug(1, "Dir %d not found.", darg.ino); 1514 goto out; 1515 } 1516 ret = ext4_init_new_dir(NULL, dir, inode); 1517 iput(dir); 1518 if (ret) { 1519 ret = 0; 1520 goto out; 1521 } 1522 } 1523 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1524 if (ret) 1525 goto out; 1526 set_nlink(inode, 1); 1527 ext4_mark_inode_dirty(NULL, inode); 1528 out: 1529 if (inode) 1530 iput(inode); 1531 return ret; 1532 } 1533 1534 /* 1535 * Record physical disk regions which are in use as per fast commit area. Our 1536 * simple replay phase allocator excludes these regions from allocation. 1537 */ 1538 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1539 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1540 { 1541 struct ext4_fc_replay_state *state; 1542 struct ext4_fc_alloc_region *region; 1543 1544 state = &EXT4_SB(sb)->s_fc_replay_state; 1545 if (state->fc_regions_used == state->fc_regions_size) { 1546 state->fc_regions_size += 1547 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1548 state->fc_regions = krealloc( 1549 state->fc_regions, 1550 state->fc_regions_size * 1551 sizeof(struct ext4_fc_alloc_region), 1552 GFP_KERNEL); 1553 if (!state->fc_regions) 1554 return -ENOMEM; 1555 } 1556 region = &state->fc_regions[state->fc_regions_used++]; 1557 region->ino = ino; 1558 region->lblk = lblk; 1559 region->pblk = pblk; 1560 region->len = len; 1561 1562 return 0; 1563 } 1564 1565 /* Replay add range tag */ 1566 static int ext4_fc_replay_add_range(struct super_block *sb, 1567 struct ext4_fc_tl *tl) 1568 { 1569 struct ext4_fc_add_range *fc_add_ex; 1570 struct ext4_extent newex, *ex; 1571 struct inode *inode; 1572 ext4_lblk_t start, cur; 1573 int remaining, len; 1574 ext4_fsblk_t start_pblk; 1575 struct ext4_map_blocks map; 1576 struct ext4_ext_path *path = NULL; 1577 int ret; 1578 1579 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1580 ex = (struct ext4_extent *)&fc_add_ex->fc_ex; 1581 1582 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1583 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), 1584 ext4_ext_get_actual_len(ex)); 1585 1586 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), 1587 EXT4_IGET_NORMAL); 1588 if (IS_ERR_OR_NULL(inode)) { 1589 jbd_debug(1, "Inode not found."); 1590 return 0; 1591 } 1592 1593 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1594 1595 start = le32_to_cpu(ex->ee_block); 1596 start_pblk = ext4_ext_pblock(ex); 1597 len = ext4_ext_get_actual_len(ex); 1598 1599 cur = start; 1600 remaining = len; 1601 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1602 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1603 inode->i_ino); 1604 1605 while (remaining > 0) { 1606 map.m_lblk = cur; 1607 map.m_len = remaining; 1608 map.m_pblk = 0; 1609 ret = ext4_map_blocks(NULL, inode, &map, 0); 1610 1611 if (ret < 0) { 1612 iput(inode); 1613 return 0; 1614 } 1615 1616 if (ret == 0) { 1617 /* Range is not mapped */ 1618 path = ext4_find_extent(inode, cur, NULL, 0); 1619 if (IS_ERR(path)) { 1620 iput(inode); 1621 return 0; 1622 } 1623 memset(&newex, 0, sizeof(newex)); 1624 newex.ee_block = cpu_to_le32(cur); 1625 ext4_ext_store_pblock( 1626 &newex, start_pblk + cur - start); 1627 newex.ee_len = cpu_to_le16(map.m_len); 1628 if (ext4_ext_is_unwritten(ex)) 1629 ext4_ext_mark_unwritten(&newex); 1630 down_write(&EXT4_I(inode)->i_data_sem); 1631 ret = ext4_ext_insert_extent( 1632 NULL, inode, &path, &newex, 0); 1633 up_write((&EXT4_I(inode)->i_data_sem)); 1634 ext4_ext_drop_refs(path); 1635 kfree(path); 1636 if (ret) { 1637 iput(inode); 1638 return 0; 1639 } 1640 goto next; 1641 } 1642 1643 if (start_pblk + cur - start != map.m_pblk) { 1644 /* 1645 * Logical to physical mapping changed. This can happen 1646 * if this range was removed and then reallocated to 1647 * map to new physical blocks during a fast commit. 1648 */ 1649 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1650 ext4_ext_is_unwritten(ex), 1651 start_pblk + cur - start); 1652 if (ret) { 1653 iput(inode); 1654 return 0; 1655 } 1656 /* 1657 * Mark the old blocks as free since they aren't used 1658 * anymore. We maintain an array of all the modified 1659 * inodes. In case these blocks are still used at either 1660 * a different logical range in the same inode or in 1661 * some different inode, we will mark them as allocated 1662 * at the end of the FC replay using our array of 1663 * modified inodes. 1664 */ 1665 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1666 goto next; 1667 } 1668 1669 /* Range is mapped and needs a state change */ 1670 jbd_debug(1, "Converting from %d to %d %lld", 1671 map.m_flags & EXT4_MAP_UNWRITTEN, 1672 ext4_ext_is_unwritten(ex), map.m_pblk); 1673 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1674 ext4_ext_is_unwritten(ex), map.m_pblk); 1675 if (ret) { 1676 iput(inode); 1677 return 0; 1678 } 1679 /* 1680 * We may have split the extent tree while toggling the state. 1681 * Try to shrink the extent tree now. 1682 */ 1683 ext4_ext_replay_shrink_inode(inode, start + len); 1684 next: 1685 cur += map.m_len; 1686 remaining -= map.m_len; 1687 } 1688 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1689 sb->s_blocksize_bits); 1690 iput(inode); 1691 return 0; 1692 } 1693 1694 /* Replay DEL_RANGE tag */ 1695 static int 1696 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) 1697 { 1698 struct inode *inode; 1699 struct ext4_fc_del_range *lrange; 1700 struct ext4_map_blocks map; 1701 ext4_lblk_t cur, remaining; 1702 int ret; 1703 1704 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); 1705 cur = le32_to_cpu(lrange->fc_lblk); 1706 remaining = le32_to_cpu(lrange->fc_len); 1707 1708 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1709 le32_to_cpu(lrange->fc_ino), cur, remaining); 1710 1711 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); 1712 if (IS_ERR_OR_NULL(inode)) { 1713 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); 1714 return 0; 1715 } 1716 1717 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1718 1719 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1720 inode->i_ino, le32_to_cpu(lrange->fc_lblk), 1721 le32_to_cpu(lrange->fc_len)); 1722 while (remaining > 0) { 1723 map.m_lblk = cur; 1724 map.m_len = remaining; 1725 1726 ret = ext4_map_blocks(NULL, inode, &map, 0); 1727 if (ret < 0) { 1728 iput(inode); 1729 return 0; 1730 } 1731 if (ret > 0) { 1732 remaining -= ret; 1733 cur += ret; 1734 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1735 } else { 1736 remaining -= map.m_len; 1737 cur += map.m_len; 1738 } 1739 } 1740 1741 ret = ext4_punch_hole(inode, 1742 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, 1743 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); 1744 if (ret) 1745 jbd_debug(1, "ext4_punch_hole returned %d", ret); 1746 ext4_ext_replay_shrink_inode(inode, 1747 i_size_read(inode) >> sb->s_blocksize_bits); 1748 ext4_mark_inode_dirty(NULL, inode); 1749 iput(inode); 1750 1751 return 0; 1752 } 1753 1754 static inline const char *tag2str(u16 tag) 1755 { 1756 switch (tag) { 1757 case EXT4_FC_TAG_LINK: 1758 return "TAG_ADD_ENTRY"; 1759 case EXT4_FC_TAG_UNLINK: 1760 return "TAG_DEL_ENTRY"; 1761 case EXT4_FC_TAG_ADD_RANGE: 1762 return "TAG_ADD_RANGE"; 1763 case EXT4_FC_TAG_CREAT: 1764 return "TAG_CREAT_DENTRY"; 1765 case EXT4_FC_TAG_DEL_RANGE: 1766 return "TAG_DEL_RANGE"; 1767 case EXT4_FC_TAG_INODE: 1768 return "TAG_INODE"; 1769 case EXT4_FC_TAG_PAD: 1770 return "TAG_PAD"; 1771 case EXT4_FC_TAG_TAIL: 1772 return "TAG_TAIL"; 1773 case EXT4_FC_TAG_HEAD: 1774 return "TAG_HEAD"; 1775 default: 1776 return "TAG_ERROR"; 1777 } 1778 } 1779 1780 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1781 { 1782 struct ext4_fc_replay_state *state; 1783 struct inode *inode; 1784 struct ext4_ext_path *path = NULL; 1785 struct ext4_map_blocks map; 1786 int i, ret, j; 1787 ext4_lblk_t cur, end; 1788 1789 state = &EXT4_SB(sb)->s_fc_replay_state; 1790 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1791 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1792 EXT4_IGET_NORMAL); 1793 if (IS_ERR_OR_NULL(inode)) { 1794 jbd_debug(1, "Inode %d not found.", 1795 state->fc_modified_inodes[i]); 1796 continue; 1797 } 1798 cur = 0; 1799 end = EXT_MAX_BLOCKS; 1800 while (cur < end) { 1801 map.m_lblk = cur; 1802 map.m_len = end - cur; 1803 1804 ret = ext4_map_blocks(NULL, inode, &map, 0); 1805 if (ret < 0) 1806 break; 1807 1808 if (ret > 0) { 1809 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1810 if (!IS_ERR_OR_NULL(path)) { 1811 for (j = 0; j < path->p_depth; j++) 1812 ext4_mb_mark_bb(inode->i_sb, 1813 path[j].p_block, 1, 1); 1814 ext4_ext_drop_refs(path); 1815 kfree(path); 1816 } 1817 cur += ret; 1818 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1819 map.m_len, 1); 1820 } else { 1821 cur = cur + (map.m_len ? map.m_len : 1); 1822 } 1823 } 1824 iput(inode); 1825 } 1826 } 1827 1828 /* 1829 * Check if block is in excluded regions for block allocation. The simple 1830 * allocator that runs during replay phase is calls this function to see 1831 * if it is okay to use a block. 1832 */ 1833 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1834 { 1835 int i; 1836 struct ext4_fc_replay_state *state; 1837 1838 state = &EXT4_SB(sb)->s_fc_replay_state; 1839 for (i = 0; i < state->fc_regions_valid; i++) { 1840 if (state->fc_regions[i].ino == 0 || 1841 state->fc_regions[i].len == 0) 1842 continue; 1843 if (blk >= state->fc_regions[i].pblk && 1844 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1845 return true; 1846 } 1847 return false; 1848 } 1849 1850 /* Cleanup function called after replay */ 1851 void ext4_fc_replay_cleanup(struct super_block *sb) 1852 { 1853 struct ext4_sb_info *sbi = EXT4_SB(sb); 1854 1855 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1856 kfree(sbi->s_fc_replay_state.fc_regions); 1857 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1858 } 1859 1860 /* 1861 * Recovery Scan phase handler 1862 * 1863 * This function is called during the scan phase and is responsible 1864 * for doing following things: 1865 * - Make sure the fast commit area has valid tags for replay 1866 * - Count number of tags that need to be replayed by the replay handler 1867 * - Verify CRC 1868 * - Create a list of excluded blocks for allocation during replay phase 1869 * 1870 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1871 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1872 * to indicate that scan has finished and JBD2 can now start replay phase. 1873 * It returns a negative error to indicate that there was an error. At the end 1874 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1875 * to indicate the number of tags that need to replayed during the replay phase. 1876 */ 1877 static int ext4_fc_replay_scan(journal_t *journal, 1878 struct buffer_head *bh, int off, 1879 tid_t expected_tid) 1880 { 1881 struct super_block *sb = journal->j_private; 1882 struct ext4_sb_info *sbi = EXT4_SB(sb); 1883 struct ext4_fc_replay_state *state; 1884 int ret = JBD2_FC_REPLAY_CONTINUE; 1885 struct ext4_fc_add_range *ext; 1886 struct ext4_fc_tl *tl; 1887 struct ext4_fc_tail *tail; 1888 __u8 *start, *end; 1889 struct ext4_fc_head *head; 1890 struct ext4_extent *ex; 1891 1892 state = &sbi->s_fc_replay_state; 1893 1894 start = (u8 *)bh->b_data; 1895 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1896 1897 if (state->fc_replay_expected_off == 0) { 1898 state->fc_cur_tag = 0; 1899 state->fc_replay_num_tags = 0; 1900 state->fc_crc = 0; 1901 state->fc_regions = NULL; 1902 state->fc_regions_valid = state->fc_regions_used = 1903 state->fc_regions_size = 0; 1904 /* Check if we can stop early */ 1905 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1906 != EXT4_FC_TAG_HEAD) 1907 return 0; 1908 } 1909 1910 if (off != state->fc_replay_expected_off) { 1911 ret = -EFSCORRUPTED; 1912 goto out_err; 1913 } 1914 1915 state->fc_replay_expected_off++; 1916 fc_for_each_tl(start, end, tl) { 1917 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1918 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); 1919 switch (le16_to_cpu(tl->fc_tag)) { 1920 case EXT4_FC_TAG_ADD_RANGE: 1921 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1922 ex = (struct ext4_extent *)&ext->fc_ex; 1923 ret = ext4_fc_record_regions(sb, 1924 le32_to_cpu(ext->fc_ino), 1925 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1926 ext4_ext_get_actual_len(ex)); 1927 if (ret < 0) 1928 break; 1929 ret = JBD2_FC_REPLAY_CONTINUE; 1930 fallthrough; 1931 case EXT4_FC_TAG_DEL_RANGE: 1932 case EXT4_FC_TAG_LINK: 1933 case EXT4_FC_TAG_UNLINK: 1934 case EXT4_FC_TAG_CREAT: 1935 case EXT4_FC_TAG_INODE: 1936 case EXT4_FC_TAG_PAD: 1937 state->fc_cur_tag++; 1938 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1939 sizeof(*tl) + ext4_fc_tag_len(tl)); 1940 break; 1941 case EXT4_FC_TAG_TAIL: 1942 state->fc_cur_tag++; 1943 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 1944 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1945 sizeof(*tl) + 1946 offsetof(struct ext4_fc_tail, 1947 fc_crc)); 1948 if (le32_to_cpu(tail->fc_tid) == expected_tid && 1949 le32_to_cpu(tail->fc_crc) == state->fc_crc) { 1950 state->fc_replay_num_tags = state->fc_cur_tag; 1951 state->fc_regions_valid = 1952 state->fc_regions_used; 1953 } else { 1954 ret = state->fc_replay_num_tags ? 1955 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 1956 } 1957 state->fc_crc = 0; 1958 break; 1959 case EXT4_FC_TAG_HEAD: 1960 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); 1961 if (le32_to_cpu(head->fc_features) & 1962 ~EXT4_FC_SUPPORTED_FEATURES) { 1963 ret = -EOPNOTSUPP; 1964 break; 1965 } 1966 if (le32_to_cpu(head->fc_tid) != expected_tid) { 1967 ret = JBD2_FC_REPLAY_STOP; 1968 break; 1969 } 1970 state->fc_cur_tag++; 1971 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 1972 sizeof(*tl) + ext4_fc_tag_len(tl)); 1973 break; 1974 default: 1975 ret = state->fc_replay_num_tags ? 1976 JBD2_FC_REPLAY_STOP : -ECANCELED; 1977 } 1978 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 1979 break; 1980 } 1981 1982 out_err: 1983 trace_ext4_fc_replay_scan(sb, ret, off); 1984 return ret; 1985 } 1986 1987 /* 1988 * Main recovery path entry point. 1989 * The meaning of return codes is similar as above. 1990 */ 1991 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 1992 enum passtype pass, int off, tid_t expected_tid) 1993 { 1994 struct super_block *sb = journal->j_private; 1995 struct ext4_sb_info *sbi = EXT4_SB(sb); 1996 struct ext4_fc_tl *tl; 1997 __u8 *start, *end; 1998 int ret = JBD2_FC_REPLAY_CONTINUE; 1999 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2000 struct ext4_fc_tail *tail; 2001 2002 if (pass == PASS_SCAN) { 2003 state->fc_current_pass = PASS_SCAN; 2004 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2005 } 2006 2007 if (state->fc_current_pass != pass) { 2008 state->fc_current_pass = pass; 2009 sbi->s_mount_state |= EXT4_FC_REPLAY; 2010 } 2011 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2012 jbd_debug(1, "Replay stops\n"); 2013 ext4_fc_set_bitmaps_and_counters(sb); 2014 return 0; 2015 } 2016 2017 #ifdef CONFIG_EXT4_DEBUG 2018 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2019 pr_warn("Dropping fc block %d because max_replay set\n", off); 2020 return JBD2_FC_REPLAY_STOP; 2021 } 2022 #endif 2023 2024 start = (u8 *)bh->b_data; 2025 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2026 2027 fc_for_each_tl(start, end, tl) { 2028 if (state->fc_replay_num_tags == 0) { 2029 ret = JBD2_FC_REPLAY_STOP; 2030 ext4_fc_set_bitmaps_and_counters(sb); 2031 break; 2032 } 2033 jbd_debug(3, "Replay phase, tag:%s\n", 2034 tag2str(le16_to_cpu(tl->fc_tag))); 2035 state->fc_replay_num_tags--; 2036 switch (le16_to_cpu(tl->fc_tag)) { 2037 case EXT4_FC_TAG_LINK: 2038 ret = ext4_fc_replay_link(sb, tl); 2039 break; 2040 case EXT4_FC_TAG_UNLINK: 2041 ret = ext4_fc_replay_unlink(sb, tl); 2042 break; 2043 case EXT4_FC_TAG_ADD_RANGE: 2044 ret = ext4_fc_replay_add_range(sb, tl); 2045 break; 2046 case EXT4_FC_TAG_CREAT: 2047 ret = ext4_fc_replay_create(sb, tl); 2048 break; 2049 case EXT4_FC_TAG_DEL_RANGE: 2050 ret = ext4_fc_replay_del_range(sb, tl); 2051 break; 2052 case EXT4_FC_TAG_INODE: 2053 ret = ext4_fc_replay_inode(sb, tl); 2054 break; 2055 case EXT4_FC_TAG_PAD: 2056 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2057 ext4_fc_tag_len(tl), 0); 2058 break; 2059 case EXT4_FC_TAG_TAIL: 2060 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2061 ext4_fc_tag_len(tl), 0); 2062 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 2063 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); 2064 break; 2065 case EXT4_FC_TAG_HEAD: 2066 break; 2067 default: 2068 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, 2069 ext4_fc_tag_len(tl), 0); 2070 ret = -ECANCELED; 2071 break; 2072 } 2073 if (ret < 0) 2074 break; 2075 ret = JBD2_FC_REPLAY_CONTINUE; 2076 } 2077 return ret; 2078 } 2079 2080 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2081 { 2082 int num_fc_blocks; 2083 2084 /* 2085 * We set replay callback even if fast commit disabled because we may 2086 * could still have fast commit blocks that need to be replayed even if 2087 * fast commit has now been turned off. 2088 */ 2089 journal->j_fc_replay_callback = ext4_fc_replay; 2090 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2091 return; 2092 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2093 if (!buffer_uptodate(journal->j_sb_buffer) 2094 && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, 2095 true)) { 2096 ext4_msg(sb, KERN_ERR, "I/O error on journal"); 2097 return; 2098 } 2099 num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks); 2100 if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks : 2101 EXT4_NUM_FC_BLKS)) { 2102 pr_warn("Error while enabling fast commits, turning off."); 2103 ext4_clear_feature_fast_commit(sb); 2104 } 2105 } 2106 2107 const char *fc_ineligible_reasons[] = { 2108 "Extended attributes changed", 2109 "Cross rename", 2110 "Journal flag changed", 2111 "Insufficient memory", 2112 "Swap boot", 2113 "Resize", 2114 "Dir renamed", 2115 "Falloc range op", 2116 "FC Commit Failed" 2117 }; 2118 2119 int ext4_fc_info_show(struct seq_file *seq, void *v) 2120 { 2121 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2122 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2123 int i; 2124 2125 if (v != SEQ_START_TOKEN) 2126 return 0; 2127 2128 seq_printf(seq, 2129 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2130 stats->fc_num_commits, stats->fc_ineligible_commits, 2131 stats->fc_numblks, 2132 div_u64(sbi->s_fc_avg_commit_time, 1000)); 2133 seq_puts(seq, "Ineligible reasons:\n"); 2134 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2135 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2136 stats->fc_ineligible_reason_count[i]); 2137 2138 return 0; 2139 } 2140 2141 int __init ext4_fc_init_dentry_cache(void) 2142 { 2143 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2144 SLAB_RECLAIM_ACCOUNT); 2145 2146 if (ext4_fc_dentry_cachep == NULL) 2147 return -ENOMEM; 2148 2149 return 0; 2150 } 2151