1 /* 2 * linux/fs/jbd2/commit.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Journal commit routines for the generic filesystem journaling code; 13 * part of the ext2fs journaling system. 14 */ 15 16 #include <linux/time.h> 17 #include <linux/fs.h> 18 #include <linux/jbd2.h> 19 #include <linux/errno.h> 20 #include <linux/slab.h> 21 #include <linux/mm.h> 22 #include <linux/pagemap.h> 23 #include <linux/jiffies.h> 24 #include <linux/crc32.h> 25 #include <linux/writeback.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bio.h> 28 #include <linux/blkdev.h> 29 #include <linux/bitops.h> 30 #include <trace/events/jbd2.h> 31 32 /* 33 * IO end handler for temporary buffer_heads handling writes to the journal. 34 */ 35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 36 { 37 struct buffer_head *orig_bh = bh->b_private; 38 39 BUFFER_TRACE(bh, ""); 40 if (uptodate) 41 set_buffer_uptodate(bh); 42 else 43 clear_buffer_uptodate(bh); 44 if (orig_bh) { 45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state); 46 smp_mb__after_atomic(); 47 wake_up_bit(&orig_bh->b_state, BH_Shadow); 48 } 49 unlock_buffer(bh); 50 } 51 52 /* 53 * When an ext4 file is truncated, it is possible that some pages are not 54 * successfully freed, because they are attached to a committing transaction. 55 * After the transaction commits, these pages are left on the LRU, with no 56 * ->mapping, and with attached buffers. These pages are trivially reclaimable 57 * by the VM, but their apparent absence upsets the VM accounting, and it makes 58 * the numbers in /proc/meminfo look odd. 59 * 60 * So here, we have a buffer which has just come off the forget list. Look to 61 * see if we can strip all buffers from the backing page. 62 * 63 * Called under lock_journal(), and possibly under journal_datalist_lock. The 64 * caller provided us with a ref against the buffer, and we drop that here. 65 */ 66 static void release_buffer_page(struct buffer_head *bh) 67 { 68 struct page *page; 69 70 if (buffer_dirty(bh)) 71 goto nope; 72 if (atomic_read(&bh->b_count) != 1) 73 goto nope; 74 page = bh->b_page; 75 if (!page) 76 goto nope; 77 if (page->mapping) 78 goto nope; 79 80 /* OK, it's a truncated page */ 81 if (!trylock_page(page)) 82 goto nope; 83 84 page_cache_get(page); 85 __brelse(bh); 86 try_to_free_buffers(page); 87 unlock_page(page); 88 page_cache_release(page); 89 return; 90 91 nope: 92 __brelse(bh); 93 } 94 95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) 96 { 97 struct commit_header *h; 98 __u32 csum; 99 100 if (!jbd2_journal_has_csum_v2or3(j)) 101 return; 102 103 h = (struct commit_header *)(bh->b_data); 104 h->h_chksum_type = 0; 105 h->h_chksum_size = 0; 106 h->h_chksum[0] = 0; 107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); 108 h->h_chksum[0] = cpu_to_be32(csum); 109 } 110 111 /* 112 * Done it all: now submit the commit record. We should have 113 * cleaned up our previous buffers by now, so if we are in abort 114 * mode we can now just skip the rest of the journal write 115 * entirely. 116 * 117 * Returns 1 if the journal needs to be aborted or 0 on success 118 */ 119 static int journal_submit_commit_record(journal_t *journal, 120 transaction_t *commit_transaction, 121 struct buffer_head **cbh, 122 __u32 crc32_sum) 123 { 124 struct commit_header *tmp; 125 struct buffer_head *bh; 126 int ret; 127 struct timespec now = current_kernel_time(); 128 129 *cbh = NULL; 130 131 if (is_journal_aborted(journal)) 132 return 0; 133 134 bh = jbd2_journal_get_descriptor_buffer(journal); 135 if (!bh) 136 return 1; 137 138 tmp = (struct commit_header *)bh->b_data; 139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 144 145 if (jbd2_has_feature_checksum(journal)) { 146 tmp->h_chksum_type = JBD2_CRC32_CHKSUM; 147 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 148 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 149 } 150 jbd2_commit_block_csum_set(journal, bh); 151 152 BUFFER_TRACE(bh, "submit commit block"); 153 lock_buffer(bh); 154 clear_buffer_dirty(bh); 155 set_buffer_uptodate(bh); 156 bh->b_end_io = journal_end_buffer_io_sync; 157 158 if (journal->j_flags & JBD2_BARRIER && 159 !jbd2_has_feature_async_commit(journal)) 160 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); 161 else 162 ret = submit_bh(WRITE_SYNC, bh); 163 164 *cbh = bh; 165 return ret; 166 } 167 168 /* 169 * This function along with journal_submit_commit_record 170 * allows to write the commit record asynchronously. 171 */ 172 static int journal_wait_on_commit_record(journal_t *journal, 173 struct buffer_head *bh) 174 { 175 int ret = 0; 176 177 clear_buffer_dirty(bh); 178 wait_on_buffer(bh); 179 180 if (unlikely(!buffer_uptodate(bh))) 181 ret = -EIO; 182 put_bh(bh); /* One for getblk() */ 183 184 return ret; 185 } 186 187 /* 188 * write the filemap data using writepage() address_space_operations. 189 * We don't do block allocation here even for delalloc. We don't 190 * use writepages() because with dealyed allocation we may be doing 191 * block allocation in writepages(). 192 */ 193 static int journal_submit_inode_data_buffers(struct address_space *mapping) 194 { 195 int ret; 196 struct writeback_control wbc = { 197 .sync_mode = WB_SYNC_ALL, 198 .nr_to_write = mapping->nrpages * 2, 199 .range_start = 0, 200 .range_end = i_size_read(mapping->host), 201 }; 202 203 ret = generic_writepages(mapping, &wbc); 204 return ret; 205 } 206 207 /* 208 * Submit all the data buffers of inode associated with the transaction to 209 * disk. 210 * 211 * We are in a committing transaction. Therefore no new inode can be added to 212 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 213 * operate on from being released while we write out pages. 214 */ 215 static int journal_submit_data_buffers(journal_t *journal, 216 transaction_t *commit_transaction) 217 { 218 struct jbd2_inode *jinode; 219 int err, ret = 0; 220 struct address_space *mapping; 221 222 spin_lock(&journal->j_list_lock); 223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 224 mapping = jinode->i_vfs_inode->i_mapping; 225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 226 spin_unlock(&journal->j_list_lock); 227 /* 228 * submit the inode data buffers. We use writepage 229 * instead of writepages. Because writepages can do 230 * block allocation with delalloc. We need to write 231 * only allocated blocks here. 232 */ 233 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 234 err = journal_submit_inode_data_buffers(mapping); 235 if (!ret) 236 ret = err; 237 spin_lock(&journal->j_list_lock); 238 J_ASSERT(jinode->i_transaction == commit_transaction); 239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 240 smp_mb__after_atomic(); 241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 242 } 243 spin_unlock(&journal->j_list_lock); 244 return ret; 245 } 246 247 /* 248 * Wait for data submitted for writeout, refile inodes to proper 249 * transaction if needed. 250 * 251 */ 252 static int journal_finish_inode_data_buffers(journal_t *journal, 253 transaction_t *commit_transaction) 254 { 255 struct jbd2_inode *jinode, *next_i; 256 int err, ret = 0; 257 258 /* For locking, see the comment in journal_submit_data_buffers() */ 259 spin_lock(&journal->j_list_lock); 260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 262 spin_unlock(&journal->j_list_lock); 263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 264 if (err) { 265 /* 266 * Because AS_EIO is cleared by 267 * filemap_fdatawait_range(), set it again so 268 * that user process can get -EIO from fsync(). 269 */ 270 set_bit(AS_EIO, 271 &jinode->i_vfs_inode->i_mapping->flags); 272 273 if (!ret) 274 ret = err; 275 } 276 spin_lock(&journal->j_list_lock); 277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 278 smp_mb__after_atomic(); 279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 280 } 281 282 /* Now refile inode to proper lists */ 283 list_for_each_entry_safe(jinode, next_i, 284 &commit_transaction->t_inode_list, i_list) { 285 list_del(&jinode->i_list); 286 if (jinode->i_next_transaction) { 287 jinode->i_transaction = jinode->i_next_transaction; 288 jinode->i_next_transaction = NULL; 289 list_add(&jinode->i_list, 290 &jinode->i_transaction->t_inode_list); 291 } else { 292 jinode->i_transaction = NULL; 293 } 294 } 295 spin_unlock(&journal->j_list_lock); 296 297 return ret; 298 } 299 300 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 301 { 302 struct page *page = bh->b_page; 303 char *addr; 304 __u32 checksum; 305 306 addr = kmap_atomic(page); 307 checksum = crc32_be(crc32_sum, 308 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); 309 kunmap_atomic(addr); 310 311 return checksum; 312 } 313 314 static void write_tag_block(journal_t *j, journal_block_tag_t *tag, 315 unsigned long long block) 316 { 317 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 318 if (jbd2_has_feature_64bit(j)) 319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 320 } 321 322 static void jbd2_descr_block_csum_set(journal_t *j, 323 struct buffer_head *bh) 324 { 325 struct jbd2_journal_block_tail *tail; 326 __u32 csum; 327 328 if (!jbd2_journal_has_csum_v2or3(j)) 329 return; 330 331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - 332 sizeof(struct jbd2_journal_block_tail)); 333 tail->t_checksum = 0; 334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); 335 tail->t_checksum = cpu_to_be32(csum); 336 } 337 338 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 339 struct buffer_head *bh, __u32 sequence) 340 { 341 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 342 struct page *page = bh->b_page; 343 __u8 *addr; 344 __u32 csum32; 345 __be32 seq; 346 347 if (!jbd2_journal_has_csum_v2or3(j)) 348 return; 349 350 seq = cpu_to_be32(sequence); 351 addr = kmap_atomic(page); 352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 353 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), 354 bh->b_size); 355 kunmap_atomic(addr); 356 357 if (jbd2_has_feature_csum3(j)) 358 tag3->t_checksum = cpu_to_be32(csum32); 359 else 360 tag->t_checksum = cpu_to_be16(csum32); 361 } 362 /* 363 * jbd2_journal_commit_transaction 364 * 365 * The primary function for committing a transaction to the log. This 366 * function is called by the journal thread to begin a complete commit. 367 */ 368 void jbd2_journal_commit_transaction(journal_t *journal) 369 { 370 struct transaction_stats_s stats; 371 transaction_t *commit_transaction; 372 struct journal_head *jh; 373 struct buffer_head *descriptor; 374 struct buffer_head **wbuf = journal->j_wbuf; 375 int bufs; 376 int flags; 377 int err; 378 unsigned long long blocknr; 379 ktime_t start_time; 380 u64 commit_time; 381 char *tagp = NULL; 382 journal_header_t *header; 383 journal_block_tag_t *tag = NULL; 384 int space_left = 0; 385 int first_tag = 0; 386 int tag_flag; 387 int i; 388 int tag_bytes = journal_tag_bytes(journal); 389 struct buffer_head *cbh = NULL; /* For transactional checksums */ 390 __u32 crc32_sum = ~0; 391 struct blk_plug plug; 392 /* Tail of the journal */ 393 unsigned long first_block; 394 tid_t first_tid; 395 int update_tail; 396 int csum_size = 0; 397 LIST_HEAD(io_bufs); 398 LIST_HEAD(log_bufs); 399 400 if (jbd2_journal_has_csum_v2or3(journal)) 401 csum_size = sizeof(struct jbd2_journal_block_tail); 402 403 /* 404 * First job: lock down the current transaction and wait for 405 * all outstanding updates to complete. 406 */ 407 408 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 409 if (journal->j_flags & JBD2_FLUSHED) { 410 jbd_debug(3, "super block updated\n"); 411 mutex_lock(&journal->j_checkpoint_mutex); 412 /* 413 * We hold j_checkpoint_mutex so tail cannot change under us. 414 * We don't need any special data guarantees for writing sb 415 * since journal is empty and it is ok for write to be 416 * flushed only with transaction commit. 417 */ 418 jbd2_journal_update_sb_log_tail(journal, 419 journal->j_tail_sequence, 420 journal->j_tail, 421 WRITE_SYNC); 422 mutex_unlock(&journal->j_checkpoint_mutex); 423 } else { 424 jbd_debug(3, "superblock not updated\n"); 425 } 426 427 J_ASSERT(journal->j_running_transaction != NULL); 428 J_ASSERT(journal->j_committing_transaction == NULL); 429 430 commit_transaction = journal->j_running_transaction; 431 432 trace_jbd2_start_commit(journal, commit_transaction); 433 jbd_debug(1, "JBD2: starting commit of transaction %d\n", 434 commit_transaction->t_tid); 435 436 write_lock(&journal->j_state_lock); 437 J_ASSERT(commit_transaction->t_state == T_RUNNING); 438 commit_transaction->t_state = T_LOCKED; 439 440 trace_jbd2_commit_locking(journal, commit_transaction); 441 stats.run.rs_wait = commit_transaction->t_max_wait; 442 stats.run.rs_request_delay = 0; 443 stats.run.rs_locked = jiffies; 444 if (commit_transaction->t_requested) 445 stats.run.rs_request_delay = 446 jbd2_time_diff(commit_transaction->t_requested, 447 stats.run.rs_locked); 448 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 449 stats.run.rs_locked); 450 451 spin_lock(&commit_transaction->t_handle_lock); 452 while (atomic_read(&commit_transaction->t_updates)) { 453 DEFINE_WAIT(wait); 454 455 prepare_to_wait(&journal->j_wait_updates, &wait, 456 TASK_UNINTERRUPTIBLE); 457 if (atomic_read(&commit_transaction->t_updates)) { 458 spin_unlock(&commit_transaction->t_handle_lock); 459 write_unlock(&journal->j_state_lock); 460 schedule(); 461 write_lock(&journal->j_state_lock); 462 spin_lock(&commit_transaction->t_handle_lock); 463 } 464 finish_wait(&journal->j_wait_updates, &wait); 465 } 466 spin_unlock(&commit_transaction->t_handle_lock); 467 468 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= 469 journal->j_max_transaction_buffers); 470 471 /* 472 * First thing we are allowed to do is to discard any remaining 473 * BJ_Reserved buffers. Note, it is _not_ permissible to assume 474 * that there are no such buffers: if a large filesystem 475 * operation like a truncate needs to split itself over multiple 476 * transactions, then it may try to do a jbd2_journal_restart() while 477 * there are still BJ_Reserved buffers outstanding. These must 478 * be released cleanly from the current transaction. 479 * 480 * In this case, the filesystem must still reserve write access 481 * again before modifying the buffer in the new transaction, but 482 * we do not require it to remember exactly which old buffers it 483 * has reserved. This is consistent with the existing behaviour 484 * that multiple jbd2_journal_get_write_access() calls to the same 485 * buffer are perfectly permissible. 486 */ 487 while (commit_transaction->t_reserved_list) { 488 jh = commit_transaction->t_reserved_list; 489 JBUFFER_TRACE(jh, "reserved, unused: refile"); 490 /* 491 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may 492 * leave undo-committed data. 493 */ 494 if (jh->b_committed_data) { 495 struct buffer_head *bh = jh2bh(jh); 496 497 jbd_lock_bh_state(bh); 498 jbd2_free(jh->b_committed_data, bh->b_size); 499 jh->b_committed_data = NULL; 500 jbd_unlock_bh_state(bh); 501 } 502 jbd2_journal_refile_buffer(journal, jh); 503 } 504 505 /* 506 * Now try to drop any written-back buffers from the journal's 507 * checkpoint lists. We do this *before* commit because it potentially 508 * frees some memory 509 */ 510 spin_lock(&journal->j_list_lock); 511 __jbd2_journal_clean_checkpoint_list(journal, false); 512 spin_unlock(&journal->j_list_lock); 513 514 jbd_debug(3, "JBD2: commit phase 1\n"); 515 516 /* 517 * Clear revoked flag to reflect there is no revoked buffers 518 * in the next transaction which is going to be started. 519 */ 520 jbd2_clear_buffer_revoked_flags(journal); 521 522 /* 523 * Switch to a new revoke table. 524 */ 525 jbd2_journal_switch_revoke_table(journal); 526 527 /* 528 * Reserved credits cannot be claimed anymore, free them 529 */ 530 atomic_sub(atomic_read(&journal->j_reserved_credits), 531 &commit_transaction->t_outstanding_credits); 532 533 trace_jbd2_commit_flushing(journal, commit_transaction); 534 stats.run.rs_flushing = jiffies; 535 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 536 stats.run.rs_flushing); 537 538 commit_transaction->t_state = T_FLUSH; 539 journal->j_committing_transaction = commit_transaction; 540 journal->j_running_transaction = NULL; 541 start_time = ktime_get(); 542 commit_transaction->t_log_start = journal->j_head; 543 wake_up(&journal->j_wait_transaction_locked); 544 write_unlock(&journal->j_state_lock); 545 546 jbd_debug(3, "JBD2: commit phase 2a\n"); 547 548 /* 549 * Now start flushing things to disk, in the order they appear 550 * on the transaction lists. Data blocks go first. 551 */ 552 err = journal_submit_data_buffers(journal, commit_transaction); 553 if (err) 554 jbd2_journal_abort(journal, err); 555 556 blk_start_plug(&plug); 557 jbd2_journal_write_revoke_records(journal, commit_transaction, 558 &log_bufs, WRITE_SYNC); 559 560 jbd_debug(3, "JBD2: commit phase 2b\n"); 561 562 /* 563 * Way to go: we have now written out all of the data for a 564 * transaction! Now comes the tricky part: we need to write out 565 * metadata. Loop over the transaction's entire buffer list: 566 */ 567 write_lock(&journal->j_state_lock); 568 commit_transaction->t_state = T_COMMIT; 569 write_unlock(&journal->j_state_lock); 570 571 trace_jbd2_commit_logging(journal, commit_transaction); 572 stats.run.rs_logging = jiffies; 573 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 574 stats.run.rs_logging); 575 stats.run.rs_blocks = 576 atomic_read(&commit_transaction->t_outstanding_credits); 577 stats.run.rs_blocks_logged = 0; 578 579 J_ASSERT(commit_transaction->t_nr_buffers <= 580 atomic_read(&commit_transaction->t_outstanding_credits)); 581 582 err = 0; 583 bufs = 0; 584 descriptor = NULL; 585 while (commit_transaction->t_buffers) { 586 587 /* Find the next buffer to be journaled... */ 588 589 jh = commit_transaction->t_buffers; 590 591 /* If we're in abort mode, we just un-journal the buffer and 592 release it. */ 593 594 if (is_journal_aborted(journal)) { 595 clear_buffer_jbddirty(jh2bh(jh)); 596 JBUFFER_TRACE(jh, "journal is aborting: refile"); 597 jbd2_buffer_abort_trigger(jh, 598 jh->b_frozen_data ? 599 jh->b_frozen_triggers : 600 jh->b_triggers); 601 jbd2_journal_refile_buffer(journal, jh); 602 /* If that was the last one, we need to clean up 603 * any descriptor buffers which may have been 604 * already allocated, even if we are now 605 * aborting. */ 606 if (!commit_transaction->t_buffers) 607 goto start_journal_io; 608 continue; 609 } 610 611 /* Make sure we have a descriptor block in which to 612 record the metadata buffer. */ 613 614 if (!descriptor) { 615 J_ASSERT (bufs == 0); 616 617 jbd_debug(4, "JBD2: get descriptor\n"); 618 619 descriptor = jbd2_journal_get_descriptor_buffer(journal); 620 if (!descriptor) { 621 jbd2_journal_abort(journal, -EIO); 622 continue; 623 } 624 625 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 626 (unsigned long long)descriptor->b_blocknr, 627 descriptor->b_data); 628 header = (journal_header_t *)descriptor->b_data; 629 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 630 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); 631 header->h_sequence = cpu_to_be32(commit_transaction->t_tid); 632 633 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 634 space_left = descriptor->b_size - 635 sizeof(journal_header_t); 636 first_tag = 1; 637 set_buffer_jwrite(descriptor); 638 set_buffer_dirty(descriptor); 639 wbuf[bufs++] = descriptor; 640 641 /* Record it so that we can wait for IO 642 completion later */ 643 BUFFER_TRACE(descriptor, "ph3: file as descriptor"); 644 jbd2_file_log_bh(&log_bufs, descriptor); 645 } 646 647 /* Where is the buffer to be written? */ 648 649 err = jbd2_journal_next_log_block(journal, &blocknr); 650 /* If the block mapping failed, just abandon the buffer 651 and repeat this loop: we'll fall into the 652 refile-on-abort condition above. */ 653 if (err) { 654 jbd2_journal_abort(journal, err); 655 continue; 656 } 657 658 /* 659 * start_this_handle() uses t_outstanding_credits to determine 660 * the free space in the log, but this counter is changed 661 * by jbd2_journal_next_log_block() also. 662 */ 663 atomic_dec(&commit_transaction->t_outstanding_credits); 664 665 /* Bump b_count to prevent truncate from stumbling over 666 the shadowed buffer! @@@ This can go if we ever get 667 rid of the shadow pairing of buffers. */ 668 atomic_inc(&jh2bh(jh)->b_count); 669 670 /* 671 * Make a temporary IO buffer with which to write it out 672 * (this will requeue the metadata buffer to BJ_Shadow). 673 */ 674 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 675 JBUFFER_TRACE(jh, "ph3: write metadata"); 676 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 677 jh, &wbuf[bufs], blocknr); 678 if (flags < 0) { 679 jbd2_journal_abort(journal, flags); 680 continue; 681 } 682 jbd2_file_log_bh(&io_bufs, wbuf[bufs]); 683 684 /* Record the new block's tag in the current descriptor 685 buffer */ 686 687 tag_flag = 0; 688 if (flags & 1) 689 tag_flag |= JBD2_FLAG_ESCAPE; 690 if (!first_tag) 691 tag_flag |= JBD2_FLAG_SAME_UUID; 692 693 tag = (journal_block_tag_t *) tagp; 694 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); 695 tag->t_flags = cpu_to_be16(tag_flag); 696 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 697 commit_transaction->t_tid); 698 tagp += tag_bytes; 699 space_left -= tag_bytes; 700 bufs++; 701 702 if (first_tag) { 703 memcpy (tagp, journal->j_uuid, 16); 704 tagp += 16; 705 space_left -= 16; 706 first_tag = 0; 707 } 708 709 /* If there's no more to do, or if the descriptor is full, 710 let the IO rip! */ 711 712 if (bufs == journal->j_wbufsize || 713 commit_transaction->t_buffers == NULL || 714 space_left < tag_bytes + 16 + csum_size) { 715 716 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); 717 718 /* Write an end-of-descriptor marker before 719 submitting the IOs. "tag" still points to 720 the last tag we set up. */ 721 722 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 723 724 jbd2_descr_block_csum_set(journal, descriptor); 725 start_journal_io: 726 for (i = 0; i < bufs; i++) { 727 struct buffer_head *bh = wbuf[i]; 728 /* 729 * Compute checksum. 730 */ 731 if (jbd2_has_feature_checksum(journal)) { 732 crc32_sum = 733 jbd2_checksum_data(crc32_sum, bh); 734 } 735 736 lock_buffer(bh); 737 clear_buffer_dirty(bh); 738 set_buffer_uptodate(bh); 739 bh->b_end_io = journal_end_buffer_io_sync; 740 submit_bh(WRITE_SYNC, bh); 741 } 742 cond_resched(); 743 stats.run.rs_blocks_logged += bufs; 744 745 /* Force a new descriptor to be generated next 746 time round the loop. */ 747 descriptor = NULL; 748 bufs = 0; 749 } 750 } 751 752 err = journal_finish_inode_data_buffers(journal, commit_transaction); 753 if (err) { 754 printk(KERN_WARNING 755 "JBD2: Detected IO errors while flushing file data " 756 "on %s\n", journal->j_devname); 757 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) 758 jbd2_journal_abort(journal, err); 759 err = 0; 760 } 761 762 /* 763 * Get current oldest transaction in the log before we issue flush 764 * to the filesystem device. After the flush we can be sure that 765 * blocks of all older transactions are checkpointed to persistent 766 * storage and we will be safe to update journal start in the 767 * superblock with the numbers we get here. 768 */ 769 update_tail = 770 jbd2_journal_get_log_tail(journal, &first_tid, &first_block); 771 772 write_lock(&journal->j_state_lock); 773 if (update_tail) { 774 long freed = first_block - journal->j_tail; 775 776 if (first_block < journal->j_tail) 777 freed += journal->j_last - journal->j_first; 778 /* Update tail only if we free significant amount of space */ 779 if (freed < journal->j_maxlen / 4) 780 update_tail = 0; 781 } 782 J_ASSERT(commit_transaction->t_state == T_COMMIT); 783 commit_transaction->t_state = T_COMMIT_DFLUSH; 784 write_unlock(&journal->j_state_lock); 785 786 /* 787 * If the journal is not located on the file system device, 788 * then we must flush the file system device before we issue 789 * the commit record 790 */ 791 if (commit_transaction->t_need_data_flush && 792 (journal->j_fs_dev != journal->j_dev) && 793 (journal->j_flags & JBD2_BARRIER)) 794 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); 795 796 /* Done it all: now write the commit record asynchronously. */ 797 if (jbd2_has_feature_async_commit(journal)) { 798 err = journal_submit_commit_record(journal, commit_transaction, 799 &cbh, crc32_sum); 800 if (err) 801 __jbd2_journal_abort_hard(journal); 802 } 803 804 blk_finish_plug(&plug); 805 806 /* Lo and behold: we have just managed to send a transaction to 807 the log. Before we can commit it, wait for the IO so far to 808 complete. Control buffers being written are on the 809 transaction's t_log_list queue, and metadata buffers are on 810 the io_bufs list. 811 812 Wait for the buffers in reverse order. That way we are 813 less likely to be woken up until all IOs have completed, and 814 so we incur less scheduling load. 815 */ 816 817 jbd_debug(3, "JBD2: commit phase 3\n"); 818 819 while (!list_empty(&io_bufs)) { 820 struct buffer_head *bh = list_entry(io_bufs.prev, 821 struct buffer_head, 822 b_assoc_buffers); 823 824 wait_on_buffer(bh); 825 cond_resched(); 826 827 if (unlikely(!buffer_uptodate(bh))) 828 err = -EIO; 829 jbd2_unfile_log_bh(bh); 830 831 /* 832 * The list contains temporary buffer heads created by 833 * jbd2_journal_write_metadata_buffer(). 834 */ 835 BUFFER_TRACE(bh, "dumping temporary bh"); 836 __brelse(bh); 837 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 838 free_buffer_head(bh); 839 840 /* We also have to refile the corresponding shadowed buffer */ 841 jh = commit_transaction->t_shadow_list->b_tprev; 842 bh = jh2bh(jh); 843 clear_buffer_jwrite(bh); 844 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 845 J_ASSERT_BH(bh, !buffer_shadow(bh)); 846 847 /* The metadata is now released for reuse, but we need 848 to remember it against this transaction so that when 849 we finally commit, we can do any checkpointing 850 required. */ 851 JBUFFER_TRACE(jh, "file as BJ_Forget"); 852 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 853 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 854 __brelse(bh); 855 } 856 857 J_ASSERT (commit_transaction->t_shadow_list == NULL); 858 859 jbd_debug(3, "JBD2: commit phase 4\n"); 860 861 /* Here we wait for the revoke record and descriptor record buffers */ 862 while (!list_empty(&log_bufs)) { 863 struct buffer_head *bh; 864 865 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); 866 wait_on_buffer(bh); 867 cond_resched(); 868 869 if (unlikely(!buffer_uptodate(bh))) 870 err = -EIO; 871 872 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 873 clear_buffer_jwrite(bh); 874 jbd2_unfile_log_bh(bh); 875 __brelse(bh); /* One for getblk */ 876 /* AKPM: bforget here */ 877 } 878 879 if (err) 880 jbd2_journal_abort(journal, err); 881 882 jbd_debug(3, "JBD2: commit phase 5\n"); 883 write_lock(&journal->j_state_lock); 884 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 885 commit_transaction->t_state = T_COMMIT_JFLUSH; 886 write_unlock(&journal->j_state_lock); 887 888 if (!jbd2_has_feature_async_commit(journal)) { 889 err = journal_submit_commit_record(journal, commit_transaction, 890 &cbh, crc32_sum); 891 if (err) 892 __jbd2_journal_abort_hard(journal); 893 } 894 if (cbh) 895 err = journal_wait_on_commit_record(journal, cbh); 896 if (jbd2_has_feature_async_commit(journal) && 897 journal->j_flags & JBD2_BARRIER) { 898 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); 899 } 900 901 if (err) 902 jbd2_journal_abort(journal, err); 903 904 /* 905 * Now disk caches for filesystem device are flushed so we are safe to 906 * erase checkpointed transactions from the log by updating journal 907 * superblock. 908 */ 909 if (update_tail) 910 jbd2_update_log_tail(journal, first_tid, first_block); 911 912 /* End of a transaction! Finally, we can do checkpoint 913 processing: any buffers committed as a result of this 914 transaction can be removed from any checkpoint list it was on 915 before. */ 916 917 jbd_debug(3, "JBD2: commit phase 6\n"); 918 919 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 920 J_ASSERT(commit_transaction->t_buffers == NULL); 921 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 922 J_ASSERT(commit_transaction->t_shadow_list == NULL); 923 924 restart_loop: 925 /* 926 * As there are other places (journal_unmap_buffer()) adding buffers 927 * to this list we have to be careful and hold the j_list_lock. 928 */ 929 spin_lock(&journal->j_list_lock); 930 while (commit_transaction->t_forget) { 931 transaction_t *cp_transaction; 932 struct buffer_head *bh; 933 int try_to_free = 0; 934 935 jh = commit_transaction->t_forget; 936 spin_unlock(&journal->j_list_lock); 937 bh = jh2bh(jh); 938 /* 939 * Get a reference so that bh cannot be freed before we are 940 * done with it. 941 */ 942 get_bh(bh); 943 jbd_lock_bh_state(bh); 944 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); 945 946 /* 947 * If there is undo-protected committed data against 948 * this buffer, then we can remove it now. If it is a 949 * buffer needing such protection, the old frozen_data 950 * field now points to a committed version of the 951 * buffer, so rotate that field to the new committed 952 * data. 953 * 954 * Otherwise, we can just throw away the frozen data now. 955 * 956 * We also know that the frozen data has already fired 957 * its triggers if they exist, so we can clear that too. 958 */ 959 if (jh->b_committed_data) { 960 jbd2_free(jh->b_committed_data, bh->b_size); 961 jh->b_committed_data = NULL; 962 if (jh->b_frozen_data) { 963 jh->b_committed_data = jh->b_frozen_data; 964 jh->b_frozen_data = NULL; 965 jh->b_frozen_triggers = NULL; 966 } 967 } else if (jh->b_frozen_data) { 968 jbd2_free(jh->b_frozen_data, bh->b_size); 969 jh->b_frozen_data = NULL; 970 jh->b_frozen_triggers = NULL; 971 } 972 973 spin_lock(&journal->j_list_lock); 974 cp_transaction = jh->b_cp_transaction; 975 if (cp_transaction) { 976 JBUFFER_TRACE(jh, "remove from old cp transaction"); 977 cp_transaction->t_chp_stats.cs_dropped++; 978 __jbd2_journal_remove_checkpoint(jh); 979 } 980 981 /* Only re-checkpoint the buffer_head if it is marked 982 * dirty. If the buffer was added to the BJ_Forget list 983 * by jbd2_journal_forget, it may no longer be dirty and 984 * there's no point in keeping a checkpoint record for 985 * it. */ 986 987 /* 988 * A buffer which has been freed while still being journaled by 989 * a previous transaction. 990 */ 991 if (buffer_freed(bh)) { 992 /* 993 * If the running transaction is the one containing 994 * "add to orphan" operation (b_next_transaction != 995 * NULL), we have to wait for that transaction to 996 * commit before we can really get rid of the buffer. 997 * So just clear b_modified to not confuse transaction 998 * credit accounting and refile the buffer to 999 * BJ_Forget of the running transaction. If the just 1000 * committed transaction contains "add to orphan" 1001 * operation, we can completely invalidate the buffer 1002 * now. We are rather through in that since the 1003 * buffer may be still accessible when blocksize < 1004 * pagesize and it is attached to the last partial 1005 * page. 1006 */ 1007 jh->b_modified = 0; 1008 if (!jh->b_next_transaction) { 1009 clear_buffer_freed(bh); 1010 clear_buffer_jbddirty(bh); 1011 clear_buffer_mapped(bh); 1012 clear_buffer_new(bh); 1013 clear_buffer_req(bh); 1014 bh->b_bdev = NULL; 1015 } 1016 } 1017 1018 if (buffer_jbddirty(bh)) { 1019 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 1020 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 1021 if (is_journal_aborted(journal)) 1022 clear_buffer_jbddirty(bh); 1023 } else { 1024 J_ASSERT_BH(bh, !buffer_dirty(bh)); 1025 /* 1026 * The buffer on BJ_Forget list and not jbddirty means 1027 * it has been freed by this transaction and hence it 1028 * could not have been reallocated until this 1029 * transaction has committed. *BUT* it could be 1030 * reallocated once we have written all the data to 1031 * disk and before we process the buffer on BJ_Forget 1032 * list. 1033 */ 1034 if (!jh->b_next_transaction) 1035 try_to_free = 1; 1036 } 1037 JBUFFER_TRACE(jh, "refile or unfile buffer"); 1038 __jbd2_journal_refile_buffer(jh); 1039 jbd_unlock_bh_state(bh); 1040 if (try_to_free) 1041 release_buffer_page(bh); /* Drops bh reference */ 1042 else 1043 __brelse(bh); 1044 cond_resched_lock(&journal->j_list_lock); 1045 } 1046 spin_unlock(&journal->j_list_lock); 1047 /* 1048 * This is a bit sleazy. We use j_list_lock to protect transition 1049 * of a transaction into T_FINISHED state and calling 1050 * __jbd2_journal_drop_transaction(). Otherwise we could race with 1051 * other checkpointing code processing the transaction... 1052 */ 1053 write_lock(&journal->j_state_lock); 1054 spin_lock(&journal->j_list_lock); 1055 /* 1056 * Now recheck if some buffers did not get attached to the transaction 1057 * while the lock was dropped... 1058 */ 1059 if (commit_transaction->t_forget) { 1060 spin_unlock(&journal->j_list_lock); 1061 write_unlock(&journal->j_state_lock); 1062 goto restart_loop; 1063 } 1064 1065 /* Add the transaction to the checkpoint list 1066 * __journal_remove_checkpoint() can not destroy transaction 1067 * under us because it is not marked as T_FINISHED yet */ 1068 if (journal->j_checkpoint_transactions == NULL) { 1069 journal->j_checkpoint_transactions = commit_transaction; 1070 commit_transaction->t_cpnext = commit_transaction; 1071 commit_transaction->t_cpprev = commit_transaction; 1072 } else { 1073 commit_transaction->t_cpnext = 1074 journal->j_checkpoint_transactions; 1075 commit_transaction->t_cpprev = 1076 commit_transaction->t_cpnext->t_cpprev; 1077 commit_transaction->t_cpnext->t_cpprev = 1078 commit_transaction; 1079 commit_transaction->t_cpprev->t_cpnext = 1080 commit_transaction; 1081 } 1082 spin_unlock(&journal->j_list_lock); 1083 1084 /* Done with this transaction! */ 1085 1086 jbd_debug(3, "JBD2: commit phase 7\n"); 1087 1088 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 1089 1090 commit_transaction->t_start = jiffies; 1091 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 1092 commit_transaction->t_start); 1093 1094 /* 1095 * File the transaction statistics 1096 */ 1097 stats.ts_tid = commit_transaction->t_tid; 1098 stats.run.rs_handle_count = 1099 atomic_read(&commit_transaction->t_handle_count); 1100 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1101 commit_transaction->t_tid, &stats.run); 1102 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; 1103 1104 commit_transaction->t_state = T_COMMIT_CALLBACK; 1105 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1106 journal->j_commit_sequence = commit_transaction->t_tid; 1107 journal->j_committing_transaction = NULL; 1108 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1109 1110 /* 1111 * weight the commit time higher than the average time so we don't 1112 * react too strongly to vast changes in the commit time 1113 */ 1114 if (likely(journal->j_average_commit_time)) 1115 journal->j_average_commit_time = (commit_time + 1116 journal->j_average_commit_time*3) / 4; 1117 else 1118 journal->j_average_commit_time = commit_time; 1119 1120 write_unlock(&journal->j_state_lock); 1121 1122 if (journal->j_commit_callback) 1123 journal->j_commit_callback(journal, commit_transaction); 1124 1125 trace_jbd2_end_commit(journal, commit_transaction); 1126 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1127 journal->j_commit_sequence, journal->j_tail_sequence); 1128 1129 write_lock(&journal->j_state_lock); 1130 spin_lock(&journal->j_list_lock); 1131 commit_transaction->t_state = T_FINISHED; 1132 /* Check if the transaction can be dropped now that we are finished */ 1133 if (commit_transaction->t_checkpoint_list == NULL && 1134 commit_transaction->t_checkpoint_io_list == NULL) { 1135 __jbd2_journal_drop_transaction(journal, commit_transaction); 1136 jbd2_journal_free_transaction(commit_transaction); 1137 } 1138 spin_unlock(&journal->j_list_lock); 1139 write_unlock(&journal->j_state_lock); 1140 wake_up(&journal->j_wait_done_commit); 1141 1142 /* 1143 * Calculate overall stats 1144 */ 1145 spin_lock(&journal->j_history_lock); 1146 journal->j_stats.ts_tid++; 1147 journal->j_stats.ts_requested += stats.ts_requested; 1148 journal->j_stats.run.rs_wait += stats.run.rs_wait; 1149 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1150 journal->j_stats.run.rs_running += stats.run.rs_running; 1151 journal->j_stats.run.rs_locked += stats.run.rs_locked; 1152 journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1153 journal->j_stats.run.rs_logging += stats.run.rs_logging; 1154 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1155 journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1156 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1157 spin_unlock(&journal->j_history_lock); 1158 } 1159