1 /* 2 * linux/fs/jbd2/checkpoint.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 5 * 6 * Copyright 1999 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Checkpoint routines for the generic filesystem journaling code. 13 * Part of the ext2fs journaling system. 14 * 15 * Checkpointing is the process of ensuring that a section of the log is 16 * committed fully to disk, so that that portion of the log can be 17 * reused. 18 */ 19 20 #include <linux/time.h> 21 #include <linux/fs.h> 22 #include <linux/jbd2.h> 23 #include <linux/errno.h> 24 #include <linux/slab.h> 25 #include <linux/blkdev.h> 26 #include <trace/events/jbd2.h> 27 28 /* 29 * Unlink a buffer from a transaction checkpoint list. 30 * 31 * Called with j_list_lock held. 32 */ 33 static inline void __buffer_unlink_first(struct journal_head *jh) 34 { 35 transaction_t *transaction = jh->b_cp_transaction; 36 37 jh->b_cpnext->b_cpprev = jh->b_cpprev; 38 jh->b_cpprev->b_cpnext = jh->b_cpnext; 39 if (transaction->t_checkpoint_list == jh) { 40 transaction->t_checkpoint_list = jh->b_cpnext; 41 if (transaction->t_checkpoint_list == jh) 42 transaction->t_checkpoint_list = NULL; 43 } 44 } 45 46 /* 47 * Unlink a buffer from a transaction checkpoint(io) list. 48 * 49 * Called with j_list_lock held. 50 */ 51 static inline void __buffer_unlink(struct journal_head *jh) 52 { 53 transaction_t *transaction = jh->b_cp_transaction; 54 55 __buffer_unlink_first(jh); 56 if (transaction->t_checkpoint_io_list == jh) { 57 transaction->t_checkpoint_io_list = jh->b_cpnext; 58 if (transaction->t_checkpoint_io_list == jh) 59 transaction->t_checkpoint_io_list = NULL; 60 } 61 } 62 63 /* 64 * Move a buffer from the checkpoint list to the checkpoint io list 65 * 66 * Called with j_list_lock held 67 */ 68 static inline void __buffer_relink_io(struct journal_head *jh) 69 { 70 transaction_t *transaction = jh->b_cp_transaction; 71 72 __buffer_unlink_first(jh); 73 74 if (!transaction->t_checkpoint_io_list) { 75 jh->b_cpnext = jh->b_cpprev = jh; 76 } else { 77 jh->b_cpnext = transaction->t_checkpoint_io_list; 78 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; 79 jh->b_cpprev->b_cpnext = jh; 80 jh->b_cpnext->b_cpprev = jh; 81 } 82 transaction->t_checkpoint_io_list = jh; 83 } 84 85 /* 86 * Try to release a checkpointed buffer from its transaction. 87 * Returns 1 if we released it and 2 if we also released the 88 * whole transaction. 89 * 90 * Requires j_list_lock 91 */ 92 static int __try_to_free_cp_buf(struct journal_head *jh) 93 { 94 int ret = 0; 95 struct buffer_head *bh = jh2bh(jh); 96 97 if (jh->b_transaction == NULL && !buffer_locked(bh) && 98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 99 /* 100 * Get our reference so that bh cannot be freed before 101 * we unlock it 102 */ 103 get_bh(bh); 104 JBUFFER_TRACE(jh, "remove from checkpoint list"); 105 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 106 BUFFER_TRACE(bh, "release"); 107 __brelse(bh); 108 } 109 return ret; 110 } 111 112 /* 113 * __jbd2_log_wait_for_space: wait until there is space in the journal. 114 * 115 * Called under j-state_lock *only*. It will be unlocked if we have to wait 116 * for a checkpoint to free up some space in the log. 117 */ 118 void __jbd2_log_wait_for_space(journal_t *journal) 119 { 120 int nblocks, space_left; 121 /* assert_spin_locked(&journal->j_state_lock); */ 122 123 nblocks = jbd_space_needed(journal); 124 while (__jbd2_log_space_left(journal) < nblocks) { 125 if (journal->j_flags & JBD2_ABORT) 126 return; 127 write_unlock(&journal->j_state_lock); 128 mutex_lock(&journal->j_checkpoint_mutex); 129 130 /* 131 * Test again, another process may have checkpointed while we 132 * were waiting for the checkpoint lock. If there are no 133 * transactions ready to be checkpointed, try to recover 134 * journal space by calling cleanup_journal_tail(), and if 135 * that doesn't work, by waiting for the currently committing 136 * transaction to complete. If there is absolutely no way 137 * to make progress, this is either a BUG or corrupted 138 * filesystem, so abort the journal and leave a stack 139 * trace for forensic evidence. 140 */ 141 write_lock(&journal->j_state_lock); 142 spin_lock(&journal->j_list_lock); 143 nblocks = jbd_space_needed(journal); 144 space_left = __jbd2_log_space_left(journal); 145 if (space_left < nblocks) { 146 int chkpt = journal->j_checkpoint_transactions != NULL; 147 tid_t tid = 0; 148 149 if (journal->j_committing_transaction) 150 tid = journal->j_committing_transaction->t_tid; 151 spin_unlock(&journal->j_list_lock); 152 write_unlock(&journal->j_state_lock); 153 if (chkpt) { 154 jbd2_log_do_checkpoint(journal); 155 } else if (jbd2_cleanup_journal_tail(journal) == 0) { 156 /* We were able to recover space; yay! */ 157 ; 158 } else if (tid) { 159 jbd2_log_wait_commit(journal, tid); 160 } else { 161 printk(KERN_ERR "%s: needed %d blocks and " 162 "only had %d space available\n", 163 __func__, nblocks, space_left); 164 printk(KERN_ERR "%s: no way to get more " 165 "journal space in %s\n", __func__, 166 journal->j_devname); 167 WARN_ON(1); 168 jbd2_journal_abort(journal, 0); 169 } 170 write_lock(&journal->j_state_lock); 171 } else { 172 spin_unlock(&journal->j_list_lock); 173 } 174 mutex_unlock(&journal->j_checkpoint_mutex); 175 } 176 } 177 178 /* 179 * Clean up transaction's list of buffers submitted for io. 180 * We wait for any pending IO to complete and remove any clean 181 * buffers. Note that we take the buffers in the opposite ordering 182 * from the one in which they were submitted for IO. 183 * 184 * Return 0 on success, and return <0 if some buffers have failed 185 * to be written out. 186 * 187 * Called with j_list_lock held. 188 */ 189 static int __wait_cp_io(journal_t *journal, transaction_t *transaction) 190 { 191 struct journal_head *jh; 192 struct buffer_head *bh; 193 tid_t this_tid; 194 int released = 0; 195 int ret = 0; 196 197 this_tid = transaction->t_tid; 198 restart: 199 /* Did somebody clean up the transaction in the meanwhile? */ 200 if (journal->j_checkpoint_transactions != transaction || 201 transaction->t_tid != this_tid) 202 return ret; 203 while (!released && transaction->t_checkpoint_io_list) { 204 jh = transaction->t_checkpoint_io_list; 205 bh = jh2bh(jh); 206 get_bh(bh); 207 if (buffer_locked(bh)) { 208 spin_unlock(&journal->j_list_lock); 209 wait_on_buffer(bh); 210 /* the journal_head may have gone by now */ 211 BUFFER_TRACE(bh, "brelse"); 212 __brelse(bh); 213 spin_lock(&journal->j_list_lock); 214 goto restart; 215 } 216 if (unlikely(buffer_write_io_error(bh))) 217 ret = -EIO; 218 219 /* 220 * Now in whatever state the buffer currently is, we know that 221 * it has been written out and so we can drop it from the list 222 */ 223 released = __jbd2_journal_remove_checkpoint(jh); 224 __brelse(bh); 225 } 226 227 return ret; 228 } 229 230 static void 231 __flush_batch(journal_t *journal, int *batch_count) 232 { 233 int i; 234 struct blk_plug plug; 235 236 blk_start_plug(&plug); 237 for (i = 0; i < *batch_count; i++) 238 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); 239 blk_finish_plug(&plug); 240 241 for (i = 0; i < *batch_count; i++) { 242 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 243 BUFFER_TRACE(bh, "brelse"); 244 __brelse(bh); 245 } 246 *batch_count = 0; 247 } 248 249 /* 250 * Try to flush one buffer from the checkpoint list to disk. 251 * 252 * Return 1 if something happened which requires us to abort the current 253 * scan of the checkpoint list. Return <0 if the buffer has failed to 254 * be written out. 255 * 256 * Called with j_list_lock held and drops it if 1 is returned 257 */ 258 static int __process_buffer(journal_t *journal, struct journal_head *jh, 259 int *batch_count, transaction_t *transaction) 260 { 261 struct buffer_head *bh = jh2bh(jh); 262 int ret = 0; 263 264 if (buffer_locked(bh)) { 265 get_bh(bh); 266 spin_unlock(&journal->j_list_lock); 267 wait_on_buffer(bh); 268 /* the journal_head may have gone by now */ 269 BUFFER_TRACE(bh, "brelse"); 270 __brelse(bh); 271 ret = 1; 272 } else if (jh->b_transaction != NULL) { 273 transaction_t *t = jh->b_transaction; 274 tid_t tid = t->t_tid; 275 276 transaction->t_chp_stats.cs_forced_to_close++; 277 spin_unlock(&journal->j_list_lock); 278 if (unlikely(journal->j_flags & JBD2_UNMOUNT)) 279 /* 280 * The journal thread is dead; so starting and 281 * waiting for a commit to finish will cause 282 * us to wait for a _very_ long time. 283 */ 284 printk(KERN_ERR "JBD2: %s: " 285 "Waiting for Godot: block %llu\n", 286 journal->j_devname, 287 (unsigned long long) bh->b_blocknr); 288 jbd2_log_start_commit(journal, tid); 289 jbd2_log_wait_commit(journal, tid); 290 ret = 1; 291 } else if (!buffer_dirty(bh)) { 292 ret = 1; 293 if (unlikely(buffer_write_io_error(bh))) 294 ret = -EIO; 295 get_bh(bh); 296 BUFFER_TRACE(bh, "remove from checkpoint"); 297 __jbd2_journal_remove_checkpoint(jh); 298 spin_unlock(&journal->j_list_lock); 299 __brelse(bh); 300 } else { 301 /* 302 * Important: we are about to write the buffer, and 303 * possibly block, while still holding the journal lock. 304 * We cannot afford to let the transaction logic start 305 * messing around with this buffer before we write it to 306 * disk, as that would break recoverability. 307 */ 308 BUFFER_TRACE(bh, "queue"); 309 get_bh(bh); 310 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 311 journal->j_chkpt_bhs[*batch_count] = bh; 312 __buffer_relink_io(jh); 313 transaction->t_chp_stats.cs_written++; 314 (*batch_count)++; 315 if (*batch_count == JBD2_NR_BATCH) { 316 spin_unlock(&journal->j_list_lock); 317 __flush_batch(journal, batch_count); 318 ret = 1; 319 } 320 } 321 return ret; 322 } 323 324 /* 325 * Perform an actual checkpoint. We take the first transaction on the 326 * list of transactions to be checkpointed and send all its buffers 327 * to disk. We submit larger chunks of data at once. 328 * 329 * The journal should be locked before calling this function. 330 * Called with j_checkpoint_mutex held. 331 */ 332 int jbd2_log_do_checkpoint(journal_t *journal) 333 { 334 transaction_t *transaction; 335 tid_t this_tid; 336 int result; 337 338 jbd_debug(1, "Start checkpoint\n"); 339 340 /* 341 * First thing: if there are any transactions in the log which 342 * don't need checkpointing, just eliminate them from the 343 * journal straight away. 344 */ 345 result = jbd2_cleanup_journal_tail(journal); 346 trace_jbd2_checkpoint(journal, result); 347 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 348 if (result <= 0) 349 return result; 350 351 /* 352 * OK, we need to start writing disk blocks. Take one transaction 353 * and write it. 354 */ 355 result = 0; 356 spin_lock(&journal->j_list_lock); 357 if (!journal->j_checkpoint_transactions) 358 goto out; 359 transaction = journal->j_checkpoint_transactions; 360 if (transaction->t_chp_stats.cs_chp_time == 0) 361 transaction->t_chp_stats.cs_chp_time = jiffies; 362 this_tid = transaction->t_tid; 363 restart: 364 /* 365 * If someone cleaned up this transaction while we slept, we're 366 * done (maybe it's a new transaction, but it fell at the same 367 * address). 368 */ 369 if (journal->j_checkpoint_transactions == transaction && 370 transaction->t_tid == this_tid) { 371 int batch_count = 0; 372 struct journal_head *jh; 373 int retry = 0, err; 374 375 while (!retry && transaction->t_checkpoint_list) { 376 jh = transaction->t_checkpoint_list; 377 retry = __process_buffer(journal, jh, &batch_count, 378 transaction); 379 if (retry < 0 && !result) 380 result = retry; 381 if (!retry && (need_resched() || 382 spin_needbreak(&journal->j_list_lock))) { 383 spin_unlock(&journal->j_list_lock); 384 retry = 1; 385 break; 386 } 387 } 388 389 if (batch_count) { 390 if (!retry) { 391 spin_unlock(&journal->j_list_lock); 392 retry = 1; 393 } 394 __flush_batch(journal, &batch_count); 395 } 396 397 if (retry) { 398 spin_lock(&journal->j_list_lock); 399 goto restart; 400 } 401 /* 402 * Now we have cleaned up the first transaction's checkpoint 403 * list. Let's clean up the second one 404 */ 405 err = __wait_cp_io(journal, transaction); 406 if (!result) 407 result = err; 408 } 409 out: 410 spin_unlock(&journal->j_list_lock); 411 if (result < 0) 412 jbd2_journal_abort(journal, result); 413 else 414 result = jbd2_cleanup_journal_tail(journal); 415 416 return (result < 0) ? result : 0; 417 } 418 419 /* 420 * Check the list of checkpoint transactions for the journal to see if 421 * we have already got rid of any since the last update of the log tail 422 * in the journal superblock. If so, we can instantly roll the 423 * superblock forward to remove those transactions from the log. 424 * 425 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 426 * 427 * Called with the journal lock held. 428 * 429 * This is the only part of the journaling code which really needs to be 430 * aware of transaction aborts. Checkpointing involves writing to the 431 * main filesystem area rather than to the journal, so it can proceed 432 * even in abort state, but we must not update the super block if 433 * checkpointing may have failed. Otherwise, we would lose some metadata 434 * buffers which should be written-back to the filesystem. 435 */ 436 437 int jbd2_cleanup_journal_tail(journal_t *journal) 438 { 439 tid_t first_tid; 440 unsigned long blocknr; 441 442 if (is_journal_aborted(journal)) 443 return 1; 444 445 if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) 446 return 1; 447 J_ASSERT(blocknr != 0); 448 449 /* 450 * We need to make sure that any blocks that were recently written out 451 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before 452 * we drop the transactions from the journal. It's unlikely this will 453 * be necessary, especially with an appropriately sized journal, but we 454 * need this to guarantee correctness. Fortunately 455 * jbd2_cleanup_journal_tail() doesn't get called all that often. 456 */ 457 if (journal->j_flags & JBD2_BARRIER) 458 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 459 460 __jbd2_update_log_tail(journal, first_tid, blocknr); 461 return 0; 462 } 463 464 465 /* Checkpoint list management */ 466 467 /* 468 * journal_clean_one_cp_list 469 * 470 * Find all the written-back checkpoint buffers in the given list and 471 * release them. 472 * 473 * Called with the journal locked. 474 * Called with j_list_lock held. 475 * Returns number of buffers reaped (for debug) 476 */ 477 478 static int journal_clean_one_cp_list(struct journal_head *jh, int *released) 479 { 480 struct journal_head *last_jh; 481 struct journal_head *next_jh = jh; 482 int ret, freed = 0; 483 484 *released = 0; 485 if (!jh) 486 return 0; 487 488 last_jh = jh->b_cpprev; 489 do { 490 jh = next_jh; 491 next_jh = jh->b_cpnext; 492 ret = __try_to_free_cp_buf(jh); 493 if (ret) { 494 freed++; 495 if (ret == 2) { 496 *released = 1; 497 return freed; 498 } 499 } 500 /* 501 * This function only frees up some memory 502 * if possible so we dont have an obligation 503 * to finish processing. Bail out if preemption 504 * requested: 505 */ 506 if (need_resched()) 507 return freed; 508 } while (jh != last_jh); 509 510 return freed; 511 } 512 513 /* 514 * journal_clean_checkpoint_list 515 * 516 * Find all the written-back checkpoint buffers in the journal and release them. 517 * 518 * Called with the journal locked. 519 * Called with j_list_lock held. 520 * Returns number of buffers reaped (for debug) 521 */ 522 523 int __jbd2_journal_clean_checkpoint_list(journal_t *journal) 524 { 525 transaction_t *transaction, *last_transaction, *next_transaction; 526 int ret = 0; 527 int released; 528 529 transaction = journal->j_checkpoint_transactions; 530 if (!transaction) 531 goto out; 532 533 last_transaction = transaction->t_cpprev; 534 next_transaction = transaction; 535 do { 536 transaction = next_transaction; 537 next_transaction = transaction->t_cpnext; 538 ret += journal_clean_one_cp_list(transaction-> 539 t_checkpoint_list, &released); 540 /* 541 * This function only frees up some memory if possible so we 542 * dont have an obligation to finish processing. Bail out if 543 * preemption requested: 544 */ 545 if (need_resched()) 546 goto out; 547 if (released) 548 continue; 549 /* 550 * It is essential that we are as careful as in the case of 551 * t_checkpoint_list with removing the buffer from the list as 552 * we can possibly see not yet submitted buffers on io_list 553 */ 554 ret += journal_clean_one_cp_list(transaction-> 555 t_checkpoint_io_list, &released); 556 if (need_resched()) 557 goto out; 558 } while (transaction != last_transaction); 559 out: 560 return ret; 561 } 562 563 /* 564 * journal_remove_checkpoint: called after a buffer has been committed 565 * to disk (either by being write-back flushed to disk, or being 566 * committed to the log). 567 * 568 * We cannot safely clean a transaction out of the log until all of the 569 * buffer updates committed in that transaction have safely been stored 570 * elsewhere on disk. To achieve this, all of the buffers in a 571 * transaction need to be maintained on the transaction's checkpoint 572 * lists until they have been rewritten, at which point this function is 573 * called to remove the buffer from the existing transaction's 574 * checkpoint lists. 575 * 576 * The function returns 1 if it frees the transaction, 0 otherwise. 577 * The function can free jh and bh. 578 * 579 * This function is called with j_list_lock held. 580 */ 581 int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 582 { 583 struct transaction_chp_stats_s *stats; 584 transaction_t *transaction; 585 journal_t *journal; 586 int ret = 0; 587 588 JBUFFER_TRACE(jh, "entry"); 589 590 if ((transaction = jh->b_cp_transaction) == NULL) { 591 JBUFFER_TRACE(jh, "not on transaction"); 592 goto out; 593 } 594 journal = transaction->t_journal; 595 596 JBUFFER_TRACE(jh, "removing from transaction"); 597 __buffer_unlink(jh); 598 jh->b_cp_transaction = NULL; 599 jbd2_journal_put_journal_head(jh); 600 601 if (transaction->t_checkpoint_list != NULL || 602 transaction->t_checkpoint_io_list != NULL) 603 goto out; 604 605 /* 606 * There is one special case to worry about: if we have just pulled the 607 * buffer off a running or committing transaction's checkpoing list, 608 * then even if the checkpoint list is empty, the transaction obviously 609 * cannot be dropped! 610 * 611 * The locking here around t_state is a bit sleazy. 612 * See the comment at the end of jbd2_journal_commit_transaction(). 613 */ 614 if (transaction->t_state != T_FINISHED) 615 goto out; 616 617 /* OK, that was the last buffer for the transaction: we can now 618 safely remove this transaction from the log */ 619 stats = &transaction->t_chp_stats; 620 if (stats->cs_chp_time) 621 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, 622 jiffies); 623 trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, 624 transaction->t_tid, stats); 625 626 __jbd2_journal_drop_transaction(journal, transaction); 627 jbd2_journal_free_transaction(transaction); 628 629 /* Just in case anybody was waiting for more transactions to be 630 checkpointed... */ 631 wake_up(&journal->j_wait_logspace); 632 ret = 1; 633 out: 634 return ret; 635 } 636 637 /* 638 * journal_insert_checkpoint: put a committed buffer onto a checkpoint 639 * list so that we know when it is safe to clean the transaction out of 640 * the log. 641 * 642 * Called with the journal locked. 643 * Called with j_list_lock held. 644 */ 645 void __jbd2_journal_insert_checkpoint(struct journal_head *jh, 646 transaction_t *transaction) 647 { 648 JBUFFER_TRACE(jh, "entry"); 649 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); 650 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 651 652 /* Get reference for checkpointing transaction */ 653 jbd2_journal_grab_journal_head(jh2bh(jh)); 654 jh->b_cp_transaction = transaction; 655 656 if (!transaction->t_checkpoint_list) { 657 jh->b_cpnext = jh->b_cpprev = jh; 658 } else { 659 jh->b_cpnext = transaction->t_checkpoint_list; 660 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; 661 jh->b_cpprev->b_cpnext = jh; 662 jh->b_cpnext->b_cpprev = jh; 663 } 664 transaction->t_checkpoint_list = jh; 665 } 666 667 /* 668 * We've finished with this transaction structure: adios... 669 * 670 * The transaction must have no links except for the checkpoint by this 671 * point. 672 * 673 * Called with the journal locked. 674 * Called with j_list_lock held. 675 */ 676 677 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) 678 { 679 assert_spin_locked(&journal->j_list_lock); 680 if (transaction->t_cpnext) { 681 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 682 transaction->t_cpprev->t_cpnext = transaction->t_cpnext; 683 if (journal->j_checkpoint_transactions == transaction) 684 journal->j_checkpoint_transactions = 685 transaction->t_cpnext; 686 if (journal->j_checkpoint_transactions == transaction) 687 journal->j_checkpoint_transactions = NULL; 688 } 689 690 J_ASSERT(transaction->t_state == T_FINISHED); 691 J_ASSERT(transaction->t_buffers == NULL); 692 J_ASSERT(transaction->t_forget == NULL); 693 J_ASSERT(transaction->t_iobuf_list == NULL); 694 J_ASSERT(transaction->t_shadow_list == NULL); 695 J_ASSERT(transaction->t_log_list == NULL); 696 J_ASSERT(transaction->t_checkpoint_list == NULL); 697 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 698 J_ASSERT(atomic_read(&transaction->t_updates) == 0); 699 J_ASSERT(journal->j_committing_transaction != transaction); 700 J_ASSERT(journal->j_running_transaction != transaction); 701 702 trace_jbd2_drop_transaction(journal, transaction); 703 704 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 705 } 706