1 /* 2 * linux/fs/jbd2/checkpoint.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 5 * 6 * Copyright 1999 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Checkpoint routines for the generic filesystem journaling code. 13 * Part of the ext2fs journaling system. 14 * 15 * Checkpointing is the process of ensuring that a section of the log is 16 * committed fully to disk, so that that portion of the log can be 17 * reused. 18 */ 19 20 #include <linux/time.h> 21 #include <linux/fs.h> 22 #include <linux/jbd2.h> 23 #include <linux/errno.h> 24 #include <linux/slab.h> 25 26 /* 27 * Unlink a buffer from a transaction checkpoint list. 28 * 29 * Called with j_list_lock held. 30 */ 31 static inline void __buffer_unlink_first(struct journal_head *jh) 32 { 33 transaction_t *transaction = jh->b_cp_transaction; 34 35 jh->b_cpnext->b_cpprev = jh->b_cpprev; 36 jh->b_cpprev->b_cpnext = jh->b_cpnext; 37 if (transaction->t_checkpoint_list == jh) { 38 transaction->t_checkpoint_list = jh->b_cpnext; 39 if (transaction->t_checkpoint_list == jh) 40 transaction->t_checkpoint_list = NULL; 41 } 42 } 43 44 /* 45 * Unlink a buffer from a transaction checkpoint(io) list. 46 * 47 * Called with j_list_lock held. 48 */ 49 static inline void __buffer_unlink(struct journal_head *jh) 50 { 51 transaction_t *transaction = jh->b_cp_transaction; 52 53 __buffer_unlink_first(jh); 54 if (transaction->t_checkpoint_io_list == jh) { 55 transaction->t_checkpoint_io_list = jh->b_cpnext; 56 if (transaction->t_checkpoint_io_list == jh) 57 transaction->t_checkpoint_io_list = NULL; 58 } 59 } 60 61 /* 62 * Move a buffer from the checkpoint list to the checkpoint io list 63 * 64 * Called with j_list_lock held 65 */ 66 static inline void __buffer_relink_io(struct journal_head *jh) 67 { 68 transaction_t *transaction = jh->b_cp_transaction; 69 70 __buffer_unlink_first(jh); 71 72 if (!transaction->t_checkpoint_io_list) { 73 jh->b_cpnext = jh->b_cpprev = jh; 74 } else { 75 jh->b_cpnext = transaction->t_checkpoint_io_list; 76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; 77 jh->b_cpprev->b_cpnext = jh; 78 jh->b_cpnext->b_cpprev = jh; 79 } 80 transaction->t_checkpoint_io_list = jh; 81 } 82 83 /* 84 * Try to release a checkpointed buffer from its transaction. 85 * Returns 1 if we released it and 2 if we also released the 86 * whole transaction. 87 * 88 * Requires j_list_lock 89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 90 */ 91 static int __try_to_free_cp_buf(struct journal_head *jh) 92 { 93 int ret = 0; 94 struct buffer_head *bh = jh2bh(jh); 95 96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 98 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 99 jbd_unlock_bh_state(bh); 100 jbd2_journal_remove_journal_head(bh); 101 BUFFER_TRACE(bh, "release"); 102 __brelse(bh); 103 } else { 104 jbd_unlock_bh_state(bh); 105 } 106 return ret; 107 } 108 109 /* 110 * __jbd2_log_wait_for_space: wait until there is space in the journal. 111 * 112 * Called under j-state_lock *only*. It will be unlocked if we have to wait 113 * for a checkpoint to free up some space in the log. 114 */ 115 void __jbd2_log_wait_for_space(journal_t *journal) 116 { 117 int nblocks; 118 assert_spin_locked(&journal->j_state_lock); 119 120 nblocks = jbd_space_needed(journal); 121 while (__jbd2_log_space_left(journal) < nblocks) { 122 if (journal->j_flags & JBD2_ABORT) 123 return; 124 spin_unlock(&journal->j_state_lock); 125 mutex_lock(&journal->j_checkpoint_mutex); 126 127 /* 128 * Test again, another process may have checkpointed while we 129 * were waiting for the checkpoint lock 130 */ 131 spin_lock(&journal->j_state_lock); 132 nblocks = jbd_space_needed(journal); 133 if (__jbd2_log_space_left(journal) < nblocks) { 134 spin_unlock(&journal->j_state_lock); 135 jbd2_log_do_checkpoint(journal); 136 spin_lock(&journal->j_state_lock); 137 } 138 mutex_unlock(&journal->j_checkpoint_mutex); 139 } 140 } 141 142 /* 143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. 144 * The caller must restart a list walk. Wait for someone else to run 145 * jbd_unlock_bh_state(). 146 */ 147 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) 148 __releases(journal->j_list_lock) 149 { 150 get_bh(bh); 151 spin_unlock(&journal->j_list_lock); 152 jbd_lock_bh_state(bh); 153 jbd_unlock_bh_state(bh); 154 put_bh(bh); 155 } 156 157 /* 158 * Clean up transaction's list of buffers submitted for io. 159 * We wait for any pending IO to complete and remove any clean 160 * buffers. Note that we take the buffers in the opposite ordering 161 * from the one in which they were submitted for IO. 162 * 163 * Called with j_list_lock held. 164 */ 165 static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 166 { 167 struct journal_head *jh; 168 struct buffer_head *bh; 169 tid_t this_tid; 170 int released = 0; 171 172 this_tid = transaction->t_tid; 173 restart: 174 /* Did somebody clean up the transaction in the meanwhile? */ 175 if (journal->j_checkpoint_transactions != transaction || 176 transaction->t_tid != this_tid) 177 return; 178 while (!released && transaction->t_checkpoint_io_list) { 179 jh = transaction->t_checkpoint_io_list; 180 bh = jh2bh(jh); 181 if (!jbd_trylock_bh_state(bh)) { 182 jbd_sync_bh(journal, bh); 183 spin_lock(&journal->j_list_lock); 184 goto restart; 185 } 186 if (buffer_locked(bh)) { 187 atomic_inc(&bh->b_count); 188 spin_unlock(&journal->j_list_lock); 189 jbd_unlock_bh_state(bh); 190 wait_on_buffer(bh); 191 /* the journal_head may have gone by now */ 192 BUFFER_TRACE(bh, "brelse"); 193 __brelse(bh); 194 spin_lock(&journal->j_list_lock); 195 goto restart; 196 } 197 /* 198 * Now in whatever state the buffer currently is, we know that 199 * it has been written out and so we can drop it from the list 200 */ 201 released = __jbd2_journal_remove_checkpoint(jh); 202 jbd_unlock_bh_state(bh); 203 jbd2_journal_remove_journal_head(bh); 204 __brelse(bh); 205 } 206 } 207 208 #define NR_BATCH 64 209 210 static void 211 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 212 { 213 int i; 214 215 ll_rw_block(SWRITE, *batch_count, bhs); 216 for (i = 0; i < *batch_count; i++) { 217 struct buffer_head *bh = bhs[i]; 218 clear_buffer_jwrite(bh); 219 BUFFER_TRACE(bh, "brelse"); 220 __brelse(bh); 221 } 222 *batch_count = 0; 223 } 224 225 /* 226 * Try to flush one buffer from the checkpoint list to disk. 227 * 228 * Return 1 if something happened which requires us to abort the current 229 * scan of the checkpoint list. 230 * 231 * Called with j_list_lock held and drops it if 1 is returned 232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 233 */ 234 static int __process_buffer(journal_t *journal, struct journal_head *jh, 235 struct buffer_head **bhs, int *batch_count, 236 transaction_t *transaction) 237 { 238 struct buffer_head *bh = jh2bh(jh); 239 int ret = 0; 240 241 if (buffer_locked(bh)) { 242 atomic_inc(&bh->b_count); 243 spin_unlock(&journal->j_list_lock); 244 jbd_unlock_bh_state(bh); 245 wait_on_buffer(bh); 246 /* the journal_head may have gone by now */ 247 BUFFER_TRACE(bh, "brelse"); 248 __brelse(bh); 249 ret = 1; 250 } else if (jh->b_transaction != NULL) { 251 transaction_t *t = jh->b_transaction; 252 tid_t tid = t->t_tid; 253 254 transaction->t_chp_stats.cs_forced_to_close++; 255 spin_unlock(&journal->j_list_lock); 256 jbd_unlock_bh_state(bh); 257 jbd2_log_start_commit(journal, tid); 258 jbd2_log_wait_commit(journal, tid); 259 ret = 1; 260 } else if (!buffer_dirty(bh)) { 261 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 262 BUFFER_TRACE(bh, "remove from checkpoint"); 263 __jbd2_journal_remove_checkpoint(jh); 264 spin_unlock(&journal->j_list_lock); 265 jbd_unlock_bh_state(bh); 266 jbd2_journal_remove_journal_head(bh); 267 __brelse(bh); 268 ret = 1; 269 } else { 270 /* 271 * Important: we are about to write the buffer, and 272 * possibly block, while still holding the journal lock. 273 * We cannot afford to let the transaction logic start 274 * messing around with this buffer before we write it to 275 * disk, as that would break recoverability. 276 */ 277 BUFFER_TRACE(bh, "queue"); 278 get_bh(bh); 279 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 280 set_buffer_jwrite(bh); 281 bhs[*batch_count] = bh; 282 __buffer_relink_io(jh); 283 jbd_unlock_bh_state(bh); 284 transaction->t_chp_stats.cs_written++; 285 (*batch_count)++; 286 if (*batch_count == NR_BATCH) { 287 spin_unlock(&journal->j_list_lock); 288 __flush_batch(journal, bhs, batch_count); 289 ret = 1; 290 } 291 } 292 return ret; 293 } 294 295 /* 296 * Perform an actual checkpoint. We take the first transaction on the 297 * list of transactions to be checkpointed and send all its buffers 298 * to disk. We submit larger chunks of data at once. 299 * 300 * The journal should be locked before calling this function. 301 */ 302 int jbd2_log_do_checkpoint(journal_t *journal) 303 { 304 transaction_t *transaction; 305 tid_t this_tid; 306 int result; 307 308 jbd_debug(1, "Start checkpoint\n"); 309 310 /* 311 * First thing: if there are any transactions in the log which 312 * don't need checkpointing, just eliminate them from the 313 * journal straight away. 314 */ 315 result = jbd2_cleanup_journal_tail(journal); 316 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 317 if (result <= 0) 318 return result; 319 320 /* 321 * OK, we need to start writing disk blocks. Take one transaction 322 * and write it. 323 */ 324 spin_lock(&journal->j_list_lock); 325 if (!journal->j_checkpoint_transactions) 326 goto out; 327 transaction = journal->j_checkpoint_transactions; 328 if (transaction->t_chp_stats.cs_chp_time == 0) 329 transaction->t_chp_stats.cs_chp_time = jiffies; 330 this_tid = transaction->t_tid; 331 restart: 332 /* 333 * If someone cleaned up this transaction while we slept, we're 334 * done (maybe it's a new transaction, but it fell at the same 335 * address). 336 */ 337 if (journal->j_checkpoint_transactions == transaction && 338 transaction->t_tid == this_tid) { 339 int batch_count = 0; 340 struct buffer_head *bhs[NR_BATCH]; 341 struct journal_head *jh; 342 int retry = 0; 343 344 while (!retry && transaction->t_checkpoint_list) { 345 struct buffer_head *bh; 346 347 jh = transaction->t_checkpoint_list; 348 bh = jh2bh(jh); 349 if (!jbd_trylock_bh_state(bh)) { 350 jbd_sync_bh(journal, bh); 351 retry = 1; 352 break; 353 } 354 retry = __process_buffer(journal, jh, bhs, &batch_count, 355 transaction); 356 if (!retry && (need_resched() || 357 spin_needbreak(&journal->j_list_lock))) { 358 spin_unlock(&journal->j_list_lock); 359 retry = 1; 360 break; 361 } 362 } 363 364 if (batch_count) { 365 if (!retry) { 366 spin_unlock(&journal->j_list_lock); 367 retry = 1; 368 } 369 __flush_batch(journal, bhs, &batch_count); 370 } 371 372 if (retry) { 373 spin_lock(&journal->j_list_lock); 374 goto restart; 375 } 376 /* 377 * Now we have cleaned up the first transaction's checkpoint 378 * list. Let's clean up the second one 379 */ 380 __wait_cp_io(journal, transaction); 381 } 382 out: 383 spin_unlock(&journal->j_list_lock); 384 result = jbd2_cleanup_journal_tail(journal); 385 if (result < 0) 386 return result; 387 return 0; 388 } 389 390 /* 391 * Check the list of checkpoint transactions for the journal to see if 392 * we have already got rid of any since the last update of the log tail 393 * in the journal superblock. If so, we can instantly roll the 394 * superblock forward to remove those transactions from the log. 395 * 396 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 397 * 398 * Called with the journal lock held. 399 * 400 * This is the only part of the journaling code which really needs to be 401 * aware of transaction aborts. Checkpointing involves writing to the 402 * main filesystem area rather than to the journal, so it can proceed 403 * even in abort state, but we must not update the journal superblock if 404 * we have an abort error outstanding. 405 */ 406 407 int jbd2_cleanup_journal_tail(journal_t *journal) 408 { 409 transaction_t * transaction; 410 tid_t first_tid; 411 unsigned long blocknr, freed; 412 413 /* OK, work out the oldest transaction remaining in the log, and 414 * the log block it starts at. 415 * 416 * If the log is now empty, we need to work out which is the 417 * next transaction ID we will write, and where it will 418 * start. */ 419 420 spin_lock(&journal->j_state_lock); 421 spin_lock(&journal->j_list_lock); 422 transaction = journal->j_checkpoint_transactions; 423 if (transaction) { 424 first_tid = transaction->t_tid; 425 blocknr = transaction->t_log_start; 426 } else if ((transaction = journal->j_committing_transaction) != NULL) { 427 first_tid = transaction->t_tid; 428 blocknr = transaction->t_log_start; 429 } else if ((transaction = journal->j_running_transaction) != NULL) { 430 first_tid = transaction->t_tid; 431 blocknr = journal->j_head; 432 } else { 433 first_tid = journal->j_transaction_sequence; 434 blocknr = journal->j_head; 435 } 436 spin_unlock(&journal->j_list_lock); 437 J_ASSERT(blocknr != 0); 438 439 /* If the oldest pinned transaction is at the tail of the log 440 already then there's not much we can do right now. */ 441 if (journal->j_tail_sequence == first_tid) { 442 spin_unlock(&journal->j_state_lock); 443 return 1; 444 } 445 446 /* OK, update the superblock to recover the freed space. 447 * Physical blocks come first: have we wrapped beyond the end of 448 * the log? */ 449 freed = blocknr - journal->j_tail; 450 if (blocknr < journal->j_tail) 451 freed = freed + journal->j_last - journal->j_first; 452 453 jbd_debug(1, 454 "Cleaning journal tail from %d to %d (offset %lu), " 455 "freeing %lu\n", 456 journal->j_tail_sequence, first_tid, blocknr, freed); 457 458 journal->j_free += freed; 459 journal->j_tail_sequence = first_tid; 460 journal->j_tail = blocknr; 461 spin_unlock(&journal->j_state_lock); 462 if (!(journal->j_flags & JBD2_ABORT)) 463 jbd2_journal_update_superblock(journal, 1); 464 return 0; 465 } 466 467 468 /* Checkpoint list management */ 469 470 /* 471 * journal_clean_one_cp_list 472 * 473 * Find all the written-back checkpoint buffers in the given list and release them. 474 * 475 * Called with the journal locked. 476 * Called with j_list_lock held. 477 * Returns number of bufers reaped (for debug) 478 */ 479 480 static int journal_clean_one_cp_list(struct journal_head *jh, int *released) 481 { 482 struct journal_head *last_jh; 483 struct journal_head *next_jh = jh; 484 int ret, freed = 0; 485 486 *released = 0; 487 if (!jh) 488 return 0; 489 490 last_jh = jh->b_cpprev; 491 do { 492 jh = next_jh; 493 next_jh = jh->b_cpnext; 494 /* Use trylock because of the ranking */ 495 if (jbd_trylock_bh_state(jh2bh(jh))) { 496 ret = __try_to_free_cp_buf(jh); 497 if (ret) { 498 freed++; 499 if (ret == 2) { 500 *released = 1; 501 return freed; 502 } 503 } 504 } 505 /* 506 * This function only frees up some memory 507 * if possible so we dont have an obligation 508 * to finish processing. Bail out if preemption 509 * requested: 510 */ 511 if (need_resched()) 512 return freed; 513 } while (jh != last_jh); 514 515 return freed; 516 } 517 518 /* 519 * journal_clean_checkpoint_list 520 * 521 * Find all the written-back checkpoint buffers in the journal and release them. 522 * 523 * Called with the journal locked. 524 * Called with j_list_lock held. 525 * Returns number of buffers reaped (for debug) 526 */ 527 528 int __jbd2_journal_clean_checkpoint_list(journal_t *journal) 529 { 530 transaction_t *transaction, *last_transaction, *next_transaction; 531 int ret = 0; 532 int released; 533 534 transaction = journal->j_checkpoint_transactions; 535 if (!transaction) 536 goto out; 537 538 last_transaction = transaction->t_cpprev; 539 next_transaction = transaction; 540 do { 541 transaction = next_transaction; 542 next_transaction = transaction->t_cpnext; 543 ret += journal_clean_one_cp_list(transaction-> 544 t_checkpoint_list, &released); 545 /* 546 * This function only frees up some memory if possible so we 547 * dont have an obligation to finish processing. Bail out if 548 * preemption requested: 549 */ 550 if (need_resched()) 551 goto out; 552 if (released) 553 continue; 554 /* 555 * It is essential that we are as careful as in the case of 556 * t_checkpoint_list with removing the buffer from the list as 557 * we can possibly see not yet submitted buffers on io_list 558 */ 559 ret += journal_clean_one_cp_list(transaction-> 560 t_checkpoint_io_list, &released); 561 if (need_resched()) 562 goto out; 563 } while (transaction != last_transaction); 564 out: 565 return ret; 566 } 567 568 /* 569 * journal_remove_checkpoint: called after a buffer has been committed 570 * to disk (either by being write-back flushed to disk, or being 571 * committed to the log). 572 * 573 * We cannot safely clean a transaction out of the log until all of the 574 * buffer updates committed in that transaction have safely been stored 575 * elsewhere on disk. To achieve this, all of the buffers in a 576 * transaction need to be maintained on the transaction's checkpoint 577 * lists until they have been rewritten, at which point this function is 578 * called to remove the buffer from the existing transaction's 579 * checkpoint lists. 580 * 581 * The function returns 1 if it frees the transaction, 0 otherwise. 582 * 583 * This function is called with the journal locked. 584 * This function is called with j_list_lock held. 585 * This function is called with jbd_lock_bh_state(jh2bh(jh)) 586 */ 587 588 int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 589 { 590 transaction_t *transaction; 591 journal_t *journal; 592 int ret = 0; 593 594 JBUFFER_TRACE(jh, "entry"); 595 596 if ((transaction = jh->b_cp_transaction) == NULL) { 597 JBUFFER_TRACE(jh, "not on transaction"); 598 goto out; 599 } 600 journal = transaction->t_journal; 601 602 __buffer_unlink(jh); 603 jh->b_cp_transaction = NULL; 604 605 if (transaction->t_checkpoint_list != NULL || 606 transaction->t_checkpoint_io_list != NULL) 607 goto out; 608 JBUFFER_TRACE(jh, "transaction has no more buffers"); 609 610 /* 611 * There is one special case to worry about: if we have just pulled the 612 * buffer off a running or committing transaction's checkpoing list, 613 * then even if the checkpoint list is empty, the transaction obviously 614 * cannot be dropped! 615 * 616 * The locking here around t_state is a bit sleazy. 617 * See the comment at the end of jbd2_journal_commit_transaction(). 618 */ 619 if (transaction->t_state != T_FINISHED) { 620 JBUFFER_TRACE(jh, "belongs to running/committing transaction"); 621 goto out; 622 } 623 624 /* OK, that was the last buffer for the transaction: we can now 625 safely remove this transaction from the log */ 626 627 __jbd2_journal_drop_transaction(journal, transaction); 628 629 /* Just in case anybody was waiting for more transactions to be 630 checkpointed... */ 631 wake_up(&journal->j_wait_logspace); 632 ret = 1; 633 out: 634 JBUFFER_TRACE(jh, "exit"); 635 return ret; 636 } 637 638 /* 639 * journal_insert_checkpoint: put a committed buffer onto a checkpoint 640 * list so that we know when it is safe to clean the transaction out of 641 * the log. 642 * 643 * Called with the journal locked. 644 * Called with j_list_lock held. 645 */ 646 void __jbd2_journal_insert_checkpoint(struct journal_head *jh, 647 transaction_t *transaction) 648 { 649 JBUFFER_TRACE(jh, "entry"); 650 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); 651 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 652 653 jh->b_cp_transaction = transaction; 654 655 if (!transaction->t_checkpoint_list) { 656 jh->b_cpnext = jh->b_cpprev = jh; 657 } else { 658 jh->b_cpnext = transaction->t_checkpoint_list; 659 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; 660 jh->b_cpprev->b_cpnext = jh; 661 jh->b_cpnext->b_cpprev = jh; 662 } 663 transaction->t_checkpoint_list = jh; 664 } 665 666 /* 667 * We've finished with this transaction structure: adios... 668 * 669 * The transaction must have no links except for the checkpoint by this 670 * point. 671 * 672 * Called with the journal locked. 673 * Called with j_list_lock held. 674 */ 675 676 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) 677 { 678 assert_spin_locked(&journal->j_list_lock); 679 if (transaction->t_cpnext) { 680 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 681 transaction->t_cpprev->t_cpnext = transaction->t_cpnext; 682 if (journal->j_checkpoint_transactions == transaction) 683 journal->j_checkpoint_transactions = 684 transaction->t_cpnext; 685 if (journal->j_checkpoint_transactions == transaction) 686 journal->j_checkpoint_transactions = NULL; 687 } 688 689 J_ASSERT(transaction->t_state == T_FINISHED); 690 J_ASSERT(transaction->t_buffers == NULL); 691 J_ASSERT(transaction->t_sync_datalist == NULL); 692 J_ASSERT(transaction->t_forget == NULL); 693 J_ASSERT(transaction->t_iobuf_list == NULL); 694 J_ASSERT(transaction->t_shadow_list == NULL); 695 J_ASSERT(transaction->t_log_list == NULL); 696 J_ASSERT(transaction->t_checkpoint_list == NULL); 697 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 698 J_ASSERT(transaction->t_updates == 0); 699 J_ASSERT(journal->j_committing_transaction != transaction); 700 J_ASSERT(journal->j_running_transaction != transaction); 701 702 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 703 kfree(transaction); 704 } 705