1 /* 2 * linux/fs/jbd2/checkpoint.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 5 * 6 * Copyright 1999 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Checkpoint routines for the generic filesystem journaling code. 13 * Part of the ext2fs journaling system. 14 * 15 * Checkpointing is the process of ensuring that a section of the log is 16 * committed fully to disk, so that that portion of the log can be 17 * reused. 18 */ 19 20 #include <linux/time.h> 21 #include <linux/fs.h> 22 #include <linux/jbd2.h> 23 #include <linux/errno.h> 24 #include <linux/slab.h> 25 26 /* 27 * Unlink a buffer from a transaction checkpoint list. 28 * 29 * Called with j_list_lock held. 30 */ 31 static inline void __buffer_unlink_first(struct journal_head *jh) 32 { 33 transaction_t *transaction = jh->b_cp_transaction; 34 35 jh->b_cpnext->b_cpprev = jh->b_cpprev; 36 jh->b_cpprev->b_cpnext = jh->b_cpnext; 37 if (transaction->t_checkpoint_list == jh) { 38 transaction->t_checkpoint_list = jh->b_cpnext; 39 if (transaction->t_checkpoint_list == jh) 40 transaction->t_checkpoint_list = NULL; 41 } 42 } 43 44 /* 45 * Unlink a buffer from a transaction checkpoint(io) list. 46 * 47 * Called with j_list_lock held. 48 */ 49 static inline void __buffer_unlink(struct journal_head *jh) 50 { 51 transaction_t *transaction = jh->b_cp_transaction; 52 53 __buffer_unlink_first(jh); 54 if (transaction->t_checkpoint_io_list == jh) { 55 transaction->t_checkpoint_io_list = jh->b_cpnext; 56 if (transaction->t_checkpoint_io_list == jh) 57 transaction->t_checkpoint_io_list = NULL; 58 } 59 } 60 61 /* 62 * Move a buffer from the checkpoint list to the checkpoint io list 63 * 64 * Called with j_list_lock held 65 */ 66 static inline void __buffer_relink_io(struct journal_head *jh) 67 { 68 transaction_t *transaction = jh->b_cp_transaction; 69 70 __buffer_unlink_first(jh); 71 72 if (!transaction->t_checkpoint_io_list) { 73 jh->b_cpnext = jh->b_cpprev = jh; 74 } else { 75 jh->b_cpnext = transaction->t_checkpoint_io_list; 76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; 77 jh->b_cpprev->b_cpnext = jh; 78 jh->b_cpnext->b_cpprev = jh; 79 } 80 transaction->t_checkpoint_io_list = jh; 81 } 82 83 /* 84 * Try to release a checkpointed buffer from its transaction. 85 * Returns 1 if we released it and 2 if we also released the 86 * whole transaction. 87 * 88 * Requires j_list_lock 89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 90 */ 91 static int __try_to_free_cp_buf(struct journal_head *jh) 92 { 93 int ret = 0; 94 struct buffer_head *bh = jh2bh(jh); 95 96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 98 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 99 jbd_unlock_bh_state(bh); 100 jbd2_journal_remove_journal_head(bh); 101 BUFFER_TRACE(bh, "release"); 102 __brelse(bh); 103 } else { 104 jbd_unlock_bh_state(bh); 105 } 106 return ret; 107 } 108 109 /* 110 * __jbd2_log_wait_for_space: wait until there is space in the journal. 111 * 112 * Called under j-state_lock *only*. It will be unlocked if we have to wait 113 * for a checkpoint to free up some space in the log. 114 */ 115 void __jbd2_log_wait_for_space(journal_t *journal) 116 { 117 int nblocks; 118 assert_spin_locked(&journal->j_state_lock); 119 120 nblocks = jbd_space_needed(journal); 121 while (__jbd2_log_space_left(journal) < nblocks) { 122 if (journal->j_flags & JBD2_ABORT) 123 return; 124 spin_unlock(&journal->j_state_lock); 125 mutex_lock(&journal->j_checkpoint_mutex); 126 127 /* 128 * Test again, another process may have checkpointed while we 129 * were waiting for the checkpoint lock 130 */ 131 spin_lock(&journal->j_state_lock); 132 nblocks = jbd_space_needed(journal); 133 if (__jbd2_log_space_left(journal) < nblocks) { 134 spin_unlock(&journal->j_state_lock); 135 jbd2_log_do_checkpoint(journal); 136 spin_lock(&journal->j_state_lock); 137 } 138 mutex_unlock(&journal->j_checkpoint_mutex); 139 } 140 } 141 142 /* 143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. 144 * The caller must restart a list walk. Wait for someone else to run 145 * jbd_unlock_bh_state(). 146 */ 147 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) 148 __releases(journal->j_list_lock) 149 { 150 get_bh(bh); 151 spin_unlock(&journal->j_list_lock); 152 jbd_lock_bh_state(bh); 153 jbd_unlock_bh_state(bh); 154 put_bh(bh); 155 } 156 157 /* 158 * Clean up transaction's list of buffers submitted for io. 159 * We wait for any pending IO to complete and remove any clean 160 * buffers. Note that we take the buffers in the opposite ordering 161 * from the one in which they were submitted for IO. 162 * 163 * Called with j_list_lock held. 164 */ 165 static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 166 { 167 struct journal_head *jh; 168 struct buffer_head *bh; 169 tid_t this_tid; 170 int released = 0; 171 172 this_tid = transaction->t_tid; 173 restart: 174 /* Did somebody clean up the transaction in the meanwhile? */ 175 if (journal->j_checkpoint_transactions != transaction || 176 transaction->t_tid != this_tid) 177 return; 178 while (!released && transaction->t_checkpoint_io_list) { 179 jh = transaction->t_checkpoint_io_list; 180 bh = jh2bh(jh); 181 if (!jbd_trylock_bh_state(bh)) { 182 jbd_sync_bh(journal, bh); 183 spin_lock(&journal->j_list_lock); 184 goto restart; 185 } 186 if (buffer_locked(bh)) { 187 atomic_inc(&bh->b_count); 188 spin_unlock(&journal->j_list_lock); 189 jbd_unlock_bh_state(bh); 190 wait_on_buffer(bh); 191 /* the journal_head may have gone by now */ 192 BUFFER_TRACE(bh, "brelse"); 193 __brelse(bh); 194 spin_lock(&journal->j_list_lock); 195 goto restart; 196 } 197 /* 198 * Now in whatever state the buffer currently is, we know that 199 * it has been written out and so we can drop it from the list 200 */ 201 released = __jbd2_journal_remove_checkpoint(jh); 202 jbd_unlock_bh_state(bh); 203 jbd2_journal_remove_journal_head(bh); 204 __brelse(bh); 205 } 206 } 207 208 #define NR_BATCH 64 209 210 static void 211 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 212 { 213 int i; 214 215 ll_rw_block(SWRITE, *batch_count, bhs); 216 for (i = 0; i < *batch_count; i++) { 217 struct buffer_head *bh = bhs[i]; 218 clear_buffer_jwrite(bh); 219 BUFFER_TRACE(bh, "brelse"); 220 __brelse(bh); 221 } 222 *batch_count = 0; 223 } 224 225 /* 226 * Try to flush one buffer from the checkpoint list to disk. 227 * 228 * Return 1 if something happened which requires us to abort the current 229 * scan of the checkpoint list. 230 * 231 * Called with j_list_lock held and drops it if 1 is returned 232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 233 */ 234 static int __process_buffer(journal_t *journal, struct journal_head *jh, 235 struct buffer_head **bhs, int *batch_count) 236 { 237 struct buffer_head *bh = jh2bh(jh); 238 int ret = 0; 239 240 if (buffer_locked(bh)) { 241 atomic_inc(&bh->b_count); 242 spin_unlock(&journal->j_list_lock); 243 jbd_unlock_bh_state(bh); 244 wait_on_buffer(bh); 245 /* the journal_head may have gone by now */ 246 BUFFER_TRACE(bh, "brelse"); 247 __brelse(bh); 248 ret = 1; 249 } else if (jh->b_transaction != NULL) { 250 transaction_t *t = jh->b_transaction; 251 tid_t tid = t->t_tid; 252 253 spin_unlock(&journal->j_list_lock); 254 jbd_unlock_bh_state(bh); 255 jbd2_log_start_commit(journal, tid); 256 jbd2_log_wait_commit(journal, tid); 257 ret = 1; 258 } else if (!buffer_dirty(bh)) { 259 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 260 BUFFER_TRACE(bh, "remove from checkpoint"); 261 __jbd2_journal_remove_checkpoint(jh); 262 spin_unlock(&journal->j_list_lock); 263 jbd_unlock_bh_state(bh); 264 jbd2_journal_remove_journal_head(bh); 265 __brelse(bh); 266 ret = 1; 267 } else { 268 /* 269 * Important: we are about to write the buffer, and 270 * possibly block, while still holding the journal lock. 271 * We cannot afford to let the transaction logic start 272 * messing around with this buffer before we write it to 273 * disk, as that would break recoverability. 274 */ 275 BUFFER_TRACE(bh, "queue"); 276 get_bh(bh); 277 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 278 set_buffer_jwrite(bh); 279 bhs[*batch_count] = bh; 280 __buffer_relink_io(jh); 281 jbd_unlock_bh_state(bh); 282 (*batch_count)++; 283 if (*batch_count == NR_BATCH) { 284 spin_unlock(&journal->j_list_lock); 285 __flush_batch(journal, bhs, batch_count); 286 ret = 1; 287 } 288 } 289 return ret; 290 } 291 292 /* 293 * Perform an actual checkpoint. We take the first transaction on the 294 * list of transactions to be checkpointed and send all its buffers 295 * to disk. We submit larger chunks of data at once. 296 * 297 * The journal should be locked before calling this function. 298 */ 299 int jbd2_log_do_checkpoint(journal_t *journal) 300 { 301 transaction_t *transaction; 302 tid_t this_tid; 303 int result; 304 305 jbd_debug(1, "Start checkpoint\n"); 306 307 /* 308 * First thing: if there are any transactions in the log which 309 * don't need checkpointing, just eliminate them from the 310 * journal straight away. 311 */ 312 result = jbd2_cleanup_journal_tail(journal); 313 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 314 if (result <= 0) 315 return result; 316 317 /* 318 * OK, we need to start writing disk blocks. Take one transaction 319 * and write it. 320 */ 321 spin_lock(&journal->j_list_lock); 322 if (!journal->j_checkpoint_transactions) 323 goto out; 324 transaction = journal->j_checkpoint_transactions; 325 this_tid = transaction->t_tid; 326 restart: 327 /* 328 * If someone cleaned up this transaction while we slept, we're 329 * done (maybe it's a new transaction, but it fell at the same 330 * address). 331 */ 332 if (journal->j_checkpoint_transactions == transaction && 333 transaction->t_tid == this_tid) { 334 int batch_count = 0; 335 struct buffer_head *bhs[NR_BATCH]; 336 struct journal_head *jh; 337 int retry = 0; 338 339 while (!retry && transaction->t_checkpoint_list) { 340 struct buffer_head *bh; 341 342 jh = transaction->t_checkpoint_list; 343 bh = jh2bh(jh); 344 if (!jbd_trylock_bh_state(bh)) { 345 jbd_sync_bh(journal, bh); 346 retry = 1; 347 break; 348 } 349 retry = __process_buffer(journal, jh, bhs,&batch_count); 350 if (!retry && lock_need_resched(&journal->j_list_lock)){ 351 spin_unlock(&journal->j_list_lock); 352 retry = 1; 353 break; 354 } 355 } 356 357 if (batch_count) { 358 if (!retry) { 359 spin_unlock(&journal->j_list_lock); 360 retry = 1; 361 } 362 __flush_batch(journal, bhs, &batch_count); 363 } 364 365 if (retry) { 366 spin_lock(&journal->j_list_lock); 367 goto restart; 368 } 369 /* 370 * Now we have cleaned up the first transaction's checkpoint 371 * list. Let's clean up the second one 372 */ 373 __wait_cp_io(journal, transaction); 374 } 375 out: 376 spin_unlock(&journal->j_list_lock); 377 result = jbd2_cleanup_journal_tail(journal); 378 if (result < 0) 379 return result; 380 return 0; 381 } 382 383 /* 384 * Check the list of checkpoint transactions for the journal to see if 385 * we have already got rid of any since the last update of the log tail 386 * in the journal superblock. If so, we can instantly roll the 387 * superblock forward to remove those transactions from the log. 388 * 389 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 390 * 391 * Called with the journal lock held. 392 * 393 * This is the only part of the journaling code which really needs to be 394 * aware of transaction aborts. Checkpointing involves writing to the 395 * main filesystem area rather than to the journal, so it can proceed 396 * even in abort state, but we must not update the journal superblock if 397 * we have an abort error outstanding. 398 */ 399 400 int jbd2_cleanup_journal_tail(journal_t *journal) 401 { 402 transaction_t * transaction; 403 tid_t first_tid; 404 unsigned long blocknr, freed; 405 406 /* OK, work out the oldest transaction remaining in the log, and 407 * the log block it starts at. 408 * 409 * If the log is now empty, we need to work out which is the 410 * next transaction ID we will write, and where it will 411 * start. */ 412 413 spin_lock(&journal->j_state_lock); 414 spin_lock(&journal->j_list_lock); 415 transaction = journal->j_checkpoint_transactions; 416 if (transaction) { 417 first_tid = transaction->t_tid; 418 blocknr = transaction->t_log_start; 419 } else if ((transaction = journal->j_committing_transaction) != NULL) { 420 first_tid = transaction->t_tid; 421 blocknr = transaction->t_log_start; 422 } else if ((transaction = journal->j_running_transaction) != NULL) { 423 first_tid = transaction->t_tid; 424 blocknr = journal->j_head; 425 } else { 426 first_tid = journal->j_transaction_sequence; 427 blocknr = journal->j_head; 428 } 429 spin_unlock(&journal->j_list_lock); 430 J_ASSERT(blocknr != 0); 431 432 /* If the oldest pinned transaction is at the tail of the log 433 already then there's not much we can do right now. */ 434 if (journal->j_tail_sequence == first_tid) { 435 spin_unlock(&journal->j_state_lock); 436 return 1; 437 } 438 439 /* OK, update the superblock to recover the freed space. 440 * Physical blocks come first: have we wrapped beyond the end of 441 * the log? */ 442 freed = blocknr - journal->j_tail; 443 if (blocknr < journal->j_tail) 444 freed = freed + journal->j_last - journal->j_first; 445 446 jbd_debug(1, 447 "Cleaning journal tail from %d to %d (offset %lu), " 448 "freeing %lu\n", 449 journal->j_tail_sequence, first_tid, blocknr, freed); 450 451 journal->j_free += freed; 452 journal->j_tail_sequence = first_tid; 453 journal->j_tail = blocknr; 454 spin_unlock(&journal->j_state_lock); 455 if (!(journal->j_flags & JBD2_ABORT)) 456 jbd2_journal_update_superblock(journal, 1); 457 return 0; 458 } 459 460 461 /* Checkpoint list management */ 462 463 /* 464 * journal_clean_one_cp_list 465 * 466 * Find all the written-back checkpoint buffers in the given list and release them. 467 * 468 * Called with the journal locked. 469 * Called with j_list_lock held. 470 * Returns number of bufers reaped (for debug) 471 */ 472 473 static int journal_clean_one_cp_list(struct journal_head *jh, int *released) 474 { 475 struct journal_head *last_jh; 476 struct journal_head *next_jh = jh; 477 int ret, freed = 0; 478 479 *released = 0; 480 if (!jh) 481 return 0; 482 483 last_jh = jh->b_cpprev; 484 do { 485 jh = next_jh; 486 next_jh = jh->b_cpnext; 487 /* Use trylock because of the ranking */ 488 if (jbd_trylock_bh_state(jh2bh(jh))) { 489 ret = __try_to_free_cp_buf(jh); 490 if (ret) { 491 freed++; 492 if (ret == 2) { 493 *released = 1; 494 return freed; 495 } 496 } 497 } 498 /* 499 * This function only frees up some memory 500 * if possible so we dont have an obligation 501 * to finish processing. Bail out if preemption 502 * requested: 503 */ 504 if (need_resched()) 505 return freed; 506 } while (jh != last_jh); 507 508 return freed; 509 } 510 511 /* 512 * journal_clean_checkpoint_list 513 * 514 * Find all the written-back checkpoint buffers in the journal and release them. 515 * 516 * Called with the journal locked. 517 * Called with j_list_lock held. 518 * Returns number of buffers reaped (for debug) 519 */ 520 521 int __jbd2_journal_clean_checkpoint_list(journal_t *journal) 522 { 523 transaction_t *transaction, *last_transaction, *next_transaction; 524 int ret = 0; 525 int released; 526 527 transaction = journal->j_checkpoint_transactions; 528 if (!transaction) 529 goto out; 530 531 last_transaction = transaction->t_cpprev; 532 next_transaction = transaction; 533 do { 534 transaction = next_transaction; 535 next_transaction = transaction->t_cpnext; 536 ret += journal_clean_one_cp_list(transaction-> 537 t_checkpoint_list, &released); 538 /* 539 * This function only frees up some memory if possible so we 540 * dont have an obligation to finish processing. Bail out if 541 * preemption requested: 542 */ 543 if (need_resched()) 544 goto out; 545 if (released) 546 continue; 547 /* 548 * It is essential that we are as careful as in the case of 549 * t_checkpoint_list with removing the buffer from the list as 550 * we can possibly see not yet submitted buffers on io_list 551 */ 552 ret += journal_clean_one_cp_list(transaction-> 553 t_checkpoint_io_list, &released); 554 if (need_resched()) 555 goto out; 556 } while (transaction != last_transaction); 557 out: 558 return ret; 559 } 560 561 /* 562 * journal_remove_checkpoint: called after a buffer has been committed 563 * to disk (either by being write-back flushed to disk, or being 564 * committed to the log). 565 * 566 * We cannot safely clean a transaction out of the log until all of the 567 * buffer updates committed in that transaction have safely been stored 568 * elsewhere on disk. To achieve this, all of the buffers in a 569 * transaction need to be maintained on the transaction's checkpoint 570 * lists until they have been rewritten, at which point this function is 571 * called to remove the buffer from the existing transaction's 572 * checkpoint lists. 573 * 574 * The function returns 1 if it frees the transaction, 0 otherwise. 575 * 576 * This function is called with the journal locked. 577 * This function is called with j_list_lock held. 578 * This function is called with jbd_lock_bh_state(jh2bh(jh)) 579 */ 580 581 int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 582 { 583 transaction_t *transaction; 584 journal_t *journal; 585 int ret = 0; 586 587 JBUFFER_TRACE(jh, "entry"); 588 589 if ((transaction = jh->b_cp_transaction) == NULL) { 590 JBUFFER_TRACE(jh, "not on transaction"); 591 goto out; 592 } 593 journal = transaction->t_journal; 594 595 __buffer_unlink(jh); 596 jh->b_cp_transaction = NULL; 597 598 if (transaction->t_checkpoint_list != NULL || 599 transaction->t_checkpoint_io_list != NULL) 600 goto out; 601 JBUFFER_TRACE(jh, "transaction has no more buffers"); 602 603 /* 604 * There is one special case to worry about: if we have just pulled the 605 * buffer off a committing transaction's forget list, then even if the 606 * checkpoint list is empty, the transaction obviously cannot be 607 * dropped! 608 * 609 * The locking here around j_committing_transaction is a bit sleazy. 610 * See the comment at the end of jbd2_journal_commit_transaction(). 611 */ 612 if (transaction == journal->j_committing_transaction) { 613 JBUFFER_TRACE(jh, "belongs to committing transaction"); 614 goto out; 615 } 616 617 /* OK, that was the last buffer for the transaction: we can now 618 safely remove this transaction from the log */ 619 620 __jbd2_journal_drop_transaction(journal, transaction); 621 622 /* Just in case anybody was waiting for more transactions to be 623 checkpointed... */ 624 wake_up(&journal->j_wait_logspace); 625 ret = 1; 626 out: 627 JBUFFER_TRACE(jh, "exit"); 628 return ret; 629 } 630 631 /* 632 * journal_insert_checkpoint: put a committed buffer onto a checkpoint 633 * list so that we know when it is safe to clean the transaction out of 634 * the log. 635 * 636 * Called with the journal locked. 637 * Called with j_list_lock held. 638 */ 639 void __jbd2_journal_insert_checkpoint(struct journal_head *jh, 640 transaction_t *transaction) 641 { 642 JBUFFER_TRACE(jh, "entry"); 643 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); 644 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 645 646 jh->b_cp_transaction = transaction; 647 648 if (!transaction->t_checkpoint_list) { 649 jh->b_cpnext = jh->b_cpprev = jh; 650 } else { 651 jh->b_cpnext = transaction->t_checkpoint_list; 652 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; 653 jh->b_cpprev->b_cpnext = jh; 654 jh->b_cpnext->b_cpprev = jh; 655 } 656 transaction->t_checkpoint_list = jh; 657 } 658 659 /* 660 * We've finished with this transaction structure: adios... 661 * 662 * The transaction must have no links except for the checkpoint by this 663 * point. 664 * 665 * Called with the journal locked. 666 * Called with j_list_lock held. 667 */ 668 669 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) 670 { 671 assert_spin_locked(&journal->j_list_lock); 672 if (transaction->t_cpnext) { 673 transaction->t_cpnext->t_cpprev = transaction->t_cpprev; 674 transaction->t_cpprev->t_cpnext = transaction->t_cpnext; 675 if (journal->j_checkpoint_transactions == transaction) 676 journal->j_checkpoint_transactions = 677 transaction->t_cpnext; 678 if (journal->j_checkpoint_transactions == transaction) 679 journal->j_checkpoint_transactions = NULL; 680 } 681 682 J_ASSERT(transaction->t_state == T_FINISHED); 683 J_ASSERT(transaction->t_buffers == NULL); 684 J_ASSERT(transaction->t_sync_datalist == NULL); 685 J_ASSERT(transaction->t_forget == NULL); 686 J_ASSERT(transaction->t_iobuf_list == NULL); 687 J_ASSERT(transaction->t_shadow_list == NULL); 688 J_ASSERT(transaction->t_log_list == NULL); 689 J_ASSERT(transaction->t_checkpoint_list == NULL); 690 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 691 J_ASSERT(transaction->t_updates == 0); 692 J_ASSERT(journal->j_committing_transaction != transaction); 693 J_ASSERT(journal->j_running_transaction != transaction); 694 695 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 696 kfree(transaction); 697 } 698