1 /* 2 * linux/fs/jbd2/journal.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Generic filesystem journal-writing code; part of the ext2fs 13 * journaling system. 14 * 15 * This file manages journals: areas of disk reserved for logging 16 * transactional updates. This includes the kernel journaling thread 17 * which is responsible for scheduling updates to the log. 18 * 19 * We do not actually manage the physical storage of the journal in this 20 * file: that is left to a per-journal policy function, which allows us 21 * to store the journal within a filesystem-specified area for ext2 22 * journaling (ext2 can use a reserved inode for storing the log). 23 */ 24 25 #include <linux/module.h> 26 #include <linux/time.h> 27 #include <linux/fs.h> 28 #include <linux/jbd2.h> 29 #include <linux/errno.h> 30 #include <linux/slab.h> 31 #include <linux/init.h> 32 #include <linux/mm.h> 33 #include <linux/freezer.h> 34 #include <linux/pagemap.h> 35 #include <linux/kthread.h> 36 #include <linux/poison.h> 37 #include <linux/proc_fs.h> 38 #include <linux/debugfs.h> 39 #include <linux/seq_file.h> 40 #include <linux/math64.h> 41 42 #include <asm/uaccess.h> 43 #include <asm/page.h> 44 45 EXPORT_SYMBOL(jbd2_journal_start); 46 EXPORT_SYMBOL(jbd2_journal_restart); 47 EXPORT_SYMBOL(jbd2_journal_extend); 48 EXPORT_SYMBOL(jbd2_journal_stop); 49 EXPORT_SYMBOL(jbd2_journal_lock_updates); 50 EXPORT_SYMBOL(jbd2_journal_unlock_updates); 51 EXPORT_SYMBOL(jbd2_journal_get_write_access); 52 EXPORT_SYMBOL(jbd2_journal_get_create_access); 53 EXPORT_SYMBOL(jbd2_journal_get_undo_access); 54 EXPORT_SYMBOL(jbd2_journal_set_triggers); 55 EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 56 EXPORT_SYMBOL(jbd2_journal_release_buffer); 57 EXPORT_SYMBOL(jbd2_journal_forget); 58 #if 0 59 EXPORT_SYMBOL(journal_sync_buffer); 60 #endif 61 EXPORT_SYMBOL(jbd2_journal_flush); 62 EXPORT_SYMBOL(jbd2_journal_revoke); 63 64 EXPORT_SYMBOL(jbd2_journal_init_dev); 65 EXPORT_SYMBOL(jbd2_journal_init_inode); 66 EXPORT_SYMBOL(jbd2_journal_update_format); 67 EXPORT_SYMBOL(jbd2_journal_check_used_features); 68 EXPORT_SYMBOL(jbd2_journal_check_available_features); 69 EXPORT_SYMBOL(jbd2_journal_set_features); 70 EXPORT_SYMBOL(jbd2_journal_load); 71 EXPORT_SYMBOL(jbd2_journal_destroy); 72 EXPORT_SYMBOL(jbd2_journal_abort); 73 EXPORT_SYMBOL(jbd2_journal_errno); 74 EXPORT_SYMBOL(jbd2_journal_ack_err); 75 EXPORT_SYMBOL(jbd2_journal_clear_err); 76 EXPORT_SYMBOL(jbd2_log_wait_commit); 77 EXPORT_SYMBOL(jbd2_journal_start_commit); 78 EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 79 EXPORT_SYMBOL(jbd2_journal_wipe); 80 EXPORT_SYMBOL(jbd2_journal_blocks_per_page); 81 EXPORT_SYMBOL(jbd2_journal_invalidatepage); 82 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 83 EXPORT_SYMBOL(jbd2_journal_force_commit); 84 EXPORT_SYMBOL(jbd2_journal_file_inode); 85 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 86 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 87 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 88 89 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 90 static void __journal_abort_soft (journal_t *journal, int errno); 91 92 /* 93 * Helper function used to manage commit timeouts 94 */ 95 96 static void commit_timeout(unsigned long __data) 97 { 98 struct task_struct * p = (struct task_struct *) __data; 99 100 wake_up_process(p); 101 } 102 103 /* 104 * kjournald2: The main thread function used to manage a logging device 105 * journal. 106 * 107 * This kernel thread is responsible for two things: 108 * 109 * 1) COMMIT: Every so often we need to commit the current state of the 110 * filesystem to disk. The journal thread is responsible for writing 111 * all of the metadata buffers to disk. 112 * 113 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all 114 * of the data in that part of the log has been rewritten elsewhere on 115 * the disk. Flushing these old buffers to reclaim space in the log is 116 * known as checkpointing, and this thread is responsible for that job. 117 */ 118 119 static int kjournald2(void *arg) 120 { 121 journal_t *journal = arg; 122 transaction_t *transaction; 123 124 /* 125 * Set up an interval timer which can be used to trigger a commit wakeup 126 * after the commit interval expires 127 */ 128 setup_timer(&journal->j_commit_timer, commit_timeout, 129 (unsigned long)current); 130 131 /* Record that the journal thread is running */ 132 journal->j_task = current; 133 wake_up(&journal->j_wait_done_commit); 134 135 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " 136 "commit interval %ld seconds\n", current->pid, 137 journal->j_devname, journal->j_commit_interval / HZ); 138 139 /* 140 * And now, wait forever for commit wakeup events. 141 */ 142 spin_lock(&journal->j_state_lock); 143 144 loop: 145 if (journal->j_flags & JBD2_UNMOUNT) 146 goto end_loop; 147 148 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", 149 journal->j_commit_sequence, journal->j_commit_request); 150 151 if (journal->j_commit_sequence != journal->j_commit_request) { 152 jbd_debug(1, "OK, requests differ\n"); 153 spin_unlock(&journal->j_state_lock); 154 del_timer_sync(&journal->j_commit_timer); 155 jbd2_journal_commit_transaction(journal); 156 spin_lock(&journal->j_state_lock); 157 goto loop; 158 } 159 160 wake_up(&journal->j_wait_done_commit); 161 if (freezing(current)) { 162 /* 163 * The simpler the better. Flushing journal isn't a 164 * good idea, because that depends on threads that may 165 * be already stopped. 166 */ 167 jbd_debug(1, "Now suspending kjournald2\n"); 168 spin_unlock(&journal->j_state_lock); 169 refrigerator(); 170 spin_lock(&journal->j_state_lock); 171 } else { 172 /* 173 * We assume on resume that commits are already there, 174 * so we don't sleep 175 */ 176 DEFINE_WAIT(wait); 177 int should_sleep = 1; 178 179 prepare_to_wait(&journal->j_wait_commit, &wait, 180 TASK_INTERRUPTIBLE); 181 if (journal->j_commit_sequence != journal->j_commit_request) 182 should_sleep = 0; 183 transaction = journal->j_running_transaction; 184 if (transaction && time_after_eq(jiffies, 185 transaction->t_expires)) 186 should_sleep = 0; 187 if (journal->j_flags & JBD2_UNMOUNT) 188 should_sleep = 0; 189 if (should_sleep) { 190 spin_unlock(&journal->j_state_lock); 191 schedule(); 192 spin_lock(&journal->j_state_lock); 193 } 194 finish_wait(&journal->j_wait_commit, &wait); 195 } 196 197 jbd_debug(1, "kjournald2 wakes\n"); 198 199 /* 200 * Were we woken up by a commit wakeup event? 201 */ 202 transaction = journal->j_running_transaction; 203 if (transaction && time_after_eq(jiffies, transaction->t_expires)) { 204 journal->j_commit_request = transaction->t_tid; 205 jbd_debug(1, "woke because of timeout\n"); 206 } 207 goto loop; 208 209 end_loop: 210 spin_unlock(&journal->j_state_lock); 211 del_timer_sync(&journal->j_commit_timer); 212 journal->j_task = NULL; 213 wake_up(&journal->j_wait_done_commit); 214 jbd_debug(1, "Journal thread exiting.\n"); 215 return 0; 216 } 217 218 static int jbd2_journal_start_thread(journal_t *journal) 219 { 220 struct task_struct *t; 221 222 t = kthread_run(kjournald2, journal, "kjournald2"); 223 if (IS_ERR(t)) 224 return PTR_ERR(t); 225 226 wait_event(journal->j_wait_done_commit, journal->j_task != NULL); 227 return 0; 228 } 229 230 static void journal_kill_thread(journal_t *journal) 231 { 232 spin_lock(&journal->j_state_lock); 233 journal->j_flags |= JBD2_UNMOUNT; 234 235 while (journal->j_task) { 236 wake_up(&journal->j_wait_commit); 237 spin_unlock(&journal->j_state_lock); 238 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 239 spin_lock(&journal->j_state_lock); 240 } 241 spin_unlock(&journal->j_state_lock); 242 } 243 244 /* 245 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. 246 * 247 * Writes a metadata buffer to a given disk block. The actual IO is not 248 * performed but a new buffer_head is constructed which labels the data 249 * to be written with the correct destination disk block. 250 * 251 * Any magic-number escaping which needs to be done will cause a 252 * copy-out here. If the buffer happens to start with the 253 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the 254 * magic number is only written to the log for descripter blocks. In 255 * this case, we copy the data and replace the first word with 0, and we 256 * return a result code which indicates that this buffer needs to be 257 * marked as an escaped buffer in the corresponding log descriptor 258 * block. The missing word can then be restored when the block is read 259 * during recovery. 260 * 261 * If the source buffer has already been modified by a new transaction 262 * since we took the last commit snapshot, we use the frozen copy of 263 * that data for IO. If we end up using the existing buffer_head's data 264 * for the write, then we *have* to lock the buffer to prevent anyone 265 * else from using and possibly modifying it while the IO is in 266 * progress. 267 * 268 * The function returns a pointer to the buffer_heads to be used for IO. 269 * 270 * We assume that the journal has already been locked in this function. 271 * 272 * Return value: 273 * <0: Error 274 * >=0: Finished OK 275 * 276 * On success: 277 * Bit 0 set == escape performed on the data 278 * Bit 1 set == buffer copy-out performed (kfree the data after IO) 279 */ 280 281 int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 282 struct journal_head *jh_in, 283 struct journal_head **jh_out, 284 unsigned long long blocknr) 285 { 286 int need_copy_out = 0; 287 int done_copy_out = 0; 288 int do_escape = 0; 289 char *mapped_data; 290 struct buffer_head *new_bh; 291 struct journal_head *new_jh; 292 struct page *new_page; 293 unsigned int new_offset; 294 struct buffer_head *bh_in = jh2bh(jh_in); 295 struct jbd2_buffer_trigger_type *triggers; 296 297 /* 298 * The buffer really shouldn't be locked: only the current committing 299 * transaction is allowed to write it, so nobody else is allowed 300 * to do any IO. 301 * 302 * akpm: except if we're journalling data, and write() output is 303 * also part of a shared mapping, and another thread has 304 * decided to launch a writepage() against this buffer. 305 */ 306 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 307 308 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 309 310 /* 311 * If a new transaction has already done a buffer copy-out, then 312 * we use that version of the data for the commit. 313 */ 314 jbd_lock_bh_state(bh_in); 315 repeat: 316 if (jh_in->b_frozen_data) { 317 done_copy_out = 1; 318 new_page = virt_to_page(jh_in->b_frozen_data); 319 new_offset = offset_in_page(jh_in->b_frozen_data); 320 triggers = jh_in->b_frozen_triggers; 321 } else { 322 new_page = jh2bh(jh_in)->b_page; 323 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 324 triggers = jh_in->b_triggers; 325 } 326 327 mapped_data = kmap_atomic(new_page, KM_USER0); 328 /* 329 * Fire any commit trigger. Do this before checking for escaping, 330 * as the trigger may modify the magic offset. If a copy-out 331 * happens afterwards, it will have the correct data in the buffer. 332 */ 333 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, 334 triggers); 335 336 /* 337 * Check for escaping 338 */ 339 if (*((__be32 *)(mapped_data + new_offset)) == 340 cpu_to_be32(JBD2_MAGIC_NUMBER)) { 341 need_copy_out = 1; 342 do_escape = 1; 343 } 344 kunmap_atomic(mapped_data, KM_USER0); 345 346 /* 347 * Do we need to do a data copy? 348 */ 349 if (need_copy_out && !done_copy_out) { 350 char *tmp; 351 352 jbd_unlock_bh_state(bh_in); 353 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 354 jbd_lock_bh_state(bh_in); 355 if (jh_in->b_frozen_data) { 356 jbd2_free(tmp, bh_in->b_size); 357 goto repeat; 358 } 359 360 jh_in->b_frozen_data = tmp; 361 mapped_data = kmap_atomic(new_page, KM_USER0); 362 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 363 kunmap_atomic(mapped_data, KM_USER0); 364 365 new_page = virt_to_page(tmp); 366 new_offset = offset_in_page(tmp); 367 done_copy_out = 1; 368 369 /* 370 * This isn't strictly necessary, as we're using frozen 371 * data for the escaping, but it keeps consistency with 372 * b_frozen_data usage. 373 */ 374 jh_in->b_frozen_triggers = jh_in->b_triggers; 375 } 376 377 /* 378 * Did we need to do an escaping? Now we've done all the 379 * copying, we can finally do so. 380 */ 381 if (do_escape) { 382 mapped_data = kmap_atomic(new_page, KM_USER0); 383 *((unsigned int *)(mapped_data + new_offset)) = 0; 384 kunmap_atomic(mapped_data, KM_USER0); 385 } 386 387 /* keep subsequent assertions sane */ 388 new_bh->b_state = 0; 389 init_buffer(new_bh, NULL, NULL); 390 atomic_set(&new_bh->b_count, 1); 391 jbd_unlock_bh_state(bh_in); 392 393 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ 394 395 set_bh_page(new_bh, new_page, new_offset); 396 new_jh->b_transaction = NULL; 397 new_bh->b_size = jh2bh(jh_in)->b_size; 398 new_bh->b_bdev = transaction->t_journal->j_dev; 399 new_bh->b_blocknr = blocknr; 400 set_buffer_mapped(new_bh); 401 set_buffer_dirty(new_bh); 402 403 *jh_out = new_jh; 404 405 /* 406 * The to-be-written buffer needs to get moved to the io queue, 407 * and the original buffer whose contents we are shadowing or 408 * copying is moved to the transaction's shadow queue. 409 */ 410 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 411 jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 412 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 413 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); 414 415 return do_escape | (done_copy_out << 1); 416 } 417 418 /* 419 * Allocation code for the journal file. Manage the space left in the 420 * journal, so that we can begin checkpointing when appropriate. 421 */ 422 423 /* 424 * __jbd2_log_space_left: Return the number of free blocks left in the journal. 425 * 426 * Called with the journal already locked. 427 * 428 * Called under j_state_lock 429 */ 430 431 int __jbd2_log_space_left(journal_t *journal) 432 { 433 int left = journal->j_free; 434 435 assert_spin_locked(&journal->j_state_lock); 436 437 /* 438 * Be pessimistic here about the number of those free blocks which 439 * might be required for log descriptor control blocks. 440 */ 441 442 #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ 443 444 left -= MIN_LOG_RESERVED_BLOCKS; 445 446 if (left <= 0) 447 return 0; 448 left -= (left >> 3); 449 return left; 450 } 451 452 /* 453 * Called under j_state_lock. Returns true if a transaction commit was started. 454 */ 455 int __jbd2_log_start_commit(journal_t *journal, tid_t target) 456 { 457 /* 458 * Are we already doing a recent enough commit? 459 */ 460 if (!tid_geq(journal->j_commit_request, target)) { 461 /* 462 * We want a new commit: OK, mark the request and wakup the 463 * commit thread. We do _not_ do the commit ourselves. 464 */ 465 466 journal->j_commit_request = target; 467 jbd_debug(1, "JBD: requesting commit %d/%d\n", 468 journal->j_commit_request, 469 journal->j_commit_sequence); 470 wake_up(&journal->j_wait_commit); 471 return 1; 472 } 473 return 0; 474 } 475 476 int jbd2_log_start_commit(journal_t *journal, tid_t tid) 477 { 478 int ret; 479 480 spin_lock(&journal->j_state_lock); 481 ret = __jbd2_log_start_commit(journal, tid); 482 spin_unlock(&journal->j_state_lock); 483 return ret; 484 } 485 486 /* 487 * Force and wait upon a commit if the calling process is not within 488 * transaction. This is used for forcing out undo-protected data which contains 489 * bitmaps, when the fs is running out of space. 490 * 491 * We can only force the running transaction if we don't have an active handle; 492 * otherwise, we will deadlock. 493 * 494 * Returns true if a transaction was started. 495 */ 496 int jbd2_journal_force_commit_nested(journal_t *journal) 497 { 498 transaction_t *transaction = NULL; 499 tid_t tid; 500 501 spin_lock(&journal->j_state_lock); 502 if (journal->j_running_transaction && !current->journal_info) { 503 transaction = journal->j_running_transaction; 504 __jbd2_log_start_commit(journal, transaction->t_tid); 505 } else if (journal->j_committing_transaction) 506 transaction = journal->j_committing_transaction; 507 508 if (!transaction) { 509 spin_unlock(&journal->j_state_lock); 510 return 0; /* Nothing to retry */ 511 } 512 513 tid = transaction->t_tid; 514 spin_unlock(&journal->j_state_lock); 515 jbd2_log_wait_commit(journal, tid); 516 return 1; 517 } 518 519 /* 520 * Start a commit of the current running transaction (if any). Returns true 521 * if a transaction is going to be committed (or is currently already 522 * committing), and fills its tid in at *ptid 523 */ 524 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) 525 { 526 int ret = 0; 527 528 spin_lock(&journal->j_state_lock); 529 if (journal->j_running_transaction) { 530 tid_t tid = journal->j_running_transaction->t_tid; 531 532 __jbd2_log_start_commit(journal, tid); 533 /* There's a running transaction and we've just made sure 534 * it's commit has been scheduled. */ 535 if (ptid) 536 *ptid = tid; 537 ret = 1; 538 } else if (journal->j_committing_transaction) { 539 /* 540 * If ext3_write_super() recently started a commit, then we 541 * have to wait for completion of that transaction 542 */ 543 if (ptid) 544 *ptid = journal->j_committing_transaction->t_tid; 545 ret = 1; 546 } 547 spin_unlock(&journal->j_state_lock); 548 return ret; 549 } 550 551 /* 552 * Wait for a specified commit to complete. 553 * The caller may not hold the journal lock. 554 */ 555 int jbd2_log_wait_commit(journal_t *journal, tid_t tid) 556 { 557 int err = 0; 558 559 #ifdef CONFIG_JBD2_DEBUG 560 spin_lock(&journal->j_state_lock); 561 if (!tid_geq(journal->j_commit_request, tid)) { 562 printk(KERN_EMERG 563 "%s: error: j_commit_request=%d, tid=%d\n", 564 __func__, journal->j_commit_request, tid); 565 } 566 spin_unlock(&journal->j_state_lock); 567 #endif 568 spin_lock(&journal->j_state_lock); 569 while (tid_gt(tid, journal->j_commit_sequence)) { 570 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 571 tid, journal->j_commit_sequence); 572 wake_up(&journal->j_wait_commit); 573 spin_unlock(&journal->j_state_lock); 574 wait_event(journal->j_wait_done_commit, 575 !tid_gt(tid, journal->j_commit_sequence)); 576 spin_lock(&journal->j_state_lock); 577 } 578 spin_unlock(&journal->j_state_lock); 579 580 if (unlikely(is_journal_aborted(journal))) { 581 printk(KERN_EMERG "journal commit I/O error\n"); 582 err = -EIO; 583 } 584 return err; 585 } 586 587 /* 588 * Log buffer allocation routines: 589 */ 590 591 int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) 592 { 593 unsigned long blocknr; 594 595 spin_lock(&journal->j_state_lock); 596 J_ASSERT(journal->j_free > 1); 597 598 blocknr = journal->j_head; 599 journal->j_head++; 600 journal->j_free--; 601 if (journal->j_head == journal->j_last) 602 journal->j_head = journal->j_first; 603 spin_unlock(&journal->j_state_lock); 604 return jbd2_journal_bmap(journal, blocknr, retp); 605 } 606 607 /* 608 * Conversion of logical to physical block numbers for the journal 609 * 610 * On external journals the journal blocks are identity-mapped, so 611 * this is a no-op. If needed, we can use j_blk_offset - everything is 612 * ready. 613 */ 614 int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, 615 unsigned long long *retp) 616 { 617 int err = 0; 618 unsigned long long ret; 619 620 if (journal->j_inode) { 621 ret = bmap(journal->j_inode, blocknr); 622 if (ret) 623 *retp = ret; 624 else { 625 printk(KERN_ALERT "%s: journal block not found " 626 "at offset %lu on %s\n", 627 __func__, blocknr, journal->j_devname); 628 err = -EIO; 629 __journal_abort_soft(journal, err); 630 } 631 } else { 632 *retp = blocknr; /* +journal->j_blk_offset */ 633 } 634 return err; 635 } 636 637 /* 638 * We play buffer_head aliasing tricks to write data/metadata blocks to 639 * the journal without copying their contents, but for journal 640 * descriptor blocks we do need to generate bona fide buffers. 641 * 642 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying 643 * the buffer's contents they really should run flush_dcache_page(bh->b_page). 644 * But we don't bother doing that, so there will be coherency problems with 645 * mmaps of blockdevs which hold live JBD-controlled filesystems. 646 */ 647 struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 648 { 649 struct buffer_head *bh; 650 unsigned long long blocknr; 651 int err; 652 653 err = jbd2_journal_next_log_block(journal, &blocknr); 654 655 if (err) 656 return NULL; 657 658 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 659 if (!bh) 660 return NULL; 661 lock_buffer(bh); 662 memset(bh->b_data, 0, journal->j_blocksize); 663 set_buffer_uptodate(bh); 664 unlock_buffer(bh); 665 BUFFER_TRACE(bh, "return this buffer"); 666 return jbd2_journal_add_journal_head(bh); 667 } 668 669 struct jbd2_stats_proc_session { 670 journal_t *journal; 671 struct transaction_stats_s *stats; 672 int start; 673 int max; 674 }; 675 676 static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, 677 struct transaction_stats_s *ts, 678 int first) 679 { 680 if (ts == s->stats + s->max) 681 ts = s->stats; 682 if (!first && ts == s->stats + s->start) 683 return NULL; 684 while (ts->ts_type == 0) { 685 ts++; 686 if (ts == s->stats + s->max) 687 ts = s->stats; 688 if (ts == s->stats + s->start) 689 return NULL; 690 } 691 return ts; 692 693 } 694 695 static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) 696 { 697 struct jbd2_stats_proc_session *s = seq->private; 698 struct transaction_stats_s *ts; 699 int l = *pos; 700 701 if (l == 0) 702 return SEQ_START_TOKEN; 703 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); 704 if (!ts) 705 return NULL; 706 l--; 707 while (l) { 708 ts = jbd2_history_skip_empty(s, ++ts, 0); 709 if (!ts) 710 break; 711 l--; 712 } 713 return ts; 714 } 715 716 static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) 717 { 718 struct jbd2_stats_proc_session *s = seq->private; 719 struct transaction_stats_s *ts = v; 720 721 ++*pos; 722 if (v == SEQ_START_TOKEN) 723 return jbd2_history_skip_empty(s, s->stats + s->start, 1); 724 else 725 return jbd2_history_skip_empty(s, ++ts, 0); 726 } 727 728 static int jbd2_seq_history_show(struct seq_file *seq, void *v) 729 { 730 struct transaction_stats_s *ts = v; 731 if (v == SEQ_START_TOKEN) { 732 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " 733 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", 734 "wait", "run", "lock", "flush", "log", "hndls", 735 "block", "inlog", "ctime", "write", "drop", 736 "close"); 737 return 0; 738 } 739 if (ts->ts_type == JBD2_STATS_RUN) 740 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " 741 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, 742 jiffies_to_msecs(ts->u.run.rs_wait), 743 jiffies_to_msecs(ts->u.run.rs_running), 744 jiffies_to_msecs(ts->u.run.rs_locked), 745 jiffies_to_msecs(ts->u.run.rs_flushing), 746 jiffies_to_msecs(ts->u.run.rs_logging), 747 ts->u.run.rs_handle_count, 748 ts->u.run.rs_blocks, 749 ts->u.run.rs_blocks_logged); 750 else if (ts->ts_type == JBD2_STATS_CHECKPOINT) 751 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", 752 "C", ts->ts_tid, " ", 753 jiffies_to_msecs(ts->u.chp.cs_chp_time), 754 ts->u.chp.cs_written, ts->u.chp.cs_dropped, 755 ts->u.chp.cs_forced_to_close); 756 else 757 J_ASSERT(0); 758 return 0; 759 } 760 761 static void jbd2_seq_history_stop(struct seq_file *seq, void *v) 762 { 763 } 764 765 static struct seq_operations jbd2_seq_history_ops = { 766 .start = jbd2_seq_history_start, 767 .next = jbd2_seq_history_next, 768 .stop = jbd2_seq_history_stop, 769 .show = jbd2_seq_history_show, 770 }; 771 772 static int jbd2_seq_history_open(struct inode *inode, struct file *file) 773 { 774 journal_t *journal = PDE(inode)->data; 775 struct jbd2_stats_proc_session *s; 776 int rc, size; 777 778 s = kmalloc(sizeof(*s), GFP_KERNEL); 779 if (s == NULL) 780 return -ENOMEM; 781 size = sizeof(struct transaction_stats_s) * journal->j_history_max; 782 s->stats = kmalloc(size, GFP_KERNEL); 783 if (s->stats == NULL) { 784 kfree(s); 785 return -ENOMEM; 786 } 787 spin_lock(&journal->j_history_lock); 788 memcpy(s->stats, journal->j_history, size); 789 s->max = journal->j_history_max; 790 s->start = journal->j_history_cur % s->max; 791 spin_unlock(&journal->j_history_lock); 792 793 rc = seq_open(file, &jbd2_seq_history_ops); 794 if (rc == 0) { 795 struct seq_file *m = file->private_data; 796 m->private = s; 797 } else { 798 kfree(s->stats); 799 kfree(s); 800 } 801 return rc; 802 803 } 804 805 static int jbd2_seq_history_release(struct inode *inode, struct file *file) 806 { 807 struct seq_file *seq = file->private_data; 808 struct jbd2_stats_proc_session *s = seq->private; 809 810 kfree(s->stats); 811 kfree(s); 812 return seq_release(inode, file); 813 } 814 815 static struct file_operations jbd2_seq_history_fops = { 816 .owner = THIS_MODULE, 817 .open = jbd2_seq_history_open, 818 .read = seq_read, 819 .llseek = seq_lseek, 820 .release = jbd2_seq_history_release, 821 }; 822 823 static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 824 { 825 return *pos ? NULL : SEQ_START_TOKEN; 826 } 827 828 static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) 829 { 830 return NULL; 831 } 832 833 static int jbd2_seq_info_show(struct seq_file *seq, void *v) 834 { 835 struct jbd2_stats_proc_session *s = seq->private; 836 837 if (v != SEQ_START_TOKEN) 838 return 0; 839 seq_printf(seq, "%lu transaction, each upto %u blocks\n", 840 s->stats->ts_tid, 841 s->journal->j_max_transaction_buffers); 842 if (s->stats->ts_tid == 0) 843 return 0; 844 seq_printf(seq, "average: \n %ums waiting for transaction\n", 845 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); 846 seq_printf(seq, " %ums running transaction\n", 847 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); 848 seq_printf(seq, " %ums transaction was being locked\n", 849 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); 850 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 851 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 852 seq_printf(seq, " %ums logging transaction\n", 853 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 854 seq_printf(seq, " %lluus average transaction commit time\n", 855 div_u64(s->journal->j_average_commit_time, 1000)); 856 seq_printf(seq, " %lu handles per transaction\n", 857 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 858 seq_printf(seq, " %lu blocks per transaction\n", 859 s->stats->u.run.rs_blocks / s->stats->ts_tid); 860 seq_printf(seq, " %lu logged blocks per transaction\n", 861 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); 862 return 0; 863 } 864 865 static void jbd2_seq_info_stop(struct seq_file *seq, void *v) 866 { 867 } 868 869 static struct seq_operations jbd2_seq_info_ops = { 870 .start = jbd2_seq_info_start, 871 .next = jbd2_seq_info_next, 872 .stop = jbd2_seq_info_stop, 873 .show = jbd2_seq_info_show, 874 }; 875 876 static int jbd2_seq_info_open(struct inode *inode, struct file *file) 877 { 878 journal_t *journal = PDE(inode)->data; 879 struct jbd2_stats_proc_session *s; 880 int rc, size; 881 882 s = kmalloc(sizeof(*s), GFP_KERNEL); 883 if (s == NULL) 884 return -ENOMEM; 885 size = sizeof(struct transaction_stats_s); 886 s->stats = kmalloc(size, GFP_KERNEL); 887 if (s->stats == NULL) { 888 kfree(s); 889 return -ENOMEM; 890 } 891 spin_lock(&journal->j_history_lock); 892 memcpy(s->stats, &journal->j_stats, size); 893 s->journal = journal; 894 spin_unlock(&journal->j_history_lock); 895 896 rc = seq_open(file, &jbd2_seq_info_ops); 897 if (rc == 0) { 898 struct seq_file *m = file->private_data; 899 m->private = s; 900 } else { 901 kfree(s->stats); 902 kfree(s); 903 } 904 return rc; 905 906 } 907 908 static int jbd2_seq_info_release(struct inode *inode, struct file *file) 909 { 910 struct seq_file *seq = file->private_data; 911 struct jbd2_stats_proc_session *s = seq->private; 912 kfree(s->stats); 913 kfree(s); 914 return seq_release(inode, file); 915 } 916 917 static struct file_operations jbd2_seq_info_fops = { 918 .owner = THIS_MODULE, 919 .open = jbd2_seq_info_open, 920 .read = seq_read, 921 .llseek = seq_lseek, 922 .release = jbd2_seq_info_release, 923 }; 924 925 static struct proc_dir_entry *proc_jbd2_stats; 926 927 static void jbd2_stats_proc_init(journal_t *journal) 928 { 929 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 930 if (journal->j_proc_entry) { 931 proc_create_data("history", S_IRUGO, journal->j_proc_entry, 932 &jbd2_seq_history_fops, journal); 933 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 934 &jbd2_seq_info_fops, journal); 935 } 936 } 937 938 static void jbd2_stats_proc_exit(journal_t *journal) 939 { 940 remove_proc_entry("info", journal->j_proc_entry); 941 remove_proc_entry("history", journal->j_proc_entry); 942 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 943 } 944 945 static void journal_init_stats(journal_t *journal) 946 { 947 int size; 948 949 if (!proc_jbd2_stats) 950 return; 951 952 journal->j_history_max = 100; 953 size = sizeof(struct transaction_stats_s) * journal->j_history_max; 954 journal->j_history = kzalloc(size, GFP_KERNEL); 955 if (!journal->j_history) { 956 journal->j_history_max = 0; 957 return; 958 } 959 spin_lock_init(&journal->j_history_lock); 960 } 961 962 /* 963 * Management for journal control blocks: functions to create and 964 * destroy journal_t structures, and to initialise and read existing 965 * journal blocks from disk. */ 966 967 /* First: create and setup a journal_t object in memory. We initialise 968 * very few fields yet: that has to wait until we have created the 969 * journal structures from from scratch, or loaded them from disk. */ 970 971 static journal_t * journal_init_common (void) 972 { 973 journal_t *journal; 974 int err; 975 976 journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL); 977 if (!journal) 978 goto fail; 979 980 init_waitqueue_head(&journal->j_wait_transaction_locked); 981 init_waitqueue_head(&journal->j_wait_logspace); 982 init_waitqueue_head(&journal->j_wait_done_commit); 983 init_waitqueue_head(&journal->j_wait_checkpoint); 984 init_waitqueue_head(&journal->j_wait_commit); 985 init_waitqueue_head(&journal->j_wait_updates); 986 mutex_init(&journal->j_barrier); 987 mutex_init(&journal->j_checkpoint_mutex); 988 spin_lock_init(&journal->j_revoke_lock); 989 spin_lock_init(&journal->j_list_lock); 990 spin_lock_init(&journal->j_state_lock); 991 992 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 993 journal->j_min_batch_time = 0; 994 journal->j_max_batch_time = 15000; /* 15ms */ 995 996 /* The journal is marked for error until we succeed with recovery! */ 997 journal->j_flags = JBD2_ABORT; 998 999 /* Set up a default-sized revoke table for the new mount. */ 1000 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 1001 if (err) { 1002 kfree(journal); 1003 goto fail; 1004 } 1005 1006 journal_init_stats(journal); 1007 1008 return journal; 1009 fail: 1010 return NULL; 1011 } 1012 1013 /* jbd2_journal_init_dev and jbd2_journal_init_inode: 1014 * 1015 * Create a journal structure assigned some fixed set of disk blocks to 1016 * the journal. We don't actually touch those disk blocks yet, but we 1017 * need to set up all of the mapping information to tell the journaling 1018 * system where the journal blocks are. 1019 * 1020 */ 1021 1022 /** 1023 * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure 1024 * @bdev: Block device on which to create the journal 1025 * @fs_dev: Device which hold journalled filesystem for this journal. 1026 * @start: Block nr Start of journal. 1027 * @len: Length of the journal in blocks. 1028 * @blocksize: blocksize of journalling device 1029 * 1030 * Returns: a newly created journal_t * 1031 * 1032 * jbd2_journal_init_dev creates a journal which maps a fixed contiguous 1033 * range of blocks on an arbitrary block device. 1034 * 1035 */ 1036 journal_t * jbd2_journal_init_dev(struct block_device *bdev, 1037 struct block_device *fs_dev, 1038 unsigned long long start, int len, int blocksize) 1039 { 1040 journal_t *journal = journal_init_common(); 1041 struct buffer_head *bh; 1042 char *p; 1043 int n; 1044 1045 if (!journal) 1046 return NULL; 1047 1048 /* journal descriptor can store up to n blocks -bzzz */ 1049 journal->j_blocksize = blocksize; 1050 jbd2_stats_proc_init(journal); 1051 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1052 journal->j_wbufsize = n; 1053 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1054 if (!journal->j_wbuf) { 1055 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1056 __func__); 1057 goto out_err; 1058 } 1059 journal->j_dev = bdev; 1060 journal->j_fs_dev = fs_dev; 1061 journal->j_blk_offset = start; 1062 journal->j_maxlen = len; 1063 bdevname(journal->j_dev, journal->j_devname); 1064 p = journal->j_devname; 1065 while ((p = strchr(p, '/'))) 1066 *p = '!'; 1067 1068 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1069 if (!bh) { 1070 printk(KERN_ERR 1071 "%s: Cannot get buffer for journal superblock\n", 1072 __func__); 1073 goto out_err; 1074 } 1075 journal->j_sb_buffer = bh; 1076 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1077 1078 return journal; 1079 out_err: 1080 jbd2_stats_proc_exit(journal); 1081 kfree(journal); 1082 return NULL; 1083 } 1084 1085 /** 1086 * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode. 1087 * @inode: An inode to create the journal in 1088 * 1089 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as 1090 * the journal. The inode must exist already, must support bmap() and 1091 * must have all data blocks preallocated. 1092 */ 1093 journal_t * jbd2_journal_init_inode (struct inode *inode) 1094 { 1095 struct buffer_head *bh; 1096 journal_t *journal = journal_init_common(); 1097 char *p; 1098 int err; 1099 int n; 1100 unsigned long long blocknr; 1101 1102 if (!journal) 1103 return NULL; 1104 1105 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 1106 journal->j_inode = inode; 1107 bdevname(journal->j_dev, journal->j_devname); 1108 p = journal->j_devname; 1109 while ((p = strchr(p, '/'))) 1110 *p = '!'; 1111 p = journal->j_devname + strlen(journal->j_devname); 1112 sprintf(p, ":%lu", journal->j_inode->i_ino); 1113 jbd_debug(1, 1114 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 1115 journal, inode->i_sb->s_id, inode->i_ino, 1116 (long long) inode->i_size, 1117 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); 1118 1119 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; 1120 journal->j_blocksize = inode->i_sb->s_blocksize; 1121 jbd2_stats_proc_init(journal); 1122 1123 /* journal descriptor can store up to n blocks -bzzz */ 1124 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1125 journal->j_wbufsize = n; 1126 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1127 if (!journal->j_wbuf) { 1128 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1129 __func__); 1130 goto out_err; 1131 } 1132 1133 err = jbd2_journal_bmap(journal, 0, &blocknr); 1134 /* If that failed, give up */ 1135 if (err) { 1136 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 1137 __func__); 1138 goto out_err; 1139 } 1140 1141 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1142 if (!bh) { 1143 printk(KERN_ERR 1144 "%s: Cannot get buffer for journal superblock\n", 1145 __func__); 1146 goto out_err; 1147 } 1148 journal->j_sb_buffer = bh; 1149 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1150 1151 return journal; 1152 out_err: 1153 jbd2_stats_proc_exit(journal); 1154 kfree(journal); 1155 return NULL; 1156 } 1157 1158 /* 1159 * If the journal init or create aborts, we need to mark the journal 1160 * superblock as being NULL to prevent the journal destroy from writing 1161 * back a bogus superblock. 1162 */ 1163 static void journal_fail_superblock (journal_t *journal) 1164 { 1165 struct buffer_head *bh = journal->j_sb_buffer; 1166 brelse(bh); 1167 journal->j_sb_buffer = NULL; 1168 } 1169 1170 /* 1171 * Given a journal_t structure, initialise the various fields for 1172 * startup of a new journaling session. We use this both when creating 1173 * a journal, and after recovering an old journal to reset it for 1174 * subsequent use. 1175 */ 1176 1177 static int journal_reset(journal_t *journal) 1178 { 1179 journal_superblock_t *sb = journal->j_superblock; 1180 unsigned long long first, last; 1181 1182 first = be32_to_cpu(sb->s_first); 1183 last = be32_to_cpu(sb->s_maxlen); 1184 1185 journal->j_first = first; 1186 journal->j_last = last; 1187 1188 journal->j_head = first; 1189 journal->j_tail = first; 1190 journal->j_free = last - first; 1191 1192 journal->j_tail_sequence = journal->j_transaction_sequence; 1193 journal->j_commit_sequence = journal->j_transaction_sequence - 1; 1194 journal->j_commit_request = journal->j_commit_sequence; 1195 1196 journal->j_max_transaction_buffers = journal->j_maxlen / 4; 1197 1198 /* Add the dynamic fields and write it to disk. */ 1199 jbd2_journal_update_superblock(journal, 1); 1200 return jbd2_journal_start_thread(journal); 1201 } 1202 1203 /** 1204 * void jbd2_journal_update_superblock() - Update journal sb on disk. 1205 * @journal: The journal to update. 1206 * @wait: Set to '0' if you don't want to wait for IO completion. 1207 * 1208 * Update a journal's dynamic superblock fields and write it to disk, 1209 * optionally waiting for the IO to complete. 1210 */ 1211 void jbd2_journal_update_superblock(journal_t *journal, int wait) 1212 { 1213 journal_superblock_t *sb = journal->j_superblock; 1214 struct buffer_head *bh = journal->j_sb_buffer; 1215 1216 /* 1217 * As a special case, if the on-disk copy is already marked as needing 1218 * no recovery (s_start == 0) and there are no outstanding transactions 1219 * in the filesystem, then we can safely defer the superblock update 1220 * until the next commit by setting JBD2_FLUSHED. This avoids 1221 * attempting a write to a potential-readonly device. 1222 */ 1223 if (sb->s_start == 0 && journal->j_tail_sequence == 1224 journal->j_transaction_sequence) { 1225 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 1226 "(start %ld, seq %d, errno %d)\n", 1227 journal->j_tail, journal->j_tail_sequence, 1228 journal->j_errno); 1229 goto out; 1230 } 1231 1232 if (buffer_write_io_error(bh)) { 1233 /* 1234 * Oh, dear. A previous attempt to write the journal 1235 * superblock failed. This could happen because the 1236 * USB device was yanked out. Or it could happen to 1237 * be a transient write error and maybe the block will 1238 * be remapped. Nothing we can do but to retry the 1239 * write and hope for the best. 1240 */ 1241 printk(KERN_ERR "JBD2: previous I/O error detected " 1242 "for journal superblock update for %s.\n", 1243 journal->j_devname); 1244 clear_buffer_write_io_error(bh); 1245 set_buffer_uptodate(bh); 1246 } 1247 1248 spin_lock(&journal->j_state_lock); 1249 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1250 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1251 1252 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1253 sb->s_start = cpu_to_be32(journal->j_tail); 1254 sb->s_errno = cpu_to_be32(journal->j_errno); 1255 spin_unlock(&journal->j_state_lock); 1256 1257 BUFFER_TRACE(bh, "marking dirty"); 1258 mark_buffer_dirty(bh); 1259 if (wait) { 1260 sync_dirty_buffer(bh); 1261 if (buffer_write_io_error(bh)) { 1262 printk(KERN_ERR "JBD2: I/O error detected " 1263 "when updating journal superblock for %s.\n", 1264 journal->j_devname); 1265 clear_buffer_write_io_error(bh); 1266 set_buffer_uptodate(bh); 1267 } 1268 } else 1269 ll_rw_block(SWRITE, 1, &bh); 1270 1271 out: 1272 /* If we have just flushed the log (by marking s_start==0), then 1273 * any future commit will have to be careful to update the 1274 * superblock again to re-record the true start of the log. */ 1275 1276 spin_lock(&journal->j_state_lock); 1277 if (sb->s_start) 1278 journal->j_flags &= ~JBD2_FLUSHED; 1279 else 1280 journal->j_flags |= JBD2_FLUSHED; 1281 spin_unlock(&journal->j_state_lock); 1282 } 1283 1284 /* 1285 * Read the superblock for a given journal, performing initial 1286 * validation of the format. 1287 */ 1288 1289 static int journal_get_superblock(journal_t *journal) 1290 { 1291 struct buffer_head *bh; 1292 journal_superblock_t *sb; 1293 int err = -EIO; 1294 1295 bh = journal->j_sb_buffer; 1296 1297 J_ASSERT(bh != NULL); 1298 if (!buffer_uptodate(bh)) { 1299 ll_rw_block(READ, 1, &bh); 1300 wait_on_buffer(bh); 1301 if (!buffer_uptodate(bh)) { 1302 printk (KERN_ERR 1303 "JBD: IO error reading journal superblock\n"); 1304 goto out; 1305 } 1306 } 1307 1308 sb = journal->j_superblock; 1309 1310 err = -EINVAL; 1311 1312 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || 1313 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { 1314 printk(KERN_WARNING "JBD: no valid journal superblock found\n"); 1315 goto out; 1316 } 1317 1318 switch(be32_to_cpu(sb->s_header.h_blocktype)) { 1319 case JBD2_SUPERBLOCK_V1: 1320 journal->j_format_version = 1; 1321 break; 1322 case JBD2_SUPERBLOCK_V2: 1323 journal->j_format_version = 2; 1324 break; 1325 default: 1326 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); 1327 goto out; 1328 } 1329 1330 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) 1331 journal->j_maxlen = be32_to_cpu(sb->s_maxlen); 1332 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { 1333 printk (KERN_WARNING "JBD: journal file too short\n"); 1334 goto out; 1335 } 1336 1337 return 0; 1338 1339 out: 1340 journal_fail_superblock(journal); 1341 return err; 1342 } 1343 1344 /* 1345 * Load the on-disk journal superblock and read the key fields into the 1346 * journal_t. 1347 */ 1348 1349 static int load_superblock(journal_t *journal) 1350 { 1351 int err; 1352 journal_superblock_t *sb; 1353 1354 err = journal_get_superblock(journal); 1355 if (err) 1356 return err; 1357 1358 sb = journal->j_superblock; 1359 1360 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); 1361 journal->j_tail = be32_to_cpu(sb->s_start); 1362 journal->j_first = be32_to_cpu(sb->s_first); 1363 journal->j_last = be32_to_cpu(sb->s_maxlen); 1364 journal->j_errno = be32_to_cpu(sb->s_errno); 1365 1366 return 0; 1367 } 1368 1369 1370 /** 1371 * int jbd2_journal_load() - Read journal from disk. 1372 * @journal: Journal to act on. 1373 * 1374 * Given a journal_t structure which tells us which disk blocks contain 1375 * a journal, read the journal from disk to initialise the in-memory 1376 * structures. 1377 */ 1378 int jbd2_journal_load(journal_t *journal) 1379 { 1380 int err; 1381 journal_superblock_t *sb; 1382 1383 err = load_superblock(journal); 1384 if (err) 1385 return err; 1386 1387 sb = journal->j_superblock; 1388 /* If this is a V2 superblock, then we have to check the 1389 * features flags on it. */ 1390 1391 if (journal->j_format_version >= 2) { 1392 if ((sb->s_feature_ro_compat & 1393 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || 1394 (sb->s_feature_incompat & 1395 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { 1396 printk (KERN_WARNING 1397 "JBD: Unrecognised features on journal\n"); 1398 return -EINVAL; 1399 } 1400 } 1401 1402 /* Let the recovery code check whether it needs to recover any 1403 * data from the journal. */ 1404 if (jbd2_journal_recover(journal)) 1405 goto recovery_error; 1406 1407 /* OK, we've finished with the dynamic journal bits: 1408 * reinitialise the dynamic contents of the superblock in memory 1409 * and reset them on disk. */ 1410 if (journal_reset(journal)) 1411 goto recovery_error; 1412 1413 journal->j_flags &= ~JBD2_ABORT; 1414 journal->j_flags |= JBD2_LOADED; 1415 return 0; 1416 1417 recovery_error: 1418 printk (KERN_WARNING "JBD: recovery failed\n"); 1419 return -EIO; 1420 } 1421 1422 /** 1423 * void jbd2_journal_destroy() - Release a journal_t structure. 1424 * @journal: Journal to act on. 1425 * 1426 * Release a journal_t structure once it is no longer in use by the 1427 * journaled object. 1428 * Return <0 if we couldn't clean up the journal. 1429 */ 1430 int jbd2_journal_destroy(journal_t *journal) 1431 { 1432 int err = 0; 1433 1434 /* Wait for the commit thread to wake up and die. */ 1435 journal_kill_thread(journal); 1436 1437 /* Force a final log commit */ 1438 if (journal->j_running_transaction) 1439 jbd2_journal_commit_transaction(journal); 1440 1441 /* Force any old transactions to disk */ 1442 1443 /* Totally anal locking here... */ 1444 spin_lock(&journal->j_list_lock); 1445 while (journal->j_checkpoint_transactions != NULL) { 1446 spin_unlock(&journal->j_list_lock); 1447 mutex_lock(&journal->j_checkpoint_mutex); 1448 jbd2_log_do_checkpoint(journal); 1449 mutex_unlock(&journal->j_checkpoint_mutex); 1450 spin_lock(&journal->j_list_lock); 1451 } 1452 1453 J_ASSERT(journal->j_running_transaction == NULL); 1454 J_ASSERT(journal->j_committing_transaction == NULL); 1455 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1456 spin_unlock(&journal->j_list_lock); 1457 1458 if (journal->j_sb_buffer) { 1459 if (!is_journal_aborted(journal)) { 1460 /* We can now mark the journal as empty. */ 1461 journal->j_tail = 0; 1462 journal->j_tail_sequence = 1463 ++journal->j_transaction_sequence; 1464 jbd2_journal_update_superblock(journal, 1); 1465 } else { 1466 err = -EIO; 1467 } 1468 brelse(journal->j_sb_buffer); 1469 } 1470 1471 if (journal->j_proc_entry) 1472 jbd2_stats_proc_exit(journal); 1473 if (journal->j_inode) 1474 iput(journal->j_inode); 1475 if (journal->j_revoke) 1476 jbd2_journal_destroy_revoke(journal); 1477 kfree(journal->j_wbuf); 1478 kfree(journal); 1479 1480 return err; 1481 } 1482 1483 1484 /** 1485 *int jbd2_journal_check_used_features () - Check if features specified are used. 1486 * @journal: Journal to check. 1487 * @compat: bitmask of compatible features 1488 * @ro: bitmask of features that force read-only mount 1489 * @incompat: bitmask of incompatible features 1490 * 1491 * Check whether the journal uses all of a given set of 1492 * features. Return true (non-zero) if it does. 1493 **/ 1494 1495 int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, 1496 unsigned long ro, unsigned long incompat) 1497 { 1498 journal_superblock_t *sb; 1499 1500 if (!compat && !ro && !incompat) 1501 return 1; 1502 if (journal->j_format_version == 1) 1503 return 0; 1504 1505 sb = journal->j_superblock; 1506 1507 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && 1508 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && 1509 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) 1510 return 1; 1511 1512 return 0; 1513 } 1514 1515 /** 1516 * int jbd2_journal_check_available_features() - Check feature set in journalling layer 1517 * @journal: Journal to check. 1518 * @compat: bitmask of compatible features 1519 * @ro: bitmask of features that force read-only mount 1520 * @incompat: bitmask of incompatible features 1521 * 1522 * Check whether the journaling code supports the use of 1523 * all of a given set of features on this journal. Return true 1524 * (non-zero) if it can. */ 1525 1526 int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, 1527 unsigned long ro, unsigned long incompat) 1528 { 1529 journal_superblock_t *sb; 1530 1531 if (!compat && !ro && !incompat) 1532 return 1; 1533 1534 sb = journal->j_superblock; 1535 1536 /* We can support any known requested features iff the 1537 * superblock is in version 2. Otherwise we fail to support any 1538 * extended sb features. */ 1539 1540 if (journal->j_format_version != 2) 1541 return 0; 1542 1543 if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && 1544 (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro && 1545 (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat) 1546 return 1; 1547 1548 return 0; 1549 } 1550 1551 /** 1552 * int jbd2_journal_set_features () - Mark a given journal feature in the superblock 1553 * @journal: Journal to act on. 1554 * @compat: bitmask of compatible features 1555 * @ro: bitmask of features that force read-only mount 1556 * @incompat: bitmask of incompatible features 1557 * 1558 * Mark a given journal feature as present on the 1559 * superblock. Returns true if the requested features could be set. 1560 * 1561 */ 1562 1563 int jbd2_journal_set_features (journal_t *journal, unsigned long compat, 1564 unsigned long ro, unsigned long incompat) 1565 { 1566 journal_superblock_t *sb; 1567 1568 if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) 1569 return 1; 1570 1571 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) 1572 return 0; 1573 1574 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", 1575 compat, ro, incompat); 1576 1577 sb = journal->j_superblock; 1578 1579 sb->s_feature_compat |= cpu_to_be32(compat); 1580 sb->s_feature_ro_compat |= cpu_to_be32(ro); 1581 sb->s_feature_incompat |= cpu_to_be32(incompat); 1582 1583 return 1; 1584 } 1585 1586 /* 1587 * jbd2_journal_clear_features () - Clear a given journal feature in the 1588 * superblock 1589 * @journal: Journal to act on. 1590 * @compat: bitmask of compatible features 1591 * @ro: bitmask of features that force read-only mount 1592 * @incompat: bitmask of incompatible features 1593 * 1594 * Clear a given journal feature as present on the 1595 * superblock. 1596 */ 1597 void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, 1598 unsigned long ro, unsigned long incompat) 1599 { 1600 journal_superblock_t *sb; 1601 1602 jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", 1603 compat, ro, incompat); 1604 1605 sb = journal->j_superblock; 1606 1607 sb->s_feature_compat &= ~cpu_to_be32(compat); 1608 sb->s_feature_ro_compat &= ~cpu_to_be32(ro); 1609 sb->s_feature_incompat &= ~cpu_to_be32(incompat); 1610 } 1611 EXPORT_SYMBOL(jbd2_journal_clear_features); 1612 1613 /** 1614 * int jbd2_journal_update_format () - Update on-disk journal structure. 1615 * @journal: Journal to act on. 1616 * 1617 * Given an initialised but unloaded journal struct, poke about in the 1618 * on-disk structure to update it to the most recent supported version. 1619 */ 1620 int jbd2_journal_update_format (journal_t *journal) 1621 { 1622 journal_superblock_t *sb; 1623 int err; 1624 1625 err = journal_get_superblock(journal); 1626 if (err) 1627 return err; 1628 1629 sb = journal->j_superblock; 1630 1631 switch (be32_to_cpu(sb->s_header.h_blocktype)) { 1632 case JBD2_SUPERBLOCK_V2: 1633 return 0; 1634 case JBD2_SUPERBLOCK_V1: 1635 return journal_convert_superblock_v1(journal, sb); 1636 default: 1637 break; 1638 } 1639 return -EINVAL; 1640 } 1641 1642 static int journal_convert_superblock_v1(journal_t *journal, 1643 journal_superblock_t *sb) 1644 { 1645 int offset, blocksize; 1646 struct buffer_head *bh; 1647 1648 printk(KERN_WARNING 1649 "JBD: Converting superblock from version 1 to 2.\n"); 1650 1651 /* Pre-initialise new fields to zero */ 1652 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); 1653 blocksize = be32_to_cpu(sb->s_blocksize); 1654 memset(&sb->s_feature_compat, 0, blocksize-offset); 1655 1656 sb->s_nr_users = cpu_to_be32(1); 1657 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); 1658 journal->j_format_version = 2; 1659 1660 bh = journal->j_sb_buffer; 1661 BUFFER_TRACE(bh, "marking dirty"); 1662 mark_buffer_dirty(bh); 1663 sync_dirty_buffer(bh); 1664 return 0; 1665 } 1666 1667 1668 /** 1669 * int jbd2_journal_flush () - Flush journal 1670 * @journal: Journal to act on. 1671 * 1672 * Flush all data for a given journal to disk and empty the journal. 1673 * Filesystems can use this when remounting readonly to ensure that 1674 * recovery does not need to happen on remount. 1675 */ 1676 1677 int jbd2_journal_flush(journal_t *journal) 1678 { 1679 int err = 0; 1680 transaction_t *transaction = NULL; 1681 unsigned long old_tail; 1682 1683 spin_lock(&journal->j_state_lock); 1684 1685 /* Force everything buffered to the log... */ 1686 if (journal->j_running_transaction) { 1687 transaction = journal->j_running_transaction; 1688 __jbd2_log_start_commit(journal, transaction->t_tid); 1689 } else if (journal->j_committing_transaction) 1690 transaction = journal->j_committing_transaction; 1691 1692 /* Wait for the log commit to complete... */ 1693 if (transaction) { 1694 tid_t tid = transaction->t_tid; 1695 1696 spin_unlock(&journal->j_state_lock); 1697 jbd2_log_wait_commit(journal, tid); 1698 } else { 1699 spin_unlock(&journal->j_state_lock); 1700 } 1701 1702 /* ...and flush everything in the log out to disk. */ 1703 spin_lock(&journal->j_list_lock); 1704 while (!err && journal->j_checkpoint_transactions != NULL) { 1705 spin_unlock(&journal->j_list_lock); 1706 mutex_lock(&journal->j_checkpoint_mutex); 1707 err = jbd2_log_do_checkpoint(journal); 1708 mutex_unlock(&journal->j_checkpoint_mutex); 1709 spin_lock(&journal->j_list_lock); 1710 } 1711 spin_unlock(&journal->j_list_lock); 1712 1713 if (is_journal_aborted(journal)) 1714 return -EIO; 1715 1716 jbd2_cleanup_journal_tail(journal); 1717 1718 /* Finally, mark the journal as really needing no recovery. 1719 * This sets s_start==0 in the underlying superblock, which is 1720 * the magic code for a fully-recovered superblock. Any future 1721 * commits of data to the journal will restore the current 1722 * s_start value. */ 1723 spin_lock(&journal->j_state_lock); 1724 old_tail = journal->j_tail; 1725 journal->j_tail = 0; 1726 spin_unlock(&journal->j_state_lock); 1727 jbd2_journal_update_superblock(journal, 1); 1728 spin_lock(&journal->j_state_lock); 1729 journal->j_tail = old_tail; 1730 1731 J_ASSERT(!journal->j_running_transaction); 1732 J_ASSERT(!journal->j_committing_transaction); 1733 J_ASSERT(!journal->j_checkpoint_transactions); 1734 J_ASSERT(journal->j_head == journal->j_tail); 1735 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1736 spin_unlock(&journal->j_state_lock); 1737 return 0; 1738 } 1739 1740 /** 1741 * int jbd2_journal_wipe() - Wipe journal contents 1742 * @journal: Journal to act on. 1743 * @write: flag (see below) 1744 * 1745 * Wipe out all of the contents of a journal, safely. This will produce 1746 * a warning if the journal contains any valid recovery information. 1747 * Must be called between journal_init_*() and jbd2_journal_load(). 1748 * 1749 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise 1750 * we merely suppress recovery. 1751 */ 1752 1753 int jbd2_journal_wipe(journal_t *journal, int write) 1754 { 1755 journal_superblock_t *sb; 1756 int err = 0; 1757 1758 J_ASSERT (!(journal->j_flags & JBD2_LOADED)); 1759 1760 err = load_superblock(journal); 1761 if (err) 1762 return err; 1763 1764 sb = journal->j_superblock; 1765 1766 if (!journal->j_tail) 1767 goto no_recovery; 1768 1769 printk (KERN_WARNING "JBD: %s recovery information on journal\n", 1770 write ? "Clearing" : "Ignoring"); 1771 1772 err = jbd2_journal_skip_recovery(journal); 1773 if (write) 1774 jbd2_journal_update_superblock(journal, 1); 1775 1776 no_recovery: 1777 return err; 1778 } 1779 1780 /* 1781 * Journal abort has very specific semantics, which we describe 1782 * for journal abort. 1783 * 1784 * Two internal functions, which provide abort to the jbd layer 1785 * itself are here. 1786 */ 1787 1788 /* 1789 * Quick version for internal journal use (doesn't lock the journal). 1790 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, 1791 * and don't attempt to make any other journal updates. 1792 */ 1793 void __jbd2_journal_abort_hard(journal_t *journal) 1794 { 1795 transaction_t *transaction; 1796 1797 if (journal->j_flags & JBD2_ABORT) 1798 return; 1799 1800 printk(KERN_ERR "Aborting journal on device %s.\n", 1801 journal->j_devname); 1802 1803 spin_lock(&journal->j_state_lock); 1804 journal->j_flags |= JBD2_ABORT; 1805 transaction = journal->j_running_transaction; 1806 if (transaction) 1807 __jbd2_log_start_commit(journal, transaction->t_tid); 1808 spin_unlock(&journal->j_state_lock); 1809 } 1810 1811 /* Soft abort: record the abort error status in the journal superblock, 1812 * but don't do any other IO. */ 1813 static void __journal_abort_soft (journal_t *journal, int errno) 1814 { 1815 if (journal->j_flags & JBD2_ABORT) 1816 return; 1817 1818 if (!journal->j_errno) 1819 journal->j_errno = errno; 1820 1821 __jbd2_journal_abort_hard(journal); 1822 1823 if (errno) 1824 jbd2_journal_update_superblock(journal, 1); 1825 } 1826 1827 /** 1828 * void jbd2_journal_abort () - Shutdown the journal immediately. 1829 * @journal: the journal to shutdown. 1830 * @errno: an error number to record in the journal indicating 1831 * the reason for the shutdown. 1832 * 1833 * Perform a complete, immediate shutdown of the ENTIRE 1834 * journal (not of a single transaction). This operation cannot be 1835 * undone without closing and reopening the journal. 1836 * 1837 * The jbd2_journal_abort function is intended to support higher level error 1838 * recovery mechanisms such as the ext2/ext3 remount-readonly error 1839 * mode. 1840 * 1841 * Journal abort has very specific semantics. Any existing dirty, 1842 * unjournaled buffers in the main filesystem will still be written to 1843 * disk by bdflush, but the journaling mechanism will be suspended 1844 * immediately and no further transaction commits will be honoured. 1845 * 1846 * Any dirty, journaled buffers will be written back to disk without 1847 * hitting the journal. Atomicity cannot be guaranteed on an aborted 1848 * filesystem, but we _do_ attempt to leave as much data as possible 1849 * behind for fsck to use for cleanup. 1850 * 1851 * Any attempt to get a new transaction handle on a journal which is in 1852 * ABORT state will just result in an -EROFS error return. A 1853 * jbd2_journal_stop on an existing handle will return -EIO if we have 1854 * entered abort state during the update. 1855 * 1856 * Recursive transactions are not disturbed by journal abort until the 1857 * final jbd2_journal_stop, which will receive the -EIO error. 1858 * 1859 * Finally, the jbd2_journal_abort call allows the caller to supply an errno 1860 * which will be recorded (if possible) in the journal superblock. This 1861 * allows a client to record failure conditions in the middle of a 1862 * transaction without having to complete the transaction to record the 1863 * failure to disk. ext3_error, for example, now uses this 1864 * functionality. 1865 * 1866 * Errors which originate from within the journaling layer will NOT 1867 * supply an errno; a null errno implies that absolutely no further 1868 * writes are done to the journal (unless there are any already in 1869 * progress). 1870 * 1871 */ 1872 1873 void jbd2_journal_abort(journal_t *journal, int errno) 1874 { 1875 __journal_abort_soft(journal, errno); 1876 } 1877 1878 /** 1879 * int jbd2_journal_errno () - returns the journal's error state. 1880 * @journal: journal to examine. 1881 * 1882 * This is the errno number set with jbd2_journal_abort(), the last 1883 * time the journal was mounted - if the journal was stopped 1884 * without calling abort this will be 0. 1885 * 1886 * If the journal has been aborted on this mount time -EROFS will 1887 * be returned. 1888 */ 1889 int jbd2_journal_errno(journal_t *journal) 1890 { 1891 int err; 1892 1893 spin_lock(&journal->j_state_lock); 1894 if (journal->j_flags & JBD2_ABORT) 1895 err = -EROFS; 1896 else 1897 err = journal->j_errno; 1898 spin_unlock(&journal->j_state_lock); 1899 return err; 1900 } 1901 1902 /** 1903 * int jbd2_journal_clear_err () - clears the journal's error state 1904 * @journal: journal to act on. 1905 * 1906 * An error must be cleared or acked to take a FS out of readonly 1907 * mode. 1908 */ 1909 int jbd2_journal_clear_err(journal_t *journal) 1910 { 1911 int err = 0; 1912 1913 spin_lock(&journal->j_state_lock); 1914 if (journal->j_flags & JBD2_ABORT) 1915 err = -EROFS; 1916 else 1917 journal->j_errno = 0; 1918 spin_unlock(&journal->j_state_lock); 1919 return err; 1920 } 1921 1922 /** 1923 * void jbd2_journal_ack_err() - Ack journal err. 1924 * @journal: journal to act on. 1925 * 1926 * An error must be cleared or acked to take a FS out of readonly 1927 * mode. 1928 */ 1929 void jbd2_journal_ack_err(journal_t *journal) 1930 { 1931 spin_lock(&journal->j_state_lock); 1932 if (journal->j_errno) 1933 journal->j_flags |= JBD2_ACK_ERR; 1934 spin_unlock(&journal->j_state_lock); 1935 } 1936 1937 int jbd2_journal_blocks_per_page(struct inode *inode) 1938 { 1939 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1940 } 1941 1942 /* 1943 * helper functions to deal with 32 or 64bit block numbers. 1944 */ 1945 size_t journal_tag_bytes(journal_t *journal) 1946 { 1947 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 1948 return JBD2_TAG_SIZE64; 1949 else 1950 return JBD2_TAG_SIZE32; 1951 } 1952 1953 /* 1954 * Journal_head storage management 1955 */ 1956 static struct kmem_cache *jbd2_journal_head_cache; 1957 #ifdef CONFIG_JBD2_DEBUG 1958 static atomic_t nr_journal_heads = ATOMIC_INIT(0); 1959 #endif 1960 1961 static int journal_init_jbd2_journal_head_cache(void) 1962 { 1963 int retval; 1964 1965 J_ASSERT(jbd2_journal_head_cache == NULL); 1966 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", 1967 sizeof(struct journal_head), 1968 0, /* offset */ 1969 SLAB_TEMPORARY, /* flags */ 1970 NULL); /* ctor */ 1971 retval = 0; 1972 if (!jbd2_journal_head_cache) { 1973 retval = -ENOMEM; 1974 printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); 1975 } 1976 return retval; 1977 } 1978 1979 static void jbd2_journal_destroy_jbd2_journal_head_cache(void) 1980 { 1981 if (jbd2_journal_head_cache) { 1982 kmem_cache_destroy(jbd2_journal_head_cache); 1983 jbd2_journal_head_cache = NULL; 1984 } 1985 } 1986 1987 /* 1988 * journal_head splicing and dicing 1989 */ 1990 static struct journal_head *journal_alloc_journal_head(void) 1991 { 1992 struct journal_head *ret; 1993 static unsigned long last_warning; 1994 1995 #ifdef CONFIG_JBD2_DEBUG 1996 atomic_inc(&nr_journal_heads); 1997 #endif 1998 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1999 if (!ret) { 2000 jbd_debug(1, "out of memory for journal_head\n"); 2001 if (time_after(jiffies, last_warning + 5*HZ)) { 2002 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", 2003 __func__); 2004 last_warning = jiffies; 2005 } 2006 while (!ret) { 2007 yield(); 2008 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2009 } 2010 } 2011 return ret; 2012 } 2013 2014 static void journal_free_journal_head(struct journal_head *jh) 2015 { 2016 #ifdef CONFIG_JBD2_DEBUG 2017 atomic_dec(&nr_journal_heads); 2018 memset(jh, JBD2_POISON_FREE, sizeof(*jh)); 2019 #endif 2020 kmem_cache_free(jbd2_journal_head_cache, jh); 2021 } 2022 2023 /* 2024 * A journal_head is attached to a buffer_head whenever JBD has an 2025 * interest in the buffer. 2026 * 2027 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit 2028 * is set. This bit is tested in core kernel code where we need to take 2029 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable 2030 * there. 2031 * 2032 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. 2033 * 2034 * When a buffer has its BH_JBD bit set it is immune from being released by 2035 * core kernel code, mainly via ->b_count. 2036 * 2037 * A journal_head may be detached from its buffer_head when the journal_head's 2038 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. 2039 * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the 2040 * journal_head can be dropped if needed. 2041 * 2042 * Various places in the kernel want to attach a journal_head to a buffer_head 2043 * _before_ attaching the journal_head to a transaction. To protect the 2044 * journal_head in this situation, jbd2_journal_add_journal_head elevates the 2045 * journal_head's b_jcount refcount by one. The caller must call 2046 * jbd2_journal_put_journal_head() to undo this. 2047 * 2048 * So the typical usage would be: 2049 * 2050 * (Attach a journal_head if needed. Increments b_jcount) 2051 * struct journal_head *jh = jbd2_journal_add_journal_head(bh); 2052 * ... 2053 * jh->b_transaction = xxx; 2054 * jbd2_journal_put_journal_head(jh); 2055 * 2056 * Now, the journal_head's b_jcount is zero, but it is safe from being released 2057 * because it has a non-zero b_transaction. 2058 */ 2059 2060 /* 2061 * Give a buffer_head a journal_head. 2062 * 2063 * Doesn't need the journal lock. 2064 * May sleep. 2065 */ 2066 struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) 2067 { 2068 struct journal_head *jh; 2069 struct journal_head *new_jh = NULL; 2070 2071 repeat: 2072 if (!buffer_jbd(bh)) { 2073 new_jh = journal_alloc_journal_head(); 2074 memset(new_jh, 0, sizeof(*new_jh)); 2075 } 2076 2077 jbd_lock_bh_journal_head(bh); 2078 if (buffer_jbd(bh)) { 2079 jh = bh2jh(bh); 2080 } else { 2081 J_ASSERT_BH(bh, 2082 (atomic_read(&bh->b_count) > 0) || 2083 (bh->b_page && bh->b_page->mapping)); 2084 2085 if (!new_jh) { 2086 jbd_unlock_bh_journal_head(bh); 2087 goto repeat; 2088 } 2089 2090 jh = new_jh; 2091 new_jh = NULL; /* We consumed it */ 2092 set_buffer_jbd(bh); 2093 bh->b_private = jh; 2094 jh->b_bh = bh; 2095 get_bh(bh); 2096 BUFFER_TRACE(bh, "added journal_head"); 2097 } 2098 jh->b_jcount++; 2099 jbd_unlock_bh_journal_head(bh); 2100 if (new_jh) 2101 journal_free_journal_head(new_jh); 2102 return bh->b_private; 2103 } 2104 2105 /* 2106 * Grab a ref against this buffer_head's journal_head. If it ended up not 2107 * having a journal_head, return NULL 2108 */ 2109 struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) 2110 { 2111 struct journal_head *jh = NULL; 2112 2113 jbd_lock_bh_journal_head(bh); 2114 if (buffer_jbd(bh)) { 2115 jh = bh2jh(bh); 2116 jh->b_jcount++; 2117 } 2118 jbd_unlock_bh_journal_head(bh); 2119 return jh; 2120 } 2121 2122 static void __journal_remove_journal_head(struct buffer_head *bh) 2123 { 2124 struct journal_head *jh = bh2jh(bh); 2125 2126 J_ASSERT_JH(jh, jh->b_jcount >= 0); 2127 2128 get_bh(bh); 2129 if (jh->b_jcount == 0) { 2130 if (jh->b_transaction == NULL && 2131 jh->b_next_transaction == NULL && 2132 jh->b_cp_transaction == NULL) { 2133 J_ASSERT_JH(jh, jh->b_jlist == BJ_None); 2134 J_ASSERT_BH(bh, buffer_jbd(bh)); 2135 J_ASSERT_BH(bh, jh2bh(jh) == bh); 2136 BUFFER_TRACE(bh, "remove journal_head"); 2137 if (jh->b_frozen_data) { 2138 printk(KERN_WARNING "%s: freeing " 2139 "b_frozen_data\n", 2140 __func__); 2141 jbd2_free(jh->b_frozen_data, bh->b_size); 2142 } 2143 if (jh->b_committed_data) { 2144 printk(KERN_WARNING "%s: freeing " 2145 "b_committed_data\n", 2146 __func__); 2147 jbd2_free(jh->b_committed_data, bh->b_size); 2148 } 2149 bh->b_private = NULL; 2150 jh->b_bh = NULL; /* debug, really */ 2151 clear_buffer_jbd(bh); 2152 __brelse(bh); 2153 journal_free_journal_head(jh); 2154 } else { 2155 BUFFER_TRACE(bh, "journal_head was locked"); 2156 } 2157 } 2158 } 2159 2160 /* 2161 * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction 2162 * and has a zero b_jcount then remove and release its journal_head. If we did 2163 * see that the buffer is not used by any transaction we also "logically" 2164 * decrement ->b_count. 2165 * 2166 * We in fact take an additional increment on ->b_count as a convenience, 2167 * because the caller usually wants to do additional things with the bh 2168 * after calling here. 2169 * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some 2170 * time. Once the caller has run __brelse(), the buffer is eligible for 2171 * reaping by try_to_free_buffers(). 2172 */ 2173 void jbd2_journal_remove_journal_head(struct buffer_head *bh) 2174 { 2175 jbd_lock_bh_journal_head(bh); 2176 __journal_remove_journal_head(bh); 2177 jbd_unlock_bh_journal_head(bh); 2178 } 2179 2180 /* 2181 * Drop a reference on the passed journal_head. If it fell to zero then try to 2182 * release the journal_head from the buffer_head. 2183 */ 2184 void jbd2_journal_put_journal_head(struct journal_head *jh) 2185 { 2186 struct buffer_head *bh = jh2bh(jh); 2187 2188 jbd_lock_bh_journal_head(bh); 2189 J_ASSERT_JH(jh, jh->b_jcount > 0); 2190 --jh->b_jcount; 2191 if (!jh->b_jcount && !jh->b_transaction) { 2192 __journal_remove_journal_head(bh); 2193 __brelse(bh); 2194 } 2195 jbd_unlock_bh_journal_head(bh); 2196 } 2197 2198 /* 2199 * Initialize jbd inode head 2200 */ 2201 void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) 2202 { 2203 jinode->i_transaction = NULL; 2204 jinode->i_next_transaction = NULL; 2205 jinode->i_vfs_inode = inode; 2206 jinode->i_flags = 0; 2207 INIT_LIST_HEAD(&jinode->i_list); 2208 } 2209 2210 /* 2211 * Function to be called before we start removing inode from memory (i.e., 2212 * clear_inode() is a fine place to be called from). It removes inode from 2213 * transaction's lists. 2214 */ 2215 void jbd2_journal_release_jbd_inode(journal_t *journal, 2216 struct jbd2_inode *jinode) 2217 { 2218 int writeout = 0; 2219 2220 if (!journal) 2221 return; 2222 restart: 2223 spin_lock(&journal->j_list_lock); 2224 /* Is commit writing out inode - we have to wait */ 2225 if (jinode->i_flags & JI_COMMIT_RUNNING) { 2226 wait_queue_head_t *wq; 2227 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2228 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2229 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 2230 spin_unlock(&journal->j_list_lock); 2231 schedule(); 2232 finish_wait(wq, &wait.wait); 2233 goto restart; 2234 } 2235 2236 /* Do we need to wait for data writeback? */ 2237 if (journal->j_committing_transaction == jinode->i_transaction) 2238 writeout = 1; 2239 if (jinode->i_transaction) { 2240 list_del(&jinode->i_list); 2241 jinode->i_transaction = NULL; 2242 } 2243 spin_unlock(&journal->j_list_lock); 2244 } 2245 2246 /* 2247 * debugfs tunables 2248 */ 2249 #ifdef CONFIG_JBD2_DEBUG 2250 u8 jbd2_journal_enable_debug __read_mostly; 2251 EXPORT_SYMBOL(jbd2_journal_enable_debug); 2252 2253 #define JBD2_DEBUG_NAME "jbd2-debug" 2254 2255 static struct dentry *jbd2_debugfs_dir; 2256 static struct dentry *jbd2_debug; 2257 2258 static void __init jbd2_create_debugfs_entry(void) 2259 { 2260 jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); 2261 if (jbd2_debugfs_dir) 2262 jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, 2263 jbd2_debugfs_dir, 2264 &jbd2_journal_enable_debug); 2265 } 2266 2267 static void __exit jbd2_remove_debugfs_entry(void) 2268 { 2269 debugfs_remove(jbd2_debug); 2270 debugfs_remove(jbd2_debugfs_dir); 2271 } 2272 2273 #else 2274 2275 static void __init jbd2_create_debugfs_entry(void) 2276 { 2277 } 2278 2279 static void __exit jbd2_remove_debugfs_entry(void) 2280 { 2281 } 2282 2283 #endif 2284 2285 #ifdef CONFIG_PROC_FS 2286 2287 #define JBD2_STATS_PROC_NAME "fs/jbd2" 2288 2289 static void __init jbd2_create_jbd_stats_proc_entry(void) 2290 { 2291 proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); 2292 } 2293 2294 static void __exit jbd2_remove_jbd_stats_proc_entry(void) 2295 { 2296 if (proc_jbd2_stats) 2297 remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); 2298 } 2299 2300 #else 2301 2302 #define jbd2_create_jbd_stats_proc_entry() do {} while (0) 2303 #define jbd2_remove_jbd_stats_proc_entry() do {} while (0) 2304 2305 #endif 2306 2307 struct kmem_cache *jbd2_handle_cache; 2308 2309 static int __init journal_init_handle_cache(void) 2310 { 2311 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", 2312 sizeof(handle_t), 2313 0, /* offset */ 2314 SLAB_TEMPORARY, /* flags */ 2315 NULL); /* ctor */ 2316 if (jbd2_handle_cache == NULL) { 2317 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 2318 return -ENOMEM; 2319 } 2320 return 0; 2321 } 2322 2323 static void jbd2_journal_destroy_handle_cache(void) 2324 { 2325 if (jbd2_handle_cache) 2326 kmem_cache_destroy(jbd2_handle_cache); 2327 } 2328 2329 /* 2330 * Module startup and shutdown 2331 */ 2332 2333 static int __init journal_init_caches(void) 2334 { 2335 int ret; 2336 2337 ret = jbd2_journal_init_revoke_caches(); 2338 if (ret == 0) 2339 ret = journal_init_jbd2_journal_head_cache(); 2340 if (ret == 0) 2341 ret = journal_init_handle_cache(); 2342 return ret; 2343 } 2344 2345 static void jbd2_journal_destroy_caches(void) 2346 { 2347 jbd2_journal_destroy_revoke_caches(); 2348 jbd2_journal_destroy_jbd2_journal_head_cache(); 2349 jbd2_journal_destroy_handle_cache(); 2350 } 2351 2352 static int __init journal_init(void) 2353 { 2354 int ret; 2355 2356 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); 2357 2358 ret = journal_init_caches(); 2359 if (ret == 0) { 2360 jbd2_create_debugfs_entry(); 2361 jbd2_create_jbd_stats_proc_entry(); 2362 } else { 2363 jbd2_journal_destroy_caches(); 2364 } 2365 return ret; 2366 } 2367 2368 static void __exit journal_exit(void) 2369 { 2370 #ifdef CONFIG_JBD2_DEBUG 2371 int n = atomic_read(&nr_journal_heads); 2372 if (n) 2373 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2374 #endif 2375 jbd2_remove_debugfs_entry(); 2376 jbd2_remove_jbd_stats_proc_entry(); 2377 jbd2_journal_destroy_caches(); 2378 } 2379 2380 MODULE_LICENSE("GPL"); 2381 module_init(journal_init); 2382 module_exit(journal_exit); 2383 2384