1 /* 2 * linux/fs/jbd2/transaction.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Generic filesystem transaction handling code; part of the ext2fs 13 * journaling system. 14 * 15 * This file manages transactions (compound commits managed by the 16 * journaling code) and handles (individual atomic operations by the 17 * filesystem). 18 */ 19 20 #include <linux/time.h> 21 #include <linux/fs.h> 22 #include <linux/jbd2.h> 23 #include <linux/errno.h> 24 #include <linux/slab.h> 25 #include <linux/timer.h> 26 #include <linux/mm.h> 27 #include <linux/highmem.h> 28 #include <linux/hrtimer.h> 29 #include <linux/backing-dev.h> 30 #include <linux/bug.h> 31 #include <linux/module.h> 32 33 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 34 static void __jbd2_journal_unfile_buffer(struct journal_head *jh); 35 36 static struct kmem_cache *transaction_cache; 37 int __init jbd2_journal_init_transaction_cache(void) 38 { 39 J_ASSERT(!transaction_cache); 40 transaction_cache = kmem_cache_create("jbd2_transaction_s", 41 sizeof(transaction_t), 42 0, 43 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, 44 NULL); 45 if (transaction_cache) 46 return 0; 47 return -ENOMEM; 48 } 49 50 void jbd2_journal_destroy_transaction_cache(void) 51 { 52 if (transaction_cache) { 53 kmem_cache_destroy(transaction_cache); 54 transaction_cache = NULL; 55 } 56 } 57 58 void jbd2_journal_free_transaction(transaction_t *transaction) 59 { 60 if (unlikely(ZERO_OR_NULL_PTR(transaction))) 61 return; 62 kmem_cache_free(transaction_cache, transaction); 63 } 64 65 /* 66 * jbd2_get_transaction: obtain a new transaction_t object. 67 * 68 * Simply allocate and initialise a new transaction. Create it in 69 * RUNNING state and add it to the current journal (which should not 70 * have an existing running transaction: we only make a new transaction 71 * once we have started to commit the old one). 72 * 73 * Preconditions: 74 * The journal MUST be locked. We don't perform atomic mallocs on the 75 * new transaction and we can't block without protecting against other 76 * processes trying to touch the journal while it is in transition. 77 * 78 */ 79 80 static transaction_t * 81 jbd2_get_transaction(journal_t *journal, transaction_t *transaction) 82 { 83 transaction->t_journal = journal; 84 transaction->t_state = T_RUNNING; 85 transaction->t_start_time = ktime_get(); 86 transaction->t_tid = journal->j_transaction_sequence++; 87 transaction->t_expires = jiffies + journal->j_commit_interval; 88 spin_lock_init(&transaction->t_handle_lock); 89 atomic_set(&transaction->t_updates, 0); 90 atomic_set(&transaction->t_outstanding_credits, 0); 91 atomic_set(&transaction->t_handle_count, 0); 92 INIT_LIST_HEAD(&transaction->t_inode_list); 93 INIT_LIST_HEAD(&transaction->t_private_list); 94 95 /* Set up the commit timer for the new transaction. */ 96 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); 97 add_timer(&journal->j_commit_timer); 98 99 J_ASSERT(journal->j_running_transaction == NULL); 100 journal->j_running_transaction = transaction; 101 transaction->t_max_wait = 0; 102 transaction->t_start = jiffies; 103 104 return transaction; 105 } 106 107 /* 108 * Handle management. 109 * 110 * A handle_t is an object which represents a single atomic update to a 111 * filesystem, and which tracks all of the modifications which form part 112 * of that one update. 113 */ 114 115 /* 116 * Update transaction's maximum wait time, if debugging is enabled. 117 * 118 * In order for t_max_wait to be reliable, it must be protected by a 119 * lock. But doing so will mean that start_this_handle() can not be 120 * run in parallel on SMP systems, which limits our scalability. So 121 * unless debugging is enabled, we no longer update t_max_wait, which 122 * means that maximum wait time reported by the jbd2_run_stats 123 * tracepoint will always be zero. 124 */ 125 static inline void update_t_max_wait(transaction_t *transaction, 126 unsigned long ts) 127 { 128 #ifdef CONFIG_JBD2_DEBUG 129 if (jbd2_journal_enable_debug && 130 time_after(transaction->t_start, ts)) { 131 ts = jbd2_time_diff(ts, transaction->t_start); 132 spin_lock(&transaction->t_handle_lock); 133 if (ts > transaction->t_max_wait) 134 transaction->t_max_wait = ts; 135 spin_unlock(&transaction->t_handle_lock); 136 } 137 #endif 138 } 139 140 /* 141 * start_this_handle: Given a handle, deal with any locking or stalling 142 * needed to make sure that there is enough journal space for the handle 143 * to begin. Attach the handle to a transaction and set up the 144 * transaction's buffer credits. 145 */ 146 147 static int start_this_handle(journal_t *journal, handle_t *handle, 148 gfp_t gfp_mask) 149 { 150 transaction_t *transaction, *new_transaction = NULL; 151 tid_t tid; 152 int needed, need_to_start; 153 int nblocks = handle->h_buffer_credits; 154 unsigned long ts = jiffies; 155 156 if (nblocks > journal->j_max_transaction_buffers) { 157 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", 158 current->comm, nblocks, 159 journal->j_max_transaction_buffers); 160 return -ENOSPC; 161 } 162 163 alloc_transaction: 164 if (!journal->j_running_transaction) { 165 new_transaction = kmem_cache_zalloc(transaction_cache, 166 gfp_mask); 167 if (!new_transaction) { 168 /* 169 * If __GFP_FS is not present, then we may be 170 * being called from inside the fs writeback 171 * layer, so we MUST NOT fail. Since 172 * __GFP_NOFAIL is going away, we will arrange 173 * to retry the allocation ourselves. 174 */ 175 if ((gfp_mask & __GFP_FS) == 0) { 176 congestion_wait(BLK_RW_ASYNC, HZ/50); 177 goto alloc_transaction; 178 } 179 return -ENOMEM; 180 } 181 } 182 183 jbd_debug(3, "New handle %p going live.\n", handle); 184 185 /* 186 * We need to hold j_state_lock until t_updates has been incremented, 187 * for proper journal barrier handling 188 */ 189 repeat: 190 read_lock(&journal->j_state_lock); 191 BUG_ON(journal->j_flags & JBD2_UNMOUNT); 192 if (is_journal_aborted(journal) || 193 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 194 read_unlock(&journal->j_state_lock); 195 jbd2_journal_free_transaction(new_transaction); 196 return -EROFS; 197 } 198 199 /* Wait on the journal's transaction barrier if necessary */ 200 if (journal->j_barrier_count) { 201 read_unlock(&journal->j_state_lock); 202 wait_event(journal->j_wait_transaction_locked, 203 journal->j_barrier_count == 0); 204 goto repeat; 205 } 206 207 if (!journal->j_running_transaction) { 208 read_unlock(&journal->j_state_lock); 209 if (!new_transaction) 210 goto alloc_transaction; 211 write_lock(&journal->j_state_lock); 212 if (!journal->j_running_transaction && 213 !journal->j_barrier_count) { 214 jbd2_get_transaction(journal, new_transaction); 215 new_transaction = NULL; 216 } 217 write_unlock(&journal->j_state_lock); 218 goto repeat; 219 } 220 221 transaction = journal->j_running_transaction; 222 223 /* 224 * If the current transaction is locked down for commit, wait for the 225 * lock to be released. 226 */ 227 if (transaction->t_state == T_LOCKED) { 228 DEFINE_WAIT(wait); 229 230 prepare_to_wait(&journal->j_wait_transaction_locked, 231 &wait, TASK_UNINTERRUPTIBLE); 232 read_unlock(&journal->j_state_lock); 233 schedule(); 234 finish_wait(&journal->j_wait_transaction_locked, &wait); 235 goto repeat; 236 } 237 238 /* 239 * If there is not enough space left in the log to write all potential 240 * buffers requested by this operation, we need to stall pending a log 241 * checkpoint to free some more log space. 242 */ 243 needed = atomic_add_return(nblocks, 244 &transaction->t_outstanding_credits); 245 246 if (needed > journal->j_max_transaction_buffers) { 247 /* 248 * If the current transaction is already too large, then start 249 * to commit it: we can then go back and attach this handle to 250 * a new transaction. 251 */ 252 DEFINE_WAIT(wait); 253 254 jbd_debug(2, "Handle %p starting new commit...\n", handle); 255 atomic_sub(nblocks, &transaction->t_outstanding_credits); 256 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 257 TASK_UNINTERRUPTIBLE); 258 tid = transaction->t_tid; 259 need_to_start = !tid_geq(journal->j_commit_request, tid); 260 read_unlock(&journal->j_state_lock); 261 if (need_to_start) 262 jbd2_log_start_commit(journal, tid); 263 schedule(); 264 finish_wait(&journal->j_wait_transaction_locked, &wait); 265 goto repeat; 266 } 267 268 /* 269 * The commit code assumes that it can get enough log space 270 * without forcing a checkpoint. This is *critical* for 271 * correctness: a checkpoint of a buffer which is also 272 * associated with a committing transaction creates a deadlock, 273 * so commit simply cannot force through checkpoints. 274 * 275 * We must therefore ensure the necessary space in the journal 276 * *before* starting to dirty potentially checkpointed buffers 277 * in the new transaction. 278 * 279 * The worst part is, any transaction currently committing can 280 * reduce the free space arbitrarily. Be careful to account for 281 * those buffers when checkpointing. 282 */ 283 284 /* 285 * @@@ AKPM: This seems rather over-defensive. We're giving commit 286 * a _lot_ of headroom: 1/4 of the journal plus the size of 287 * the committing transaction. Really, we only need to give it 288 * committing_transaction->t_outstanding_credits plus "enough" for 289 * the log control blocks. 290 * Also, this test is inconsistent with the matching one in 291 * jbd2_journal_extend(). 292 */ 293 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 294 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); 295 atomic_sub(nblocks, &transaction->t_outstanding_credits); 296 read_unlock(&journal->j_state_lock); 297 write_lock(&journal->j_state_lock); 298 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) 299 __jbd2_log_wait_for_space(journal); 300 write_unlock(&journal->j_state_lock); 301 goto repeat; 302 } 303 304 /* OK, account for the buffers that this operation expects to 305 * use and add the handle to the running transaction. 306 */ 307 update_t_max_wait(transaction, ts); 308 handle->h_transaction = transaction; 309 atomic_inc(&transaction->t_updates); 310 atomic_inc(&transaction->t_handle_count); 311 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 312 handle, nblocks, 313 atomic_read(&transaction->t_outstanding_credits), 314 __jbd2_log_space_left(journal)); 315 read_unlock(&journal->j_state_lock); 316 317 lock_map_acquire(&handle->h_lockdep_map); 318 jbd2_journal_free_transaction(new_transaction); 319 return 0; 320 } 321 322 static struct lock_class_key jbd2_handle_key; 323 324 /* Allocate a new handle. This should probably be in a slab... */ 325 static handle_t *new_handle(int nblocks) 326 { 327 handle_t *handle = jbd2_alloc_handle(GFP_NOFS); 328 if (!handle) 329 return NULL; 330 memset(handle, 0, sizeof(*handle)); 331 handle->h_buffer_credits = nblocks; 332 handle->h_ref = 1; 333 334 lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", 335 &jbd2_handle_key, 0); 336 337 return handle; 338 } 339 340 /** 341 * handle_t *jbd2_journal_start() - Obtain a new handle. 342 * @journal: Journal to start transaction on. 343 * @nblocks: number of block buffer we might modify 344 * 345 * We make sure that the transaction can guarantee at least nblocks of 346 * modified buffers in the log. We block until the log can guarantee 347 * that much space. 348 * 349 * This function is visible to journal users (like ext3fs), so is not 350 * called with the journal already locked. 351 * 352 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 353 * on failure. 354 */ 355 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) 356 { 357 handle_t *handle = journal_current_handle(); 358 int err; 359 360 if (!journal) 361 return ERR_PTR(-EROFS); 362 363 if (handle) { 364 J_ASSERT(handle->h_transaction->t_journal == journal); 365 handle->h_ref++; 366 return handle; 367 } 368 369 handle = new_handle(nblocks); 370 if (!handle) 371 return ERR_PTR(-ENOMEM); 372 373 current->journal_info = handle; 374 375 err = start_this_handle(journal, handle, gfp_mask); 376 if (err < 0) { 377 jbd2_free_handle(handle); 378 current->journal_info = NULL; 379 handle = ERR_PTR(err); 380 } 381 return handle; 382 } 383 EXPORT_SYMBOL(jbd2__journal_start); 384 385 386 handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 387 { 388 return jbd2__journal_start(journal, nblocks, GFP_NOFS); 389 } 390 EXPORT_SYMBOL(jbd2_journal_start); 391 392 393 /** 394 * int jbd2_journal_extend() - extend buffer credits. 395 * @handle: handle to 'extend' 396 * @nblocks: nr blocks to try to extend by. 397 * 398 * Some transactions, such as large extends and truncates, can be done 399 * atomically all at once or in several stages. The operation requests 400 * a credit for a number of buffer modications in advance, but can 401 * extend its credit if it needs more. 402 * 403 * jbd2_journal_extend tries to give the running handle more buffer credits. 404 * It does not guarantee that allocation - this is a best-effort only. 405 * The calling process MUST be able to deal cleanly with a failure to 406 * extend here. 407 * 408 * Return 0 on success, non-zero on failure. 409 * 410 * return code < 0 implies an error 411 * return code > 0 implies normal transaction-full status. 412 */ 413 int jbd2_journal_extend(handle_t *handle, int nblocks) 414 { 415 transaction_t *transaction = handle->h_transaction; 416 journal_t *journal = transaction->t_journal; 417 int result; 418 int wanted; 419 420 result = -EIO; 421 if (is_handle_aborted(handle)) 422 goto out; 423 424 result = 1; 425 426 read_lock(&journal->j_state_lock); 427 428 /* Don't extend a locked-down transaction! */ 429 if (handle->h_transaction->t_state != T_RUNNING) { 430 jbd_debug(3, "denied handle %p %d blocks: " 431 "transaction not running\n", handle, nblocks); 432 goto error_out; 433 } 434 435 spin_lock(&transaction->t_handle_lock); 436 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; 437 438 if (wanted > journal->j_max_transaction_buffers) { 439 jbd_debug(3, "denied handle %p %d blocks: " 440 "transaction too large\n", handle, nblocks); 441 goto unlock; 442 } 443 444 if (wanted > __jbd2_log_space_left(journal)) { 445 jbd_debug(3, "denied handle %p %d blocks: " 446 "insufficient log space\n", handle, nblocks); 447 goto unlock; 448 } 449 450 handle->h_buffer_credits += nblocks; 451 atomic_add(nblocks, &transaction->t_outstanding_credits); 452 result = 0; 453 454 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 455 unlock: 456 spin_unlock(&transaction->t_handle_lock); 457 error_out: 458 read_unlock(&journal->j_state_lock); 459 out: 460 return result; 461 } 462 463 464 /** 465 * int jbd2_journal_restart() - restart a handle . 466 * @handle: handle to restart 467 * @nblocks: nr credits requested 468 * 469 * Restart a handle for a multi-transaction filesystem 470 * operation. 471 * 472 * If the jbd2_journal_extend() call above fails to grant new buffer credits 473 * to a running handle, a call to jbd2_journal_restart will commit the 474 * handle's transaction so far and reattach the handle to a new 475 * transaction capabable of guaranteeing the requested number of 476 * credits. 477 */ 478 int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) 479 { 480 transaction_t *transaction = handle->h_transaction; 481 journal_t *journal = transaction->t_journal; 482 tid_t tid; 483 int need_to_start, ret; 484 485 /* If we've had an abort of any type, don't even think about 486 * actually doing the restart! */ 487 if (is_handle_aborted(handle)) 488 return 0; 489 490 /* 491 * First unlink the handle from its current transaction, and start the 492 * commit on that. 493 */ 494 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 495 J_ASSERT(journal_current_handle() == handle); 496 497 read_lock(&journal->j_state_lock); 498 spin_lock(&transaction->t_handle_lock); 499 atomic_sub(handle->h_buffer_credits, 500 &transaction->t_outstanding_credits); 501 if (atomic_dec_and_test(&transaction->t_updates)) 502 wake_up(&journal->j_wait_updates); 503 spin_unlock(&transaction->t_handle_lock); 504 505 jbd_debug(2, "restarting handle %p\n", handle); 506 tid = transaction->t_tid; 507 need_to_start = !tid_geq(journal->j_commit_request, tid); 508 read_unlock(&journal->j_state_lock); 509 if (need_to_start) 510 jbd2_log_start_commit(journal, tid); 511 512 lock_map_release(&handle->h_lockdep_map); 513 handle->h_buffer_credits = nblocks; 514 ret = start_this_handle(journal, handle, gfp_mask); 515 return ret; 516 } 517 EXPORT_SYMBOL(jbd2__journal_restart); 518 519 520 int jbd2_journal_restart(handle_t *handle, int nblocks) 521 { 522 return jbd2__journal_restart(handle, nblocks, GFP_NOFS); 523 } 524 EXPORT_SYMBOL(jbd2_journal_restart); 525 526 /** 527 * void jbd2_journal_lock_updates () - establish a transaction barrier. 528 * @journal: Journal to establish a barrier on. 529 * 530 * This locks out any further updates from being started, and blocks 531 * until all existing updates have completed, returning only once the 532 * journal is in a quiescent state with no updates running. 533 * 534 * The journal lock should not be held on entry. 535 */ 536 void jbd2_journal_lock_updates(journal_t *journal) 537 { 538 DEFINE_WAIT(wait); 539 540 write_lock(&journal->j_state_lock); 541 ++journal->j_barrier_count; 542 543 /* Wait until there are no running updates */ 544 while (1) { 545 transaction_t *transaction = journal->j_running_transaction; 546 547 if (!transaction) 548 break; 549 550 spin_lock(&transaction->t_handle_lock); 551 prepare_to_wait(&journal->j_wait_updates, &wait, 552 TASK_UNINTERRUPTIBLE); 553 if (!atomic_read(&transaction->t_updates)) { 554 spin_unlock(&transaction->t_handle_lock); 555 finish_wait(&journal->j_wait_updates, &wait); 556 break; 557 } 558 spin_unlock(&transaction->t_handle_lock); 559 write_unlock(&journal->j_state_lock); 560 schedule(); 561 finish_wait(&journal->j_wait_updates, &wait); 562 write_lock(&journal->j_state_lock); 563 } 564 write_unlock(&journal->j_state_lock); 565 566 /* 567 * We have now established a barrier against other normal updates, but 568 * we also need to barrier against other jbd2_journal_lock_updates() calls 569 * to make sure that we serialise special journal-locked operations 570 * too. 571 */ 572 mutex_lock(&journal->j_barrier); 573 } 574 575 /** 576 * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier 577 * @journal: Journal to release the barrier on. 578 * 579 * Release a transaction barrier obtained with jbd2_journal_lock_updates(). 580 * 581 * Should be called without the journal lock held. 582 */ 583 void jbd2_journal_unlock_updates (journal_t *journal) 584 { 585 J_ASSERT(journal->j_barrier_count != 0); 586 587 mutex_unlock(&journal->j_barrier); 588 write_lock(&journal->j_state_lock); 589 --journal->j_barrier_count; 590 write_unlock(&journal->j_state_lock); 591 wake_up(&journal->j_wait_transaction_locked); 592 } 593 594 static void warn_dirty_buffer(struct buffer_head *bh) 595 { 596 char b[BDEVNAME_SIZE]; 597 598 printk(KERN_WARNING 599 "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 600 "There's a risk of filesystem corruption in case of system " 601 "crash.\n", 602 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 603 } 604 605 /* 606 * If the buffer is already part of the current transaction, then there 607 * is nothing we need to do. If it is already part of a prior 608 * transaction which we are still committing to disk, then we need to 609 * make sure that we do not overwrite the old copy: we do copy-out to 610 * preserve the copy going to disk. We also account the buffer against 611 * the handle's metadata buffer credits (unless the buffer is already 612 * part of the transaction, that is). 613 * 614 */ 615 static int 616 do_get_write_access(handle_t *handle, struct journal_head *jh, 617 int force_copy) 618 { 619 struct buffer_head *bh; 620 transaction_t *transaction; 621 journal_t *journal; 622 int error; 623 char *frozen_buffer = NULL; 624 int need_copy = 0; 625 626 if (is_handle_aborted(handle)) 627 return -EROFS; 628 629 transaction = handle->h_transaction; 630 journal = transaction->t_journal; 631 632 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); 633 634 JBUFFER_TRACE(jh, "entry"); 635 repeat: 636 bh = jh2bh(jh); 637 638 /* @@@ Need to check for errors here at some point. */ 639 640 lock_buffer(bh); 641 jbd_lock_bh_state(bh); 642 643 /* We now hold the buffer lock so it is safe to query the buffer 644 * state. Is the buffer dirty? 645 * 646 * If so, there are two possibilities. The buffer may be 647 * non-journaled, and undergoing a quite legitimate writeback. 648 * Otherwise, it is journaled, and we don't expect dirty buffers 649 * in that state (the buffers should be marked JBD_Dirty 650 * instead.) So either the IO is being done under our own 651 * control and this is a bug, or it's a third party IO such as 652 * dump(8) (which may leave the buffer scheduled for read --- 653 * ie. locked but not dirty) or tune2fs (which may actually have 654 * the buffer dirtied, ugh.) */ 655 656 if (buffer_dirty(bh)) { 657 /* 658 * First question: is this buffer already part of the current 659 * transaction or the existing committing transaction? 660 */ 661 if (jh->b_transaction) { 662 J_ASSERT_JH(jh, 663 jh->b_transaction == transaction || 664 jh->b_transaction == 665 journal->j_committing_transaction); 666 if (jh->b_next_transaction) 667 J_ASSERT_JH(jh, jh->b_next_transaction == 668 transaction); 669 warn_dirty_buffer(bh); 670 } 671 /* 672 * In any case we need to clean the dirty flag and we must 673 * do it under the buffer lock to be sure we don't race 674 * with running write-out. 675 */ 676 JBUFFER_TRACE(jh, "Journalling dirty buffer"); 677 clear_buffer_dirty(bh); 678 set_buffer_jbddirty(bh); 679 } 680 681 unlock_buffer(bh); 682 683 error = -EROFS; 684 if (is_handle_aborted(handle)) { 685 jbd_unlock_bh_state(bh); 686 goto out; 687 } 688 error = 0; 689 690 /* 691 * The buffer is already part of this transaction if b_transaction or 692 * b_next_transaction points to it 693 */ 694 if (jh->b_transaction == transaction || 695 jh->b_next_transaction == transaction) 696 goto done; 697 698 /* 699 * this is the first time this transaction is touching this buffer, 700 * reset the modified flag 701 */ 702 jh->b_modified = 0; 703 704 /* 705 * If there is already a copy-out version of this buffer, then we don't 706 * need to make another one 707 */ 708 if (jh->b_frozen_data) { 709 JBUFFER_TRACE(jh, "has frozen data"); 710 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 711 jh->b_next_transaction = transaction; 712 goto done; 713 } 714 715 /* Is there data here we need to preserve? */ 716 717 if (jh->b_transaction && jh->b_transaction != transaction) { 718 JBUFFER_TRACE(jh, "owned by older transaction"); 719 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 720 J_ASSERT_JH(jh, jh->b_transaction == 721 journal->j_committing_transaction); 722 723 /* There is one case we have to be very careful about. 724 * If the committing transaction is currently writing 725 * this buffer out to disk and has NOT made a copy-out, 726 * then we cannot modify the buffer contents at all 727 * right now. The essence of copy-out is that it is the 728 * extra copy, not the primary copy, which gets 729 * journaled. If the primary copy is already going to 730 * disk then we cannot do copy-out here. */ 731 732 if (jh->b_jlist == BJ_Shadow) { 733 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); 734 wait_queue_head_t *wqh; 735 736 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); 737 738 JBUFFER_TRACE(jh, "on shadow: sleep"); 739 jbd_unlock_bh_state(bh); 740 /* commit wakes up all shadow buffers after IO */ 741 for ( ; ; ) { 742 prepare_to_wait(wqh, &wait.wait, 743 TASK_UNINTERRUPTIBLE); 744 if (jh->b_jlist != BJ_Shadow) 745 break; 746 schedule(); 747 } 748 finish_wait(wqh, &wait.wait); 749 goto repeat; 750 } 751 752 /* Only do the copy if the currently-owning transaction 753 * still needs it. If it is on the Forget list, the 754 * committing transaction is past that stage. The 755 * buffer had better remain locked during the kmalloc, 756 * but that should be true --- we hold the journal lock 757 * still and the buffer is already on the BUF_JOURNAL 758 * list so won't be flushed. 759 * 760 * Subtle point, though: if this is a get_undo_access, 761 * then we will be relying on the frozen_data to contain 762 * the new value of the committed_data record after the 763 * transaction, so we HAVE to force the frozen_data copy 764 * in that case. */ 765 766 if (jh->b_jlist != BJ_Forget || force_copy) { 767 JBUFFER_TRACE(jh, "generate frozen data"); 768 if (!frozen_buffer) { 769 JBUFFER_TRACE(jh, "allocate memory for buffer"); 770 jbd_unlock_bh_state(bh); 771 frozen_buffer = 772 jbd2_alloc(jh2bh(jh)->b_size, 773 GFP_NOFS); 774 if (!frozen_buffer) { 775 printk(KERN_EMERG 776 "%s: OOM for frozen_buffer\n", 777 __func__); 778 JBUFFER_TRACE(jh, "oom!"); 779 error = -ENOMEM; 780 jbd_lock_bh_state(bh); 781 goto done; 782 } 783 goto repeat; 784 } 785 jh->b_frozen_data = frozen_buffer; 786 frozen_buffer = NULL; 787 need_copy = 1; 788 } 789 jh->b_next_transaction = transaction; 790 } 791 792 793 /* 794 * Finally, if the buffer is not journaled right now, we need to make 795 * sure it doesn't get written to disk before the caller actually 796 * commits the new data 797 */ 798 if (!jh->b_transaction) { 799 JBUFFER_TRACE(jh, "no transaction"); 800 J_ASSERT_JH(jh, !jh->b_next_transaction); 801 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 802 spin_lock(&journal->j_list_lock); 803 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 804 spin_unlock(&journal->j_list_lock); 805 } 806 807 done: 808 if (need_copy) { 809 struct page *page; 810 int offset; 811 char *source; 812 813 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 814 "Possible IO failure.\n"); 815 page = jh2bh(jh)->b_page; 816 offset = offset_in_page(jh2bh(jh)->b_data); 817 source = kmap_atomic(page); 818 /* Fire data frozen trigger just before we copy the data */ 819 jbd2_buffer_frozen_trigger(jh, source + offset, 820 jh->b_triggers); 821 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 822 kunmap_atomic(source); 823 824 /* 825 * Now that the frozen data is saved off, we need to store 826 * any matching triggers. 827 */ 828 jh->b_frozen_triggers = jh->b_triggers; 829 } 830 jbd_unlock_bh_state(bh); 831 832 /* 833 * If we are about to journal a buffer, then any revoke pending on it is 834 * no longer valid 835 */ 836 jbd2_journal_cancel_revoke(handle, jh); 837 838 out: 839 if (unlikely(frozen_buffer)) /* It's usually NULL */ 840 jbd2_free(frozen_buffer, bh->b_size); 841 842 JBUFFER_TRACE(jh, "exit"); 843 return error; 844 } 845 846 /** 847 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. 848 * @handle: transaction to add buffer modifications to 849 * @bh: bh to be used for metadata writes 850 * 851 * Returns an error code or 0 on success. 852 * 853 * In full data journalling mode the buffer may be of type BJ_AsyncData, 854 * because we're write()ing a buffer which is also part of a shared mapping. 855 */ 856 857 int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) 858 { 859 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 860 int rc; 861 862 /* We do not want to get caught playing with fields which the 863 * log thread also manipulates. Make sure that the buffer 864 * completes any outstanding IO before proceeding. */ 865 rc = do_get_write_access(handle, jh, 0); 866 jbd2_journal_put_journal_head(jh); 867 return rc; 868 } 869 870 871 /* 872 * When the user wants to journal a newly created buffer_head 873 * (ie. getblk() returned a new buffer and we are going to populate it 874 * manually rather than reading off disk), then we need to keep the 875 * buffer_head locked until it has been completely filled with new 876 * data. In this case, we should be able to make the assertion that 877 * the bh is not already part of an existing transaction. 878 * 879 * The buffer should already be locked by the caller by this point. 880 * There is no lock ranking violation: it was a newly created, 881 * unlocked buffer beforehand. */ 882 883 /** 884 * int jbd2_journal_get_create_access () - notify intent to use newly created bh 885 * @handle: transaction to new buffer to 886 * @bh: new buffer. 887 * 888 * Call this if you create a new bh. 889 */ 890 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) 891 { 892 transaction_t *transaction = handle->h_transaction; 893 journal_t *journal = transaction->t_journal; 894 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 895 int err; 896 897 jbd_debug(5, "journal_head %p\n", jh); 898 err = -EROFS; 899 if (is_handle_aborted(handle)) 900 goto out; 901 err = 0; 902 903 JBUFFER_TRACE(jh, "entry"); 904 /* 905 * The buffer may already belong to this transaction due to pre-zeroing 906 * in the filesystem's new_block code. It may also be on the previous, 907 * committing transaction's lists, but it HAS to be in Forget state in 908 * that case: the transaction must have deleted the buffer for it to be 909 * reused here. 910 */ 911 jbd_lock_bh_state(bh); 912 spin_lock(&journal->j_list_lock); 913 J_ASSERT_JH(jh, (jh->b_transaction == transaction || 914 jh->b_transaction == NULL || 915 (jh->b_transaction == journal->j_committing_transaction && 916 jh->b_jlist == BJ_Forget))); 917 918 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 919 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 920 921 if (jh->b_transaction == NULL) { 922 /* 923 * Previous jbd2_journal_forget() could have left the buffer 924 * with jbddirty bit set because it was being committed. When 925 * the commit finished, we've filed the buffer for 926 * checkpointing and marked it dirty. Now we are reallocating 927 * the buffer so the transaction freeing it must have 928 * committed and so it's safe to clear the dirty bit. 929 */ 930 clear_buffer_dirty(jh2bh(jh)); 931 /* first access by this transaction */ 932 jh->b_modified = 0; 933 934 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 935 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 936 } else if (jh->b_transaction == journal->j_committing_transaction) { 937 /* first access by this transaction */ 938 jh->b_modified = 0; 939 940 JBUFFER_TRACE(jh, "set next transaction"); 941 jh->b_next_transaction = transaction; 942 } 943 spin_unlock(&journal->j_list_lock); 944 jbd_unlock_bh_state(bh); 945 946 /* 947 * akpm: I added this. ext3_alloc_branch can pick up new indirect 948 * blocks which contain freed but then revoked metadata. We need 949 * to cancel the revoke in case we end up freeing it yet again 950 * and the reallocating as data - this would cause a second revoke, 951 * which hits an assertion error. 952 */ 953 JBUFFER_TRACE(jh, "cancelling revoke"); 954 jbd2_journal_cancel_revoke(handle, jh); 955 out: 956 jbd2_journal_put_journal_head(jh); 957 return err; 958 } 959 960 /** 961 * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with 962 * non-rewindable consequences 963 * @handle: transaction 964 * @bh: buffer to undo 965 * 966 * Sometimes there is a need to distinguish between metadata which has 967 * been committed to disk and that which has not. The ext3fs code uses 968 * this for freeing and allocating space, we have to make sure that we 969 * do not reuse freed space until the deallocation has been committed, 970 * since if we overwrote that space we would make the delete 971 * un-rewindable in case of a crash. 972 * 973 * To deal with that, jbd2_journal_get_undo_access requests write access to a 974 * buffer for parts of non-rewindable operations such as delete 975 * operations on the bitmaps. The journaling code must keep a copy of 976 * the buffer's contents prior to the undo_access call until such time 977 * as we know that the buffer has definitely been committed to disk. 978 * 979 * We never need to know which transaction the committed data is part 980 * of, buffers touched here are guaranteed to be dirtied later and so 981 * will be committed to a new transaction in due course, at which point 982 * we can discard the old committed data pointer. 983 * 984 * Returns error number or 0 on success. 985 */ 986 int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) 987 { 988 int err; 989 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 990 char *committed_data = NULL; 991 992 JBUFFER_TRACE(jh, "entry"); 993 994 /* 995 * Do this first --- it can drop the journal lock, so we want to 996 * make sure that obtaining the committed_data is done 997 * atomically wrt. completion of any outstanding commits. 998 */ 999 err = do_get_write_access(handle, jh, 1); 1000 if (err) 1001 goto out; 1002 1003 repeat: 1004 if (!jh->b_committed_data) { 1005 committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 1006 if (!committed_data) { 1007 printk(KERN_EMERG "%s: No memory for committed data\n", 1008 __func__); 1009 err = -ENOMEM; 1010 goto out; 1011 } 1012 } 1013 1014 jbd_lock_bh_state(bh); 1015 if (!jh->b_committed_data) { 1016 /* Copy out the current buffer contents into the 1017 * preserved, committed copy. */ 1018 JBUFFER_TRACE(jh, "generate b_committed data"); 1019 if (!committed_data) { 1020 jbd_unlock_bh_state(bh); 1021 goto repeat; 1022 } 1023 1024 jh->b_committed_data = committed_data; 1025 committed_data = NULL; 1026 memcpy(jh->b_committed_data, bh->b_data, bh->b_size); 1027 } 1028 jbd_unlock_bh_state(bh); 1029 out: 1030 jbd2_journal_put_journal_head(jh); 1031 if (unlikely(committed_data)) 1032 jbd2_free(committed_data, bh->b_size); 1033 return err; 1034 } 1035 1036 /** 1037 * void jbd2_journal_set_triggers() - Add triggers for commit writeout 1038 * @bh: buffer to trigger on 1039 * @type: struct jbd2_buffer_trigger_type containing the trigger(s). 1040 * 1041 * Set any triggers on this journal_head. This is always safe, because 1042 * triggers for a committing buffer will be saved off, and triggers for 1043 * a running transaction will match the buffer in that transaction. 1044 * 1045 * Call with NULL to clear the triggers. 1046 */ 1047 void jbd2_journal_set_triggers(struct buffer_head *bh, 1048 struct jbd2_buffer_trigger_type *type) 1049 { 1050 struct journal_head *jh = bh2jh(bh); 1051 1052 jh->b_triggers = type; 1053 } 1054 1055 void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, 1056 struct jbd2_buffer_trigger_type *triggers) 1057 { 1058 struct buffer_head *bh = jh2bh(jh); 1059 1060 if (!triggers || !triggers->t_frozen) 1061 return; 1062 1063 triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); 1064 } 1065 1066 void jbd2_buffer_abort_trigger(struct journal_head *jh, 1067 struct jbd2_buffer_trigger_type *triggers) 1068 { 1069 if (!triggers || !triggers->t_abort) 1070 return; 1071 1072 triggers->t_abort(triggers, jh2bh(jh)); 1073 } 1074 1075 1076 1077 /** 1078 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 1079 * @handle: transaction to add buffer to. 1080 * @bh: buffer to mark 1081 * 1082 * mark dirty metadata which needs to be journaled as part of the current 1083 * transaction. 1084 * 1085 * The buffer must have previously had jbd2_journal_get_write_access() 1086 * called so that it has a valid journal_head attached to the buffer 1087 * head. 1088 * 1089 * The buffer is placed on the transaction's metadata list and is marked 1090 * as belonging to the transaction. 1091 * 1092 * Returns error number or 0 on success. 1093 * 1094 * Special care needs to be taken if the buffer already belongs to the 1095 * current committing transaction (in which case we should have frozen 1096 * data present for that commit). In that case, we don't relink the 1097 * buffer: that only gets done when the old transaction finally 1098 * completes its commit. 1099 */ 1100 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1101 { 1102 transaction_t *transaction = handle->h_transaction; 1103 journal_t *journal = transaction->t_journal; 1104 struct journal_head *jh = bh2jh(bh); 1105 int ret = 0; 1106 1107 jbd_debug(5, "journal_head %p\n", jh); 1108 JBUFFER_TRACE(jh, "entry"); 1109 if (is_handle_aborted(handle)) 1110 goto out; 1111 if (!buffer_jbd(bh)) { 1112 ret = -EUCLEAN; 1113 goto out; 1114 } 1115 1116 jbd_lock_bh_state(bh); 1117 1118 if (jh->b_modified == 0) { 1119 /* 1120 * This buffer's got modified and becoming part 1121 * of the transaction. This needs to be done 1122 * once a transaction -bzzz 1123 */ 1124 jh->b_modified = 1; 1125 J_ASSERT_JH(jh, handle->h_buffer_credits > 0); 1126 handle->h_buffer_credits--; 1127 } 1128 1129 /* 1130 * fastpath, to avoid expensive locking. If this buffer is already 1131 * on the running transaction's metadata list there is nothing to do. 1132 * Nobody can take it off again because there is a handle open. 1133 * I _think_ we're OK here with SMP barriers - a mistaken decision will 1134 * result in this test being false, so we go in and take the locks. 1135 */ 1136 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { 1137 JBUFFER_TRACE(jh, "fastpath"); 1138 if (unlikely(jh->b_transaction != 1139 journal->j_running_transaction)) { 1140 printk(KERN_EMERG "JBD: %s: " 1141 "jh->b_transaction (%llu, %p, %u) != " 1142 "journal->j_running_transaction (%p, %u)", 1143 journal->j_devname, 1144 (unsigned long long) bh->b_blocknr, 1145 jh->b_transaction, 1146 jh->b_transaction ? jh->b_transaction->t_tid : 0, 1147 journal->j_running_transaction, 1148 journal->j_running_transaction ? 1149 journal->j_running_transaction->t_tid : 0); 1150 ret = -EINVAL; 1151 } 1152 goto out_unlock_bh; 1153 } 1154 1155 set_buffer_jbddirty(bh); 1156 1157 /* 1158 * Metadata already on the current transaction list doesn't 1159 * need to be filed. Metadata on another transaction's list must 1160 * be committing, and will be refiled once the commit completes: 1161 * leave it alone for now. 1162 */ 1163 if (jh->b_transaction != transaction) { 1164 JBUFFER_TRACE(jh, "already on other transaction"); 1165 if (unlikely(jh->b_transaction != 1166 journal->j_committing_transaction)) { 1167 printk(KERN_EMERG "JBD: %s: " 1168 "jh->b_transaction (%llu, %p, %u) != " 1169 "journal->j_committing_transaction (%p, %u)", 1170 journal->j_devname, 1171 (unsigned long long) bh->b_blocknr, 1172 jh->b_transaction, 1173 jh->b_transaction ? jh->b_transaction->t_tid : 0, 1174 journal->j_committing_transaction, 1175 journal->j_committing_transaction ? 1176 journal->j_committing_transaction->t_tid : 0); 1177 ret = -EINVAL; 1178 } 1179 if (unlikely(jh->b_next_transaction != transaction)) { 1180 printk(KERN_EMERG "JBD: %s: " 1181 "jh->b_next_transaction (%llu, %p, %u) != " 1182 "transaction (%p, %u)", 1183 journal->j_devname, 1184 (unsigned long long) bh->b_blocknr, 1185 jh->b_next_transaction, 1186 jh->b_next_transaction ? 1187 jh->b_next_transaction->t_tid : 0, 1188 transaction, transaction->t_tid); 1189 ret = -EINVAL; 1190 } 1191 /* And this case is illegal: we can't reuse another 1192 * transaction's data buffer, ever. */ 1193 goto out_unlock_bh; 1194 } 1195 1196 /* That test should have eliminated the following case: */ 1197 J_ASSERT_JH(jh, jh->b_frozen_data == NULL); 1198 1199 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1200 spin_lock(&journal->j_list_lock); 1201 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); 1202 spin_unlock(&journal->j_list_lock); 1203 out_unlock_bh: 1204 jbd_unlock_bh_state(bh); 1205 out: 1206 JBUFFER_TRACE(jh, "exit"); 1207 WARN_ON(ret); /* All errors are bugs, so dump the stack */ 1208 return ret; 1209 } 1210 1211 /** 1212 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. 1213 * @handle: transaction handle 1214 * @bh: bh to 'forget' 1215 * 1216 * We can only do the bforget if there are no commits pending against the 1217 * buffer. If the buffer is dirty in the current running transaction we 1218 * can safely unlink it. 1219 * 1220 * bh may not be a journalled buffer at all - it may be a non-JBD 1221 * buffer which came off the hashtable. Check for this. 1222 * 1223 * Decrements bh->b_count by one. 1224 * 1225 * Allow this call even if the handle has aborted --- it may be part of 1226 * the caller's cleanup after an abort. 1227 */ 1228 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) 1229 { 1230 transaction_t *transaction = handle->h_transaction; 1231 journal_t *journal = transaction->t_journal; 1232 struct journal_head *jh; 1233 int drop_reserve = 0; 1234 int err = 0; 1235 int was_modified = 0; 1236 1237 BUFFER_TRACE(bh, "entry"); 1238 1239 jbd_lock_bh_state(bh); 1240 spin_lock(&journal->j_list_lock); 1241 1242 if (!buffer_jbd(bh)) 1243 goto not_jbd; 1244 jh = bh2jh(bh); 1245 1246 /* Critical error: attempting to delete a bitmap buffer, maybe? 1247 * Don't do any jbd operations, and return an error. */ 1248 if (!J_EXPECT_JH(jh, !jh->b_committed_data, 1249 "inconsistent data on disk")) { 1250 err = -EIO; 1251 goto not_jbd; 1252 } 1253 1254 /* keep track of whether or not this transaction modified us */ 1255 was_modified = jh->b_modified; 1256 1257 /* 1258 * The buffer's going from the transaction, we must drop 1259 * all references -bzzz 1260 */ 1261 jh->b_modified = 0; 1262 1263 if (jh->b_transaction == handle->h_transaction) { 1264 J_ASSERT_JH(jh, !jh->b_frozen_data); 1265 1266 /* If we are forgetting a buffer which is already part 1267 * of this transaction, then we can just drop it from 1268 * the transaction immediately. */ 1269 clear_buffer_dirty(bh); 1270 clear_buffer_jbddirty(bh); 1271 1272 JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); 1273 1274 /* 1275 * we only want to drop a reference if this transaction 1276 * modified the buffer 1277 */ 1278 if (was_modified) 1279 drop_reserve = 1; 1280 1281 /* 1282 * We are no longer going to journal this buffer. 1283 * However, the commit of this transaction is still 1284 * important to the buffer: the delete that we are now 1285 * processing might obsolete an old log entry, so by 1286 * committing, we can satisfy the buffer's checkpoint. 1287 * 1288 * So, if we have a checkpoint on the buffer, we should 1289 * now refile the buffer on our BJ_Forget list so that 1290 * we know to remove the checkpoint after we commit. 1291 */ 1292 1293 if (jh->b_cp_transaction) { 1294 __jbd2_journal_temp_unlink_buffer(jh); 1295 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1296 } else { 1297 __jbd2_journal_unfile_buffer(jh); 1298 if (!buffer_jbd(bh)) { 1299 spin_unlock(&journal->j_list_lock); 1300 jbd_unlock_bh_state(bh); 1301 __bforget(bh); 1302 goto drop; 1303 } 1304 } 1305 } else if (jh->b_transaction) { 1306 J_ASSERT_JH(jh, (jh->b_transaction == 1307 journal->j_committing_transaction)); 1308 /* However, if the buffer is still owned by a prior 1309 * (committing) transaction, we can't drop it yet... */ 1310 JBUFFER_TRACE(jh, "belongs to older transaction"); 1311 /* ... but we CAN drop it from the new transaction if we 1312 * have also modified it since the original commit. */ 1313 1314 if (jh->b_next_transaction) { 1315 J_ASSERT(jh->b_next_transaction == transaction); 1316 jh->b_next_transaction = NULL; 1317 1318 /* 1319 * only drop a reference if this transaction modified 1320 * the buffer 1321 */ 1322 if (was_modified) 1323 drop_reserve = 1; 1324 } 1325 } 1326 1327 not_jbd: 1328 spin_unlock(&journal->j_list_lock); 1329 jbd_unlock_bh_state(bh); 1330 __brelse(bh); 1331 drop: 1332 if (drop_reserve) { 1333 /* no need to reserve log space for this block -bzzz */ 1334 handle->h_buffer_credits++; 1335 } 1336 return err; 1337 } 1338 1339 /** 1340 * int jbd2_journal_stop() - complete a transaction 1341 * @handle: tranaction to complete. 1342 * 1343 * All done for a particular handle. 1344 * 1345 * There is not much action needed here. We just return any remaining 1346 * buffer credits to the transaction and remove the handle. The only 1347 * complication is that we need to start a commit operation if the 1348 * filesystem is marked for synchronous update. 1349 * 1350 * jbd2_journal_stop itself will not usually return an error, but it may 1351 * do so in unusual circumstances. In particular, expect it to 1352 * return -EIO if a jbd2_journal_abort has been executed since the 1353 * transaction began. 1354 */ 1355 int jbd2_journal_stop(handle_t *handle) 1356 { 1357 transaction_t *transaction = handle->h_transaction; 1358 journal_t *journal = transaction->t_journal; 1359 int err, wait_for_commit = 0; 1360 tid_t tid; 1361 pid_t pid; 1362 1363 J_ASSERT(journal_current_handle() == handle); 1364 1365 if (is_handle_aborted(handle)) 1366 err = -EIO; 1367 else { 1368 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 1369 err = 0; 1370 } 1371 1372 if (--handle->h_ref > 0) { 1373 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, 1374 handle->h_ref); 1375 return err; 1376 } 1377 1378 jbd_debug(4, "Handle %p going down\n", handle); 1379 1380 /* 1381 * Implement synchronous transaction batching. If the handle 1382 * was synchronous, don't force a commit immediately. Let's 1383 * yield and let another thread piggyback onto this 1384 * transaction. Keep doing that while new threads continue to 1385 * arrive. It doesn't cost much - we're about to run a commit 1386 * and sleep on IO anyway. Speeds up many-threaded, many-dir 1387 * operations by 30x or more... 1388 * 1389 * We try and optimize the sleep time against what the 1390 * underlying disk can do, instead of having a static sleep 1391 * time. This is useful for the case where our storage is so 1392 * fast that it is more optimal to go ahead and force a flush 1393 * and wait for the transaction to be committed than it is to 1394 * wait for an arbitrary amount of time for new writers to 1395 * join the transaction. We achieve this by measuring how 1396 * long it takes to commit a transaction, and compare it with 1397 * how long this transaction has been running, and if run time 1398 * < commit time then we sleep for the delta and commit. This 1399 * greatly helps super fast disks that would see slowdowns as 1400 * more threads started doing fsyncs. 1401 * 1402 * But don't do this if this process was the most recent one 1403 * to perform a synchronous write. We do this to detect the 1404 * case where a single process is doing a stream of sync 1405 * writes. No point in waiting for joiners in that case. 1406 */ 1407 pid = current->pid; 1408 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1409 u64 commit_time, trans_time; 1410 1411 journal->j_last_sync_writer = pid; 1412 1413 read_lock(&journal->j_state_lock); 1414 commit_time = journal->j_average_commit_time; 1415 read_unlock(&journal->j_state_lock); 1416 1417 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1418 transaction->t_start_time)); 1419 1420 commit_time = max_t(u64, commit_time, 1421 1000*journal->j_min_batch_time); 1422 commit_time = min_t(u64, commit_time, 1423 1000*journal->j_max_batch_time); 1424 1425 if (trans_time < commit_time) { 1426 ktime_t expires = ktime_add_ns(ktime_get(), 1427 commit_time); 1428 set_current_state(TASK_UNINTERRUPTIBLE); 1429 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 1430 } 1431 } 1432 1433 if (handle->h_sync) 1434 transaction->t_synchronous_commit = 1; 1435 current->journal_info = NULL; 1436 atomic_sub(handle->h_buffer_credits, 1437 &transaction->t_outstanding_credits); 1438 1439 /* 1440 * If the handle is marked SYNC, we need to set another commit 1441 * going! We also want to force a commit if the current 1442 * transaction is occupying too much of the log, or if the 1443 * transaction is too old now. 1444 */ 1445 if (handle->h_sync || 1446 (atomic_read(&transaction->t_outstanding_credits) > 1447 journal->j_max_transaction_buffers) || 1448 time_after_eq(jiffies, transaction->t_expires)) { 1449 /* Do this even for aborted journals: an abort still 1450 * completes the commit thread, it just doesn't write 1451 * anything to disk. */ 1452 1453 jbd_debug(2, "transaction too old, requesting commit for " 1454 "handle %p\n", handle); 1455 /* This is non-blocking */ 1456 jbd2_log_start_commit(journal, transaction->t_tid); 1457 1458 /* 1459 * Special case: JBD2_SYNC synchronous updates require us 1460 * to wait for the commit to complete. 1461 */ 1462 if (handle->h_sync && !(current->flags & PF_MEMALLOC)) 1463 wait_for_commit = 1; 1464 } 1465 1466 /* 1467 * Once we drop t_updates, if it goes to zero the transaction 1468 * could start committing on us and eventually disappear. So 1469 * once we do this, we must not dereference transaction 1470 * pointer again. 1471 */ 1472 tid = transaction->t_tid; 1473 if (atomic_dec_and_test(&transaction->t_updates)) { 1474 wake_up(&journal->j_wait_updates); 1475 if (journal->j_barrier_count) 1476 wake_up(&journal->j_wait_transaction_locked); 1477 } 1478 1479 if (wait_for_commit) 1480 err = jbd2_log_wait_commit(journal, tid); 1481 1482 lock_map_release(&handle->h_lockdep_map); 1483 1484 jbd2_free_handle(handle); 1485 return err; 1486 } 1487 1488 /** 1489 * int jbd2_journal_force_commit() - force any uncommitted transactions 1490 * @journal: journal to force 1491 * 1492 * For synchronous operations: force any uncommitted transactions 1493 * to disk. May seem kludgy, but it reuses all the handle batching 1494 * code in a very simple manner. 1495 */ 1496 int jbd2_journal_force_commit(journal_t *journal) 1497 { 1498 handle_t *handle; 1499 int ret; 1500 1501 handle = jbd2_journal_start(journal, 1); 1502 if (IS_ERR(handle)) { 1503 ret = PTR_ERR(handle); 1504 } else { 1505 handle->h_sync = 1; 1506 ret = jbd2_journal_stop(handle); 1507 } 1508 return ret; 1509 } 1510 1511 /* 1512 * 1513 * List management code snippets: various functions for manipulating the 1514 * transaction buffer lists. 1515 * 1516 */ 1517 1518 /* 1519 * Append a buffer to a transaction list, given the transaction's list head 1520 * pointer. 1521 * 1522 * j_list_lock is held. 1523 * 1524 * jbd_lock_bh_state(jh2bh(jh)) is held. 1525 */ 1526 1527 static inline void 1528 __blist_add_buffer(struct journal_head **list, struct journal_head *jh) 1529 { 1530 if (!*list) { 1531 jh->b_tnext = jh->b_tprev = jh; 1532 *list = jh; 1533 } else { 1534 /* Insert at the tail of the list to preserve order */ 1535 struct journal_head *first = *list, *last = first->b_tprev; 1536 jh->b_tprev = last; 1537 jh->b_tnext = first; 1538 last->b_tnext = first->b_tprev = jh; 1539 } 1540 } 1541 1542 /* 1543 * Remove a buffer from a transaction list, given the transaction's list 1544 * head pointer. 1545 * 1546 * Called with j_list_lock held, and the journal may not be locked. 1547 * 1548 * jbd_lock_bh_state(jh2bh(jh)) is held. 1549 */ 1550 1551 static inline void 1552 __blist_del_buffer(struct journal_head **list, struct journal_head *jh) 1553 { 1554 if (*list == jh) { 1555 *list = jh->b_tnext; 1556 if (*list == jh) 1557 *list = NULL; 1558 } 1559 jh->b_tprev->b_tnext = jh->b_tnext; 1560 jh->b_tnext->b_tprev = jh->b_tprev; 1561 } 1562 1563 /* 1564 * Remove a buffer from the appropriate transaction list. 1565 * 1566 * Note that this function can *change* the value of 1567 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, 1568 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one 1569 * of these pointers, it could go bad. Generally the caller needs to re-read 1570 * the pointer from the transaction_t. 1571 * 1572 * Called under j_list_lock. 1573 */ 1574 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) 1575 { 1576 struct journal_head **list = NULL; 1577 transaction_t *transaction; 1578 struct buffer_head *bh = jh2bh(jh); 1579 1580 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1581 transaction = jh->b_transaction; 1582 if (transaction) 1583 assert_spin_locked(&transaction->t_journal->j_list_lock); 1584 1585 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1586 if (jh->b_jlist != BJ_None) 1587 J_ASSERT_JH(jh, transaction != NULL); 1588 1589 switch (jh->b_jlist) { 1590 case BJ_None: 1591 return; 1592 case BJ_Metadata: 1593 transaction->t_nr_buffers--; 1594 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); 1595 list = &transaction->t_buffers; 1596 break; 1597 case BJ_Forget: 1598 list = &transaction->t_forget; 1599 break; 1600 case BJ_IO: 1601 list = &transaction->t_iobuf_list; 1602 break; 1603 case BJ_Shadow: 1604 list = &transaction->t_shadow_list; 1605 break; 1606 case BJ_LogCtl: 1607 list = &transaction->t_log_list; 1608 break; 1609 case BJ_Reserved: 1610 list = &transaction->t_reserved_list; 1611 break; 1612 } 1613 1614 __blist_del_buffer(list, jh); 1615 jh->b_jlist = BJ_None; 1616 if (test_clear_buffer_jbddirty(bh)) 1617 mark_buffer_dirty(bh); /* Expose it to the VM */ 1618 } 1619 1620 /* 1621 * Remove buffer from all transactions. 1622 * 1623 * Called with bh_state lock and j_list_lock 1624 * 1625 * jh and bh may be already freed when this function returns. 1626 */ 1627 static void __jbd2_journal_unfile_buffer(struct journal_head *jh) 1628 { 1629 __jbd2_journal_temp_unlink_buffer(jh); 1630 jh->b_transaction = NULL; 1631 jbd2_journal_put_journal_head(jh); 1632 } 1633 1634 void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) 1635 { 1636 struct buffer_head *bh = jh2bh(jh); 1637 1638 /* Get reference so that buffer cannot be freed before we unlock it */ 1639 get_bh(bh); 1640 jbd_lock_bh_state(bh); 1641 spin_lock(&journal->j_list_lock); 1642 __jbd2_journal_unfile_buffer(jh); 1643 spin_unlock(&journal->j_list_lock); 1644 jbd_unlock_bh_state(bh); 1645 __brelse(bh); 1646 } 1647 1648 /* 1649 * Called from jbd2_journal_try_to_free_buffers(). 1650 * 1651 * Called under jbd_lock_bh_state(bh) 1652 */ 1653 static void 1654 __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) 1655 { 1656 struct journal_head *jh; 1657 1658 jh = bh2jh(bh); 1659 1660 if (buffer_locked(bh) || buffer_dirty(bh)) 1661 goto out; 1662 1663 if (jh->b_next_transaction != NULL) 1664 goto out; 1665 1666 spin_lock(&journal->j_list_lock); 1667 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1668 /* written-back checkpointed metadata buffer */ 1669 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1670 __jbd2_journal_remove_checkpoint(jh); 1671 } 1672 spin_unlock(&journal->j_list_lock); 1673 out: 1674 return; 1675 } 1676 1677 /** 1678 * int jbd2_journal_try_to_free_buffers() - try to free page buffers. 1679 * @journal: journal for operation 1680 * @page: to try and free 1681 * @gfp_mask: we use the mask to detect how hard should we try to release 1682 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to 1683 * release the buffers. 1684 * 1685 * 1686 * For all the buffers on this page, 1687 * if they are fully written out ordered data, move them onto BUF_CLEAN 1688 * so try_to_free_buffers() can reap them. 1689 * 1690 * This function returns non-zero if we wish try_to_free_buffers() 1691 * to be called. We do this if the page is releasable by try_to_free_buffers(). 1692 * We also do it if the page has locked or dirty buffers and the caller wants 1693 * us to perform sync or async writeout. 1694 * 1695 * This complicates JBD locking somewhat. We aren't protected by the 1696 * BKL here. We wish to remove the buffer from its committing or 1697 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. 1698 * 1699 * This may *change* the value of transaction_t->t_datalist, so anyone 1700 * who looks at t_datalist needs to lock against this function. 1701 * 1702 * Even worse, someone may be doing a jbd2_journal_dirty_data on this 1703 * buffer. So we need to lock against that. jbd2_journal_dirty_data() 1704 * will come out of the lock with the buffer dirty, which makes it 1705 * ineligible for release here. 1706 * 1707 * Who else is affected by this? hmm... Really the only contender 1708 * is do_get_write_access() - it could be looking at the buffer while 1709 * journal_try_to_free_buffer() is changing its state. But that 1710 * cannot happen because we never reallocate freed data as metadata 1711 * while the data is part of a transaction. Yes? 1712 * 1713 * Return 0 on failure, 1 on success 1714 */ 1715 int jbd2_journal_try_to_free_buffers(journal_t *journal, 1716 struct page *page, gfp_t gfp_mask) 1717 { 1718 struct buffer_head *head; 1719 struct buffer_head *bh; 1720 int ret = 0; 1721 1722 J_ASSERT(PageLocked(page)); 1723 1724 head = page_buffers(page); 1725 bh = head; 1726 do { 1727 struct journal_head *jh; 1728 1729 /* 1730 * We take our own ref against the journal_head here to avoid 1731 * having to add tons of locking around each instance of 1732 * jbd2_journal_put_journal_head(). 1733 */ 1734 jh = jbd2_journal_grab_journal_head(bh); 1735 if (!jh) 1736 continue; 1737 1738 jbd_lock_bh_state(bh); 1739 __journal_try_to_free_buffer(journal, bh); 1740 jbd2_journal_put_journal_head(jh); 1741 jbd_unlock_bh_state(bh); 1742 if (buffer_jbd(bh)) 1743 goto busy; 1744 } while ((bh = bh->b_this_page) != head); 1745 1746 ret = try_to_free_buffers(page); 1747 1748 busy: 1749 return ret; 1750 } 1751 1752 /* 1753 * This buffer is no longer needed. If it is on an older transaction's 1754 * checkpoint list we need to record it on this transaction's forget list 1755 * to pin this buffer (and hence its checkpointing transaction) down until 1756 * this transaction commits. If the buffer isn't on a checkpoint list, we 1757 * release it. 1758 * Returns non-zero if JBD no longer has an interest in the buffer. 1759 * 1760 * Called under j_list_lock. 1761 * 1762 * Called under jbd_lock_bh_state(bh). 1763 */ 1764 static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) 1765 { 1766 int may_free = 1; 1767 struct buffer_head *bh = jh2bh(jh); 1768 1769 if (jh->b_cp_transaction) { 1770 JBUFFER_TRACE(jh, "on running+cp transaction"); 1771 __jbd2_journal_temp_unlink_buffer(jh); 1772 /* 1773 * We don't want to write the buffer anymore, clear the 1774 * bit so that we don't confuse checks in 1775 * __journal_file_buffer 1776 */ 1777 clear_buffer_dirty(bh); 1778 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1779 may_free = 0; 1780 } else { 1781 JBUFFER_TRACE(jh, "on running transaction"); 1782 __jbd2_journal_unfile_buffer(jh); 1783 } 1784 return may_free; 1785 } 1786 1787 /* 1788 * jbd2_journal_invalidatepage 1789 * 1790 * This code is tricky. It has a number of cases to deal with. 1791 * 1792 * There are two invariants which this code relies on: 1793 * 1794 * i_size must be updated on disk before we start calling invalidatepage on the 1795 * data. 1796 * 1797 * This is done in ext3 by defining an ext3_setattr method which 1798 * updates i_size before truncate gets going. By maintaining this 1799 * invariant, we can be sure that it is safe to throw away any buffers 1800 * attached to the current transaction: once the transaction commits, 1801 * we know that the data will not be needed. 1802 * 1803 * Note however that we can *not* throw away data belonging to the 1804 * previous, committing transaction! 1805 * 1806 * Any disk blocks which *are* part of the previous, committing 1807 * transaction (and which therefore cannot be discarded immediately) are 1808 * not going to be reused in the new running transaction 1809 * 1810 * The bitmap committed_data images guarantee this: any block which is 1811 * allocated in one transaction and removed in the next will be marked 1812 * as in-use in the committed_data bitmap, so cannot be reused until 1813 * the next transaction to delete the block commits. This means that 1814 * leaving committing buffers dirty is quite safe: the disk blocks 1815 * cannot be reallocated to a different file and so buffer aliasing is 1816 * not possible. 1817 * 1818 * 1819 * The above applies mainly to ordered data mode. In writeback mode we 1820 * don't make guarantees about the order in which data hits disk --- in 1821 * particular we don't guarantee that new dirty data is flushed before 1822 * transaction commit --- so it is always safe just to discard data 1823 * immediately in that mode. --sct 1824 */ 1825 1826 /* 1827 * The journal_unmap_buffer helper function returns zero if the buffer 1828 * concerned remains pinned as an anonymous buffer belonging to an older 1829 * transaction. 1830 * 1831 * We're outside-transaction here. Either or both of j_running_transaction 1832 * and j_committing_transaction may be NULL. 1833 */ 1834 static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, 1835 int partial_page) 1836 { 1837 transaction_t *transaction; 1838 struct journal_head *jh; 1839 int may_free = 1; 1840 1841 BUFFER_TRACE(bh, "entry"); 1842 1843 /* 1844 * It is safe to proceed here without the j_list_lock because the 1845 * buffers cannot be stolen by try_to_free_buffers as long as we are 1846 * holding the page lock. --sct 1847 */ 1848 1849 if (!buffer_jbd(bh)) 1850 goto zap_buffer_unlocked; 1851 1852 /* OK, we have data buffer in journaled mode */ 1853 write_lock(&journal->j_state_lock); 1854 jbd_lock_bh_state(bh); 1855 spin_lock(&journal->j_list_lock); 1856 1857 jh = jbd2_journal_grab_journal_head(bh); 1858 if (!jh) 1859 goto zap_buffer_no_jh; 1860 1861 /* 1862 * We cannot remove the buffer from checkpoint lists until the 1863 * transaction adding inode to orphan list (let's call it T) 1864 * is committed. Otherwise if the transaction changing the 1865 * buffer would be cleaned from the journal before T is 1866 * committed, a crash will cause that the correct contents of 1867 * the buffer will be lost. On the other hand we have to 1868 * clear the buffer dirty bit at latest at the moment when the 1869 * transaction marking the buffer as freed in the filesystem 1870 * structures is committed because from that moment on the 1871 * block can be reallocated and used by a different page. 1872 * Since the block hasn't been freed yet but the inode has 1873 * already been added to orphan list, it is safe for us to add 1874 * the buffer to BJ_Forget list of the newest transaction. 1875 * 1876 * Also we have to clear buffer_mapped flag of a truncated buffer 1877 * because the buffer_head may be attached to the page straddling 1878 * i_size (can happen only when blocksize < pagesize) and thus the 1879 * buffer_head can be reused when the file is extended again. So we end 1880 * up keeping around invalidated buffers attached to transactions' 1881 * BJ_Forget list just to stop checkpointing code from cleaning up 1882 * the transaction this buffer was modified in. 1883 */ 1884 transaction = jh->b_transaction; 1885 if (transaction == NULL) { 1886 /* First case: not on any transaction. If it 1887 * has no checkpoint link, then we can zap it: 1888 * it's a writeback-mode buffer so we don't care 1889 * if it hits disk safely. */ 1890 if (!jh->b_cp_transaction) { 1891 JBUFFER_TRACE(jh, "not on any transaction: zap"); 1892 goto zap_buffer; 1893 } 1894 1895 if (!buffer_dirty(bh)) { 1896 /* bdflush has written it. We can drop it now */ 1897 goto zap_buffer; 1898 } 1899 1900 /* OK, it must be in the journal but still not 1901 * written fully to disk: it's metadata or 1902 * journaled data... */ 1903 1904 if (journal->j_running_transaction) { 1905 /* ... and once the current transaction has 1906 * committed, the buffer won't be needed any 1907 * longer. */ 1908 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); 1909 may_free = __dispose_buffer(jh, 1910 journal->j_running_transaction); 1911 goto zap_buffer; 1912 } else { 1913 /* There is no currently-running transaction. So the 1914 * orphan record which we wrote for this file must have 1915 * passed into commit. We must attach this buffer to 1916 * the committing transaction, if it exists. */ 1917 if (journal->j_committing_transaction) { 1918 JBUFFER_TRACE(jh, "give to committing trans"); 1919 may_free = __dispose_buffer(jh, 1920 journal->j_committing_transaction); 1921 goto zap_buffer; 1922 } else { 1923 /* The orphan record's transaction has 1924 * committed. We can cleanse this buffer */ 1925 clear_buffer_jbddirty(bh); 1926 goto zap_buffer; 1927 } 1928 } 1929 } else if (transaction == journal->j_committing_transaction) { 1930 JBUFFER_TRACE(jh, "on committing transaction"); 1931 /* 1932 * The buffer is committing, we simply cannot touch 1933 * it. If the page is straddling i_size we have to wait 1934 * for commit and try again. 1935 */ 1936 if (partial_page) { 1937 jbd2_journal_put_journal_head(jh); 1938 spin_unlock(&journal->j_list_lock); 1939 jbd_unlock_bh_state(bh); 1940 write_unlock(&journal->j_state_lock); 1941 return -EBUSY; 1942 } 1943 /* 1944 * OK, buffer won't be reachable after truncate. We just set 1945 * j_next_transaction to the running transaction (if there is 1946 * one) and mark buffer as freed so that commit code knows it 1947 * should clear dirty bits when it is done with the buffer. 1948 */ 1949 set_buffer_freed(bh); 1950 if (journal->j_running_transaction && buffer_jbddirty(bh)) 1951 jh->b_next_transaction = journal->j_running_transaction; 1952 jbd2_journal_put_journal_head(jh); 1953 spin_unlock(&journal->j_list_lock); 1954 jbd_unlock_bh_state(bh); 1955 write_unlock(&journal->j_state_lock); 1956 return 0; 1957 } else { 1958 /* Good, the buffer belongs to the running transaction. 1959 * We are writing our own transaction's data, not any 1960 * previous one's, so it is safe to throw it away 1961 * (remember that we expect the filesystem to have set 1962 * i_size already for this truncate so recovery will not 1963 * expose the disk blocks we are discarding here.) */ 1964 J_ASSERT_JH(jh, transaction == journal->j_running_transaction); 1965 JBUFFER_TRACE(jh, "on running transaction"); 1966 may_free = __dispose_buffer(jh, transaction); 1967 } 1968 1969 zap_buffer: 1970 /* 1971 * This is tricky. Although the buffer is truncated, it may be reused 1972 * if blocksize < pagesize and it is attached to the page straddling 1973 * EOF. Since the buffer might have been added to BJ_Forget list of the 1974 * running transaction, journal_get_write_access() won't clear 1975 * b_modified and credit accounting gets confused. So clear b_modified 1976 * here. 1977 */ 1978 jh->b_modified = 0; 1979 jbd2_journal_put_journal_head(jh); 1980 zap_buffer_no_jh: 1981 spin_unlock(&journal->j_list_lock); 1982 jbd_unlock_bh_state(bh); 1983 write_unlock(&journal->j_state_lock); 1984 zap_buffer_unlocked: 1985 clear_buffer_dirty(bh); 1986 J_ASSERT_BH(bh, !buffer_jbddirty(bh)); 1987 clear_buffer_mapped(bh); 1988 clear_buffer_req(bh); 1989 clear_buffer_new(bh); 1990 clear_buffer_delay(bh); 1991 clear_buffer_unwritten(bh); 1992 bh->b_bdev = NULL; 1993 return may_free; 1994 } 1995 1996 /** 1997 * void jbd2_journal_invalidatepage() 1998 * @journal: journal to use for flush... 1999 * @page: page to flush 2000 * @offset: length of page to invalidate. 2001 * 2002 * Reap page buffers containing data after offset in page. Can return -EBUSY 2003 * if buffers are part of the committing transaction and the page is straddling 2004 * i_size. Caller then has to wait for current commit and try again. 2005 */ 2006 int jbd2_journal_invalidatepage(journal_t *journal, 2007 struct page *page, 2008 unsigned long offset) 2009 { 2010 struct buffer_head *head, *bh, *next; 2011 unsigned int curr_off = 0; 2012 int may_free = 1; 2013 int ret = 0; 2014 2015 if (!PageLocked(page)) 2016 BUG(); 2017 if (!page_has_buffers(page)) 2018 return 0; 2019 2020 /* We will potentially be playing with lists other than just the 2021 * data lists (especially for journaled data mode), so be 2022 * cautious in our locking. */ 2023 2024 head = bh = page_buffers(page); 2025 do { 2026 unsigned int next_off = curr_off + bh->b_size; 2027 next = bh->b_this_page; 2028 2029 if (offset <= curr_off) { 2030 /* This block is wholly outside the truncation point */ 2031 lock_buffer(bh); 2032 ret = journal_unmap_buffer(journal, bh, offset > 0); 2033 unlock_buffer(bh); 2034 if (ret < 0) 2035 return ret; 2036 may_free &= ret; 2037 } 2038 curr_off = next_off; 2039 bh = next; 2040 2041 } while (bh != head); 2042 2043 if (!offset) { 2044 if (may_free && try_to_free_buffers(page)) 2045 J_ASSERT(!page_has_buffers(page)); 2046 } 2047 return 0; 2048 } 2049 2050 /* 2051 * File a buffer on the given transaction list. 2052 */ 2053 void __jbd2_journal_file_buffer(struct journal_head *jh, 2054 transaction_t *transaction, int jlist) 2055 { 2056 struct journal_head **list = NULL; 2057 int was_dirty = 0; 2058 struct buffer_head *bh = jh2bh(jh); 2059 2060 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 2061 assert_spin_locked(&transaction->t_journal->j_list_lock); 2062 2063 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 2064 J_ASSERT_JH(jh, jh->b_transaction == transaction || 2065 jh->b_transaction == NULL); 2066 2067 if (jh->b_transaction && jh->b_jlist == jlist) 2068 return; 2069 2070 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 2071 jlist == BJ_Shadow || jlist == BJ_Forget) { 2072 /* 2073 * For metadata buffers, we track dirty bit in buffer_jbddirty 2074 * instead of buffer_dirty. We should not see a dirty bit set 2075 * here because we clear it in do_get_write_access but e.g. 2076 * tune2fs can modify the sb and set the dirty bit at any time 2077 * so we try to gracefully handle that. 2078 */ 2079 if (buffer_dirty(bh)) 2080 warn_dirty_buffer(bh); 2081 if (test_clear_buffer_dirty(bh) || 2082 test_clear_buffer_jbddirty(bh)) 2083 was_dirty = 1; 2084 } 2085 2086 if (jh->b_transaction) 2087 __jbd2_journal_temp_unlink_buffer(jh); 2088 else 2089 jbd2_journal_grab_journal_head(bh); 2090 jh->b_transaction = transaction; 2091 2092 switch (jlist) { 2093 case BJ_None: 2094 J_ASSERT_JH(jh, !jh->b_committed_data); 2095 J_ASSERT_JH(jh, !jh->b_frozen_data); 2096 return; 2097 case BJ_Metadata: 2098 transaction->t_nr_buffers++; 2099 list = &transaction->t_buffers; 2100 break; 2101 case BJ_Forget: 2102 list = &transaction->t_forget; 2103 break; 2104 case BJ_IO: 2105 list = &transaction->t_iobuf_list; 2106 break; 2107 case BJ_Shadow: 2108 list = &transaction->t_shadow_list; 2109 break; 2110 case BJ_LogCtl: 2111 list = &transaction->t_log_list; 2112 break; 2113 case BJ_Reserved: 2114 list = &transaction->t_reserved_list; 2115 break; 2116 } 2117 2118 __blist_add_buffer(list, jh); 2119 jh->b_jlist = jlist; 2120 2121 if (was_dirty) 2122 set_buffer_jbddirty(bh); 2123 } 2124 2125 void jbd2_journal_file_buffer(struct journal_head *jh, 2126 transaction_t *transaction, int jlist) 2127 { 2128 jbd_lock_bh_state(jh2bh(jh)); 2129 spin_lock(&transaction->t_journal->j_list_lock); 2130 __jbd2_journal_file_buffer(jh, transaction, jlist); 2131 spin_unlock(&transaction->t_journal->j_list_lock); 2132 jbd_unlock_bh_state(jh2bh(jh)); 2133 } 2134 2135 /* 2136 * Remove a buffer from its current buffer list in preparation for 2137 * dropping it from its current transaction entirely. If the buffer has 2138 * already started to be used by a subsequent transaction, refile the 2139 * buffer on that transaction's metadata list. 2140 * 2141 * Called under j_list_lock 2142 * Called under jbd_lock_bh_state(jh2bh(jh)) 2143 * 2144 * jh and bh may be already free when this function returns 2145 */ 2146 void __jbd2_journal_refile_buffer(struct journal_head *jh) 2147 { 2148 int was_dirty, jlist; 2149 struct buffer_head *bh = jh2bh(jh); 2150 2151 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 2152 if (jh->b_transaction) 2153 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); 2154 2155 /* If the buffer is now unused, just drop it. */ 2156 if (jh->b_next_transaction == NULL) { 2157 __jbd2_journal_unfile_buffer(jh); 2158 return; 2159 } 2160 2161 /* 2162 * It has been modified by a later transaction: add it to the new 2163 * transaction's metadata list. 2164 */ 2165 2166 was_dirty = test_clear_buffer_jbddirty(bh); 2167 __jbd2_journal_temp_unlink_buffer(jh); 2168 /* 2169 * We set b_transaction here because b_next_transaction will inherit 2170 * our jh reference and thus __jbd2_journal_file_buffer() must not 2171 * take a new one. 2172 */ 2173 jh->b_transaction = jh->b_next_transaction; 2174 jh->b_next_transaction = NULL; 2175 if (buffer_freed(bh)) 2176 jlist = BJ_Forget; 2177 else if (jh->b_modified) 2178 jlist = BJ_Metadata; 2179 else 2180 jlist = BJ_Reserved; 2181 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); 2182 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2183 2184 if (was_dirty) 2185 set_buffer_jbddirty(bh); 2186 } 2187 2188 /* 2189 * __jbd2_journal_refile_buffer() with necessary locking added. We take our 2190 * bh reference so that we can safely unlock bh. 2191 * 2192 * The jh and bh may be freed by this call. 2193 */ 2194 void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) 2195 { 2196 struct buffer_head *bh = jh2bh(jh); 2197 2198 /* Get reference so that buffer cannot be freed before we unlock it */ 2199 get_bh(bh); 2200 jbd_lock_bh_state(bh); 2201 spin_lock(&journal->j_list_lock); 2202 __jbd2_journal_refile_buffer(jh); 2203 jbd_unlock_bh_state(bh); 2204 spin_unlock(&journal->j_list_lock); 2205 __brelse(bh); 2206 } 2207 2208 /* 2209 * File inode in the inode list of the handle's transaction 2210 */ 2211 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2212 { 2213 transaction_t *transaction = handle->h_transaction; 2214 journal_t *journal = transaction->t_journal; 2215 2216 if (is_handle_aborted(handle)) 2217 return -EIO; 2218 2219 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2220 transaction->t_tid); 2221 2222 /* 2223 * First check whether inode isn't already on the transaction's 2224 * lists without taking the lock. Note that this check is safe 2225 * without the lock as we cannot race with somebody removing inode 2226 * from the transaction. The reason is that we remove inode from the 2227 * transaction only in journal_release_jbd_inode() and when we commit 2228 * the transaction. We are guarded from the first case by holding 2229 * a reference to the inode. We are safe against the second case 2230 * because if jinode->i_transaction == transaction, commit code 2231 * cannot touch the transaction because we hold reference to it, 2232 * and if jinode->i_next_transaction == transaction, commit code 2233 * will only file the inode where we want it. 2234 */ 2235 if (jinode->i_transaction == transaction || 2236 jinode->i_next_transaction == transaction) 2237 return 0; 2238 2239 spin_lock(&journal->j_list_lock); 2240 2241 if (jinode->i_transaction == transaction || 2242 jinode->i_next_transaction == transaction) 2243 goto done; 2244 2245 /* 2246 * We only ever set this variable to 1 so the test is safe. Since 2247 * t_need_data_flush is likely to be set, we do the test to save some 2248 * cacheline bouncing 2249 */ 2250 if (!transaction->t_need_data_flush) 2251 transaction->t_need_data_flush = 1; 2252 /* On some different transaction's list - should be 2253 * the committing one */ 2254 if (jinode->i_transaction) { 2255 J_ASSERT(jinode->i_next_transaction == NULL); 2256 J_ASSERT(jinode->i_transaction == 2257 journal->j_committing_transaction); 2258 jinode->i_next_transaction = transaction; 2259 goto done; 2260 } 2261 /* Not on any transaction list... */ 2262 J_ASSERT(!jinode->i_next_transaction); 2263 jinode->i_transaction = transaction; 2264 list_add(&jinode->i_list, &transaction->t_inode_list); 2265 done: 2266 spin_unlock(&journal->j_list_lock); 2267 2268 return 0; 2269 } 2270 2271 /* 2272 * File truncate and transaction commit interact with each other in a 2273 * non-trivial way. If a transaction writing data block A is 2274 * committing, we cannot discard the data by truncate until we have 2275 * written them. Otherwise if we crashed after the transaction with 2276 * write has committed but before the transaction with truncate has 2277 * committed, we could see stale data in block A. This function is a 2278 * helper to solve this problem. It starts writeout of the truncated 2279 * part in case it is in the committing transaction. 2280 * 2281 * Filesystem code must call this function when inode is journaled in 2282 * ordered mode before truncation happens and after the inode has been 2283 * placed on orphan list with the new inode size. The second condition 2284 * avoids the race that someone writes new data and we start 2285 * committing the transaction after this function has been called but 2286 * before a transaction for truncate is started (and furthermore it 2287 * allows us to optimize the case where the addition to orphan list 2288 * happens in the same transaction as write --- we don't have to write 2289 * any data in such case). 2290 */ 2291 int jbd2_journal_begin_ordered_truncate(journal_t *journal, 2292 struct jbd2_inode *jinode, 2293 loff_t new_size) 2294 { 2295 transaction_t *inode_trans, *commit_trans; 2296 int ret = 0; 2297 2298 /* This is a quick check to avoid locking if not necessary */ 2299 if (!jinode->i_transaction) 2300 goto out; 2301 /* Locks are here just to force reading of recent values, it is 2302 * enough that the transaction was not committing before we started 2303 * a transaction adding the inode to orphan list */ 2304 read_lock(&journal->j_state_lock); 2305 commit_trans = journal->j_committing_transaction; 2306 read_unlock(&journal->j_state_lock); 2307 spin_lock(&journal->j_list_lock); 2308 inode_trans = jinode->i_transaction; 2309 spin_unlock(&journal->j_list_lock); 2310 if (inode_trans == commit_trans) { 2311 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, 2312 new_size, LLONG_MAX); 2313 if (ret) 2314 jbd2_journal_abort(journal, ret); 2315 } 2316 out: 2317 return ret; 2318 } 2319