1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * journal.c 5 * 6 * Defines functions of journalling api 7 * 8 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/kthread.h> 31 32 #define MLOG_MASK_PREFIX ML_JOURNAL 33 #include <cluster/masklog.h> 34 35 #include "ocfs2.h" 36 37 #include "alloc.h" 38 #include "dlmglue.h" 39 #include "extent_map.h" 40 #include "heartbeat.h" 41 #include "inode.h" 42 #include "journal.h" 43 #include "localalloc.h" 44 #include "namei.h" 45 #include "slot_map.h" 46 #include "super.h" 47 #include "vote.h" 48 #include "sysfile.h" 49 50 #include "buffer_head_io.h" 51 52 spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; 53 54 static int ocfs2_force_read_journal(struct inode *inode); 55 static int ocfs2_recover_node(struct ocfs2_super *osb, 56 int node_num); 57 static int __ocfs2_recovery_thread(void *arg); 58 static int ocfs2_commit_cache(struct ocfs2_super *osb); 59 static int ocfs2_wait_on_mount(struct ocfs2_super *osb); 60 static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, 61 struct ocfs2_journal_handle *handle); 62 static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle); 63 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 64 int dirty); 65 static int ocfs2_trylock_journal(struct ocfs2_super *osb, 66 int slot_num); 67 static int ocfs2_recover_orphans(struct ocfs2_super *osb, 68 int slot); 69 static int ocfs2_commit_thread(void *arg); 70 71 static int ocfs2_commit_cache(struct ocfs2_super *osb) 72 { 73 int status = 0; 74 unsigned int flushed; 75 unsigned long old_id; 76 struct ocfs2_journal *journal = NULL; 77 78 mlog_entry_void(); 79 80 journal = osb->journal; 81 82 /* Flush all pending commits and checkpoint the journal. */ 83 down_write(&journal->j_trans_barrier); 84 85 if (atomic_read(&journal->j_num_trans) == 0) { 86 up_write(&journal->j_trans_barrier); 87 mlog(0, "No transactions for me to flush!\n"); 88 goto finally; 89 } 90 91 journal_lock_updates(journal->j_journal); 92 status = journal_flush(journal->j_journal); 93 journal_unlock_updates(journal->j_journal); 94 if (status < 0) { 95 up_write(&journal->j_trans_barrier); 96 mlog_errno(status); 97 goto finally; 98 } 99 100 old_id = ocfs2_inc_trans_id(journal); 101 102 flushed = atomic_read(&journal->j_num_trans); 103 atomic_set(&journal->j_num_trans, 0); 104 up_write(&journal->j_trans_barrier); 105 106 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 107 journal->j_trans_id, flushed); 108 109 ocfs2_kick_vote_thread(osb); 110 wake_up(&journal->j_checkpointed); 111 finally: 112 mlog_exit(status); 113 return status; 114 } 115 116 struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) 117 { 118 struct ocfs2_journal_handle *retval = NULL; 119 120 retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); 121 if (!retval) { 122 mlog(ML_ERROR, "Failed to allocate memory for journal " 123 "handle!\n"); 124 return NULL; 125 } 126 127 retval->max_buffs = 0; 128 retval->num_locks = 0; 129 retval->k_handle = NULL; 130 131 INIT_LIST_HEAD(&retval->locks); 132 INIT_LIST_HEAD(&retval->inode_list); 133 retval->journal = osb->journal; 134 135 return retval; 136 } 137 138 /* pass it NULL and it will allocate a new handle object for you. If 139 * you pass it a handle however, it may still return error, in which 140 * case it has free'd the passed handle for you. */ 141 struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, 142 struct ocfs2_journal_handle *handle, 143 int max_buffs) 144 { 145 int ret; 146 journal_t *journal = osb->journal->j_journal; 147 148 mlog_entry("(max_buffs = %d)\n", max_buffs); 149 150 if (!osb || !osb->journal->j_journal) 151 BUG(); 152 153 if (ocfs2_is_hard_readonly(osb)) { 154 ret = -EROFS; 155 goto done_free; 156 } 157 158 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); 159 BUG_ON(max_buffs <= 0); 160 161 /* JBD might support this, but our journalling code doesn't yet. */ 162 if (journal_current_handle()) { 163 mlog(ML_ERROR, "Recursive transaction attempted!\n"); 164 BUG(); 165 } 166 167 if (!handle) 168 handle = ocfs2_alloc_handle(osb); 169 if (!handle) { 170 ret = -ENOMEM; 171 mlog(ML_ERROR, "Failed to allocate memory for journal " 172 "handle!\n"); 173 goto done_free; 174 } 175 176 handle->max_buffs = max_buffs; 177 178 down_read(&osb->journal->j_trans_barrier); 179 180 /* actually start the transaction now */ 181 handle->k_handle = journal_start(journal, max_buffs); 182 if (IS_ERR(handle->k_handle)) { 183 up_read(&osb->journal->j_trans_barrier); 184 185 ret = PTR_ERR(handle->k_handle); 186 handle->k_handle = NULL; 187 mlog_errno(ret); 188 189 if (is_journal_aborted(journal)) { 190 ocfs2_abort(osb->sb, "Detected aborted journal"); 191 ret = -EROFS; 192 } 193 goto done_free; 194 } 195 196 atomic_inc(&(osb->journal->j_num_trans)); 197 handle->flags |= OCFS2_HANDLE_STARTED; 198 199 mlog_exit_ptr(handle); 200 return handle; 201 202 done_free: 203 if (handle) 204 ocfs2_commit_unstarted_handle(handle); /* will kfree handle */ 205 206 mlog_exit(ret); 207 return ERR_PTR(ret); 208 } 209 210 void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, 211 struct inode *inode) 212 { 213 BUG_ON(!handle); 214 BUG_ON(!inode); 215 216 atomic_inc(&inode->i_count); 217 218 /* we're obviously changing it... */ 219 mutex_lock(&inode->i_mutex); 220 221 /* sanity check */ 222 BUG_ON(OCFS2_I(inode)->ip_handle); 223 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); 224 225 OCFS2_I(inode)->ip_handle = handle; 226 list_del(&(OCFS2_I(inode)->ip_handle_list)); 227 list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); 228 } 229 230 static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) 231 { 232 struct list_head *p, *n; 233 struct inode *inode; 234 struct ocfs2_inode_info *oi; 235 236 list_for_each_safe(p, n, &handle->inode_list) { 237 oi = list_entry(p, struct ocfs2_inode_info, 238 ip_handle_list); 239 inode = &oi->vfs_inode; 240 241 OCFS2_I(inode)->ip_handle = NULL; 242 list_del_init(&OCFS2_I(inode)->ip_handle_list); 243 244 mutex_unlock(&inode->i_mutex); 245 iput(inode); 246 } 247 } 248 249 /* This is trivial so we do it out of the main commit 250 * paths. Beware, it can be called from start_trans too! */ 251 static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle) 252 { 253 mlog_entry_void(); 254 255 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); 256 257 ocfs2_handle_unlock_inodes(handle); 258 /* You are allowed to add journal locks before the transaction 259 * has started. */ 260 ocfs2_handle_cleanup_locks(handle->journal, handle); 261 262 kfree(handle); 263 264 mlog_exit_void(); 265 } 266 267 void ocfs2_commit_trans(struct ocfs2_journal_handle *handle) 268 { 269 handle_t *jbd_handle; 270 int retval; 271 struct ocfs2_journal *journal = handle->journal; 272 273 mlog_entry_void(); 274 275 BUG_ON(!handle); 276 277 if (!(handle->flags & OCFS2_HANDLE_STARTED)) { 278 ocfs2_commit_unstarted_handle(handle); 279 mlog_exit_void(); 280 return; 281 } 282 283 /* release inode semaphores we took during this transaction */ 284 ocfs2_handle_unlock_inodes(handle); 285 286 /* ocfs2_extend_trans may have had to call journal_restart 287 * which will always commit the transaction, but may return 288 * error for any number of reasons. If this is the case, we 289 * clear k_handle as it's not valid any more. */ 290 if (handle->k_handle) { 291 jbd_handle = handle->k_handle; 292 293 if (handle->flags & OCFS2_HANDLE_SYNC) 294 jbd_handle->h_sync = 1; 295 else 296 jbd_handle->h_sync = 0; 297 298 /* actually stop the transaction. if we've set h_sync, 299 * it'll have been committed when we return */ 300 retval = journal_stop(jbd_handle); 301 if (retval < 0) { 302 mlog_errno(retval); 303 mlog(ML_ERROR, "Could not commit transaction\n"); 304 BUG(); 305 } 306 307 handle->k_handle = NULL; /* it's been free'd in journal_stop */ 308 } 309 310 ocfs2_handle_cleanup_locks(journal, handle); 311 312 up_read(&journal->j_trans_barrier); 313 314 kfree(handle); 315 mlog_exit_void(); 316 } 317 318 /* 319 * 'nblocks' is what you want to add to the current 320 * transaction. extend_trans will either extend the current handle by 321 * nblocks, or commit it and start a new one with nblocks credits. 322 * 323 * WARNING: This will not release any semaphores or disk locks taken 324 * during the transaction, so make sure they were taken *before* 325 * start_trans or we'll have ordering deadlocks. 326 * 327 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is 328 * good because transaction ids haven't yet been recorded on the 329 * cluster locks associated with this handle. 330 */ 331 int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, 332 int nblocks) 333 { 334 int status; 335 336 BUG_ON(!handle); 337 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); 338 BUG_ON(!nblocks); 339 340 mlog_entry_void(); 341 342 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 343 344 status = journal_extend(handle->k_handle, nblocks); 345 if (status < 0) { 346 mlog_errno(status); 347 goto bail; 348 } 349 350 if (status > 0) { 351 mlog(0, "journal_extend failed, trying journal_restart\n"); 352 status = journal_restart(handle->k_handle, nblocks); 353 if (status < 0) { 354 handle->k_handle = NULL; 355 mlog_errno(status); 356 goto bail; 357 } 358 handle->max_buffs = nblocks; 359 } else 360 handle->max_buffs += nblocks; 361 362 status = 0; 363 bail: 364 365 mlog_exit(status); 366 return status; 367 } 368 369 int ocfs2_journal_access(struct ocfs2_journal_handle *handle, 370 struct inode *inode, 371 struct buffer_head *bh, 372 int type) 373 { 374 int status; 375 376 BUG_ON(!inode); 377 BUG_ON(!handle); 378 BUG_ON(!bh); 379 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); 380 381 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n", 382 (unsigned long long)bh->b_blocknr, type, 383 (type == OCFS2_JOURNAL_ACCESS_CREATE) ? 384 "OCFS2_JOURNAL_ACCESS_CREATE" : 385 "OCFS2_JOURNAL_ACCESS_WRITE", 386 bh->b_size); 387 388 /* we can safely remove this assertion after testing. */ 389 if (!buffer_uptodate(bh)) { 390 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); 391 mlog(ML_ERROR, "b_blocknr=%llu\n", 392 (unsigned long long)bh->b_blocknr); 393 BUG(); 394 } 395 396 /* Set the current transaction information on the inode so 397 * that the locking code knows whether it can drop it's locks 398 * on this inode or not. We're protected from the commit 399 * thread updating the current transaction id until 400 * ocfs2_commit_trans() because ocfs2_start_trans() took 401 * j_trans_barrier for us. */ 402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 403 404 down(&OCFS2_I(inode)->ip_io_sem); 405 switch (type) { 406 case OCFS2_JOURNAL_ACCESS_CREATE: 407 case OCFS2_JOURNAL_ACCESS_WRITE: 408 status = journal_get_write_access(handle->k_handle, bh); 409 break; 410 411 case OCFS2_JOURNAL_ACCESS_UNDO: 412 status = journal_get_undo_access(handle->k_handle, bh); 413 break; 414 415 default: 416 status = -EINVAL; 417 mlog(ML_ERROR, "Uknown access type!\n"); 418 } 419 up(&OCFS2_I(inode)->ip_io_sem); 420 421 if (status < 0) 422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 423 status, type); 424 425 mlog_exit(status); 426 return status; 427 } 428 429 int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, 430 struct buffer_head *bh) 431 { 432 int status; 433 434 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); 435 436 mlog_entry("(bh->b_blocknr=%llu)\n", 437 (unsigned long long)bh->b_blocknr); 438 439 status = journal_dirty_metadata(handle->k_handle, bh); 440 if (status < 0) 441 mlog(ML_ERROR, "Could not dirty metadata buffer. " 442 "(bh->b_blocknr=%llu)\n", 443 (unsigned long long)bh->b_blocknr); 444 445 mlog_exit(status); 446 return status; 447 } 448 449 int ocfs2_journal_dirty_data(handle_t *handle, 450 struct buffer_head *bh) 451 { 452 int err = journal_dirty_data(handle, bh); 453 if (err) 454 mlog_errno(err); 455 /* TODO: When we can handle it, abort the handle and go RO on 456 * error here. */ 457 458 return err; 459 } 460 461 /* We always assume you're adding a metadata lock at level 'ex' */ 462 int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, 463 struct inode *inode) 464 { 465 int status; 466 struct ocfs2_journal_lock *lock; 467 468 BUG_ON(!inode); 469 470 lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS); 471 if (!lock) { 472 status = -ENOMEM; 473 mlog_errno(-ENOMEM); 474 goto bail; 475 } 476 477 if (!igrab(inode)) 478 BUG(); 479 lock->jl_inode = inode; 480 481 list_add_tail(&(lock->jl_lock_list), &(handle->locks)); 482 handle->num_locks++; 483 484 status = 0; 485 bail: 486 mlog_exit(status); 487 return status; 488 } 489 490 static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, 491 struct ocfs2_journal_handle *handle) 492 { 493 struct list_head *p, *n; 494 struct ocfs2_journal_lock *lock; 495 struct inode *inode; 496 497 list_for_each_safe(p, n, &(handle->locks)) { 498 lock = list_entry(p, struct ocfs2_journal_lock, 499 jl_lock_list); 500 list_del(&lock->jl_lock_list); 501 handle->num_locks--; 502 503 inode = lock->jl_inode; 504 ocfs2_meta_unlock(inode, 1); 505 if (atomic_read(&inode->i_count) == 1) 506 mlog(ML_ERROR, 507 "Inode %"MLFu64", I'm doing a last iput for!", 508 OCFS2_I(inode)->ip_blkno); 509 iput(inode); 510 kmem_cache_free(ocfs2_lock_cache, lock); 511 } 512 } 513 514 #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 515 516 void ocfs2_set_journal_params(struct ocfs2_super *osb) 517 { 518 journal_t *journal = osb->journal->j_journal; 519 520 spin_lock(&journal->j_state_lock); 521 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 522 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 523 journal->j_flags |= JFS_BARRIER; 524 else 525 journal->j_flags &= ~JFS_BARRIER; 526 spin_unlock(&journal->j_state_lock); 527 } 528 529 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) 530 { 531 int status = -1; 532 struct inode *inode = NULL; /* the journal inode */ 533 journal_t *j_journal = NULL; 534 struct ocfs2_dinode *di = NULL; 535 struct buffer_head *bh = NULL; 536 struct ocfs2_super *osb; 537 int meta_lock = 0; 538 539 mlog_entry_void(); 540 541 BUG_ON(!journal); 542 543 osb = journal->j_osb; 544 545 /* already have the inode for our journal */ 546 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 547 osb->slot_num); 548 if (inode == NULL) { 549 status = -EACCES; 550 mlog_errno(status); 551 goto done; 552 } 553 if (is_bad_inode(inode)) { 554 mlog(ML_ERROR, "access error (bad inode)\n"); 555 iput(inode); 556 inode = NULL; 557 status = -EACCES; 558 goto done; 559 } 560 561 SET_INODE_JOURNAL(inode); 562 OCFS2_I(inode)->ip_open_count++; 563 564 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 565 if (status < 0) { 566 if (status != -ERESTARTSYS) 567 mlog(ML_ERROR, "Could not get lock on journal!\n"); 568 goto done; 569 } 570 571 meta_lock = 1; 572 di = (struct ocfs2_dinode *)bh->b_data; 573 574 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 575 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", 576 inode->i_size); 577 status = -EINVAL; 578 goto done; 579 } 580 581 mlog(0, "inode->i_size = %lld\n", inode->i_size); 582 mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks); 583 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); 584 585 /* call the kernels journal init function now */ 586 j_journal = journal_init_inode(inode); 587 if (j_journal == NULL) { 588 mlog(ML_ERROR, "Linux journal layer error\n"); 589 status = -EINVAL; 590 goto done; 591 } 592 593 mlog(0, "Returned from journal_init_inode\n"); 594 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); 595 596 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & 597 OCFS2_JOURNAL_DIRTY_FL); 598 599 journal->j_journal = j_journal; 600 journal->j_inode = inode; 601 journal->j_bh = bh; 602 603 ocfs2_set_journal_params(osb); 604 605 journal->j_state = OCFS2_JOURNAL_LOADED; 606 607 status = 0; 608 done: 609 if (status < 0) { 610 if (meta_lock) 611 ocfs2_meta_unlock(inode, 1); 612 if (bh != NULL) 613 brelse(bh); 614 if (inode) { 615 OCFS2_I(inode)->ip_open_count--; 616 iput(inode); 617 } 618 } 619 620 mlog_exit(status); 621 return status; 622 } 623 624 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 625 int dirty) 626 { 627 int status; 628 unsigned int flags; 629 struct ocfs2_journal *journal = osb->journal; 630 struct buffer_head *bh = journal->j_bh; 631 struct ocfs2_dinode *fe; 632 633 mlog_entry_void(); 634 635 fe = (struct ocfs2_dinode *)bh->b_data; 636 if (!OCFS2_IS_VALID_DINODE(fe)) { 637 /* This is called from startup/shutdown which will 638 * handle the errors in a specific manner, so no need 639 * to call ocfs2_error() here. */ 640 mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid " 641 "signature: %.*s", fe->i_blkno, 7, fe->i_signature); 642 status = -EIO; 643 goto out; 644 } 645 646 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 647 if (dirty) 648 flags |= OCFS2_JOURNAL_DIRTY_FL; 649 else 650 flags &= ~OCFS2_JOURNAL_DIRTY_FL; 651 fe->id1.journal1.ij_flags = cpu_to_le32(flags); 652 653 status = ocfs2_write_block(osb, bh, journal->j_inode); 654 if (status < 0) 655 mlog_errno(status); 656 657 out: 658 mlog_exit(status); 659 return status; 660 } 661 662 /* 663 * If the journal has been kmalloc'd it needs to be freed after this 664 * call. 665 */ 666 void ocfs2_journal_shutdown(struct ocfs2_super *osb) 667 { 668 struct ocfs2_journal *journal = NULL; 669 int status = 0; 670 struct inode *inode = NULL; 671 int num_running_trans = 0; 672 673 mlog_entry_void(); 674 675 if (!osb) 676 BUG(); 677 678 journal = osb->journal; 679 if (!journal) 680 goto done; 681 682 inode = journal->j_inode; 683 684 if (journal->j_state != OCFS2_JOURNAL_LOADED) 685 goto done; 686 687 /* need to inc inode use count as journal_destroy will iput. */ 688 if (!igrab(inode)) 689 BUG(); 690 691 num_running_trans = atomic_read(&(osb->journal->j_num_trans)); 692 if (num_running_trans > 0) 693 mlog(0, "Shutting down journal: must wait on %d " 694 "running transactions!\n", 695 num_running_trans); 696 697 /* Do a commit_cache here. It will flush our journal, *and* 698 * release any locks that are still held. 699 * set the SHUTDOWN flag and release the trans lock. 700 * the commit thread will take the trans lock for us below. */ 701 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; 702 703 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not 704 * drop the trans_lock (which we want to hold until we 705 * completely destroy the journal. */ 706 if (osb->commit_task) { 707 /* Wait for the commit thread */ 708 mlog(0, "Waiting for ocfs2commit to exit....\n"); 709 kthread_stop(osb->commit_task); 710 osb->commit_task = NULL; 711 } 712 713 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); 714 715 status = ocfs2_journal_toggle_dirty(osb, 0); 716 if (status < 0) 717 mlog_errno(status); 718 719 /* Shutdown the kernel journal system */ 720 journal_destroy(journal->j_journal); 721 722 OCFS2_I(inode)->ip_open_count--; 723 724 /* unlock our journal */ 725 ocfs2_meta_unlock(inode, 1); 726 727 brelse(journal->j_bh); 728 journal->j_bh = NULL; 729 730 journal->j_state = OCFS2_JOURNAL_FREE; 731 732 // up_write(&journal->j_trans_barrier); 733 done: 734 if (inode) 735 iput(inode); 736 mlog_exit_void(); 737 } 738 739 static void ocfs2_clear_journal_error(struct super_block *sb, 740 journal_t *journal, 741 int slot) 742 { 743 int olderr; 744 745 olderr = journal_errno(journal); 746 if (olderr) { 747 mlog(ML_ERROR, "File system error %d recorded in " 748 "journal %u.\n", olderr, slot); 749 mlog(ML_ERROR, "File system on device %s needs checking.\n", 750 sb->s_id); 751 752 journal_ack_err(journal); 753 journal_clear_err(journal); 754 } 755 } 756 757 int ocfs2_journal_load(struct ocfs2_journal *journal) 758 { 759 int status = 0; 760 struct ocfs2_super *osb; 761 762 mlog_entry_void(); 763 764 if (!journal) 765 BUG(); 766 767 osb = journal->j_osb; 768 769 status = journal_load(journal->j_journal); 770 if (status < 0) { 771 mlog(ML_ERROR, "Failed to load journal!\n"); 772 goto done; 773 } 774 775 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); 776 777 status = ocfs2_journal_toggle_dirty(osb, 1); 778 if (status < 0) { 779 mlog_errno(status); 780 goto done; 781 } 782 783 /* Launch the commit thread */ 784 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d", 785 osb->osb_id); 786 if (IS_ERR(osb->commit_task)) { 787 status = PTR_ERR(osb->commit_task); 788 osb->commit_task = NULL; 789 mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", 790 status); 791 goto done; 792 } 793 794 done: 795 mlog_exit(status); 796 return status; 797 } 798 799 800 /* 'full' flag tells us whether we clear out all blocks or if we just 801 * mark the journal clean */ 802 int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) 803 { 804 int status; 805 806 mlog_entry_void(); 807 808 if (!journal) 809 BUG(); 810 811 status = journal_wipe(journal->j_journal, full); 812 if (status < 0) { 813 mlog_errno(status); 814 goto bail; 815 } 816 817 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); 818 if (status < 0) 819 mlog_errno(status); 820 821 bail: 822 mlog_exit(status); 823 return status; 824 } 825 826 /* 827 * JBD Might read a cached version of another nodes journal file. We 828 * don't want this as this file changes often and we get no 829 * notification on those changes. The only way to be sure that we've 830 * got the most up to date version of those blocks then is to force 831 * read them off disk. Just searching through the buffer cache won't 832 * work as there may be pages backing this file which are still marked 833 * up to date. We know things can't change on this file underneath us 834 * as we have the lock by now :) 835 */ 836 static int ocfs2_force_read_journal(struct inode *inode) 837 { 838 int status = 0; 839 int i, p_blocks; 840 u64 v_blkno, p_blkno; 841 #define CONCURRENT_JOURNAL_FILL 32 842 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; 843 844 mlog_entry_void(); 845 846 BUG_ON(inode->i_blocks != 847 ocfs2_align_bytes_to_sectors(i_size_read(inode))); 848 849 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 850 851 mlog(0, "Force reading %lu blocks\n", 852 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))); 853 854 v_blkno = 0; 855 while (v_blkno < 856 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { 857 858 status = ocfs2_extent_map_get_blocks(inode, v_blkno, 859 1, &p_blkno, 860 &p_blocks); 861 if (status < 0) { 862 mlog_errno(status); 863 goto bail; 864 } 865 866 if (p_blocks > CONCURRENT_JOURNAL_FILL) 867 p_blocks = CONCURRENT_JOURNAL_FILL; 868 869 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), 870 p_blkno, p_blocks, bhs, 0, 871 inode); 872 if (status < 0) { 873 mlog_errno(status); 874 goto bail; 875 } 876 877 for(i = 0; i < p_blocks; i++) { 878 brelse(bhs[i]); 879 bhs[i] = NULL; 880 } 881 882 v_blkno += p_blocks; 883 } 884 885 bail: 886 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) 887 if (bhs[i]) 888 brelse(bhs[i]); 889 mlog_exit(status); 890 return status; 891 } 892 893 struct ocfs2_la_recovery_item { 894 struct list_head lri_list; 895 int lri_slot; 896 struct ocfs2_dinode *lri_la_dinode; 897 struct ocfs2_dinode *lri_tl_dinode; 898 }; 899 900 /* Does the second half of the recovery process. By this point, the 901 * node is marked clean and can actually be considered recovered, 902 * hence it's no longer in the recovery map, but there's still some 903 * cleanup we can do which shouldn't happen within the recovery thread 904 * as locking in that context becomes very difficult if we are to take 905 * recovering nodes into account. 906 * 907 * NOTE: This function can and will sleep on recovery of other nodes 908 * during cluster locking, just like any other ocfs2 process. 909 */ 910 void ocfs2_complete_recovery(void *data) 911 { 912 int ret; 913 struct ocfs2_super *osb = data; 914 struct ocfs2_journal *journal = osb->journal; 915 struct ocfs2_dinode *la_dinode, *tl_dinode; 916 struct ocfs2_la_recovery_item *item; 917 struct list_head *p, *n; 918 LIST_HEAD(tmp_la_list); 919 920 mlog_entry_void(); 921 922 mlog(0, "completing recovery from keventd\n"); 923 924 spin_lock(&journal->j_lock); 925 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 926 spin_unlock(&journal->j_lock); 927 928 list_for_each_safe(p, n, &tmp_la_list) { 929 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); 930 list_del_init(&item->lri_list); 931 932 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 933 934 la_dinode = item->lri_la_dinode; 935 if (la_dinode) { 936 mlog(0, "Clean up local alloc %"MLFu64"\n", 937 la_dinode->i_blkno); 938 939 ret = ocfs2_complete_local_alloc_recovery(osb, 940 la_dinode); 941 if (ret < 0) 942 mlog_errno(ret); 943 944 kfree(la_dinode); 945 } 946 947 tl_dinode = item->lri_tl_dinode; 948 if (tl_dinode) { 949 mlog(0, "Clean up truncate log %"MLFu64"\n", 950 tl_dinode->i_blkno); 951 952 ret = ocfs2_complete_truncate_log_recovery(osb, 953 tl_dinode); 954 if (ret < 0) 955 mlog_errno(ret); 956 957 kfree(tl_dinode); 958 } 959 960 ret = ocfs2_recover_orphans(osb, item->lri_slot); 961 if (ret < 0) 962 mlog_errno(ret); 963 964 kfree(item); 965 } 966 967 mlog(0, "Recovery completion\n"); 968 mlog_exit_void(); 969 } 970 971 /* NOTE: This function always eats your references to la_dinode and 972 * tl_dinode, either manually on error, or by passing them to 973 * ocfs2_complete_recovery */ 974 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 975 int slot_num, 976 struct ocfs2_dinode *la_dinode, 977 struct ocfs2_dinode *tl_dinode) 978 { 979 struct ocfs2_la_recovery_item *item; 980 981 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); 982 if (!item) { 983 /* Though we wish to avoid it, we are in fact safe in 984 * skipping local alloc cleanup as fsck.ocfs2 is more 985 * than capable of reclaiming unused space. */ 986 if (la_dinode) 987 kfree(la_dinode); 988 989 if (tl_dinode) 990 kfree(tl_dinode); 991 992 mlog_errno(-ENOMEM); 993 return; 994 } 995 996 INIT_LIST_HEAD(&item->lri_list); 997 item->lri_la_dinode = la_dinode; 998 item->lri_slot = slot_num; 999 item->lri_tl_dinode = tl_dinode; 1000 1001 spin_lock(&journal->j_lock); 1002 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1003 queue_work(ocfs2_wq, &journal->j_recovery_work); 1004 spin_unlock(&journal->j_lock); 1005 } 1006 1007 /* Called by the mount code to queue recovery the last part of 1008 * recovery for it's own slot. */ 1009 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 1010 { 1011 struct ocfs2_journal *journal = osb->journal; 1012 1013 if (osb->dirty) { 1014 /* No need to queue up our truncate_log as regular 1015 * cleanup will catch that. */ 1016 ocfs2_queue_recovery_completion(journal, 1017 osb->slot_num, 1018 osb->local_alloc_copy, 1019 NULL); 1020 ocfs2_schedule_truncate_log_flush(osb, 0); 1021 1022 osb->local_alloc_copy = NULL; 1023 osb->dirty = 0; 1024 } 1025 } 1026 1027 static int __ocfs2_recovery_thread(void *arg) 1028 { 1029 int status, node_num; 1030 struct ocfs2_super *osb = arg; 1031 1032 mlog_entry_void(); 1033 1034 status = ocfs2_wait_on_mount(osb); 1035 if (status < 0) { 1036 goto bail; 1037 } 1038 1039 restart: 1040 status = ocfs2_super_lock(osb, 1); 1041 if (status < 0) { 1042 mlog_errno(status); 1043 goto bail; 1044 } 1045 1046 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1047 node_num = ocfs2_node_map_first_set_bit(osb, 1048 &osb->recovery_map); 1049 if (node_num == O2NM_INVALID_NODE_NUM) { 1050 mlog(0, "Out of nodes to recover.\n"); 1051 break; 1052 } 1053 1054 status = ocfs2_recover_node(osb, node_num); 1055 if (status < 0) { 1056 mlog(ML_ERROR, 1057 "Error %d recovering node %d on device (%u,%u)!\n", 1058 status, node_num, 1059 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1060 mlog(ML_ERROR, "Volume requires unmount.\n"); 1061 continue; 1062 } 1063 1064 ocfs2_recovery_map_clear(osb, node_num); 1065 } 1066 ocfs2_super_unlock(osb, 1); 1067 1068 /* We always run recovery on our own orphan dir - the dead 1069 * node(s) may have voted "no" on an inode delete earlier. A 1070 * revote is therefore required. */ 1071 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 1072 NULL); 1073 1074 bail: 1075 down(&osb->recovery_lock); 1076 if (!status && 1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1078 up(&osb->recovery_lock); 1079 goto restart; 1080 } 1081 1082 osb->recovery_thread_task = NULL; 1083 mb(); /* sync with ocfs2_recovery_thread_running */ 1084 wake_up(&osb->recovery_event); 1085 1086 up(&osb->recovery_lock); 1087 1088 mlog_exit(status); 1089 /* no one is callint kthread_stop() for us so the kthread() api 1090 * requires that we call do_exit(). And it isn't exported, but 1091 * complete_and_exit() seems to be a minimal wrapper around it. */ 1092 complete_and_exit(NULL, status); 1093 return status; 1094 } 1095 1096 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) 1097 { 1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1099 node_num, osb->node_num); 1100 1101 down(&osb->recovery_lock); 1102 if (osb->disable_recovery) 1103 goto out; 1104 1105 /* People waiting on recovery will wait on 1106 * the recovery map to empty. */ 1107 if (!ocfs2_recovery_map_set(osb, node_num)) 1108 mlog(0, "node %d already be in recovery.\n", node_num); 1109 1110 mlog(0, "starting recovery thread...\n"); 1111 1112 if (osb->recovery_thread_task) 1113 goto out; 1114 1115 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, 1116 "ocfs2rec-%d", osb->osb_id); 1117 if (IS_ERR(osb->recovery_thread_task)) { 1118 mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); 1119 osb->recovery_thread_task = NULL; 1120 } 1121 1122 out: 1123 up(&osb->recovery_lock); 1124 wake_up(&osb->recovery_event); 1125 1126 mlog_exit_void(); 1127 } 1128 1129 /* Does the actual journal replay and marks the journal inode as 1130 * clean. Will only replay if the journal inode is marked dirty. */ 1131 static int ocfs2_replay_journal(struct ocfs2_super *osb, 1132 int node_num, 1133 int slot_num) 1134 { 1135 int status; 1136 int got_lock = 0; 1137 unsigned int flags; 1138 struct inode *inode = NULL; 1139 struct ocfs2_dinode *fe; 1140 journal_t *journal = NULL; 1141 struct buffer_head *bh = NULL; 1142 1143 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 1144 slot_num); 1145 if (inode == NULL) { 1146 status = -EACCES; 1147 mlog_errno(status); 1148 goto done; 1149 } 1150 if (is_bad_inode(inode)) { 1151 status = -EACCES; 1152 iput(inode); 1153 inode = NULL; 1154 mlog_errno(status); 1155 goto done; 1156 } 1157 SET_INODE_JOURNAL(inode); 1158 1159 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, 1160 OCFS2_META_LOCK_RECOVERY); 1161 if (status < 0) { 1162 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 1163 if (status != -ERESTARTSYS) 1164 mlog(ML_ERROR, "Could not lock journal!\n"); 1165 goto done; 1166 } 1167 got_lock = 1; 1168 1169 fe = (struct ocfs2_dinode *) bh->b_data; 1170 1171 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 1172 1173 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { 1174 mlog(0, "No recovery required for node %d\n", node_num); 1175 goto done; 1176 } 1177 1178 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1179 node_num, slot_num, 1180 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1181 1182 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1183 1184 status = ocfs2_force_read_journal(inode); 1185 if (status < 0) { 1186 mlog_errno(status); 1187 goto done; 1188 } 1189 1190 mlog(0, "calling journal_init_inode\n"); 1191 journal = journal_init_inode(inode); 1192 if (journal == NULL) { 1193 mlog(ML_ERROR, "Linux journal layer error\n"); 1194 status = -EIO; 1195 goto done; 1196 } 1197 1198 status = journal_load(journal); 1199 if (status < 0) { 1200 mlog_errno(status); 1201 if (!igrab(inode)) 1202 BUG(); 1203 journal_destroy(journal); 1204 goto done; 1205 } 1206 1207 ocfs2_clear_journal_error(osb->sb, journal, slot_num); 1208 1209 /* wipe the journal */ 1210 mlog(0, "flushing the journal.\n"); 1211 journal_lock_updates(journal); 1212 status = journal_flush(journal); 1213 journal_unlock_updates(journal); 1214 if (status < 0) 1215 mlog_errno(status); 1216 1217 /* This will mark the node clean */ 1218 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 1219 flags &= ~OCFS2_JOURNAL_DIRTY_FL; 1220 fe->id1.journal1.ij_flags = cpu_to_le32(flags); 1221 1222 status = ocfs2_write_block(osb, bh, inode); 1223 if (status < 0) 1224 mlog_errno(status); 1225 1226 if (!igrab(inode)) 1227 BUG(); 1228 1229 journal_destroy(journal); 1230 1231 done: 1232 /* drop the lock on this nodes journal */ 1233 if (got_lock) 1234 ocfs2_meta_unlock(inode, 1); 1235 1236 if (inode) 1237 iput(inode); 1238 1239 if (bh) 1240 brelse(bh); 1241 1242 mlog_exit(status); 1243 return status; 1244 } 1245 1246 /* 1247 * Do the most important parts of node recovery: 1248 * - Replay it's journal 1249 * - Stamp a clean local allocator file 1250 * - Stamp a clean truncate log 1251 * - Mark the node clean 1252 * 1253 * If this function completes without error, a node in OCFS2 can be 1254 * said to have been safely recovered. As a result, failure during the 1255 * second part of a nodes recovery process (local alloc recovery) is 1256 * far less concerning. 1257 */ 1258 static int ocfs2_recover_node(struct ocfs2_super *osb, 1259 int node_num) 1260 { 1261 int status = 0; 1262 int slot_num; 1263 struct ocfs2_slot_info *si = osb->slot_info; 1264 struct ocfs2_dinode *la_copy = NULL; 1265 struct ocfs2_dinode *tl_copy = NULL; 1266 1267 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1268 node_num, osb->node_num); 1269 1270 mlog(0, "checking node %d\n", node_num); 1271 1272 /* Should not ever be called to recover ourselves -- in that 1273 * case we should've called ocfs2_journal_load instead. */ 1274 if (osb->node_num == node_num) 1275 BUG(); 1276 1277 slot_num = ocfs2_node_num_to_slot(si, node_num); 1278 if (slot_num == OCFS2_INVALID_SLOT) { 1279 status = 0; 1280 mlog(0, "no slot for this node, so no recovery required.\n"); 1281 goto done; 1282 } 1283 1284 mlog(0, "node %d was using slot %d\n", node_num, slot_num); 1285 1286 status = ocfs2_replay_journal(osb, node_num, slot_num); 1287 if (status < 0) { 1288 mlog_errno(status); 1289 goto done; 1290 } 1291 1292 /* Stamp a clean local alloc file AFTER recovering the journal... */ 1293 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); 1294 if (status < 0) { 1295 mlog_errno(status); 1296 goto done; 1297 } 1298 1299 /* An error from begin_truncate_log_recovery is not 1300 * serious enough to warrant halting the rest of 1301 * recovery. */ 1302 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); 1303 if (status < 0) 1304 mlog_errno(status); 1305 1306 /* Likewise, this would be a strange but ultimately not so 1307 * harmful place to get an error... */ 1308 ocfs2_clear_slot(si, slot_num); 1309 status = ocfs2_update_disk_slots(osb, si); 1310 if (status < 0) 1311 mlog_errno(status); 1312 1313 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1314 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1315 tl_copy); 1316 1317 status = 0; 1318 done: 1319 1320 mlog_exit(status); 1321 return status; 1322 } 1323 1324 /* Test node liveness by trylocking his journal. If we get the lock, 1325 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is 1326 * still alive (we couldn't get the lock) and < 0 on error. */ 1327 static int ocfs2_trylock_journal(struct ocfs2_super *osb, 1328 int slot_num) 1329 { 1330 int status, flags; 1331 struct inode *inode = NULL; 1332 1333 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 1334 slot_num); 1335 if (inode == NULL) { 1336 mlog(ML_ERROR, "access error\n"); 1337 status = -EACCES; 1338 goto bail; 1339 } 1340 if (is_bad_inode(inode)) { 1341 mlog(ML_ERROR, "access error (bad inode)\n"); 1342 iput(inode); 1343 inode = NULL; 1344 status = -EACCES; 1345 goto bail; 1346 } 1347 SET_INODE_JOURNAL(inode); 1348 1349 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1350 status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags); 1351 if (status < 0) { 1352 if (status != -EAGAIN) 1353 mlog_errno(status); 1354 goto bail; 1355 } 1356 1357 ocfs2_meta_unlock(inode, 1); 1358 bail: 1359 if (inode) 1360 iput(inode); 1361 1362 return status; 1363 } 1364 1365 /* Call this underneath ocfs2_super_lock. It also assumes that the 1366 * slot info struct has been updated from disk. */ 1367 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) 1368 { 1369 int status, i, node_num; 1370 struct ocfs2_slot_info *si = osb->slot_info; 1371 1372 /* This is called with the super block cluster lock, so we 1373 * know that the slot map can't change underneath us. */ 1374 1375 spin_lock(&si->si_lock); 1376 for(i = 0; i < si->si_num_slots; i++) { 1377 if (i == osb->slot_num) 1378 continue; 1379 if (ocfs2_is_empty_slot(si, i)) 1380 continue; 1381 1382 node_num = si->si_global_node_nums[i]; 1383 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) 1384 continue; 1385 spin_unlock(&si->si_lock); 1386 1387 /* Ok, we have a slot occupied by another node which 1388 * is not in the recovery map. We trylock his journal 1389 * file here to test if he's alive. */ 1390 status = ocfs2_trylock_journal(osb, i); 1391 if (!status) { 1392 /* Since we're called from mount, we know that 1393 * the recovery thread can't race us on 1394 * setting / checking the recovery bits. */ 1395 ocfs2_recovery_thread(osb, node_num); 1396 } else if ((status < 0) && (status != -EAGAIN)) { 1397 mlog_errno(status); 1398 goto bail; 1399 } 1400 1401 spin_lock(&si->si_lock); 1402 } 1403 spin_unlock(&si->si_lock); 1404 1405 status = 0; 1406 bail: 1407 mlog_exit(status); 1408 return status; 1409 } 1410 1411 static int ocfs2_recover_orphans(struct ocfs2_super *osb, 1412 int slot) 1413 { 1414 int status = 0; 1415 int have_disk_lock = 0; 1416 struct inode *inode = NULL; 1417 struct inode *iter; 1418 struct inode *orphan_dir_inode = NULL; 1419 unsigned long offset, blk, local; 1420 struct buffer_head *bh = NULL; 1421 struct ocfs2_dir_entry *de; 1422 struct super_block *sb = osb->sb; 1423 struct ocfs2_inode_info *oi; 1424 1425 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); 1426 1427 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1428 ORPHAN_DIR_SYSTEM_INODE, 1429 slot); 1430 if (!orphan_dir_inode) { 1431 status = -ENOENT; 1432 mlog_errno(status); 1433 goto out; 1434 } 1435 1436 mutex_lock(&orphan_dir_inode->i_mutex); 1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); 1438 if (status < 0) { 1439 mutex_unlock(&orphan_dir_inode->i_mutex); 1440 mlog_errno(status); 1441 goto out; 1442 } 1443 have_disk_lock = 1; 1444 1445 offset = 0; 1446 iter = NULL; 1447 while(offset < i_size_read(orphan_dir_inode)) { 1448 blk = offset >> sb->s_blocksize_bits; 1449 1450 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0); 1451 if (!bh) 1452 status = -EINVAL; 1453 if (status < 0) { 1454 mutex_unlock(&orphan_dir_inode->i_mutex); 1455 if (bh) 1456 brelse(bh); 1457 mlog_errno(status); 1458 goto out; 1459 } 1460 1461 local = 0; 1462 while(offset < i_size_read(orphan_dir_inode) 1463 && local < sb->s_blocksize) { 1464 de = (struct ocfs2_dir_entry *) (bh->b_data + local); 1465 1466 if (!ocfs2_check_dir_entry(orphan_dir_inode, 1467 de, bh, local)) { 1468 mutex_unlock(&orphan_dir_inode->i_mutex); 1469 status = -EINVAL; 1470 mlog_errno(status); 1471 brelse(bh); 1472 goto out; 1473 } 1474 1475 local += le16_to_cpu(de->rec_len); 1476 offset += le16_to_cpu(de->rec_len); 1477 1478 /* I guess we silently fail on no inode? */ 1479 if (!le64_to_cpu(de->inode)) 1480 continue; 1481 if (de->file_type > OCFS2_FT_MAX) { 1482 mlog(ML_ERROR, 1483 "block %llu contains invalid de: " 1484 "inode = %"MLFu64", rec_len = %u, " 1485 "name_len = %u, file_type = %u, " 1486 "name='%.*s'\n", 1487 (unsigned long long)bh->b_blocknr, 1488 le64_to_cpu(de->inode), 1489 le16_to_cpu(de->rec_len), 1490 de->name_len, 1491 de->file_type, 1492 de->name_len, 1493 de->name); 1494 continue; 1495 } 1496 if (de->name_len == 1 && !strncmp(".", de->name, 1)) 1497 continue; 1498 if (de->name_len == 2 && !strncmp("..", de->name, 2)) 1499 continue; 1500 1501 iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); 1502 if (IS_ERR(iter)) 1503 continue; 1504 1505 mlog(0, "queue orphan %"MLFu64"\n", 1506 OCFS2_I(iter)->ip_blkno); 1507 OCFS2_I(iter)->ip_next_orphan = inode; 1508 inode = iter; 1509 } 1510 brelse(bh); 1511 } 1512 mutex_unlock(&orphan_dir_inode->i_mutex); 1513 1514 ocfs2_meta_unlock(orphan_dir_inode, 0); 1515 have_disk_lock = 0; 1516 1517 iput(orphan_dir_inode); 1518 orphan_dir_inode = NULL; 1519 1520 while (inode) { 1521 oi = OCFS2_I(inode); 1522 mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno); 1523 1524 iter = oi->ip_next_orphan; 1525 1526 spin_lock(&oi->ip_lock); 1527 /* Delete voting may have set these on the assumption 1528 * that the other node would wipe them successfully. 1529 * If they are still in the node's orphan dir, we need 1530 * to reset that state. */ 1531 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1532 1533 /* Set the proper information to get us going into 1534 * ocfs2_delete_inode. */ 1535 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 1536 oi->ip_orphaned_slot = slot; 1537 spin_unlock(&oi->ip_lock); 1538 1539 iput(inode); 1540 1541 inode = iter; 1542 } 1543 1544 out: 1545 if (have_disk_lock) 1546 ocfs2_meta_unlock(orphan_dir_inode, 0); 1547 1548 if (orphan_dir_inode) 1549 iput(orphan_dir_inode); 1550 1551 return status; 1552 } 1553 1554 static int ocfs2_wait_on_mount(struct ocfs2_super *osb) 1555 { 1556 /* This check is good because ocfs2 will wait on our recovery 1557 * thread before changing it to something other than MOUNTED 1558 * or DISABLED. */ 1559 wait_event(osb->osb_mount_event, 1560 atomic_read(&osb->vol_state) == VOLUME_MOUNTED || 1561 atomic_read(&osb->vol_state) == VOLUME_DISABLED); 1562 1563 /* If there's an error on mount, then we may never get to the 1564 * MOUNTED flag, but this is set right before 1565 * dismount_volume() so we can trust it. */ 1566 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { 1567 mlog(0, "mount error, exiting!\n"); 1568 return -EBUSY; 1569 } 1570 1571 return 0; 1572 } 1573 1574 static int ocfs2_commit_thread(void *arg) 1575 { 1576 int status; 1577 struct ocfs2_super *osb = arg; 1578 struct ocfs2_journal *journal = osb->journal; 1579 1580 /* we can trust j_num_trans here because _should_stop() is only set in 1581 * shutdown and nobody other than ourselves should be able to start 1582 * transactions. committing on shutdown might take a few iterations 1583 * as final transactions put deleted inodes on the list */ 1584 while (!(kthread_should_stop() && 1585 atomic_read(&journal->j_num_trans) == 0)) { 1586 1587 wait_event_interruptible_timeout(osb->checkpoint_event, 1588 atomic_read(&journal->j_num_trans) 1589 || kthread_should_stop(), 1590 OCFS2_CHECKPOINT_INTERVAL); 1591 1592 status = ocfs2_commit_cache(osb); 1593 if (status < 0) 1594 mlog_errno(status); 1595 1596 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ 1597 mlog(ML_KTHREAD, 1598 "commit_thread: %u transactions pending on " 1599 "shutdown\n", 1600 atomic_read(&journal->j_num_trans)); 1601 } 1602 } 1603 1604 return 0; 1605 } 1606 1607 /* Look for a dirty journal without taking any cluster locks. Used for 1608 * hard readonly access to determine whether the file system journals 1609 * require recovery. */ 1610 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) 1611 { 1612 int ret = 0; 1613 unsigned int slot; 1614 struct buffer_head *di_bh; 1615 struct ocfs2_dinode *di; 1616 struct inode *journal = NULL; 1617 1618 for(slot = 0; slot < osb->max_slots; slot++) { 1619 journal = ocfs2_get_system_file_inode(osb, 1620 JOURNAL_SYSTEM_INODE, 1621 slot); 1622 if (!journal || is_bad_inode(journal)) { 1623 ret = -EACCES; 1624 mlog_errno(ret); 1625 goto out; 1626 } 1627 1628 di_bh = NULL; 1629 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, 1630 0, journal); 1631 if (ret < 0) { 1632 mlog_errno(ret); 1633 goto out; 1634 } 1635 1636 di = (struct ocfs2_dinode *) di_bh->b_data; 1637 1638 if (le32_to_cpu(di->id1.journal1.ij_flags) & 1639 OCFS2_JOURNAL_DIRTY_FL) 1640 ret = -EROFS; 1641 1642 brelse(di_bh); 1643 if (ret) 1644 break; 1645 } 1646 1647 out: 1648 if (journal) 1649 iput(journal); 1650 1651 return ret; 1652 } 1653