1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * super.c 5 * 6 * load/unload driver, mount/dismount volumes 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/module.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/utsname.h> 32 #include <linux/init.h> 33 #include <linux/random.h> 34 #include <linux/statfs.h> 35 #include <linux/moduleparam.h> 36 #include <linux/blkdev.h> 37 #include <linux/socket.h> 38 #include <linux/inet.h> 39 #include <linux/parser.h> 40 #include <linux/crc32.h> 41 #include <linux/debugfs.h> 42 43 #include <cluster/nodemanager.h> 44 45 #define MLOG_MASK_PREFIX ML_SUPER 46 #include <cluster/masklog.h> 47 48 #include "ocfs2.h" 49 50 /* this should be the only file to include a version 1 header */ 51 #include "ocfs1_fs_compat.h" 52 53 #include "alloc.h" 54 #include "dlmglue.h" 55 #include "export.h" 56 #include "extent_map.h" 57 #include "heartbeat.h" 58 #include "inode.h" 59 #include "journal.h" 60 #include "localalloc.h" 61 #include "namei.h" 62 #include "slot_map.h" 63 #include "super.h" 64 #include "sysfile.h" 65 #include "uptodate.h" 66 #include "ver.h" 67 #include "vote.h" 68 69 #include "buffer_head_io.h" 70 71 static struct kmem_cache *ocfs2_inode_cachep = NULL; 72 73 /* OCFS2 needs to schedule several differnt types of work which 74 * require cluster locking, disk I/O, recovery waits, etc. Since these 75 * types of work tend to be heavy we avoid using the kernel events 76 * workqueue and schedule on our own. */ 77 struct workqueue_struct *ocfs2_wq = NULL; 78 79 static struct dentry *ocfs2_debugfs_root = NULL; 80 81 MODULE_AUTHOR("Oracle"); 82 MODULE_LICENSE("GPL"); 83 84 static int ocfs2_parse_options(struct super_block *sb, char *options, 85 unsigned long *mount_opt, int is_remount); 86 static void ocfs2_put_super(struct super_block *sb); 87 static int ocfs2_mount_volume(struct super_block *sb); 88 static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 89 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); 90 static int ocfs2_initialize_mem_caches(void); 91 static void ocfs2_free_mem_caches(void); 92 static void ocfs2_delete_osb(struct ocfs2_super *osb); 93 94 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); 95 96 static int ocfs2_sync_fs(struct super_block *sb, int wait); 97 98 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 99 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 100 static int ocfs2_release_system_inodes(struct ocfs2_super *osb); 101 static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); 102 static int ocfs2_check_volume(struct ocfs2_super *osb); 103 static int ocfs2_verify_volume(struct ocfs2_dinode *di, 104 struct buffer_head *bh, 105 u32 sectsize); 106 static int ocfs2_initialize_super(struct super_block *sb, 107 struct buffer_head *bh, 108 int sector_size); 109 static int ocfs2_get_sector(struct super_block *sb, 110 struct buffer_head **bh, 111 int block, 112 int sect_size); 113 static void ocfs2_write_super(struct super_block *sb); 114 static struct inode *ocfs2_alloc_inode(struct super_block *sb); 115 static void ocfs2_destroy_inode(struct inode *inode); 116 117 static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); 118 119 static const struct super_operations ocfs2_sops = { 120 .statfs = ocfs2_statfs, 121 .alloc_inode = ocfs2_alloc_inode, 122 .destroy_inode = ocfs2_destroy_inode, 123 .drop_inode = ocfs2_drop_inode, 124 .clear_inode = ocfs2_clear_inode, 125 .delete_inode = ocfs2_delete_inode, 126 .sync_fs = ocfs2_sync_fs, 127 .write_super = ocfs2_write_super, 128 .put_super = ocfs2_put_super, 129 .remount_fs = ocfs2_remount, 130 }; 131 132 enum { 133 Opt_barrier, 134 Opt_err_panic, 135 Opt_err_ro, 136 Opt_intr, 137 Opt_nointr, 138 Opt_hb_none, 139 Opt_hb_local, 140 Opt_data_ordered, 141 Opt_data_writeback, 142 Opt_atime_quantum, 143 Opt_err, 144 }; 145 146 static match_table_t tokens = { 147 {Opt_barrier, "barrier=%u"}, 148 {Opt_err_panic, "errors=panic"}, 149 {Opt_err_ro, "errors=remount-ro"}, 150 {Opt_intr, "intr"}, 151 {Opt_nointr, "nointr"}, 152 {Opt_hb_none, OCFS2_HB_NONE}, 153 {Opt_hb_local, OCFS2_HB_LOCAL}, 154 {Opt_data_ordered, "data=ordered"}, 155 {Opt_data_writeback, "data=writeback"}, 156 {Opt_atime_quantum, "atime_quantum=%u"}, 157 {Opt_err, NULL} 158 }; 159 160 /* 161 * write_super and sync_fs ripped right out of ext3. 162 */ 163 static void ocfs2_write_super(struct super_block *sb) 164 { 165 if (mutex_trylock(&sb->s_lock) != 0) 166 BUG(); 167 sb->s_dirt = 0; 168 } 169 170 static int ocfs2_sync_fs(struct super_block *sb, int wait) 171 { 172 int status = 0; 173 tid_t target; 174 struct ocfs2_super *osb = OCFS2_SB(sb); 175 176 sb->s_dirt = 0; 177 178 if (ocfs2_is_hard_readonly(osb)) 179 return -EROFS; 180 181 if (wait) { 182 status = ocfs2_flush_truncate_log(osb); 183 if (status < 0) 184 mlog_errno(status); 185 } else { 186 ocfs2_schedule_truncate_log_flush(osb, 0); 187 } 188 189 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { 190 if (wait) 191 log_wait_commit(OCFS2_SB(sb)->journal->j_journal, 192 target); 193 } 194 return 0; 195 } 196 197 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) 198 { 199 struct inode *new = NULL; 200 int status = 0; 201 int i; 202 203 mlog_entry_void(); 204 205 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); 206 if (IS_ERR(new)) { 207 status = PTR_ERR(new); 208 mlog_errno(status); 209 goto bail; 210 } 211 osb->root_inode = new; 212 213 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); 214 if (IS_ERR(new)) { 215 status = PTR_ERR(new); 216 mlog_errno(status); 217 goto bail; 218 } 219 osb->sys_root_inode = new; 220 221 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; 222 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { 223 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 224 if (!new) { 225 ocfs2_release_system_inodes(osb); 226 status = -EINVAL; 227 mlog_errno(status); 228 /* FIXME: Should ERROR_RO_FS */ 229 mlog(ML_ERROR, "Unable to load system inode %d, " 230 "possibly corrupt fs?", i); 231 goto bail; 232 } 233 // the array now has one ref, so drop this one 234 iput(new); 235 } 236 237 bail: 238 mlog_exit(status); 239 return status; 240 } 241 242 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) 243 { 244 struct inode *new = NULL; 245 int status = 0; 246 int i; 247 248 mlog_entry_void(); 249 250 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; 251 i < NUM_SYSTEM_INODES; 252 i++) { 253 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 254 if (!new) { 255 ocfs2_release_system_inodes(osb); 256 status = -EINVAL; 257 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", 258 status, i, osb->slot_num); 259 goto bail; 260 } 261 /* the array now has one ref, so drop this one */ 262 iput(new); 263 } 264 265 bail: 266 mlog_exit(status); 267 return status; 268 } 269 270 static int ocfs2_release_system_inodes(struct ocfs2_super *osb) 271 { 272 int status = 0, i; 273 struct inode *inode; 274 275 mlog_entry_void(); 276 277 for (i = 0; i < NUM_SYSTEM_INODES; i++) { 278 inode = osb->system_inodes[i]; 279 if (inode) { 280 iput(inode); 281 osb->system_inodes[i] = NULL; 282 } 283 } 284 285 inode = osb->sys_root_inode; 286 if (inode) { 287 iput(inode); 288 osb->sys_root_inode = NULL; 289 } 290 291 inode = osb->root_inode; 292 if (inode) { 293 iput(inode); 294 osb->root_inode = NULL; 295 } 296 297 mlog_exit(status); 298 return status; 299 } 300 301 /* We're allocating fs objects, use GFP_NOFS */ 302 static struct inode *ocfs2_alloc_inode(struct super_block *sb) 303 { 304 struct ocfs2_inode_info *oi; 305 306 oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS); 307 if (!oi) 308 return NULL; 309 310 return &oi->vfs_inode; 311 } 312 313 static void ocfs2_destroy_inode(struct inode *inode) 314 { 315 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); 316 } 317 318 /* From xfs_super.c:xfs_max_file_offset 319 * Copyright (c) 2000-2004 Silicon Graphics, Inc. 320 */ 321 static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) 322 { 323 unsigned int pagefactor = 1; 324 unsigned int bitshift = BITS_PER_LONG - 1; 325 326 /* Figure out maximum filesize, on Linux this can depend on 327 * the filesystem blocksize (on 32 bit platforms). 328 * __block_prepare_write does this in an [unsigned] long... 329 * page->index << (PAGE_CACHE_SHIFT - bbits) 330 * So, for page sized blocks (4K on 32 bit platforms), 331 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 332 * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 333 * but for smaller blocksizes it is less (bbits = log2 bsize). 334 * Note1: get_block_t takes a long (implicit cast from above) 335 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch 336 * can optionally convert the [unsigned] long from above into 337 * an [unsigned] long long. 338 */ 339 340 #if BITS_PER_LONG == 32 341 # if defined(CONFIG_LBD) 342 BUILD_BUG_ON(sizeof(sector_t) != 8); 343 pagefactor = PAGE_CACHE_SIZE; 344 bitshift = BITS_PER_LONG; 345 # else 346 pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); 347 # endif 348 #endif 349 350 return (((unsigned long long)pagefactor) << bitshift) - 1; 351 } 352 353 static int ocfs2_remount(struct super_block *sb, int *flags, char *data) 354 { 355 int incompat_features; 356 int ret = 0; 357 unsigned long parsed_options; 358 struct ocfs2_super *osb = OCFS2_SB(sb); 359 360 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 361 ret = -EINVAL; 362 goto out; 363 } 364 365 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != 366 (parsed_options & OCFS2_MOUNT_HB_LOCAL)) { 367 ret = -EINVAL; 368 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 369 goto out; 370 } 371 372 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != 373 (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) { 374 ret = -EINVAL; 375 mlog(ML_ERROR, "Cannot change data mode on remount\n"); 376 goto out; 377 } 378 379 /* We're going to/from readonly mode. */ 380 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 381 /* Lock here so the check of HARD_RO and the potential 382 * setting of SOFT_RO is atomic. */ 383 spin_lock(&osb->osb_lock); 384 if (osb->osb_flags & OCFS2_OSB_HARD_RO) { 385 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); 386 ret = -EROFS; 387 goto unlock_osb; 388 } 389 390 if (*flags & MS_RDONLY) { 391 mlog(0, "Going to ro mode.\n"); 392 sb->s_flags |= MS_RDONLY; 393 osb->osb_flags |= OCFS2_OSB_SOFT_RO; 394 } else { 395 mlog(0, "Making ro filesystem writeable.\n"); 396 397 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { 398 mlog(ML_ERROR, "Cannot remount RDWR " 399 "filesystem due to previous errors.\n"); 400 ret = -EROFS; 401 goto unlock_osb; 402 } 403 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); 404 if (incompat_features) { 405 mlog(ML_ERROR, "Cannot remount RDWR because " 406 "of unsupported optional features " 407 "(%x).\n", incompat_features); 408 ret = -EINVAL; 409 goto unlock_osb; 410 } 411 sb->s_flags &= ~MS_RDONLY; 412 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; 413 } 414 unlock_osb: 415 spin_unlock(&osb->osb_lock); 416 } 417 418 if (!ret) { 419 if (!ocfs2_is_hard_readonly(osb)) 420 ocfs2_set_journal_params(osb); 421 422 /* Only save off the new mount options in case of a successful 423 * remount. */ 424 osb->s_mount_opt = parsed_options; 425 } 426 out: 427 return ret; 428 } 429 430 static int ocfs2_sb_probe(struct super_block *sb, 431 struct buffer_head **bh, 432 int *sector_size) 433 { 434 int status = 0, tmpstat; 435 struct ocfs1_vol_disk_hdr *hdr; 436 struct ocfs2_dinode *di; 437 int blksize; 438 439 *bh = NULL; 440 441 /* may be > 512 */ 442 *sector_size = bdev_hardsect_size(sb->s_bdev); 443 if (*sector_size > OCFS2_MAX_BLOCKSIZE) { 444 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", 445 *sector_size, OCFS2_MAX_BLOCKSIZE); 446 status = -EINVAL; 447 goto bail; 448 } 449 450 /* Can this really happen? */ 451 if (*sector_size < OCFS2_MIN_BLOCKSIZE) 452 *sector_size = OCFS2_MIN_BLOCKSIZE; 453 454 /* check block zero for old format */ 455 status = ocfs2_get_sector(sb, bh, 0, *sector_size); 456 if (status < 0) { 457 mlog_errno(status); 458 goto bail; 459 } 460 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; 461 if (hdr->major_version == OCFS1_MAJOR_VERSION) { 462 mlog(ML_ERROR, "incompatible version: %u.%u\n", 463 hdr->major_version, hdr->minor_version); 464 status = -EINVAL; 465 } 466 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, 467 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { 468 mlog(ML_ERROR, "incompatible volume signature: %8s\n", 469 hdr->signature); 470 status = -EINVAL; 471 } 472 brelse(*bh); 473 *bh = NULL; 474 if (status < 0) { 475 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " 476 "upgraded before mounting with ocfs v2\n"); 477 goto bail; 478 } 479 480 /* 481 * Now check at magic offset for 512, 1024, 2048, 4096 482 * blocksizes. 4096 is the maximum blocksize because it is 483 * the minimum clustersize. 484 */ 485 status = -EINVAL; 486 for (blksize = *sector_size; 487 blksize <= OCFS2_MAX_BLOCKSIZE; 488 blksize <<= 1) { 489 tmpstat = ocfs2_get_sector(sb, bh, 490 OCFS2_SUPER_BLOCK_BLKNO, 491 blksize); 492 if (tmpstat < 0) { 493 status = tmpstat; 494 mlog_errno(status); 495 goto bail; 496 } 497 di = (struct ocfs2_dinode *) (*bh)->b_data; 498 status = ocfs2_verify_volume(di, *bh, blksize); 499 if (status >= 0) 500 goto bail; 501 brelse(*bh); 502 *bh = NULL; 503 if (status != -EAGAIN) 504 break; 505 } 506 507 bail: 508 return status; 509 } 510 511 static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 512 { 513 if (ocfs2_mount_local(osb)) { 514 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 515 mlog(ML_ERROR, "Cannot heartbeat on a locally " 516 "mounted device.\n"); 517 return -EINVAL; 518 } 519 } 520 521 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 522 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { 523 mlog(ML_ERROR, "Heartbeat has to be started to mount " 524 "a read-write clustered device.\n"); 525 return -EINVAL; 526 } 527 } 528 529 return 0; 530 } 531 532 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 533 { 534 struct dentry *root; 535 int status, sector_size; 536 unsigned long parsed_opt; 537 struct inode *inode = NULL; 538 struct ocfs2_super *osb = NULL; 539 struct buffer_head *bh = NULL; 540 char nodestr[8]; 541 542 mlog_entry("%p, %p, %i", sb, data, silent); 543 544 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { 545 status = -EINVAL; 546 goto read_super_error; 547 } 548 549 /* for now we only have one cluster/node, make sure we see it 550 * in the heartbeat universe */ 551 if (parsed_opt & OCFS2_MOUNT_HB_LOCAL) { 552 if (!o2hb_check_local_node_heartbeating()) { 553 status = -EINVAL; 554 goto read_super_error; 555 } 556 } 557 558 /* probe for superblock */ 559 status = ocfs2_sb_probe(sb, &bh, §or_size); 560 if (status < 0) { 561 mlog(ML_ERROR, "superblock probe failed!\n"); 562 goto read_super_error; 563 } 564 565 status = ocfs2_initialize_super(sb, bh, sector_size); 566 osb = OCFS2_SB(sb); 567 if (status < 0) { 568 mlog_errno(status); 569 goto read_super_error; 570 } 571 brelse(bh); 572 bh = NULL; 573 osb->s_mount_opt = parsed_opt; 574 575 sb->s_magic = OCFS2_SUPER_MAGIC; 576 577 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 578 * heartbeat=none */ 579 if (bdev_read_only(sb->s_bdev)) { 580 if (!(sb->s_flags & MS_RDONLY)) { 581 status = -EACCES; 582 mlog(ML_ERROR, "Readonly device detected but readonly " 583 "mount was not specified.\n"); 584 goto read_super_error; 585 } 586 587 /* You should not be able to start a local heartbeat 588 * on a readonly device. */ 589 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 590 status = -EROFS; 591 mlog(ML_ERROR, "Local heartbeat specified on readonly " 592 "device.\n"); 593 goto read_super_error; 594 } 595 596 status = ocfs2_check_journals_nolocks(osb); 597 if (status < 0) { 598 if (status == -EROFS) 599 mlog(ML_ERROR, "Recovery required on readonly " 600 "file system, but write access is " 601 "unavailable.\n"); 602 else 603 mlog_errno(status); 604 goto read_super_error; 605 } 606 607 ocfs2_set_ro_flag(osb, 1); 608 609 printk(KERN_NOTICE "Readonly device detected. No cluster " 610 "services will be utilized for this mount. Recovery " 611 "will be skipped.\n"); 612 } 613 614 if (!ocfs2_is_hard_readonly(osb)) { 615 if (sb->s_flags & MS_RDONLY) 616 ocfs2_set_ro_flag(osb, 0); 617 } 618 619 status = ocfs2_verify_heartbeat(osb); 620 if (status < 0) { 621 mlog_errno(status); 622 goto read_super_error; 623 } 624 625 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, 626 ocfs2_debugfs_root); 627 if (!osb->osb_debug_root) { 628 status = -EINVAL; 629 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); 630 goto read_super_error; 631 } 632 633 status = ocfs2_mount_volume(sb); 634 if (osb->root_inode) 635 inode = igrab(osb->root_inode); 636 637 if (status < 0) 638 goto read_super_error; 639 640 if (!inode) { 641 status = -EIO; 642 mlog_errno(status); 643 goto read_super_error; 644 } 645 646 root = d_alloc_root(inode); 647 if (!root) { 648 status = -ENOMEM; 649 mlog_errno(status); 650 goto read_super_error; 651 } 652 653 sb->s_root = root; 654 655 ocfs2_complete_mount_recovery(osb); 656 657 if (ocfs2_mount_local(osb)) 658 snprintf(nodestr, sizeof(nodestr), "local"); 659 else 660 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 661 662 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " 663 "with %s data mode.\n", 664 osb->dev_str, nodestr, osb->slot_num, 665 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : 666 "ordered"); 667 668 atomic_set(&osb->vol_state, VOLUME_MOUNTED); 669 wake_up(&osb->osb_mount_event); 670 671 mlog_exit(status); 672 return status; 673 674 read_super_error: 675 if (bh != NULL) 676 brelse(bh); 677 678 if (inode) 679 iput(inode); 680 681 if (osb) { 682 atomic_set(&osb->vol_state, VOLUME_DISABLED); 683 wake_up(&osb->osb_mount_event); 684 ocfs2_dismount_volume(sb, 1); 685 } 686 687 mlog_exit(status); 688 return status; 689 } 690 691 static int ocfs2_get_sb(struct file_system_type *fs_type, 692 int flags, 693 const char *dev_name, 694 void *data, 695 struct vfsmount *mnt) 696 { 697 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, 698 mnt); 699 } 700 701 static struct file_system_type ocfs2_fs_type = { 702 .owner = THIS_MODULE, 703 .name = "ocfs2", 704 .get_sb = ocfs2_get_sb, /* is this called when we mount 705 * the fs? */ 706 .kill_sb = kill_block_super, /* set to the generic one 707 * right now, but do we 708 * need to change that? */ 709 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 710 .next = NULL 711 }; 712 713 static int ocfs2_parse_options(struct super_block *sb, 714 char *options, 715 unsigned long *mount_opt, 716 int is_remount) 717 { 718 int status; 719 char *p; 720 721 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 722 options ? options : "(none)"); 723 724 *mount_opt = 0; 725 726 if (!options) { 727 status = 1; 728 goto bail; 729 } 730 731 while ((p = strsep(&options, ",")) != NULL) { 732 int token, option; 733 substring_t args[MAX_OPT_ARGS]; 734 struct ocfs2_super * osb = OCFS2_SB(sb); 735 736 if (!*p) 737 continue; 738 739 token = match_token(p, tokens, args); 740 switch (token) { 741 case Opt_hb_local: 742 *mount_opt |= OCFS2_MOUNT_HB_LOCAL; 743 break; 744 case Opt_hb_none: 745 *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; 746 break; 747 case Opt_barrier: 748 if (match_int(&args[0], &option)) { 749 status = 0; 750 goto bail; 751 } 752 if (option) 753 *mount_opt |= OCFS2_MOUNT_BARRIER; 754 else 755 *mount_opt &= ~OCFS2_MOUNT_BARRIER; 756 break; 757 case Opt_intr: 758 *mount_opt &= ~OCFS2_MOUNT_NOINTR; 759 break; 760 case Opt_nointr: 761 *mount_opt |= OCFS2_MOUNT_NOINTR; 762 break; 763 case Opt_err_panic: 764 *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 765 break; 766 case Opt_err_ro: 767 *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; 768 break; 769 case Opt_data_ordered: 770 *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; 771 break; 772 case Opt_data_writeback: 773 *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; 774 break; 775 case Opt_atime_quantum: 776 if (match_int(&args[0], &option)) { 777 status = 0; 778 goto bail; 779 } 780 if (option >= 0) 781 osb->s_atime_quantum = option; 782 else 783 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 784 break; 785 default: 786 mlog(ML_ERROR, 787 "Unrecognized mount option \"%s\" " 788 "or missing value\n", p); 789 status = 0; 790 goto bail; 791 } 792 } 793 794 status = 1; 795 796 bail: 797 mlog_exit(status); 798 return status; 799 } 800 801 static int __init ocfs2_init(void) 802 { 803 int status; 804 805 mlog_entry_void(); 806 807 ocfs2_print_version(); 808 809 status = init_ocfs2_uptodate_cache(); 810 if (status < 0) { 811 mlog_errno(status); 812 goto leave; 813 } 814 815 status = ocfs2_initialize_mem_caches(); 816 if (status < 0) { 817 mlog_errno(status); 818 goto leave; 819 } 820 821 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); 822 if (!ocfs2_wq) { 823 status = -ENOMEM; 824 goto leave; 825 } 826 827 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 828 if (!ocfs2_debugfs_root) { 829 status = -EFAULT; 830 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 831 } 832 833 leave: 834 if (status < 0) { 835 ocfs2_free_mem_caches(); 836 exit_ocfs2_uptodate_cache(); 837 } 838 839 mlog_exit(status); 840 841 if (status >= 0) { 842 return register_filesystem(&ocfs2_fs_type); 843 } else 844 return -1; 845 } 846 847 static void __exit ocfs2_exit(void) 848 { 849 mlog_entry_void(); 850 851 if (ocfs2_wq) { 852 flush_workqueue(ocfs2_wq); 853 destroy_workqueue(ocfs2_wq); 854 } 855 856 debugfs_remove(ocfs2_debugfs_root); 857 858 ocfs2_free_mem_caches(); 859 860 unregister_filesystem(&ocfs2_fs_type); 861 862 exit_ocfs2_uptodate_cache(); 863 864 mlog_exit_void(); 865 } 866 867 static void ocfs2_put_super(struct super_block *sb) 868 { 869 mlog_entry("(0x%p)\n", sb); 870 871 ocfs2_sync_blockdev(sb); 872 ocfs2_dismount_volume(sb, 0); 873 874 mlog_exit_void(); 875 } 876 877 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) 878 { 879 struct ocfs2_super *osb; 880 u32 numbits, freebits; 881 int status; 882 struct ocfs2_dinode *bm_lock; 883 struct buffer_head *bh = NULL; 884 struct inode *inode = NULL; 885 886 mlog_entry("(%p, %p)\n", dentry->d_sb, buf); 887 888 osb = OCFS2_SB(dentry->d_sb); 889 890 inode = ocfs2_get_system_file_inode(osb, 891 GLOBAL_BITMAP_SYSTEM_INODE, 892 OCFS2_INVALID_SLOT); 893 if (!inode) { 894 mlog(ML_ERROR, "failed to get bitmap inode\n"); 895 status = -EIO; 896 goto bail; 897 } 898 899 status = ocfs2_meta_lock(inode, &bh, 0); 900 if (status < 0) { 901 mlog_errno(status); 902 goto bail; 903 } 904 905 bm_lock = (struct ocfs2_dinode *) bh->b_data; 906 907 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); 908 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); 909 910 buf->f_type = OCFS2_SUPER_MAGIC; 911 buf->f_bsize = dentry->d_sb->s_blocksize; 912 buf->f_namelen = OCFS2_MAX_FILENAME_LEN; 913 buf->f_blocks = ((sector_t) numbits) * 914 (osb->s_clustersize >> osb->sb->s_blocksize_bits); 915 buf->f_bfree = ((sector_t) freebits) * 916 (osb->s_clustersize >> osb->sb->s_blocksize_bits); 917 buf->f_bavail = buf->f_bfree; 918 buf->f_files = numbits; 919 buf->f_ffree = freebits; 920 921 brelse(bh); 922 923 ocfs2_meta_unlock(inode, 0); 924 status = 0; 925 bail: 926 if (inode) 927 iput(inode); 928 929 mlog_exit(status); 930 931 return status; 932 } 933 934 static void ocfs2_inode_init_once(void *data, 935 struct kmem_cache *cachep, 936 unsigned long flags) 937 { 938 struct ocfs2_inode_info *oi = data; 939 940 if (flags & SLAB_CTOR_CONSTRUCTOR) { 941 oi->ip_flags = 0; 942 oi->ip_open_count = 0; 943 spin_lock_init(&oi->ip_lock); 944 ocfs2_extent_map_init(&oi->vfs_inode); 945 INIT_LIST_HEAD(&oi->ip_io_markers); 946 oi->ip_created_trans = 0; 947 oi->ip_last_trans = 0; 948 oi->ip_dir_start_lookup = 0; 949 950 init_rwsem(&oi->ip_alloc_sem); 951 mutex_init(&oi->ip_io_mutex); 952 953 oi->ip_blkno = 0ULL; 954 oi->ip_clusters = 0; 955 956 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 957 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 958 ocfs2_lock_res_init_once(&oi->ip_data_lockres); 959 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 960 961 ocfs2_metadata_cache_init(&oi->vfs_inode); 962 963 inode_init_once(&oi->vfs_inode); 964 } 965 } 966 967 static int ocfs2_initialize_mem_caches(void) 968 { 969 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", 970 sizeof(struct ocfs2_inode_info), 971 0, 972 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 973 SLAB_MEM_SPREAD), 974 ocfs2_inode_init_once, NULL); 975 if (!ocfs2_inode_cachep) 976 return -ENOMEM; 977 978 return 0; 979 } 980 981 static void ocfs2_free_mem_caches(void) 982 { 983 if (ocfs2_inode_cachep) 984 kmem_cache_destroy(ocfs2_inode_cachep); 985 986 ocfs2_inode_cachep = NULL; 987 } 988 989 static int ocfs2_get_sector(struct super_block *sb, 990 struct buffer_head **bh, 991 int block, 992 int sect_size) 993 { 994 if (!sb_set_blocksize(sb, sect_size)) { 995 mlog(ML_ERROR, "unable to set blocksize\n"); 996 return -EIO; 997 } 998 999 *bh = sb_getblk(sb, block); 1000 if (!*bh) { 1001 mlog_errno(-EIO); 1002 return -EIO; 1003 } 1004 lock_buffer(*bh); 1005 if (!buffer_dirty(*bh)) 1006 clear_buffer_uptodate(*bh); 1007 unlock_buffer(*bh); 1008 ll_rw_block(READ, 1, bh); 1009 wait_on_buffer(*bh); 1010 return 0; 1011 } 1012 1013 /* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ 1014 static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) 1015 { 1016 int status; 1017 1018 /* XXX hold a ref on the node while mounte? easy enough, if 1019 * desirable. */ 1020 if (ocfs2_mount_local(osb)) 1021 osb->node_num = 0; 1022 else 1023 osb->node_num = o2nm_this_node(); 1024 1025 if (osb->node_num == O2NM_MAX_NODES) { 1026 mlog(ML_ERROR, "could not find this host's node number\n"); 1027 status = -ENOENT; 1028 goto bail; 1029 } 1030 1031 mlog(0, "I am node %d\n", osb->node_num); 1032 1033 status = 0; 1034 bail: 1035 return status; 1036 } 1037 1038 static int ocfs2_mount_volume(struct super_block *sb) 1039 { 1040 int status = 0; 1041 int unlock_super = 0; 1042 struct ocfs2_super *osb = OCFS2_SB(sb); 1043 1044 mlog_entry_void(); 1045 1046 if (ocfs2_is_hard_readonly(osb)) 1047 goto leave; 1048 1049 status = ocfs2_fill_local_node_info(osb); 1050 if (status < 0) { 1051 mlog_errno(status); 1052 goto leave; 1053 } 1054 1055 status = ocfs2_register_hb_callbacks(osb); 1056 if (status < 0) { 1057 mlog_errno(status); 1058 goto leave; 1059 } 1060 1061 status = ocfs2_dlm_init(osb); 1062 if (status < 0) { 1063 mlog_errno(status); 1064 goto leave; 1065 } 1066 1067 /* requires vote_thread to be running. */ 1068 status = ocfs2_register_net_handlers(osb); 1069 if (status < 0) { 1070 mlog_errno(status); 1071 goto leave; 1072 } 1073 1074 status = ocfs2_super_lock(osb, 1); 1075 if (status < 0) { 1076 mlog_errno(status); 1077 goto leave; 1078 } 1079 unlock_super = 1; 1080 1081 /* This will load up the node map and add ourselves to it. */ 1082 status = ocfs2_find_slot(osb); 1083 if (status < 0) { 1084 mlog_errno(status); 1085 goto leave; 1086 } 1087 1088 ocfs2_populate_mounted_map(osb); 1089 1090 /* load all node-local system inodes */ 1091 status = ocfs2_init_local_system_inodes(osb); 1092 if (status < 0) { 1093 mlog_errno(status); 1094 goto leave; 1095 } 1096 1097 status = ocfs2_check_volume(osb); 1098 if (status < 0) { 1099 mlog_errno(status); 1100 goto leave; 1101 } 1102 1103 status = ocfs2_truncate_log_init(osb); 1104 if (status < 0) { 1105 mlog_errno(status); 1106 goto leave; 1107 } 1108 1109 if (ocfs2_mount_local(osb)) 1110 goto leave; 1111 1112 /* This should be sent *after* we recovered our journal as it 1113 * will cause other nodes to unmark us as needing 1114 * recovery. However, we need to send it *before* dropping the 1115 * super block lock as otherwise their recovery threads might 1116 * try to clean us up while we're live! */ 1117 status = ocfs2_request_mount_vote(osb); 1118 if (status < 0) 1119 mlog_errno(status); 1120 1121 leave: 1122 if (unlock_super) 1123 ocfs2_super_unlock(osb, 1); 1124 1125 mlog_exit(status); 1126 return status; 1127 } 1128 1129 /* we can't grab the goofy sem lock from inside wait_event, so we use 1130 * memory barriers to make sure that we'll see the null task before 1131 * being woken up */ 1132 static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) 1133 { 1134 mb(); 1135 return osb->recovery_thread_task != NULL; 1136 } 1137 1138 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) 1139 { 1140 int tmp; 1141 struct ocfs2_super *osb = NULL; 1142 char nodestr[8]; 1143 1144 mlog_entry("(0x%p)\n", sb); 1145 1146 BUG_ON(!sb); 1147 osb = OCFS2_SB(sb); 1148 BUG_ON(!osb); 1149 1150 ocfs2_shutdown_local_alloc(osb); 1151 1152 ocfs2_truncate_log_shutdown(osb); 1153 1154 /* disable any new recovery threads and wait for any currently 1155 * running ones to exit. Do this before setting the vol_state. */ 1156 mutex_lock(&osb->recovery_lock); 1157 osb->disable_recovery = 1; 1158 mutex_unlock(&osb->recovery_lock); 1159 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); 1160 1161 /* At this point, we know that no more recovery threads can be 1162 * launched, so wait for any recovery completion work to 1163 * complete. */ 1164 flush_workqueue(ocfs2_wq); 1165 1166 ocfs2_journal_shutdown(osb); 1167 1168 ocfs2_sync_blockdev(sb); 1169 1170 /* No dlm means we've failed during mount, so skip all the 1171 * steps which depended on that to complete. */ 1172 if (osb->dlm) { 1173 tmp = ocfs2_super_lock(osb, 1); 1174 if (tmp < 0) { 1175 mlog_errno(tmp); 1176 return; 1177 } 1178 1179 tmp = ocfs2_request_umount_vote(osb); 1180 if (tmp < 0) 1181 mlog_errno(tmp); 1182 1183 if (osb->slot_num != OCFS2_INVALID_SLOT) 1184 ocfs2_put_slot(osb); 1185 1186 ocfs2_super_unlock(osb, 1); 1187 } 1188 1189 ocfs2_release_system_inodes(osb); 1190 1191 if (osb->dlm) { 1192 ocfs2_unregister_net_handlers(osb); 1193 1194 ocfs2_dlm_shutdown(osb); 1195 } 1196 1197 ocfs2_clear_hb_callbacks(osb); 1198 1199 debugfs_remove(osb->osb_debug_root); 1200 1201 if (!mnt_err) 1202 ocfs2_stop_heartbeat(osb); 1203 1204 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); 1205 1206 if (ocfs2_mount_local(osb)) 1207 snprintf(nodestr, sizeof(nodestr), "local"); 1208 else 1209 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 1210 1211 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", 1212 osb->dev_str, nodestr); 1213 1214 ocfs2_delete_osb(osb); 1215 kfree(osb); 1216 sb->s_dev = 0; 1217 sb->s_fs_info = NULL; 1218 } 1219 1220 static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, 1221 unsigned uuid_bytes) 1222 { 1223 int i, ret; 1224 char *ptr; 1225 1226 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); 1227 1228 osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); 1229 if (osb->uuid_str == NULL) 1230 return -ENOMEM; 1231 1232 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { 1233 /* print with null */ 1234 ret = snprintf(ptr, 3, "%02X", uuid[i]); 1235 if (ret != 2) /* drop super cleans up */ 1236 return -EINVAL; 1237 /* then only advance past the last char */ 1238 ptr += 2; 1239 } 1240 1241 return 0; 1242 } 1243 1244 static int ocfs2_initialize_super(struct super_block *sb, 1245 struct buffer_head *bh, 1246 int sector_size) 1247 { 1248 int status = 0; 1249 int i; 1250 struct ocfs2_dinode *di = NULL; 1251 struct inode *inode = NULL; 1252 struct buffer_head *bitmap_bh = NULL; 1253 struct ocfs2_journal *journal; 1254 __le32 uuid_net_key; 1255 struct ocfs2_super *osb; 1256 1257 mlog_entry_void(); 1258 1259 osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); 1260 if (!osb) { 1261 status = -ENOMEM; 1262 mlog_errno(status); 1263 goto bail; 1264 } 1265 1266 sb->s_fs_info = osb; 1267 sb->s_op = &ocfs2_sops; 1268 sb->s_export_op = &ocfs2_export_ops; 1269 sb->s_flags |= MS_NOATIME; 1270 /* this is needed to support O_LARGEFILE */ 1271 sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits); 1272 1273 osb->sb = sb; 1274 /* Save off for ocfs2_rw_direct */ 1275 osb->s_sectsize_bits = blksize_bits(sector_size); 1276 BUG_ON(!osb->s_sectsize_bits); 1277 1278 osb->net_response_ids = 0; 1279 spin_lock_init(&osb->net_response_lock); 1280 INIT_LIST_HEAD(&osb->net_response_list); 1281 1282 INIT_LIST_HEAD(&osb->osb_net_handlers); 1283 init_waitqueue_head(&osb->recovery_event); 1284 spin_lock_init(&osb->vote_task_lock); 1285 init_waitqueue_head(&osb->vote_event); 1286 osb->vote_work_sequence = 0; 1287 osb->vote_wake_sequence = 0; 1288 INIT_LIST_HEAD(&osb->blocked_lock_list); 1289 osb->blocked_lock_count = 0; 1290 INIT_LIST_HEAD(&osb->vote_list); 1291 spin_lock_init(&osb->osb_lock); 1292 1293 atomic_set(&osb->alloc_stats.moves, 0); 1294 atomic_set(&osb->alloc_stats.local_data, 0); 1295 atomic_set(&osb->alloc_stats.bitmap_data, 0); 1296 atomic_set(&osb->alloc_stats.bg_allocs, 0); 1297 atomic_set(&osb->alloc_stats.bg_extends, 0); 1298 1299 ocfs2_init_node_maps(osb); 1300 1301 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1302 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1303 1304 mutex_init(&osb->recovery_lock); 1305 1306 osb->disable_recovery = 0; 1307 osb->recovery_thread_task = NULL; 1308 1309 init_waitqueue_head(&osb->checkpoint_event); 1310 atomic_set(&osb->needs_checkpoint, 0); 1311 1312 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1313 1314 osb->node_num = O2NM_INVALID_NODE_NUM; 1315 osb->slot_num = OCFS2_INVALID_SLOT; 1316 1317 osb->local_alloc_state = OCFS2_LA_UNUSED; 1318 osb->local_alloc_bh = NULL; 1319 1320 ocfs2_setup_hb_callbacks(osb); 1321 1322 init_waitqueue_head(&osb->osb_mount_event); 1323 1324 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 1325 if (!osb->vol_label) { 1326 mlog(ML_ERROR, "unable to alloc vol label\n"); 1327 status = -ENOMEM; 1328 goto bail; 1329 } 1330 1331 di = (struct ocfs2_dinode *)bh->b_data; 1332 1333 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); 1334 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { 1335 mlog(ML_ERROR, "Invalid number of node slots (%u)\n", 1336 osb->max_slots); 1337 status = -EINVAL; 1338 goto bail; 1339 } 1340 mlog(0, "max_slots for this device: %u\n", osb->max_slots); 1341 1342 init_waitqueue_head(&osb->osb_wipe_event); 1343 osb->osb_orphan_wipes = kcalloc(osb->max_slots, 1344 sizeof(*osb->osb_orphan_wipes), 1345 GFP_KERNEL); 1346 if (!osb->osb_orphan_wipes) { 1347 status = -ENOMEM; 1348 mlog_errno(status); 1349 goto bail; 1350 } 1351 1352 osb->s_feature_compat = 1353 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 1354 osb->s_feature_ro_compat = 1355 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); 1356 osb->s_feature_incompat = 1357 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); 1358 1359 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { 1360 mlog(ML_ERROR, "couldn't mount because of unsupported " 1361 "optional features (%x).\n", i); 1362 status = -EINVAL; 1363 goto bail; 1364 } 1365 if (!(osb->sb->s_flags & MS_RDONLY) && 1366 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { 1367 mlog(ML_ERROR, "couldn't mount RDWR because of " 1368 "unsupported optional features (%x).\n", i); 1369 status = -EINVAL; 1370 goto bail; 1371 } 1372 1373 get_random_bytes(&osb->s_next_generation, sizeof(u32)); 1374 1375 /* FIXME 1376 * This should be done in ocfs2_journal_init(), but unknown 1377 * ordering issues will cause the filesystem to crash. 1378 * If anyone wants to figure out what part of the code 1379 * refers to osb->journal before ocfs2_journal_init() is run, 1380 * be my guest. 1381 */ 1382 /* initialize our journal structure */ 1383 1384 journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); 1385 if (!journal) { 1386 mlog(ML_ERROR, "unable to alloc journal\n"); 1387 status = -ENOMEM; 1388 goto bail; 1389 } 1390 osb->journal = journal; 1391 journal->j_osb = osb; 1392 1393 atomic_set(&journal->j_num_trans, 0); 1394 init_rwsem(&journal->j_trans_barrier); 1395 init_waitqueue_head(&journal->j_checkpointed); 1396 spin_lock_init(&journal->j_lock); 1397 journal->j_trans_id = (unsigned long) 1; 1398 INIT_LIST_HEAD(&journal->j_la_cleanups); 1399 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 1400 journal->j_state = OCFS2_JOURNAL_FREE; 1401 1402 /* get some pseudo constants for clustersize bits */ 1403 osb->s_clustersize_bits = 1404 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 1405 osb->s_clustersize = 1 << osb->s_clustersize_bits; 1406 mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits); 1407 1408 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || 1409 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { 1410 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", 1411 osb->s_clustersize); 1412 status = -EINVAL; 1413 goto bail; 1414 } 1415 1416 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) 1417 > (u32)~0UL) { 1418 mlog(ML_ERROR, "Volume might try to write to blocks beyond " 1419 "what jbd can address in 32 bits.\n"); 1420 status = -EINVAL; 1421 goto bail; 1422 } 1423 1424 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, 1425 sizeof(di->id2.i_super.s_uuid))) { 1426 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); 1427 status = -ENOMEM; 1428 goto bail; 1429 } 1430 1431 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); 1432 osb->net_key = le32_to_cpu(uuid_net_key); 1433 1434 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 1435 osb->vol_label[63] = '\0'; 1436 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); 1437 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); 1438 osb->first_cluster_group_blkno = 1439 le64_to_cpu(di->id2.i_super.s_first_cluster_group); 1440 osb->fs_generation = le32_to_cpu(di->i_fs_generation); 1441 mlog(0, "vol_label: %s\n", osb->vol_label); 1442 mlog(0, "uuid: %s\n", osb->uuid_str); 1443 mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", 1444 (unsigned long long)osb->root_blkno, 1445 (unsigned long long)osb->system_dir_blkno); 1446 1447 osb->osb_dlm_debug = ocfs2_new_dlm_debug(); 1448 if (!osb->osb_dlm_debug) { 1449 status = -ENOMEM; 1450 mlog_errno(status); 1451 goto bail; 1452 } 1453 1454 atomic_set(&osb->vol_state, VOLUME_INIT); 1455 1456 /* load root, system_dir, and all global system inodes */ 1457 status = ocfs2_init_global_system_inodes(osb); 1458 if (status < 0) { 1459 mlog_errno(status); 1460 goto bail; 1461 } 1462 1463 /* 1464 * global bitmap 1465 */ 1466 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 1467 OCFS2_INVALID_SLOT); 1468 if (!inode) { 1469 status = -EINVAL; 1470 mlog_errno(status); 1471 goto bail; 1472 } 1473 1474 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1475 1476 /* We don't have a cluster lock on the bitmap here because 1477 * we're only interested in static information and the extra 1478 * complexity at mount time isn't worht it. Don't pass the 1479 * inode in to the read function though as we don't want it to 1480 * be put in the cache. */ 1481 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, 1482 NULL); 1483 iput(inode); 1484 if (status < 0) { 1485 mlog_errno(status); 1486 goto bail; 1487 } 1488 1489 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1490 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); 1491 brelse(bitmap_bh); 1492 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n", 1493 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg); 1494 1495 status = ocfs2_init_slot_info(osb); 1496 if (status < 0) { 1497 mlog_errno(status); 1498 goto bail; 1499 } 1500 1501 bail: 1502 mlog_exit(status); 1503 return status; 1504 } 1505 1506 /* 1507 * will return: -EAGAIN if it is ok to keep searching for superblocks 1508 * -EINVAL if there is a bad superblock 1509 * 0 on success 1510 */ 1511 static int ocfs2_verify_volume(struct ocfs2_dinode *di, 1512 struct buffer_head *bh, 1513 u32 blksz) 1514 { 1515 int status = -EAGAIN; 1516 1517 mlog_entry_void(); 1518 1519 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, 1520 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { 1521 status = -EINVAL; 1522 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { 1523 mlog(ML_ERROR, "found superblock with incorrect block " 1524 "size: found %u, should be %u\n", 1525 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), 1526 blksz); 1527 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != 1528 OCFS2_MAJOR_REV_LEVEL || 1529 le16_to_cpu(di->id2.i_super.s_minor_rev_level) != 1530 OCFS2_MINOR_REV_LEVEL) { 1531 mlog(ML_ERROR, "found superblock with bad version: " 1532 "found %u.%u, should be %u.%u\n", 1533 le16_to_cpu(di->id2.i_super.s_major_rev_level), 1534 le16_to_cpu(di->id2.i_super.s_minor_rev_level), 1535 OCFS2_MAJOR_REV_LEVEL, 1536 OCFS2_MINOR_REV_LEVEL); 1537 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { 1538 mlog(ML_ERROR, "bad block number on superblock: " 1539 "found %llu, should be %llu\n", 1540 (unsigned long long)le64_to_cpu(di->i_blkno), 1541 (unsigned long long)bh->b_blocknr); 1542 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || 1543 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { 1544 mlog(ML_ERROR, "bad cluster size found: %u\n", 1545 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); 1546 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { 1547 mlog(ML_ERROR, "bad root_blkno: 0\n"); 1548 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { 1549 mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); 1550 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { 1551 mlog(ML_ERROR, 1552 "Superblock slots found greater than file system " 1553 "maximum: found %u, max %u\n", 1554 le16_to_cpu(di->id2.i_super.s_max_slots), 1555 OCFS2_MAX_SLOTS); 1556 } else { 1557 /* found it! */ 1558 status = 0; 1559 } 1560 } 1561 1562 mlog_exit(status); 1563 return status; 1564 } 1565 1566 static int ocfs2_check_volume(struct ocfs2_super *osb) 1567 { 1568 int status = 0; 1569 int dirty; 1570 int local; 1571 struct ocfs2_dinode *local_alloc = NULL; /* only used if we 1572 * recover 1573 * ourselves. */ 1574 1575 mlog_entry_void(); 1576 1577 /* Init our journal object. */ 1578 status = ocfs2_journal_init(osb->journal, &dirty); 1579 if (status < 0) { 1580 mlog(ML_ERROR, "Could not initialize journal!\n"); 1581 goto finally; 1582 } 1583 1584 /* If the journal was unmounted cleanly then we don't want to 1585 * recover anything. Otherwise, journal_load will do that 1586 * dirty work for us :) */ 1587 if (!dirty) { 1588 status = ocfs2_journal_wipe(osb->journal, 0); 1589 if (status < 0) { 1590 mlog_errno(status); 1591 goto finally; 1592 } 1593 } else { 1594 mlog(ML_NOTICE, "File system was not unmounted cleanly, " 1595 "recovering volume.\n"); 1596 } 1597 1598 local = ocfs2_mount_local(osb); 1599 1600 /* will play back anything left in the journal. */ 1601 ocfs2_journal_load(osb->journal, local); 1602 1603 if (dirty) { 1604 /* recover my local alloc if we didn't unmount cleanly. */ 1605 status = ocfs2_begin_local_alloc_recovery(osb, 1606 osb->slot_num, 1607 &local_alloc); 1608 if (status < 0) { 1609 mlog_errno(status); 1610 goto finally; 1611 } 1612 /* we complete the recovery process after we've marked 1613 * ourselves as mounted. */ 1614 } 1615 1616 mlog(0, "Journal loaded.\n"); 1617 1618 status = ocfs2_load_local_alloc(osb); 1619 if (status < 0) { 1620 mlog_errno(status); 1621 goto finally; 1622 } 1623 1624 if (dirty) { 1625 /* Recovery will be completed after we've mounted the 1626 * rest of the volume. */ 1627 osb->dirty = 1; 1628 osb->local_alloc_copy = local_alloc; 1629 local_alloc = NULL; 1630 } 1631 1632 /* go through each journal, trylock it and if you get the 1633 * lock, and it's marked as dirty, set the bit in the recover 1634 * map and launch a recovery thread for it. */ 1635 status = ocfs2_mark_dead_nodes(osb); 1636 if (status < 0) 1637 mlog_errno(status); 1638 1639 finally: 1640 if (local_alloc) 1641 kfree(local_alloc); 1642 1643 mlog_exit(status); 1644 return status; 1645 } 1646 1647 /* 1648 * The routine gets called from dismount or close whenever a dismount on 1649 * volume is requested and the osb open count becomes 1. 1650 * It will remove the osb from the global list and also free up all the 1651 * initialized resources and fileobject. 1652 */ 1653 static void ocfs2_delete_osb(struct ocfs2_super *osb) 1654 { 1655 mlog_entry_void(); 1656 1657 /* This function assumes that the caller has the main osb resource */ 1658 1659 if (osb->slot_info) 1660 ocfs2_free_slot_info(osb->slot_info); 1661 1662 kfree(osb->osb_orphan_wipes); 1663 /* FIXME 1664 * This belongs in journal shutdown, but because we have to 1665 * allocate osb->journal at the start of ocfs2_initalize_osb(), 1666 * we free it here. 1667 */ 1668 kfree(osb->journal); 1669 if (osb->local_alloc_copy) 1670 kfree(osb->local_alloc_copy); 1671 kfree(osb->uuid_str); 1672 ocfs2_put_dlm_debug(osb->osb_dlm_debug); 1673 memset(osb, 0, sizeof(struct ocfs2_super)); 1674 1675 mlog_exit_void(); 1676 } 1677 1678 /* Put OCFS2 into a readonly state, or (if the user specifies it), 1679 * panic(). We do not support continue-on-error operation. */ 1680 static void ocfs2_handle_error(struct super_block *sb) 1681 { 1682 struct ocfs2_super *osb = OCFS2_SB(sb); 1683 1684 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) 1685 panic("OCFS2: (device %s): panic forced after error\n", 1686 sb->s_id); 1687 1688 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); 1689 1690 if (sb->s_flags & MS_RDONLY && 1691 (ocfs2_is_soft_readonly(osb) || 1692 ocfs2_is_hard_readonly(osb))) 1693 return; 1694 1695 printk(KERN_CRIT "File system is now read-only due to the potential " 1696 "of on-disk corruption. Please run fsck.ocfs2 once the file " 1697 "system is unmounted.\n"); 1698 sb->s_flags |= MS_RDONLY; 1699 ocfs2_set_ro_flag(osb, 0); 1700 } 1701 1702 static char error_buf[1024]; 1703 1704 void __ocfs2_error(struct super_block *sb, 1705 const char *function, 1706 const char *fmt, ...) 1707 { 1708 va_list args; 1709 1710 va_start(args, fmt); 1711 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 1712 va_end(args); 1713 1714 /* Not using mlog here because we want to show the actual 1715 * function the error came from. */ 1716 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", 1717 sb->s_id, function, error_buf); 1718 1719 ocfs2_handle_error(sb); 1720 } 1721 1722 /* Handle critical errors. This is intentionally more drastic than 1723 * ocfs2_handle_error, so we only use for things like journal errors, 1724 * etc. */ 1725 void __ocfs2_abort(struct super_block* sb, 1726 const char *function, 1727 const char *fmt, ...) 1728 { 1729 va_list args; 1730 1731 va_start(args, fmt); 1732 vsnprintf(error_buf, sizeof(error_buf), fmt, args); 1733 va_end(args); 1734 1735 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", 1736 sb->s_id, function, error_buf); 1737 1738 /* We don't have the cluster support yet to go straight to 1739 * hard readonly in here. Until then, we want to keep 1740 * ocfs2_abort() so that we can at least mark critical 1741 * errors. 1742 * 1743 * TODO: This should abort the journal and alert other nodes 1744 * that our slot needs recovery. */ 1745 1746 /* Force a panic(). This stinks, but it's better than letting 1747 * things continue without having a proper hard readonly 1748 * here. */ 1749 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 1750 ocfs2_handle_error(sb); 1751 } 1752 1753 module_init(ocfs2_init); 1754 module_exit(ocfs2_exit); 1755