1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 46 #include <linux/init.h> 47 48 #include <linux/file.h> 49 50 #ifdef CONFIG_KMOD 51 #include <linux/kmod.h> 52 #endif 53 54 #include <asm/unaligned.h> 55 56 #define MAJOR_NR MD_MAJOR 57 #define MD_DRIVER 58 59 /* 63 partitions with the alternate major number (mdp) */ 60 #define MdpMinorShift 6 61 62 #define DEBUG 0 63 #define dprintk(x...) ((void)(DEBUG && printk(x))) 64 65 66 #ifndef MODULE 67 static void autostart_arrays (int part); 68 #endif 69 70 static mdk_personality_t *pers[MAX_PERSONALITY]; 71 static DEFINE_SPINLOCK(pers_lock); 72 73 /* 74 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 75 * is 1000 KB/sec, so the extra system load does not show up that much. 76 * Increase it if you want to have more _guaranteed_ speed. Note that 77 * the RAID driver will use the maximum available bandwidth if the IO 78 * subsystem is idle. There is also an 'absolute maximum' reconstruction 79 * speed limit - in case reconstruction slows down your system despite 80 * idle IO detection. 81 * 82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 */ 84 85 static int sysctl_speed_limit_min = 1000; 86 static int sysctl_speed_limit_max = 200000; 87 88 static struct ctl_table_header *raid_table_header; 89 90 static ctl_table raid_table[] = { 91 { 92 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 93 .procname = "speed_limit_min", 94 .data = &sysctl_speed_limit_min, 95 .maxlen = sizeof(int), 96 .mode = 0644, 97 .proc_handler = &proc_dointvec, 98 }, 99 { 100 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 101 .procname = "speed_limit_max", 102 .data = &sysctl_speed_limit_max, 103 .maxlen = sizeof(int), 104 .mode = 0644, 105 .proc_handler = &proc_dointvec, 106 }, 107 { .ctl_name = 0 } 108 }; 109 110 static ctl_table raid_dir_table[] = { 111 { 112 .ctl_name = DEV_RAID, 113 .procname = "raid", 114 .maxlen = 0, 115 .mode = 0555, 116 .child = raid_table, 117 }, 118 { .ctl_name = 0 } 119 }; 120 121 static ctl_table raid_root_table[] = { 122 { 123 .ctl_name = CTL_DEV, 124 .procname = "dev", 125 .maxlen = 0, 126 .mode = 0555, 127 .child = raid_dir_table, 128 }, 129 { .ctl_name = 0 } 130 }; 131 132 static struct block_device_operations md_fops; 133 134 /* 135 * Enables to iterate over all existing md arrays 136 * all_mddevs_lock protects this list. 137 */ 138 static LIST_HEAD(all_mddevs); 139 static DEFINE_SPINLOCK(all_mddevs_lock); 140 141 142 /* 143 * iterates through all used mddevs in the system. 144 * We take care to grab the all_mddevs_lock whenever navigating 145 * the list, and to always hold a refcount when unlocked. 146 * Any code which breaks out of this loop while own 147 * a reference to the current mddev and must mddev_put it. 148 */ 149 #define ITERATE_MDDEV(mddev,tmp) \ 150 \ 151 for (({ spin_lock(&all_mddevs_lock); \ 152 tmp = all_mddevs.next; \ 153 mddev = NULL;}); \ 154 ({ if (tmp != &all_mddevs) \ 155 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 156 spin_unlock(&all_mddevs_lock); \ 157 if (mddev) mddev_put(mddev); \ 158 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 159 tmp != &all_mddevs;}); \ 160 ({ spin_lock(&all_mddevs_lock); \ 161 tmp = tmp->next;}) \ 162 ) 163 164 165 static int md_fail_request (request_queue_t *q, struct bio *bio) 166 { 167 bio_io_error(bio, bio->bi_size); 168 return 0; 169 } 170 171 static inline mddev_t *mddev_get(mddev_t *mddev) 172 { 173 atomic_inc(&mddev->active); 174 return mddev; 175 } 176 177 static void mddev_put(mddev_t *mddev) 178 { 179 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 180 return; 181 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 182 list_del(&mddev->all_mddevs); 183 blk_put_queue(mddev->queue); 184 kobject_unregister(&mddev->kobj); 185 } 186 spin_unlock(&all_mddevs_lock); 187 } 188 189 static mddev_t * mddev_find(dev_t unit) 190 { 191 mddev_t *mddev, *new = NULL; 192 193 retry: 194 spin_lock(&all_mddevs_lock); 195 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 196 if (mddev->unit == unit) { 197 mddev_get(mddev); 198 spin_unlock(&all_mddevs_lock); 199 kfree(new); 200 return mddev; 201 } 202 203 if (new) { 204 list_add(&new->all_mddevs, &all_mddevs); 205 spin_unlock(&all_mddevs_lock); 206 return new; 207 } 208 spin_unlock(&all_mddevs_lock); 209 210 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 211 if (!new) 212 return NULL; 213 214 memset(new, 0, sizeof(*new)); 215 216 new->unit = unit; 217 if (MAJOR(unit) == MD_MAJOR) 218 new->md_minor = MINOR(unit); 219 else 220 new->md_minor = MINOR(unit) >> MdpMinorShift; 221 222 init_MUTEX(&new->reconfig_sem); 223 INIT_LIST_HEAD(&new->disks); 224 INIT_LIST_HEAD(&new->all_mddevs); 225 init_timer(&new->safemode_timer); 226 atomic_set(&new->active, 1); 227 spin_lock_init(&new->write_lock); 228 init_waitqueue_head(&new->sb_wait); 229 230 new->queue = blk_alloc_queue(GFP_KERNEL); 231 if (!new->queue) { 232 kfree(new); 233 return NULL; 234 } 235 236 blk_queue_make_request(new->queue, md_fail_request); 237 238 goto retry; 239 } 240 241 static inline int mddev_lock(mddev_t * mddev) 242 { 243 return down_interruptible(&mddev->reconfig_sem); 244 } 245 246 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 247 { 248 down(&mddev->reconfig_sem); 249 } 250 251 static inline int mddev_trylock(mddev_t * mddev) 252 { 253 return down_trylock(&mddev->reconfig_sem); 254 } 255 256 static inline void mddev_unlock(mddev_t * mddev) 257 { 258 up(&mddev->reconfig_sem); 259 260 md_wakeup_thread(mddev->thread); 261 } 262 263 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 264 { 265 mdk_rdev_t * rdev; 266 struct list_head *tmp; 267 268 ITERATE_RDEV(mddev,rdev,tmp) { 269 if (rdev->desc_nr == nr) 270 return rdev; 271 } 272 return NULL; 273 } 274 275 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 276 { 277 struct list_head *tmp; 278 mdk_rdev_t *rdev; 279 280 ITERATE_RDEV(mddev,rdev,tmp) { 281 if (rdev->bdev->bd_dev == dev) 282 return rdev; 283 } 284 return NULL; 285 } 286 287 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 288 { 289 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 290 return MD_NEW_SIZE_BLOCKS(size); 291 } 292 293 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 294 { 295 sector_t size; 296 297 size = rdev->sb_offset; 298 299 if (chunk_size) 300 size &= ~((sector_t)chunk_size/1024 - 1); 301 return size; 302 } 303 304 static int alloc_disk_sb(mdk_rdev_t * rdev) 305 { 306 if (rdev->sb_page) 307 MD_BUG(); 308 309 rdev->sb_page = alloc_page(GFP_KERNEL); 310 if (!rdev->sb_page) { 311 printk(KERN_ALERT "md: out of memory.\n"); 312 return -EINVAL; 313 } 314 315 return 0; 316 } 317 318 static void free_disk_sb(mdk_rdev_t * rdev) 319 { 320 if (rdev->sb_page) { 321 page_cache_release(rdev->sb_page); 322 rdev->sb_loaded = 0; 323 rdev->sb_page = NULL; 324 rdev->sb_offset = 0; 325 rdev->size = 0; 326 } 327 } 328 329 330 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 331 { 332 mdk_rdev_t *rdev = bio->bi_private; 333 if (bio->bi_size) 334 return 1; 335 336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 md_error(rdev->mddev, rdev); 338 339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 wake_up(&rdev->mddev->sb_wait); 341 bio_put(bio); 342 return 0; 343 } 344 345 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 346 sector_t sector, int size, struct page *page) 347 { 348 /* write first size bytes of page to sector of rdev 349 * Increment mddev->pending_writes before returning 350 * and decrement it on completion, waking up sb_wait 351 * if zero is reached. 352 * If an error occurred, call md_error 353 */ 354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 355 356 bio->bi_bdev = rdev->bdev; 357 bio->bi_sector = sector; 358 bio_add_page(bio, page, size, 0); 359 bio->bi_private = rdev; 360 bio->bi_end_io = super_written; 361 atomic_inc(&mddev->pending_writes); 362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 363 } 364 365 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 366 { 367 if (bio->bi_size) 368 return 1; 369 370 complete((struct completion*)bio->bi_private); 371 return 0; 372 } 373 374 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 375 struct page *page, int rw) 376 { 377 struct bio *bio = bio_alloc(GFP_NOIO, 1); 378 struct completion event; 379 int ret; 380 381 rw |= (1 << BIO_RW_SYNC); 382 383 bio->bi_bdev = bdev; 384 bio->bi_sector = sector; 385 bio_add_page(bio, page, size, 0); 386 init_completion(&event); 387 bio->bi_private = &event; 388 bio->bi_end_io = bi_complete; 389 submit_bio(rw, bio); 390 wait_for_completion(&event); 391 392 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 393 bio_put(bio); 394 return ret; 395 } 396 397 static int read_disk_sb(mdk_rdev_t * rdev, int size) 398 { 399 char b[BDEVNAME_SIZE]; 400 if (!rdev->sb_page) { 401 MD_BUG(); 402 return -EINVAL; 403 } 404 if (rdev->sb_loaded) 405 return 0; 406 407 408 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 409 goto fail; 410 rdev->sb_loaded = 1; 411 return 0; 412 413 fail: 414 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 415 bdevname(rdev->bdev,b)); 416 return -EINVAL; 417 } 418 419 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 420 { 421 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 422 (sb1->set_uuid1 == sb2->set_uuid1) && 423 (sb1->set_uuid2 == sb2->set_uuid2) && 424 (sb1->set_uuid3 == sb2->set_uuid3)) 425 426 return 1; 427 428 return 0; 429 } 430 431 432 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 433 { 434 int ret; 435 mdp_super_t *tmp1, *tmp2; 436 437 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 438 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 439 440 if (!tmp1 || !tmp2) { 441 ret = 0; 442 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 443 goto abort; 444 } 445 446 *tmp1 = *sb1; 447 *tmp2 = *sb2; 448 449 /* 450 * nr_disks is not constant 451 */ 452 tmp1->nr_disks = 0; 453 tmp2->nr_disks = 0; 454 455 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 456 ret = 0; 457 else 458 ret = 1; 459 460 abort: 461 kfree(tmp1); 462 kfree(tmp2); 463 return ret; 464 } 465 466 static unsigned int calc_sb_csum(mdp_super_t * sb) 467 { 468 unsigned int disk_csum, csum; 469 470 disk_csum = sb->sb_csum; 471 sb->sb_csum = 0; 472 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 473 sb->sb_csum = disk_csum; 474 return csum; 475 } 476 477 478 /* 479 * Handle superblock details. 480 * We want to be able to handle multiple superblock formats 481 * so we have a common interface to them all, and an array of 482 * different handlers. 483 * We rely on user-space to write the initial superblock, and support 484 * reading and updating of superblocks. 485 * Interface methods are: 486 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 487 * loads and validates a superblock on dev. 488 * if refdev != NULL, compare superblocks on both devices 489 * Return: 490 * 0 - dev has a superblock that is compatible with refdev 491 * 1 - dev has a superblock that is compatible and newer than refdev 492 * so dev should be used as the refdev in future 493 * -EINVAL superblock incompatible or invalid 494 * -othererror e.g. -EIO 495 * 496 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 497 * Verify that dev is acceptable into mddev. 498 * The first time, mddev->raid_disks will be 0, and data from 499 * dev should be merged in. Subsequent calls check that dev 500 * is new enough. Return 0 or -EINVAL 501 * 502 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 503 * Update the superblock for rdev with data in mddev 504 * This does not write to disc. 505 * 506 */ 507 508 struct super_type { 509 char *name; 510 struct module *owner; 511 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 512 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 513 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 514 }; 515 516 /* 517 * load_super for 0.90.0 518 */ 519 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 520 { 521 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 522 mdp_super_t *sb; 523 int ret; 524 sector_t sb_offset; 525 526 /* 527 * Calculate the position of the superblock, 528 * it's at the end of the disk. 529 * 530 * It also happens to be a multiple of 4Kb. 531 */ 532 sb_offset = calc_dev_sboffset(rdev->bdev); 533 rdev->sb_offset = sb_offset; 534 535 ret = read_disk_sb(rdev, MD_SB_BYTES); 536 if (ret) return ret; 537 538 ret = -EINVAL; 539 540 bdevname(rdev->bdev, b); 541 sb = (mdp_super_t*)page_address(rdev->sb_page); 542 543 if (sb->md_magic != MD_SB_MAGIC) { 544 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 545 b); 546 goto abort; 547 } 548 549 if (sb->major_version != 0 || 550 sb->minor_version != 90) { 551 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 552 sb->major_version, sb->minor_version, 553 b); 554 goto abort; 555 } 556 557 if (sb->raid_disks <= 0) 558 goto abort; 559 560 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 561 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 562 b); 563 goto abort; 564 } 565 566 rdev->preferred_minor = sb->md_minor; 567 rdev->data_offset = 0; 568 rdev->sb_size = MD_SB_BYTES; 569 570 if (sb->level == LEVEL_MULTIPATH) 571 rdev->desc_nr = -1; 572 else 573 rdev->desc_nr = sb->this_disk.number; 574 575 if (refdev == 0) 576 ret = 1; 577 else { 578 __u64 ev1, ev2; 579 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 580 if (!uuid_equal(refsb, sb)) { 581 printk(KERN_WARNING "md: %s has different UUID to %s\n", 582 b, bdevname(refdev->bdev,b2)); 583 goto abort; 584 } 585 if (!sb_equal(refsb, sb)) { 586 printk(KERN_WARNING "md: %s has same UUID" 587 " but different superblock to %s\n", 588 b, bdevname(refdev->bdev, b2)); 589 goto abort; 590 } 591 ev1 = md_event(sb); 592 ev2 = md_event(refsb); 593 if (ev1 > ev2) 594 ret = 1; 595 else 596 ret = 0; 597 } 598 rdev->size = calc_dev_size(rdev, sb->chunk_size); 599 600 abort: 601 return ret; 602 } 603 604 /* 605 * validate_super for 0.90.0 606 */ 607 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 608 { 609 mdp_disk_t *desc; 610 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 611 612 rdev->raid_disk = -1; 613 rdev->in_sync = 0; 614 if (mddev->raid_disks == 0) { 615 mddev->major_version = 0; 616 mddev->minor_version = sb->minor_version; 617 mddev->patch_version = sb->patch_version; 618 mddev->persistent = ! sb->not_persistent; 619 mddev->chunk_size = sb->chunk_size; 620 mddev->ctime = sb->ctime; 621 mddev->utime = sb->utime; 622 mddev->level = sb->level; 623 mddev->layout = sb->layout; 624 mddev->raid_disks = sb->raid_disks; 625 mddev->size = sb->size; 626 mddev->events = md_event(sb); 627 mddev->bitmap_offset = 0; 628 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 629 630 if (sb->state & (1<<MD_SB_CLEAN)) 631 mddev->recovery_cp = MaxSector; 632 else { 633 if (sb->events_hi == sb->cp_events_hi && 634 sb->events_lo == sb->cp_events_lo) { 635 mddev->recovery_cp = sb->recovery_cp; 636 } else 637 mddev->recovery_cp = 0; 638 } 639 640 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 641 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 642 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 643 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 644 645 mddev->max_disks = MD_SB_DISKS; 646 647 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 648 mddev->bitmap_file == NULL) { 649 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 650 /* FIXME use a better test */ 651 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 652 return -EINVAL; 653 } 654 mddev->bitmap_offset = mddev->default_bitmap_offset; 655 } 656 657 } else if (mddev->pers == NULL) { 658 /* Insist on good event counter while assembling */ 659 __u64 ev1 = md_event(sb); 660 ++ev1; 661 if (ev1 < mddev->events) 662 return -EINVAL; 663 } else if (mddev->bitmap) { 664 /* if adding to array with a bitmap, then we can accept an 665 * older device ... but not too old. 666 */ 667 __u64 ev1 = md_event(sb); 668 if (ev1 < mddev->bitmap->events_cleared) 669 return 0; 670 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 671 return 0; 672 673 if (mddev->level != LEVEL_MULTIPATH) { 674 rdev->faulty = 0; 675 rdev->flags = 0; 676 desc = sb->disks + rdev->desc_nr; 677 678 if (desc->state & (1<<MD_DISK_FAULTY)) 679 rdev->faulty = 1; 680 else if (desc->state & (1<<MD_DISK_SYNC) && 681 desc->raid_disk < mddev->raid_disks) { 682 rdev->in_sync = 1; 683 rdev->raid_disk = desc->raid_disk; 684 } 685 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 686 set_bit(WriteMostly, &rdev->flags); 687 } else /* MULTIPATH are always insync */ 688 rdev->in_sync = 1; 689 return 0; 690 } 691 692 /* 693 * sync_super for 0.90.0 694 */ 695 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 696 { 697 mdp_super_t *sb; 698 struct list_head *tmp; 699 mdk_rdev_t *rdev2; 700 int next_spare = mddev->raid_disks; 701 char nm[20]; 702 703 /* make rdev->sb match mddev data.. 704 * 705 * 1/ zero out disks 706 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 707 * 3/ any empty disks < next_spare become removed 708 * 709 * disks[0] gets initialised to REMOVED because 710 * we cannot be sure from other fields if it has 711 * been initialised or not. 712 */ 713 int i; 714 int active=0, working=0,failed=0,spare=0,nr_disks=0; 715 unsigned int fixdesc=0; 716 717 rdev->sb_size = MD_SB_BYTES; 718 719 sb = (mdp_super_t*)page_address(rdev->sb_page); 720 721 memset(sb, 0, sizeof(*sb)); 722 723 sb->md_magic = MD_SB_MAGIC; 724 sb->major_version = mddev->major_version; 725 sb->minor_version = mddev->minor_version; 726 sb->patch_version = mddev->patch_version; 727 sb->gvalid_words = 0; /* ignored */ 728 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 729 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 730 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 731 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 732 733 sb->ctime = mddev->ctime; 734 sb->level = mddev->level; 735 sb->size = mddev->size; 736 sb->raid_disks = mddev->raid_disks; 737 sb->md_minor = mddev->md_minor; 738 sb->not_persistent = !mddev->persistent; 739 sb->utime = mddev->utime; 740 sb->state = 0; 741 sb->events_hi = (mddev->events>>32); 742 sb->events_lo = (u32)mddev->events; 743 744 if (mddev->in_sync) 745 { 746 sb->recovery_cp = mddev->recovery_cp; 747 sb->cp_events_hi = (mddev->events>>32); 748 sb->cp_events_lo = (u32)mddev->events; 749 if (mddev->recovery_cp == MaxSector) 750 sb->state = (1<< MD_SB_CLEAN); 751 } else 752 sb->recovery_cp = 0; 753 754 sb->layout = mddev->layout; 755 sb->chunk_size = mddev->chunk_size; 756 757 if (mddev->bitmap && mddev->bitmap_file == NULL) 758 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 759 760 sb->disks[0].state = (1<<MD_DISK_REMOVED); 761 ITERATE_RDEV(mddev,rdev2,tmp) { 762 mdp_disk_t *d; 763 int desc_nr; 764 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 765 desc_nr = rdev2->raid_disk; 766 else 767 desc_nr = next_spare++; 768 if (desc_nr != rdev2->desc_nr) { 769 fixdesc |= (1 << desc_nr); 770 rdev2->desc_nr = desc_nr; 771 if (rdev2->raid_disk >= 0) { 772 sprintf(nm, "rd%d", rdev2->raid_disk); 773 sysfs_remove_link(&mddev->kobj, nm); 774 } 775 sysfs_remove_link(&rdev2->kobj, "block"); 776 kobject_del(&rdev2->kobj); 777 } 778 d = &sb->disks[rdev2->desc_nr]; 779 nr_disks++; 780 d->number = rdev2->desc_nr; 781 d->major = MAJOR(rdev2->bdev->bd_dev); 782 d->minor = MINOR(rdev2->bdev->bd_dev); 783 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 784 d->raid_disk = rdev2->raid_disk; 785 else 786 d->raid_disk = rdev2->desc_nr; /* compatibility */ 787 if (rdev2->faulty) { 788 d->state = (1<<MD_DISK_FAULTY); 789 failed++; 790 } else if (rdev2->in_sync) { 791 d->state = (1<<MD_DISK_ACTIVE); 792 d->state |= (1<<MD_DISK_SYNC); 793 active++; 794 working++; 795 } else { 796 d->state = 0; 797 spare++; 798 working++; 799 } 800 if (test_bit(WriteMostly, &rdev2->flags)) 801 d->state |= (1<<MD_DISK_WRITEMOSTLY); 802 } 803 if (fixdesc) 804 ITERATE_RDEV(mddev,rdev2,tmp) 805 if (fixdesc & (1<<rdev2->desc_nr)) { 806 snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", 807 rdev2->desc_nr); 808 /* kobject_add gets a ref on the parent, so 809 * we have to drop the one we already have 810 */ 811 kobject_add(&rdev2->kobj); 812 kobject_put(rdev->kobj.parent); 813 sysfs_create_link(&rdev2->kobj, 814 &rdev2->bdev->bd_disk->kobj, 815 "block"); 816 if (rdev2->raid_disk >= 0) { 817 sprintf(nm, "rd%d", rdev2->raid_disk); 818 sysfs_create_link(&mddev->kobj, 819 &rdev2->kobj, nm); 820 } 821 } 822 /* now set the "removed" and "faulty" bits on any missing devices */ 823 for (i=0 ; i < mddev->raid_disks ; i++) { 824 mdp_disk_t *d = &sb->disks[i]; 825 if (d->state == 0 && d->number == 0) { 826 d->number = i; 827 d->raid_disk = i; 828 d->state = (1<<MD_DISK_REMOVED); 829 d->state |= (1<<MD_DISK_FAULTY); 830 failed++; 831 } 832 } 833 sb->nr_disks = nr_disks; 834 sb->active_disks = active; 835 sb->working_disks = working; 836 sb->failed_disks = failed; 837 sb->spare_disks = spare; 838 839 sb->this_disk = sb->disks[rdev->desc_nr]; 840 sb->sb_csum = calc_sb_csum(sb); 841 } 842 843 /* 844 * version 1 superblock 845 */ 846 847 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 848 { 849 unsigned int disk_csum, csum; 850 unsigned long long newcsum; 851 int size = 256 + le32_to_cpu(sb->max_dev)*2; 852 unsigned int *isuper = (unsigned int*)sb; 853 int i; 854 855 disk_csum = sb->sb_csum; 856 sb->sb_csum = 0; 857 newcsum = 0; 858 for (i=0; size>=4; size -= 4 ) 859 newcsum += le32_to_cpu(*isuper++); 860 861 if (size == 2) 862 newcsum += le16_to_cpu(*(unsigned short*) isuper); 863 864 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 865 sb->sb_csum = disk_csum; 866 return cpu_to_le32(csum); 867 } 868 869 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 870 { 871 struct mdp_superblock_1 *sb; 872 int ret; 873 sector_t sb_offset; 874 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 875 int bmask; 876 877 /* 878 * Calculate the position of the superblock. 879 * It is always aligned to a 4K boundary and 880 * depeding on minor_version, it can be: 881 * 0: At least 8K, but less than 12K, from end of device 882 * 1: At start of device 883 * 2: 4K from start of device. 884 */ 885 switch(minor_version) { 886 case 0: 887 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 888 sb_offset -= 8*2; 889 sb_offset &= ~(sector_t)(4*2-1); 890 /* convert from sectors to K */ 891 sb_offset /= 2; 892 break; 893 case 1: 894 sb_offset = 0; 895 break; 896 case 2: 897 sb_offset = 4; 898 break; 899 default: 900 return -EINVAL; 901 } 902 rdev->sb_offset = sb_offset; 903 904 /* superblock is rarely larger than 1K, but it can be larger, 905 * and it is safe to read 4k, so we do that 906 */ 907 ret = read_disk_sb(rdev, 4096); 908 if (ret) return ret; 909 910 911 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 912 913 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 914 sb->major_version != cpu_to_le32(1) || 915 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 916 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 917 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 918 return -EINVAL; 919 920 if (calc_sb_1_csum(sb) != sb->sb_csum) { 921 printk("md: invalid superblock checksum on %s\n", 922 bdevname(rdev->bdev,b)); 923 return -EINVAL; 924 } 925 if (le64_to_cpu(sb->data_size) < 10) { 926 printk("md: data_size too small on %s\n", 927 bdevname(rdev->bdev,b)); 928 return -EINVAL; 929 } 930 rdev->preferred_minor = 0xffff; 931 rdev->data_offset = le64_to_cpu(sb->data_offset); 932 933 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 934 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 935 if (rdev->sb_size & bmask) 936 rdev-> sb_size = (rdev->sb_size | bmask)+1; 937 938 if (refdev == 0) 939 return 1; 940 else { 941 __u64 ev1, ev2; 942 struct mdp_superblock_1 *refsb = 943 (struct mdp_superblock_1*)page_address(refdev->sb_page); 944 945 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 946 sb->level != refsb->level || 947 sb->layout != refsb->layout || 948 sb->chunksize != refsb->chunksize) { 949 printk(KERN_WARNING "md: %s has strangely different" 950 " superblock to %s\n", 951 bdevname(rdev->bdev,b), 952 bdevname(refdev->bdev,b2)); 953 return -EINVAL; 954 } 955 ev1 = le64_to_cpu(sb->events); 956 ev2 = le64_to_cpu(refsb->events); 957 958 if (ev1 > ev2) 959 return 1; 960 } 961 if (minor_version) 962 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 963 else 964 rdev->size = rdev->sb_offset; 965 if (rdev->size < le64_to_cpu(sb->data_size)/2) 966 return -EINVAL; 967 rdev->size = le64_to_cpu(sb->data_size)/2; 968 if (le32_to_cpu(sb->chunksize)) 969 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 970 return 0; 971 } 972 973 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 974 { 975 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 976 977 rdev->raid_disk = -1; 978 rdev->in_sync = 0; 979 if (mddev->raid_disks == 0) { 980 mddev->major_version = 1; 981 mddev->patch_version = 0; 982 mddev->persistent = 1; 983 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 984 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 985 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 986 mddev->level = le32_to_cpu(sb->level); 987 mddev->layout = le32_to_cpu(sb->layout); 988 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 989 mddev->size = le64_to_cpu(sb->size)/2; 990 mddev->events = le64_to_cpu(sb->events); 991 mddev->bitmap_offset = 0; 992 mddev->default_bitmap_offset = 0; 993 mddev->default_bitmap_offset = 1024; 994 995 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 996 memcpy(mddev->uuid, sb->set_uuid, 16); 997 998 mddev->max_disks = (4096-256)/2; 999 1000 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1001 mddev->bitmap_file == NULL ) { 1002 if (mddev->level != 1) { 1003 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 1004 return -EINVAL; 1005 } 1006 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1007 } 1008 } else if (mddev->pers == NULL) { 1009 /* Insist of good event counter while assembling */ 1010 __u64 ev1 = le64_to_cpu(sb->events); 1011 ++ev1; 1012 if (ev1 < mddev->events) 1013 return -EINVAL; 1014 } else if (mddev->bitmap) { 1015 /* If adding to array with a bitmap, then we can accept an 1016 * older device, but not too old. 1017 */ 1018 __u64 ev1 = le64_to_cpu(sb->events); 1019 if (ev1 < mddev->bitmap->events_cleared) 1020 return 0; 1021 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1022 return 0; 1023 1024 if (mddev->level != LEVEL_MULTIPATH) { 1025 int role; 1026 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1027 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1028 switch(role) { 1029 case 0xffff: /* spare */ 1030 rdev->faulty = 0; 1031 break; 1032 case 0xfffe: /* faulty */ 1033 rdev->faulty = 1; 1034 break; 1035 default: 1036 rdev->in_sync = 1; 1037 rdev->faulty = 0; 1038 rdev->raid_disk = role; 1039 break; 1040 } 1041 rdev->flags = 0; 1042 if (sb->devflags & WriteMostly1) 1043 set_bit(WriteMostly, &rdev->flags); 1044 } else /* MULTIPATH are always insync */ 1045 rdev->in_sync = 1; 1046 1047 return 0; 1048 } 1049 1050 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1051 { 1052 struct mdp_superblock_1 *sb; 1053 struct list_head *tmp; 1054 mdk_rdev_t *rdev2; 1055 int max_dev, i; 1056 /* make rdev->sb match mddev and rdev data. */ 1057 1058 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1059 1060 sb->feature_map = 0; 1061 sb->pad0 = 0; 1062 memset(sb->pad1, 0, sizeof(sb->pad1)); 1063 memset(sb->pad2, 0, sizeof(sb->pad2)); 1064 memset(sb->pad3, 0, sizeof(sb->pad3)); 1065 1066 sb->utime = cpu_to_le64((__u64)mddev->utime); 1067 sb->events = cpu_to_le64(mddev->events); 1068 if (mddev->in_sync) 1069 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1070 else 1071 sb->resync_offset = cpu_to_le64(0); 1072 1073 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1074 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1075 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1076 } 1077 1078 max_dev = 0; 1079 ITERATE_RDEV(mddev,rdev2,tmp) 1080 if (rdev2->desc_nr+1 > max_dev) 1081 max_dev = rdev2->desc_nr+1; 1082 1083 sb->max_dev = cpu_to_le32(max_dev); 1084 for (i=0; i<max_dev;i++) 1085 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1086 1087 ITERATE_RDEV(mddev,rdev2,tmp) { 1088 i = rdev2->desc_nr; 1089 if (rdev2->faulty) 1090 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1091 else if (rdev2->in_sync) 1092 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1093 else 1094 sb->dev_roles[i] = cpu_to_le16(0xffff); 1095 } 1096 1097 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1098 sb->sb_csum = calc_sb_1_csum(sb); 1099 } 1100 1101 1102 static struct super_type super_types[] = { 1103 [0] = { 1104 .name = "0.90.0", 1105 .owner = THIS_MODULE, 1106 .load_super = super_90_load, 1107 .validate_super = super_90_validate, 1108 .sync_super = super_90_sync, 1109 }, 1110 [1] = { 1111 .name = "md-1", 1112 .owner = THIS_MODULE, 1113 .load_super = super_1_load, 1114 .validate_super = super_1_validate, 1115 .sync_super = super_1_sync, 1116 }, 1117 }; 1118 1119 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1120 { 1121 struct list_head *tmp; 1122 mdk_rdev_t *rdev; 1123 1124 ITERATE_RDEV(mddev,rdev,tmp) 1125 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1126 return rdev; 1127 1128 return NULL; 1129 } 1130 1131 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1132 { 1133 struct list_head *tmp; 1134 mdk_rdev_t *rdev; 1135 1136 ITERATE_RDEV(mddev1,rdev,tmp) 1137 if (match_dev_unit(mddev2, rdev)) 1138 return 1; 1139 1140 return 0; 1141 } 1142 1143 static LIST_HEAD(pending_raid_disks); 1144 1145 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1146 { 1147 mdk_rdev_t *same_pdev; 1148 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1149 1150 if (rdev->mddev) { 1151 MD_BUG(); 1152 return -EINVAL; 1153 } 1154 same_pdev = match_dev_unit(mddev, rdev); 1155 if (same_pdev) 1156 printk(KERN_WARNING 1157 "%s: WARNING: %s appears to be on the same physical" 1158 " disk as %s. True\n protection against single-disk" 1159 " failure might be compromised.\n", 1160 mdname(mddev), bdevname(rdev->bdev,b), 1161 bdevname(same_pdev->bdev,b2)); 1162 1163 /* Verify rdev->desc_nr is unique. 1164 * If it is -1, assign a free number, else 1165 * check number is not in use 1166 */ 1167 if (rdev->desc_nr < 0) { 1168 int choice = 0; 1169 if (mddev->pers) choice = mddev->raid_disks; 1170 while (find_rdev_nr(mddev, choice)) 1171 choice++; 1172 rdev->desc_nr = choice; 1173 } else { 1174 if (find_rdev_nr(mddev, rdev->desc_nr)) 1175 return -EBUSY; 1176 } 1177 1178 list_add(&rdev->same_set, &mddev->disks); 1179 rdev->mddev = mddev; 1180 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1181 1182 rdev->kobj.k_name = NULL; 1183 snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); 1184 rdev->kobj.parent = &mddev->kobj; 1185 kobject_add(&rdev->kobj); 1186 1187 sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); 1188 return 0; 1189 } 1190 1191 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1192 { 1193 char b[BDEVNAME_SIZE]; 1194 if (!rdev->mddev) { 1195 MD_BUG(); 1196 return; 1197 } 1198 list_del_init(&rdev->same_set); 1199 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1200 rdev->mddev = NULL; 1201 sysfs_remove_link(&rdev->kobj, "block"); 1202 kobject_del(&rdev->kobj); 1203 } 1204 1205 /* 1206 * prevent the device from being mounted, repartitioned or 1207 * otherwise reused by a RAID array (or any other kernel 1208 * subsystem), by bd_claiming the device. 1209 */ 1210 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1211 { 1212 int err = 0; 1213 struct block_device *bdev; 1214 char b[BDEVNAME_SIZE]; 1215 1216 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1217 if (IS_ERR(bdev)) { 1218 printk(KERN_ERR "md: could not open %s.\n", 1219 __bdevname(dev, b)); 1220 return PTR_ERR(bdev); 1221 } 1222 err = bd_claim(bdev, rdev); 1223 if (err) { 1224 printk(KERN_ERR "md: could not bd_claim %s.\n", 1225 bdevname(bdev, b)); 1226 blkdev_put(bdev); 1227 return err; 1228 } 1229 rdev->bdev = bdev; 1230 return err; 1231 } 1232 1233 static void unlock_rdev(mdk_rdev_t *rdev) 1234 { 1235 struct block_device *bdev = rdev->bdev; 1236 rdev->bdev = NULL; 1237 if (!bdev) 1238 MD_BUG(); 1239 bd_release(bdev); 1240 blkdev_put(bdev); 1241 } 1242 1243 void md_autodetect_dev(dev_t dev); 1244 1245 static void export_rdev(mdk_rdev_t * rdev) 1246 { 1247 char b[BDEVNAME_SIZE]; 1248 printk(KERN_INFO "md: export_rdev(%s)\n", 1249 bdevname(rdev->bdev,b)); 1250 if (rdev->mddev) 1251 MD_BUG(); 1252 free_disk_sb(rdev); 1253 list_del_init(&rdev->same_set); 1254 #ifndef MODULE 1255 md_autodetect_dev(rdev->bdev->bd_dev); 1256 #endif 1257 unlock_rdev(rdev); 1258 kobject_put(&rdev->kobj); 1259 } 1260 1261 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1262 { 1263 unbind_rdev_from_array(rdev); 1264 export_rdev(rdev); 1265 } 1266 1267 static void export_array(mddev_t *mddev) 1268 { 1269 struct list_head *tmp; 1270 mdk_rdev_t *rdev; 1271 1272 ITERATE_RDEV(mddev,rdev,tmp) { 1273 if (!rdev->mddev) { 1274 MD_BUG(); 1275 continue; 1276 } 1277 kick_rdev_from_array(rdev); 1278 } 1279 if (!list_empty(&mddev->disks)) 1280 MD_BUG(); 1281 mddev->raid_disks = 0; 1282 mddev->major_version = 0; 1283 } 1284 1285 static void print_desc(mdp_disk_t *desc) 1286 { 1287 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1288 desc->major,desc->minor,desc->raid_disk,desc->state); 1289 } 1290 1291 static void print_sb(mdp_super_t *sb) 1292 { 1293 int i; 1294 1295 printk(KERN_INFO 1296 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1297 sb->major_version, sb->minor_version, sb->patch_version, 1298 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1299 sb->ctime); 1300 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1301 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1302 sb->md_minor, sb->layout, sb->chunk_size); 1303 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1304 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1305 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1306 sb->failed_disks, sb->spare_disks, 1307 sb->sb_csum, (unsigned long)sb->events_lo); 1308 1309 printk(KERN_INFO); 1310 for (i = 0; i < MD_SB_DISKS; i++) { 1311 mdp_disk_t *desc; 1312 1313 desc = sb->disks + i; 1314 if (desc->number || desc->major || desc->minor || 1315 desc->raid_disk || (desc->state && (desc->state != 4))) { 1316 printk(" D %2d: ", i); 1317 print_desc(desc); 1318 } 1319 } 1320 printk(KERN_INFO "md: THIS: "); 1321 print_desc(&sb->this_disk); 1322 1323 } 1324 1325 static void print_rdev(mdk_rdev_t *rdev) 1326 { 1327 char b[BDEVNAME_SIZE]; 1328 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1329 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1330 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1331 if (rdev->sb_loaded) { 1332 printk(KERN_INFO "md: rdev superblock:\n"); 1333 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1334 } else 1335 printk(KERN_INFO "md: no rdev superblock!\n"); 1336 } 1337 1338 void md_print_devices(void) 1339 { 1340 struct list_head *tmp, *tmp2; 1341 mdk_rdev_t *rdev; 1342 mddev_t *mddev; 1343 char b[BDEVNAME_SIZE]; 1344 1345 printk("\n"); 1346 printk("md: **********************************\n"); 1347 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1348 printk("md: **********************************\n"); 1349 ITERATE_MDDEV(mddev,tmp) { 1350 1351 if (mddev->bitmap) 1352 bitmap_print_sb(mddev->bitmap); 1353 else 1354 printk("%s: ", mdname(mddev)); 1355 ITERATE_RDEV(mddev,rdev,tmp2) 1356 printk("<%s>", bdevname(rdev->bdev,b)); 1357 printk("\n"); 1358 1359 ITERATE_RDEV(mddev,rdev,tmp2) 1360 print_rdev(rdev); 1361 } 1362 printk("md: **********************************\n"); 1363 printk("\n"); 1364 } 1365 1366 1367 static void sync_sbs(mddev_t * mddev) 1368 { 1369 mdk_rdev_t *rdev; 1370 struct list_head *tmp; 1371 1372 ITERATE_RDEV(mddev,rdev,tmp) { 1373 super_types[mddev->major_version]. 1374 sync_super(mddev, rdev); 1375 rdev->sb_loaded = 1; 1376 } 1377 } 1378 1379 static void md_update_sb(mddev_t * mddev) 1380 { 1381 int err; 1382 struct list_head *tmp; 1383 mdk_rdev_t *rdev; 1384 int sync_req; 1385 1386 repeat: 1387 spin_lock(&mddev->write_lock); 1388 sync_req = mddev->in_sync; 1389 mddev->utime = get_seconds(); 1390 mddev->events ++; 1391 1392 if (!mddev->events) { 1393 /* 1394 * oops, this 64-bit counter should never wrap. 1395 * Either we are in around ~1 trillion A.C., assuming 1396 * 1 reboot per second, or we have a bug: 1397 */ 1398 MD_BUG(); 1399 mddev->events --; 1400 } 1401 mddev->sb_dirty = 2; 1402 sync_sbs(mddev); 1403 1404 /* 1405 * do not write anything to disk if using 1406 * nonpersistent superblocks 1407 */ 1408 if (!mddev->persistent) { 1409 mddev->sb_dirty = 0; 1410 spin_unlock(&mddev->write_lock); 1411 wake_up(&mddev->sb_wait); 1412 return; 1413 } 1414 spin_unlock(&mddev->write_lock); 1415 1416 dprintk(KERN_INFO 1417 "md: updating %s RAID superblock on device (in sync %d)\n", 1418 mdname(mddev),mddev->in_sync); 1419 1420 err = bitmap_update_sb(mddev->bitmap); 1421 ITERATE_RDEV(mddev,rdev,tmp) { 1422 char b[BDEVNAME_SIZE]; 1423 dprintk(KERN_INFO "md: "); 1424 if (rdev->faulty) 1425 dprintk("(skipping faulty "); 1426 1427 dprintk("%s ", bdevname(rdev->bdev,b)); 1428 if (!rdev->faulty) { 1429 md_super_write(mddev,rdev, 1430 rdev->sb_offset<<1, rdev->sb_size, 1431 rdev->sb_page); 1432 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1433 bdevname(rdev->bdev,b), 1434 (unsigned long long)rdev->sb_offset); 1435 1436 } else 1437 dprintk(")\n"); 1438 if (mddev->level == LEVEL_MULTIPATH) 1439 /* only need to write one superblock... */ 1440 break; 1441 } 1442 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1443 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1444 1445 spin_lock(&mddev->write_lock); 1446 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1447 /* have to write it out again */ 1448 spin_unlock(&mddev->write_lock); 1449 goto repeat; 1450 } 1451 mddev->sb_dirty = 0; 1452 spin_unlock(&mddev->write_lock); 1453 wake_up(&mddev->sb_wait); 1454 1455 } 1456 1457 struct rdev_sysfs_entry { 1458 struct attribute attr; 1459 ssize_t (*show)(mdk_rdev_t *, char *); 1460 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1461 }; 1462 1463 static ssize_t 1464 rdev_show_state(mdk_rdev_t *rdev, char *page) 1465 { 1466 char *sep = ""; 1467 int len=0; 1468 1469 if (rdev->faulty) { 1470 len+= sprintf(page+len, "%sfaulty",sep); 1471 sep = ","; 1472 } 1473 if (rdev->in_sync) { 1474 len += sprintf(page+len, "%sin_sync",sep); 1475 sep = ","; 1476 } 1477 if (!rdev->faulty && !rdev->in_sync) { 1478 len += sprintf(page+len, "%sspare", sep); 1479 sep = ","; 1480 } 1481 return len+sprintf(page+len, "\n"); 1482 } 1483 1484 static struct rdev_sysfs_entry rdev_state = { 1485 .attr = {.name = "state", .mode = S_IRUGO }, 1486 .show = rdev_show_state, 1487 }; 1488 1489 static ssize_t 1490 rdev_show_super(mdk_rdev_t *rdev, char *page) 1491 { 1492 if (rdev->sb_loaded && rdev->sb_size) { 1493 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1494 return rdev->sb_size; 1495 } else 1496 return 0; 1497 } 1498 static struct rdev_sysfs_entry rdev_super = { 1499 .attr = {.name = "super", .mode = S_IRUGO }, 1500 .show = rdev_show_super, 1501 }; 1502 static struct attribute *rdev_default_attrs[] = { 1503 &rdev_state.attr, 1504 &rdev_super.attr, 1505 NULL, 1506 }; 1507 static ssize_t 1508 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1509 { 1510 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1511 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1512 1513 if (!entry->show) 1514 return -EIO; 1515 return entry->show(rdev, page); 1516 } 1517 1518 static ssize_t 1519 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1520 const char *page, size_t length) 1521 { 1522 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1523 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1524 1525 if (!entry->store) 1526 return -EIO; 1527 return entry->store(rdev, page, length); 1528 } 1529 1530 static void rdev_free(struct kobject *ko) 1531 { 1532 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1533 kfree(rdev); 1534 } 1535 static struct sysfs_ops rdev_sysfs_ops = { 1536 .show = rdev_attr_show, 1537 .store = rdev_attr_store, 1538 }; 1539 static struct kobj_type rdev_ktype = { 1540 .release = rdev_free, 1541 .sysfs_ops = &rdev_sysfs_ops, 1542 .default_attrs = rdev_default_attrs, 1543 }; 1544 1545 /* 1546 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1547 * 1548 * mark the device faulty if: 1549 * 1550 * - the device is nonexistent (zero size) 1551 * - the device has no valid superblock 1552 * 1553 * a faulty rdev _never_ has rdev->sb set. 1554 */ 1555 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1556 { 1557 char b[BDEVNAME_SIZE]; 1558 int err; 1559 mdk_rdev_t *rdev; 1560 sector_t size; 1561 1562 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1563 if (!rdev) { 1564 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1565 return ERR_PTR(-ENOMEM); 1566 } 1567 memset(rdev, 0, sizeof(*rdev)); 1568 1569 if ((err = alloc_disk_sb(rdev))) 1570 goto abort_free; 1571 1572 err = lock_rdev(rdev, newdev); 1573 if (err) 1574 goto abort_free; 1575 1576 rdev->kobj.parent = NULL; 1577 rdev->kobj.ktype = &rdev_ktype; 1578 kobject_init(&rdev->kobj); 1579 1580 rdev->desc_nr = -1; 1581 rdev->faulty = 0; 1582 rdev->in_sync = 0; 1583 rdev->data_offset = 0; 1584 atomic_set(&rdev->nr_pending, 0); 1585 1586 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1587 if (!size) { 1588 printk(KERN_WARNING 1589 "md: %s has zero or unknown size, marking faulty!\n", 1590 bdevname(rdev->bdev,b)); 1591 err = -EINVAL; 1592 goto abort_free; 1593 } 1594 1595 if (super_format >= 0) { 1596 err = super_types[super_format]. 1597 load_super(rdev, NULL, super_minor); 1598 if (err == -EINVAL) { 1599 printk(KERN_WARNING 1600 "md: %s has invalid sb, not importing!\n", 1601 bdevname(rdev->bdev,b)); 1602 goto abort_free; 1603 } 1604 if (err < 0) { 1605 printk(KERN_WARNING 1606 "md: could not read %s's sb, not importing!\n", 1607 bdevname(rdev->bdev,b)); 1608 goto abort_free; 1609 } 1610 } 1611 INIT_LIST_HEAD(&rdev->same_set); 1612 1613 return rdev; 1614 1615 abort_free: 1616 if (rdev->sb_page) { 1617 if (rdev->bdev) 1618 unlock_rdev(rdev); 1619 free_disk_sb(rdev); 1620 } 1621 kfree(rdev); 1622 return ERR_PTR(err); 1623 } 1624 1625 /* 1626 * Check a full RAID array for plausibility 1627 */ 1628 1629 1630 static void analyze_sbs(mddev_t * mddev) 1631 { 1632 int i; 1633 struct list_head *tmp; 1634 mdk_rdev_t *rdev, *freshest; 1635 char b[BDEVNAME_SIZE]; 1636 1637 freshest = NULL; 1638 ITERATE_RDEV(mddev,rdev,tmp) 1639 switch (super_types[mddev->major_version]. 1640 load_super(rdev, freshest, mddev->minor_version)) { 1641 case 1: 1642 freshest = rdev; 1643 break; 1644 case 0: 1645 break; 1646 default: 1647 printk( KERN_ERR \ 1648 "md: fatal superblock inconsistency in %s" 1649 " -- removing from array\n", 1650 bdevname(rdev->bdev,b)); 1651 kick_rdev_from_array(rdev); 1652 } 1653 1654 1655 super_types[mddev->major_version]. 1656 validate_super(mddev, freshest); 1657 1658 i = 0; 1659 ITERATE_RDEV(mddev,rdev,tmp) { 1660 if (rdev != freshest) 1661 if (super_types[mddev->major_version]. 1662 validate_super(mddev, rdev)) { 1663 printk(KERN_WARNING "md: kicking non-fresh %s" 1664 " from array!\n", 1665 bdevname(rdev->bdev,b)); 1666 kick_rdev_from_array(rdev); 1667 continue; 1668 } 1669 if (mddev->level == LEVEL_MULTIPATH) { 1670 rdev->desc_nr = i++; 1671 rdev->raid_disk = rdev->desc_nr; 1672 rdev->in_sync = 1; 1673 } 1674 } 1675 1676 1677 1678 if (mddev->recovery_cp != MaxSector && 1679 mddev->level >= 1) 1680 printk(KERN_ERR "md: %s: raid array is not clean" 1681 " -- starting background reconstruction\n", 1682 mdname(mddev)); 1683 1684 } 1685 1686 static ssize_t 1687 md_show_level(mddev_t *mddev, char *page) 1688 { 1689 mdk_personality_t *p = mddev->pers; 1690 if (p == NULL) 1691 return 0; 1692 if (mddev->level >= 0) 1693 return sprintf(page, "RAID-%d\n", mddev->level); 1694 else 1695 return sprintf(page, "%s\n", p->name); 1696 } 1697 1698 static struct md_sysfs_entry md_level = { 1699 .attr = {.name = "level", .mode = S_IRUGO }, 1700 .show = md_show_level, 1701 }; 1702 1703 static ssize_t 1704 md_show_rdisks(mddev_t *mddev, char *page) 1705 { 1706 return sprintf(page, "%d\n", mddev->raid_disks); 1707 } 1708 1709 static struct md_sysfs_entry md_raid_disks = { 1710 .attr = {.name = "raid_disks", .mode = S_IRUGO }, 1711 .show = md_show_rdisks, 1712 }; 1713 1714 static ssize_t 1715 md_show_scan(mddev_t *mddev, char *page) 1716 { 1717 char *type = "none"; 1718 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1719 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 1720 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1721 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1722 type = "resync"; 1723 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1724 type = "check"; 1725 else 1726 type = "repair"; 1727 } else 1728 type = "recover"; 1729 } 1730 return sprintf(page, "%s\n", type); 1731 } 1732 1733 static ssize_t 1734 md_store_scan(mddev_t *mddev, const char *page, size_t len) 1735 { 1736 int canscan=0; 1737 1738 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1739 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 1740 return -EBUSY; 1741 down(&mddev->reconfig_sem); 1742 if (mddev->pers && mddev->pers->sync_request) 1743 canscan=1; 1744 up(&mddev->reconfig_sem); 1745 if (!canscan) 1746 return -EINVAL; 1747 1748 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) 1749 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 1750 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) 1751 return -EINVAL; 1752 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 1753 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 1754 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1755 md_wakeup_thread(mddev->thread); 1756 return len; 1757 } 1758 1759 static ssize_t 1760 md_show_mismatch(mddev_t *mddev, char *page) 1761 { 1762 return sprintf(page, "%llu\n", 1763 (unsigned long long) mddev->resync_mismatches); 1764 } 1765 1766 static struct md_sysfs_entry md_scan_mode = { 1767 .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, 1768 .show = md_show_scan, 1769 .store = md_store_scan, 1770 }; 1771 1772 static struct md_sysfs_entry md_mismatches = { 1773 .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, 1774 .show = md_show_mismatch, 1775 }; 1776 1777 static struct attribute *md_default_attrs[] = { 1778 &md_level.attr, 1779 &md_raid_disks.attr, 1780 &md_scan_mode.attr, 1781 &md_mismatches.attr, 1782 NULL, 1783 }; 1784 1785 static ssize_t 1786 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1787 { 1788 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1789 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1790 1791 if (!entry->show) 1792 return -EIO; 1793 return entry->show(mddev, page); 1794 } 1795 1796 static ssize_t 1797 md_attr_store(struct kobject *kobj, struct attribute *attr, 1798 const char *page, size_t length) 1799 { 1800 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1801 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1802 1803 if (!entry->store) 1804 return -EIO; 1805 return entry->store(mddev, page, length); 1806 } 1807 1808 static void md_free(struct kobject *ko) 1809 { 1810 mddev_t *mddev = container_of(ko, mddev_t, kobj); 1811 kfree(mddev); 1812 } 1813 1814 static struct sysfs_ops md_sysfs_ops = { 1815 .show = md_attr_show, 1816 .store = md_attr_store, 1817 }; 1818 static struct kobj_type md_ktype = { 1819 .release = md_free, 1820 .sysfs_ops = &md_sysfs_ops, 1821 .default_attrs = md_default_attrs, 1822 }; 1823 1824 int mdp_major = 0; 1825 1826 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1827 { 1828 static DECLARE_MUTEX(disks_sem); 1829 mddev_t *mddev = mddev_find(dev); 1830 struct gendisk *disk; 1831 int partitioned = (MAJOR(dev) != MD_MAJOR); 1832 int shift = partitioned ? MdpMinorShift : 0; 1833 int unit = MINOR(dev) >> shift; 1834 1835 if (!mddev) 1836 return NULL; 1837 1838 down(&disks_sem); 1839 if (mddev->gendisk) { 1840 up(&disks_sem); 1841 mddev_put(mddev); 1842 return NULL; 1843 } 1844 disk = alloc_disk(1 << shift); 1845 if (!disk) { 1846 up(&disks_sem); 1847 mddev_put(mddev); 1848 return NULL; 1849 } 1850 disk->major = MAJOR(dev); 1851 disk->first_minor = unit << shift; 1852 if (partitioned) { 1853 sprintf(disk->disk_name, "md_d%d", unit); 1854 sprintf(disk->devfs_name, "md/d%d", unit); 1855 } else { 1856 sprintf(disk->disk_name, "md%d", unit); 1857 sprintf(disk->devfs_name, "md/%d", unit); 1858 } 1859 disk->fops = &md_fops; 1860 disk->private_data = mddev; 1861 disk->queue = mddev->queue; 1862 add_disk(disk); 1863 mddev->gendisk = disk; 1864 up(&disks_sem); 1865 mddev->kobj.parent = &disk->kobj; 1866 mddev->kobj.k_name = NULL; 1867 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 1868 mddev->kobj.ktype = &md_ktype; 1869 kobject_register(&mddev->kobj); 1870 return NULL; 1871 } 1872 1873 void md_wakeup_thread(mdk_thread_t *thread); 1874 1875 static void md_safemode_timeout(unsigned long data) 1876 { 1877 mddev_t *mddev = (mddev_t *) data; 1878 1879 mddev->safemode = 1; 1880 md_wakeup_thread(mddev->thread); 1881 } 1882 1883 1884 static int do_md_run(mddev_t * mddev) 1885 { 1886 int pnum, err; 1887 int chunk_size; 1888 struct list_head *tmp; 1889 mdk_rdev_t *rdev; 1890 struct gendisk *disk; 1891 char b[BDEVNAME_SIZE]; 1892 1893 if (list_empty(&mddev->disks)) 1894 /* cannot run an array with no devices.. */ 1895 return -EINVAL; 1896 1897 if (mddev->pers) 1898 return -EBUSY; 1899 1900 /* 1901 * Analyze all RAID superblock(s) 1902 */ 1903 if (!mddev->raid_disks) 1904 analyze_sbs(mddev); 1905 1906 chunk_size = mddev->chunk_size; 1907 pnum = level_to_pers(mddev->level); 1908 1909 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1910 if (!chunk_size) { 1911 /* 1912 * 'default chunksize' in the old md code used to 1913 * be PAGE_SIZE, baaad. 1914 * we abort here to be on the safe side. We don't 1915 * want to continue the bad practice. 1916 */ 1917 printk(KERN_ERR 1918 "no chunksize specified, see 'man raidtab'\n"); 1919 return -EINVAL; 1920 } 1921 if (chunk_size > MAX_CHUNK_SIZE) { 1922 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1923 chunk_size, MAX_CHUNK_SIZE); 1924 return -EINVAL; 1925 } 1926 /* 1927 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1928 */ 1929 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1930 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1931 return -EINVAL; 1932 } 1933 if (chunk_size < PAGE_SIZE) { 1934 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1935 chunk_size, PAGE_SIZE); 1936 return -EINVAL; 1937 } 1938 1939 /* devices must have minimum size of one chunk */ 1940 ITERATE_RDEV(mddev,rdev,tmp) { 1941 if (rdev->faulty) 1942 continue; 1943 if (rdev->size < chunk_size / 1024) { 1944 printk(KERN_WARNING 1945 "md: Dev %s smaller than chunk_size:" 1946 " %lluk < %dk\n", 1947 bdevname(rdev->bdev,b), 1948 (unsigned long long)rdev->size, 1949 chunk_size / 1024); 1950 return -EINVAL; 1951 } 1952 } 1953 } 1954 1955 #ifdef CONFIG_KMOD 1956 if (!pers[pnum]) 1957 { 1958 request_module("md-personality-%d", pnum); 1959 } 1960 #endif 1961 1962 /* 1963 * Drop all container device buffers, from now on 1964 * the only valid external interface is through the md 1965 * device. 1966 * Also find largest hardsector size 1967 */ 1968 ITERATE_RDEV(mddev,rdev,tmp) { 1969 if (rdev->faulty) 1970 continue; 1971 sync_blockdev(rdev->bdev); 1972 invalidate_bdev(rdev->bdev, 0); 1973 } 1974 1975 md_probe(mddev->unit, NULL, NULL); 1976 disk = mddev->gendisk; 1977 if (!disk) 1978 return -ENOMEM; 1979 1980 spin_lock(&pers_lock); 1981 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1982 spin_unlock(&pers_lock); 1983 printk(KERN_WARNING "md: personality %d is not loaded!\n", 1984 pnum); 1985 return -EINVAL; 1986 } 1987 1988 mddev->pers = pers[pnum]; 1989 spin_unlock(&pers_lock); 1990 1991 mddev->recovery = 0; 1992 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1993 1994 /* before we start the array running, initialise the bitmap */ 1995 err = bitmap_create(mddev); 1996 if (err) 1997 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 1998 mdname(mddev), err); 1999 else 2000 err = mddev->pers->run(mddev); 2001 if (err) { 2002 printk(KERN_ERR "md: pers->run() failed ...\n"); 2003 module_put(mddev->pers->owner); 2004 mddev->pers = NULL; 2005 bitmap_destroy(mddev); 2006 return err; 2007 } 2008 atomic_set(&mddev->writes_pending,0); 2009 mddev->safemode = 0; 2010 mddev->safemode_timer.function = md_safemode_timeout; 2011 mddev->safemode_timer.data = (unsigned long) mddev; 2012 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2013 mddev->in_sync = 1; 2014 2015 ITERATE_RDEV(mddev,rdev,tmp) 2016 if (rdev->raid_disk >= 0) { 2017 char nm[20]; 2018 sprintf(nm, "rd%d", rdev->raid_disk); 2019 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2020 } 2021 2022 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2023 md_wakeup_thread(mddev->thread); 2024 2025 if (mddev->sb_dirty) 2026 md_update_sb(mddev); 2027 2028 set_capacity(disk, mddev->array_size<<1); 2029 2030 /* If we call blk_queue_make_request here, it will 2031 * re-initialise max_sectors etc which may have been 2032 * refined inside -> run. So just set the bits we need to set. 2033 * Most initialisation happended when we called 2034 * blk_queue_make_request(..., md_fail_request) 2035 * earlier. 2036 */ 2037 mddev->queue->queuedata = mddev; 2038 mddev->queue->make_request_fn = mddev->pers->make_request; 2039 2040 mddev->changed = 1; 2041 return 0; 2042 } 2043 2044 static int restart_array(mddev_t *mddev) 2045 { 2046 struct gendisk *disk = mddev->gendisk; 2047 int err; 2048 2049 /* 2050 * Complain if it has no devices 2051 */ 2052 err = -ENXIO; 2053 if (list_empty(&mddev->disks)) 2054 goto out; 2055 2056 if (mddev->pers) { 2057 err = -EBUSY; 2058 if (!mddev->ro) 2059 goto out; 2060 2061 mddev->safemode = 0; 2062 mddev->ro = 0; 2063 set_disk_ro(disk, 0); 2064 2065 printk(KERN_INFO "md: %s switched to read-write mode.\n", 2066 mdname(mddev)); 2067 /* 2068 * Kick recovery or resync if necessary 2069 */ 2070 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2071 md_wakeup_thread(mddev->thread); 2072 err = 0; 2073 } else { 2074 printk(KERN_ERR "md: %s has no personality assigned.\n", 2075 mdname(mddev)); 2076 err = -EINVAL; 2077 } 2078 2079 out: 2080 return err; 2081 } 2082 2083 static int do_md_stop(mddev_t * mddev, int ro) 2084 { 2085 int err = 0; 2086 struct gendisk *disk = mddev->gendisk; 2087 2088 if (mddev->pers) { 2089 if (atomic_read(&mddev->active)>2) { 2090 printk("md: %s still in use.\n",mdname(mddev)); 2091 return -EBUSY; 2092 } 2093 2094 if (mddev->sync_thread) { 2095 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2096 md_unregister_thread(mddev->sync_thread); 2097 mddev->sync_thread = NULL; 2098 } 2099 2100 del_timer_sync(&mddev->safemode_timer); 2101 2102 invalidate_partition(disk, 0); 2103 2104 if (ro) { 2105 err = -ENXIO; 2106 if (mddev->ro) 2107 goto out; 2108 mddev->ro = 1; 2109 } else { 2110 bitmap_flush(mddev); 2111 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 2112 if (mddev->ro) 2113 set_disk_ro(disk, 0); 2114 blk_queue_make_request(mddev->queue, md_fail_request); 2115 mddev->pers->stop(mddev); 2116 module_put(mddev->pers->owner); 2117 mddev->pers = NULL; 2118 if (mddev->ro) 2119 mddev->ro = 0; 2120 } 2121 if (!mddev->in_sync) { 2122 /* mark array as shutdown cleanly */ 2123 mddev->in_sync = 1; 2124 md_update_sb(mddev); 2125 } 2126 if (ro) 2127 set_disk_ro(disk, 1); 2128 } 2129 2130 bitmap_destroy(mddev); 2131 if (mddev->bitmap_file) { 2132 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 2133 fput(mddev->bitmap_file); 2134 mddev->bitmap_file = NULL; 2135 } 2136 mddev->bitmap_offset = 0; 2137 2138 /* 2139 * Free resources if final stop 2140 */ 2141 if (!ro) { 2142 mdk_rdev_t *rdev; 2143 struct list_head *tmp; 2144 struct gendisk *disk; 2145 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2146 2147 ITERATE_RDEV(mddev,rdev,tmp) 2148 if (rdev->raid_disk >= 0) { 2149 char nm[20]; 2150 sprintf(nm, "rd%d", rdev->raid_disk); 2151 sysfs_remove_link(&mddev->kobj, nm); 2152 } 2153 2154 export_array(mddev); 2155 2156 mddev->array_size = 0; 2157 disk = mddev->gendisk; 2158 if (disk) 2159 set_capacity(disk, 0); 2160 mddev->changed = 1; 2161 } else 2162 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2163 mdname(mddev)); 2164 err = 0; 2165 out: 2166 return err; 2167 } 2168 2169 static void autorun_array(mddev_t *mddev) 2170 { 2171 mdk_rdev_t *rdev; 2172 struct list_head *tmp; 2173 int err; 2174 2175 if (list_empty(&mddev->disks)) 2176 return; 2177 2178 printk(KERN_INFO "md: running: "); 2179 2180 ITERATE_RDEV(mddev,rdev,tmp) { 2181 char b[BDEVNAME_SIZE]; 2182 printk("<%s>", bdevname(rdev->bdev,b)); 2183 } 2184 printk("\n"); 2185 2186 err = do_md_run (mddev); 2187 if (err) { 2188 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 2189 do_md_stop (mddev, 0); 2190 } 2191 } 2192 2193 /* 2194 * lets try to run arrays based on all disks that have arrived 2195 * until now. (those are in pending_raid_disks) 2196 * 2197 * the method: pick the first pending disk, collect all disks with 2198 * the same UUID, remove all from the pending list and put them into 2199 * the 'same_array' list. Then order this list based on superblock 2200 * update time (freshest comes first), kick out 'old' disks and 2201 * compare superblocks. If everything's fine then run it. 2202 * 2203 * If "unit" is allocated, then bump its reference count 2204 */ 2205 static void autorun_devices(int part) 2206 { 2207 struct list_head candidates; 2208 struct list_head *tmp; 2209 mdk_rdev_t *rdev0, *rdev; 2210 mddev_t *mddev; 2211 char b[BDEVNAME_SIZE]; 2212 2213 printk(KERN_INFO "md: autorun ...\n"); 2214 while (!list_empty(&pending_raid_disks)) { 2215 dev_t dev; 2216 rdev0 = list_entry(pending_raid_disks.next, 2217 mdk_rdev_t, same_set); 2218 2219 printk(KERN_INFO "md: considering %s ...\n", 2220 bdevname(rdev0->bdev,b)); 2221 INIT_LIST_HEAD(&candidates); 2222 ITERATE_RDEV_PENDING(rdev,tmp) 2223 if (super_90_load(rdev, rdev0, 0) >= 0) { 2224 printk(KERN_INFO "md: adding %s ...\n", 2225 bdevname(rdev->bdev,b)); 2226 list_move(&rdev->same_set, &candidates); 2227 } 2228 /* 2229 * now we have a set of devices, with all of them having 2230 * mostly sane superblocks. It's time to allocate the 2231 * mddev. 2232 */ 2233 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 2234 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 2235 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 2236 break; 2237 } 2238 if (part) 2239 dev = MKDEV(mdp_major, 2240 rdev0->preferred_minor << MdpMinorShift); 2241 else 2242 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 2243 2244 md_probe(dev, NULL, NULL); 2245 mddev = mddev_find(dev); 2246 if (!mddev) { 2247 printk(KERN_ERR 2248 "md: cannot allocate memory for md drive.\n"); 2249 break; 2250 } 2251 if (mddev_lock(mddev)) 2252 printk(KERN_WARNING "md: %s locked, cannot run\n", 2253 mdname(mddev)); 2254 else if (mddev->raid_disks || mddev->major_version 2255 || !list_empty(&mddev->disks)) { 2256 printk(KERN_WARNING 2257 "md: %s already running, cannot run %s\n", 2258 mdname(mddev), bdevname(rdev0->bdev,b)); 2259 mddev_unlock(mddev); 2260 } else { 2261 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 2262 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 2263 list_del_init(&rdev->same_set); 2264 if (bind_rdev_to_array(rdev, mddev)) 2265 export_rdev(rdev); 2266 } 2267 autorun_array(mddev); 2268 mddev_unlock(mddev); 2269 } 2270 /* on success, candidates will be empty, on error 2271 * it won't... 2272 */ 2273 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 2274 export_rdev(rdev); 2275 mddev_put(mddev); 2276 } 2277 printk(KERN_INFO "md: ... autorun DONE.\n"); 2278 } 2279 2280 /* 2281 * import RAID devices based on one partition 2282 * if possible, the array gets run as well. 2283 */ 2284 2285 static int autostart_array(dev_t startdev) 2286 { 2287 char b[BDEVNAME_SIZE]; 2288 int err = -EINVAL, i; 2289 mdp_super_t *sb = NULL; 2290 mdk_rdev_t *start_rdev = NULL, *rdev; 2291 2292 start_rdev = md_import_device(startdev, 0, 0); 2293 if (IS_ERR(start_rdev)) 2294 return err; 2295 2296 2297 /* NOTE: this can only work for 0.90.0 superblocks */ 2298 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2299 if (sb->major_version != 0 || 2300 sb->minor_version != 90 ) { 2301 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2302 export_rdev(start_rdev); 2303 return err; 2304 } 2305 2306 if (start_rdev->faulty) { 2307 printk(KERN_WARNING 2308 "md: can not autostart based on faulty %s!\n", 2309 bdevname(start_rdev->bdev,b)); 2310 export_rdev(start_rdev); 2311 return err; 2312 } 2313 list_add(&start_rdev->same_set, &pending_raid_disks); 2314 2315 for (i = 0; i < MD_SB_DISKS; i++) { 2316 mdp_disk_t *desc = sb->disks + i; 2317 dev_t dev = MKDEV(desc->major, desc->minor); 2318 2319 if (!dev) 2320 continue; 2321 if (dev == startdev) 2322 continue; 2323 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2324 continue; 2325 rdev = md_import_device(dev, 0, 0); 2326 if (IS_ERR(rdev)) 2327 continue; 2328 2329 list_add(&rdev->same_set, &pending_raid_disks); 2330 } 2331 2332 /* 2333 * possibly return codes 2334 */ 2335 autorun_devices(0); 2336 return 0; 2337 2338 } 2339 2340 2341 static int get_version(void __user * arg) 2342 { 2343 mdu_version_t ver; 2344 2345 ver.major = MD_MAJOR_VERSION; 2346 ver.minor = MD_MINOR_VERSION; 2347 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2348 2349 if (copy_to_user(arg, &ver, sizeof(ver))) 2350 return -EFAULT; 2351 2352 return 0; 2353 } 2354 2355 static int get_array_info(mddev_t * mddev, void __user * arg) 2356 { 2357 mdu_array_info_t info; 2358 int nr,working,active,failed,spare; 2359 mdk_rdev_t *rdev; 2360 struct list_head *tmp; 2361 2362 nr=working=active=failed=spare=0; 2363 ITERATE_RDEV(mddev,rdev,tmp) { 2364 nr++; 2365 if (rdev->faulty) 2366 failed++; 2367 else { 2368 working++; 2369 if (rdev->in_sync) 2370 active++; 2371 else 2372 spare++; 2373 } 2374 } 2375 2376 info.major_version = mddev->major_version; 2377 info.minor_version = mddev->minor_version; 2378 info.patch_version = MD_PATCHLEVEL_VERSION; 2379 info.ctime = mddev->ctime; 2380 info.level = mddev->level; 2381 info.size = mddev->size; 2382 info.nr_disks = nr; 2383 info.raid_disks = mddev->raid_disks; 2384 info.md_minor = mddev->md_minor; 2385 info.not_persistent= !mddev->persistent; 2386 2387 info.utime = mddev->utime; 2388 info.state = 0; 2389 if (mddev->in_sync) 2390 info.state = (1<<MD_SB_CLEAN); 2391 if (mddev->bitmap && mddev->bitmap_offset) 2392 info.state = (1<<MD_SB_BITMAP_PRESENT); 2393 info.active_disks = active; 2394 info.working_disks = working; 2395 info.failed_disks = failed; 2396 info.spare_disks = spare; 2397 2398 info.layout = mddev->layout; 2399 info.chunk_size = mddev->chunk_size; 2400 2401 if (copy_to_user(arg, &info, sizeof(info))) 2402 return -EFAULT; 2403 2404 return 0; 2405 } 2406 2407 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2408 { 2409 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2410 char *ptr, *buf = NULL; 2411 int err = -ENOMEM; 2412 2413 file = kmalloc(sizeof(*file), GFP_KERNEL); 2414 if (!file) 2415 goto out; 2416 2417 /* bitmap disabled, zero the first byte and copy out */ 2418 if (!mddev->bitmap || !mddev->bitmap->file) { 2419 file->pathname[0] = '\0'; 2420 goto copy_out; 2421 } 2422 2423 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2424 if (!buf) 2425 goto out; 2426 2427 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2428 if (!ptr) 2429 goto out; 2430 2431 strcpy(file->pathname, ptr); 2432 2433 copy_out: 2434 err = 0; 2435 if (copy_to_user(arg, file, sizeof(*file))) 2436 err = -EFAULT; 2437 out: 2438 kfree(buf); 2439 kfree(file); 2440 return err; 2441 } 2442 2443 static int get_disk_info(mddev_t * mddev, void __user * arg) 2444 { 2445 mdu_disk_info_t info; 2446 unsigned int nr; 2447 mdk_rdev_t *rdev; 2448 2449 if (copy_from_user(&info, arg, sizeof(info))) 2450 return -EFAULT; 2451 2452 nr = info.number; 2453 2454 rdev = find_rdev_nr(mddev, nr); 2455 if (rdev) { 2456 info.major = MAJOR(rdev->bdev->bd_dev); 2457 info.minor = MINOR(rdev->bdev->bd_dev); 2458 info.raid_disk = rdev->raid_disk; 2459 info.state = 0; 2460 if (rdev->faulty) 2461 info.state |= (1<<MD_DISK_FAULTY); 2462 else if (rdev->in_sync) { 2463 info.state |= (1<<MD_DISK_ACTIVE); 2464 info.state |= (1<<MD_DISK_SYNC); 2465 } 2466 if (test_bit(WriteMostly, &rdev->flags)) 2467 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2468 } else { 2469 info.major = info.minor = 0; 2470 info.raid_disk = -1; 2471 info.state = (1<<MD_DISK_REMOVED); 2472 } 2473 2474 if (copy_to_user(arg, &info, sizeof(info))) 2475 return -EFAULT; 2476 2477 return 0; 2478 } 2479 2480 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2481 { 2482 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2483 mdk_rdev_t *rdev; 2484 dev_t dev = MKDEV(info->major,info->minor); 2485 2486 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2487 return -EOVERFLOW; 2488 2489 if (!mddev->raid_disks) { 2490 int err; 2491 /* expecting a device which has a superblock */ 2492 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2493 if (IS_ERR(rdev)) { 2494 printk(KERN_WARNING 2495 "md: md_import_device returned %ld\n", 2496 PTR_ERR(rdev)); 2497 return PTR_ERR(rdev); 2498 } 2499 if (!list_empty(&mddev->disks)) { 2500 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2501 mdk_rdev_t, same_set); 2502 int err = super_types[mddev->major_version] 2503 .load_super(rdev, rdev0, mddev->minor_version); 2504 if (err < 0) { 2505 printk(KERN_WARNING 2506 "md: %s has different UUID to %s\n", 2507 bdevname(rdev->bdev,b), 2508 bdevname(rdev0->bdev,b2)); 2509 export_rdev(rdev); 2510 return -EINVAL; 2511 } 2512 } 2513 err = bind_rdev_to_array(rdev, mddev); 2514 if (err) 2515 export_rdev(rdev); 2516 return err; 2517 } 2518 2519 /* 2520 * add_new_disk can be used once the array is assembled 2521 * to add "hot spares". They must already have a superblock 2522 * written 2523 */ 2524 if (mddev->pers) { 2525 int err; 2526 if (!mddev->pers->hot_add_disk) { 2527 printk(KERN_WARNING 2528 "%s: personality does not support diskops!\n", 2529 mdname(mddev)); 2530 return -EINVAL; 2531 } 2532 if (mddev->persistent) 2533 rdev = md_import_device(dev, mddev->major_version, 2534 mddev->minor_version); 2535 else 2536 rdev = md_import_device(dev, -1, -1); 2537 if (IS_ERR(rdev)) { 2538 printk(KERN_WARNING 2539 "md: md_import_device returned %ld\n", 2540 PTR_ERR(rdev)); 2541 return PTR_ERR(rdev); 2542 } 2543 /* set save_raid_disk if appropriate */ 2544 if (!mddev->persistent) { 2545 if (info->state & (1<<MD_DISK_SYNC) && 2546 info->raid_disk < mddev->raid_disks) 2547 rdev->raid_disk = info->raid_disk; 2548 else 2549 rdev->raid_disk = -1; 2550 } else 2551 super_types[mddev->major_version]. 2552 validate_super(mddev, rdev); 2553 rdev->saved_raid_disk = rdev->raid_disk; 2554 2555 rdev->in_sync = 0; /* just to be sure */ 2556 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2557 set_bit(WriteMostly, &rdev->flags); 2558 2559 rdev->raid_disk = -1; 2560 err = bind_rdev_to_array(rdev, mddev); 2561 if (err) 2562 export_rdev(rdev); 2563 2564 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2565 md_wakeup_thread(mddev->thread); 2566 return err; 2567 } 2568 2569 /* otherwise, add_new_disk is only allowed 2570 * for major_version==0 superblocks 2571 */ 2572 if (mddev->major_version != 0) { 2573 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2574 mdname(mddev)); 2575 return -EINVAL; 2576 } 2577 2578 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2579 int err; 2580 rdev = md_import_device (dev, -1, 0); 2581 if (IS_ERR(rdev)) { 2582 printk(KERN_WARNING 2583 "md: error, md_import_device() returned %ld\n", 2584 PTR_ERR(rdev)); 2585 return PTR_ERR(rdev); 2586 } 2587 rdev->desc_nr = info->number; 2588 if (info->raid_disk < mddev->raid_disks) 2589 rdev->raid_disk = info->raid_disk; 2590 else 2591 rdev->raid_disk = -1; 2592 2593 rdev->faulty = 0; 2594 if (rdev->raid_disk < mddev->raid_disks) 2595 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2596 else 2597 rdev->in_sync = 0; 2598 2599 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2600 set_bit(WriteMostly, &rdev->flags); 2601 2602 err = bind_rdev_to_array(rdev, mddev); 2603 if (err) { 2604 export_rdev(rdev); 2605 return err; 2606 } 2607 2608 if (!mddev->persistent) { 2609 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2610 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2611 } else 2612 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2613 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2614 2615 if (!mddev->size || (mddev->size > rdev->size)) 2616 mddev->size = rdev->size; 2617 } 2618 2619 return 0; 2620 } 2621 2622 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2623 { 2624 char b[BDEVNAME_SIZE]; 2625 mdk_rdev_t *rdev; 2626 2627 if (!mddev->pers) 2628 return -ENODEV; 2629 2630 rdev = find_rdev(mddev, dev); 2631 if (!rdev) 2632 return -ENXIO; 2633 2634 if (rdev->raid_disk >= 0) 2635 goto busy; 2636 2637 kick_rdev_from_array(rdev); 2638 md_update_sb(mddev); 2639 2640 return 0; 2641 busy: 2642 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2643 bdevname(rdev->bdev,b), mdname(mddev)); 2644 return -EBUSY; 2645 } 2646 2647 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2648 { 2649 char b[BDEVNAME_SIZE]; 2650 int err; 2651 unsigned int size; 2652 mdk_rdev_t *rdev; 2653 2654 if (!mddev->pers) 2655 return -ENODEV; 2656 2657 if (mddev->major_version != 0) { 2658 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2659 " version-0 superblocks.\n", 2660 mdname(mddev)); 2661 return -EINVAL; 2662 } 2663 if (!mddev->pers->hot_add_disk) { 2664 printk(KERN_WARNING 2665 "%s: personality does not support diskops!\n", 2666 mdname(mddev)); 2667 return -EINVAL; 2668 } 2669 2670 rdev = md_import_device (dev, -1, 0); 2671 if (IS_ERR(rdev)) { 2672 printk(KERN_WARNING 2673 "md: error, md_import_device() returned %ld\n", 2674 PTR_ERR(rdev)); 2675 return -EINVAL; 2676 } 2677 2678 if (mddev->persistent) 2679 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2680 else 2681 rdev->sb_offset = 2682 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2683 2684 size = calc_dev_size(rdev, mddev->chunk_size); 2685 rdev->size = size; 2686 2687 if (size < mddev->size) { 2688 printk(KERN_WARNING 2689 "%s: disk size %llu blocks < array size %llu\n", 2690 mdname(mddev), (unsigned long long)size, 2691 (unsigned long long)mddev->size); 2692 err = -ENOSPC; 2693 goto abort_export; 2694 } 2695 2696 if (rdev->faulty) { 2697 printk(KERN_WARNING 2698 "md: can not hot-add faulty %s disk to %s!\n", 2699 bdevname(rdev->bdev,b), mdname(mddev)); 2700 err = -EINVAL; 2701 goto abort_export; 2702 } 2703 rdev->in_sync = 0; 2704 rdev->desc_nr = -1; 2705 bind_rdev_to_array(rdev, mddev); 2706 2707 /* 2708 * The rest should better be atomic, we can have disk failures 2709 * noticed in interrupt contexts ... 2710 */ 2711 2712 if (rdev->desc_nr == mddev->max_disks) { 2713 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2714 mdname(mddev)); 2715 err = -EBUSY; 2716 goto abort_unbind_export; 2717 } 2718 2719 rdev->raid_disk = -1; 2720 2721 md_update_sb(mddev); 2722 2723 /* 2724 * Kick recovery, maybe this spare has to be added to the 2725 * array immediately. 2726 */ 2727 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2728 md_wakeup_thread(mddev->thread); 2729 2730 return 0; 2731 2732 abort_unbind_export: 2733 unbind_rdev_from_array(rdev); 2734 2735 abort_export: 2736 export_rdev(rdev); 2737 return err; 2738 } 2739 2740 /* similar to deny_write_access, but accounts for our holding a reference 2741 * to the file ourselves */ 2742 static int deny_bitmap_write_access(struct file * file) 2743 { 2744 struct inode *inode = file->f_mapping->host; 2745 2746 spin_lock(&inode->i_lock); 2747 if (atomic_read(&inode->i_writecount) > 1) { 2748 spin_unlock(&inode->i_lock); 2749 return -ETXTBSY; 2750 } 2751 atomic_set(&inode->i_writecount, -1); 2752 spin_unlock(&inode->i_lock); 2753 2754 return 0; 2755 } 2756 2757 static int set_bitmap_file(mddev_t *mddev, int fd) 2758 { 2759 int err; 2760 2761 if (mddev->pers) { 2762 if (!mddev->pers->quiesce) 2763 return -EBUSY; 2764 if (mddev->recovery || mddev->sync_thread) 2765 return -EBUSY; 2766 /* we should be able to change the bitmap.. */ 2767 } 2768 2769 2770 if (fd >= 0) { 2771 if (mddev->bitmap) 2772 return -EEXIST; /* cannot add when bitmap is present */ 2773 mddev->bitmap_file = fget(fd); 2774 2775 if (mddev->bitmap_file == NULL) { 2776 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2777 mdname(mddev)); 2778 return -EBADF; 2779 } 2780 2781 err = deny_bitmap_write_access(mddev->bitmap_file); 2782 if (err) { 2783 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2784 mdname(mddev)); 2785 fput(mddev->bitmap_file); 2786 mddev->bitmap_file = NULL; 2787 return err; 2788 } 2789 mddev->bitmap_offset = 0; /* file overrides offset */ 2790 } else if (mddev->bitmap == NULL) 2791 return -ENOENT; /* cannot remove what isn't there */ 2792 err = 0; 2793 if (mddev->pers) { 2794 mddev->pers->quiesce(mddev, 1); 2795 if (fd >= 0) 2796 err = bitmap_create(mddev); 2797 if (fd < 0 || err) 2798 bitmap_destroy(mddev); 2799 mddev->pers->quiesce(mddev, 0); 2800 } else if (fd < 0) { 2801 if (mddev->bitmap_file) 2802 fput(mddev->bitmap_file); 2803 mddev->bitmap_file = NULL; 2804 } 2805 2806 return err; 2807 } 2808 2809 /* 2810 * set_array_info is used two different ways 2811 * The original usage is when creating a new array. 2812 * In this usage, raid_disks is > 0 and it together with 2813 * level, size, not_persistent,layout,chunksize determine the 2814 * shape of the array. 2815 * This will always create an array with a type-0.90.0 superblock. 2816 * The newer usage is when assembling an array. 2817 * In this case raid_disks will be 0, and the major_version field is 2818 * use to determine which style super-blocks are to be found on the devices. 2819 * The minor and patch _version numbers are also kept incase the 2820 * super_block handler wishes to interpret them. 2821 */ 2822 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2823 { 2824 2825 if (info->raid_disks == 0) { 2826 /* just setting version number for superblock loading */ 2827 if (info->major_version < 0 || 2828 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2829 super_types[info->major_version].name == NULL) { 2830 /* maybe try to auto-load a module? */ 2831 printk(KERN_INFO 2832 "md: superblock version %d not known\n", 2833 info->major_version); 2834 return -EINVAL; 2835 } 2836 mddev->major_version = info->major_version; 2837 mddev->minor_version = info->minor_version; 2838 mddev->patch_version = info->patch_version; 2839 return 0; 2840 } 2841 mddev->major_version = MD_MAJOR_VERSION; 2842 mddev->minor_version = MD_MINOR_VERSION; 2843 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2844 mddev->ctime = get_seconds(); 2845 2846 mddev->level = info->level; 2847 mddev->size = info->size; 2848 mddev->raid_disks = info->raid_disks; 2849 /* don't set md_minor, it is determined by which /dev/md* was 2850 * openned 2851 */ 2852 if (info->state & (1<<MD_SB_CLEAN)) 2853 mddev->recovery_cp = MaxSector; 2854 else 2855 mddev->recovery_cp = 0; 2856 mddev->persistent = ! info->not_persistent; 2857 2858 mddev->layout = info->layout; 2859 mddev->chunk_size = info->chunk_size; 2860 2861 mddev->max_disks = MD_SB_DISKS; 2862 2863 mddev->sb_dirty = 1; 2864 2865 /* 2866 * Generate a 128 bit UUID 2867 */ 2868 get_random_bytes(mddev->uuid, 16); 2869 2870 return 0; 2871 } 2872 2873 /* 2874 * update_array_info is used to change the configuration of an 2875 * on-line array. 2876 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2877 * fields in the info are checked against the array. 2878 * Any differences that cannot be handled will cause an error. 2879 * Normally, only one change can be managed at a time. 2880 */ 2881 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2882 { 2883 int rv = 0; 2884 int cnt = 0; 2885 int state = 0; 2886 2887 /* calculate expected state,ignoring low bits */ 2888 if (mddev->bitmap && mddev->bitmap_offset) 2889 state |= (1 << MD_SB_BITMAP_PRESENT); 2890 2891 if (mddev->major_version != info->major_version || 2892 mddev->minor_version != info->minor_version || 2893 /* mddev->patch_version != info->patch_version || */ 2894 mddev->ctime != info->ctime || 2895 mddev->level != info->level || 2896 /* mddev->layout != info->layout || */ 2897 !mddev->persistent != info->not_persistent|| 2898 mddev->chunk_size != info->chunk_size || 2899 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 2900 ((state^info->state) & 0xfffffe00) 2901 ) 2902 return -EINVAL; 2903 /* Check there is only one change */ 2904 if (mddev->size != info->size) cnt++; 2905 if (mddev->raid_disks != info->raid_disks) cnt++; 2906 if (mddev->layout != info->layout) cnt++; 2907 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 2908 if (cnt == 0) return 0; 2909 if (cnt > 1) return -EINVAL; 2910 2911 if (mddev->layout != info->layout) { 2912 /* Change layout 2913 * we don't need to do anything at the md level, the 2914 * personality will take care of it all. 2915 */ 2916 if (mddev->pers->reconfig == NULL) 2917 return -EINVAL; 2918 else 2919 return mddev->pers->reconfig(mddev, info->layout, -1); 2920 } 2921 if (mddev->size != info->size) { 2922 mdk_rdev_t * rdev; 2923 struct list_head *tmp; 2924 if (mddev->pers->resize == NULL) 2925 return -EINVAL; 2926 /* The "size" is the amount of each device that is used. 2927 * This can only make sense for arrays with redundancy. 2928 * linear and raid0 always use whatever space is available 2929 * We can only consider changing the size if no resync 2930 * or reconstruction is happening, and if the new size 2931 * is acceptable. It must fit before the sb_offset or, 2932 * if that is <data_offset, it must fit before the 2933 * size of each device. 2934 * If size is zero, we find the largest size that fits. 2935 */ 2936 if (mddev->sync_thread) 2937 return -EBUSY; 2938 ITERATE_RDEV(mddev,rdev,tmp) { 2939 sector_t avail; 2940 int fit = (info->size == 0); 2941 if (rdev->sb_offset > rdev->data_offset) 2942 avail = (rdev->sb_offset*2) - rdev->data_offset; 2943 else 2944 avail = get_capacity(rdev->bdev->bd_disk) 2945 - rdev->data_offset; 2946 if (fit && (info->size == 0 || info->size > avail/2)) 2947 info->size = avail/2; 2948 if (avail < ((sector_t)info->size << 1)) 2949 return -ENOSPC; 2950 } 2951 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 2952 if (!rv) { 2953 struct block_device *bdev; 2954 2955 bdev = bdget_disk(mddev->gendisk, 0); 2956 if (bdev) { 2957 down(&bdev->bd_inode->i_sem); 2958 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2959 up(&bdev->bd_inode->i_sem); 2960 bdput(bdev); 2961 } 2962 } 2963 } 2964 if (mddev->raid_disks != info->raid_disks) { 2965 /* change the number of raid disks */ 2966 if (mddev->pers->reshape == NULL) 2967 return -EINVAL; 2968 if (info->raid_disks <= 0 || 2969 info->raid_disks >= mddev->max_disks) 2970 return -EINVAL; 2971 if (mddev->sync_thread) 2972 return -EBUSY; 2973 rv = mddev->pers->reshape(mddev, info->raid_disks); 2974 if (!rv) { 2975 struct block_device *bdev; 2976 2977 bdev = bdget_disk(mddev->gendisk, 0); 2978 if (bdev) { 2979 down(&bdev->bd_inode->i_sem); 2980 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2981 up(&bdev->bd_inode->i_sem); 2982 bdput(bdev); 2983 } 2984 } 2985 } 2986 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 2987 if (mddev->pers->quiesce == NULL) 2988 return -EINVAL; 2989 if (mddev->recovery || mddev->sync_thread) 2990 return -EBUSY; 2991 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 2992 /* add the bitmap */ 2993 if (mddev->bitmap) 2994 return -EEXIST; 2995 if (mddev->default_bitmap_offset == 0) 2996 return -EINVAL; 2997 mddev->bitmap_offset = mddev->default_bitmap_offset; 2998 mddev->pers->quiesce(mddev, 1); 2999 rv = bitmap_create(mddev); 3000 if (rv) 3001 bitmap_destroy(mddev); 3002 mddev->pers->quiesce(mddev, 0); 3003 } else { 3004 /* remove the bitmap */ 3005 if (!mddev->bitmap) 3006 return -ENOENT; 3007 if (mddev->bitmap->file) 3008 return -EINVAL; 3009 mddev->pers->quiesce(mddev, 1); 3010 bitmap_destroy(mddev); 3011 mddev->pers->quiesce(mddev, 0); 3012 mddev->bitmap_offset = 0; 3013 } 3014 } 3015 md_update_sb(mddev); 3016 return rv; 3017 } 3018 3019 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 3020 { 3021 mdk_rdev_t *rdev; 3022 3023 if (mddev->pers == NULL) 3024 return -ENODEV; 3025 3026 rdev = find_rdev(mddev, dev); 3027 if (!rdev) 3028 return -ENODEV; 3029 3030 md_error(mddev, rdev); 3031 return 0; 3032 } 3033 3034 static int md_ioctl(struct inode *inode, struct file *file, 3035 unsigned int cmd, unsigned long arg) 3036 { 3037 int err = 0; 3038 void __user *argp = (void __user *)arg; 3039 struct hd_geometry __user *loc = argp; 3040 mddev_t *mddev = NULL; 3041 3042 if (!capable(CAP_SYS_ADMIN)) 3043 return -EACCES; 3044 3045 /* 3046 * Commands dealing with the RAID driver but not any 3047 * particular array: 3048 */ 3049 switch (cmd) 3050 { 3051 case RAID_VERSION: 3052 err = get_version(argp); 3053 goto done; 3054 3055 case PRINT_RAID_DEBUG: 3056 err = 0; 3057 md_print_devices(); 3058 goto done; 3059 3060 #ifndef MODULE 3061 case RAID_AUTORUN: 3062 err = 0; 3063 autostart_arrays(arg); 3064 goto done; 3065 #endif 3066 default:; 3067 } 3068 3069 /* 3070 * Commands creating/starting a new array: 3071 */ 3072 3073 mddev = inode->i_bdev->bd_disk->private_data; 3074 3075 if (!mddev) { 3076 BUG(); 3077 goto abort; 3078 } 3079 3080 3081 if (cmd == START_ARRAY) { 3082 /* START_ARRAY doesn't need to lock the array as autostart_array 3083 * does the locking, and it could even be a different array 3084 */ 3085 static int cnt = 3; 3086 if (cnt > 0 ) { 3087 printk(KERN_WARNING 3088 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 3089 "This will not be supported beyond 2.6\n", 3090 current->comm, current->pid); 3091 cnt--; 3092 } 3093 err = autostart_array(new_decode_dev(arg)); 3094 if (err) { 3095 printk(KERN_WARNING "md: autostart failed!\n"); 3096 goto abort; 3097 } 3098 goto done; 3099 } 3100 3101 err = mddev_lock(mddev); 3102 if (err) { 3103 printk(KERN_INFO 3104 "md: ioctl lock interrupted, reason %d, cmd %d\n", 3105 err, cmd); 3106 goto abort; 3107 } 3108 3109 switch (cmd) 3110 { 3111 case SET_ARRAY_INFO: 3112 { 3113 mdu_array_info_t info; 3114 if (!arg) 3115 memset(&info, 0, sizeof(info)); 3116 else if (copy_from_user(&info, argp, sizeof(info))) { 3117 err = -EFAULT; 3118 goto abort_unlock; 3119 } 3120 if (mddev->pers) { 3121 err = update_array_info(mddev, &info); 3122 if (err) { 3123 printk(KERN_WARNING "md: couldn't update" 3124 " array info. %d\n", err); 3125 goto abort_unlock; 3126 } 3127 goto done_unlock; 3128 } 3129 if (!list_empty(&mddev->disks)) { 3130 printk(KERN_WARNING 3131 "md: array %s already has disks!\n", 3132 mdname(mddev)); 3133 err = -EBUSY; 3134 goto abort_unlock; 3135 } 3136 if (mddev->raid_disks) { 3137 printk(KERN_WARNING 3138 "md: array %s already initialised!\n", 3139 mdname(mddev)); 3140 err = -EBUSY; 3141 goto abort_unlock; 3142 } 3143 err = set_array_info(mddev, &info); 3144 if (err) { 3145 printk(KERN_WARNING "md: couldn't set" 3146 " array info. %d\n", err); 3147 goto abort_unlock; 3148 } 3149 } 3150 goto done_unlock; 3151 3152 default:; 3153 } 3154 3155 /* 3156 * Commands querying/configuring an existing array: 3157 */ 3158 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 3159 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 3160 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 3161 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 3162 err = -ENODEV; 3163 goto abort_unlock; 3164 } 3165 3166 /* 3167 * Commands even a read-only array can execute: 3168 */ 3169 switch (cmd) 3170 { 3171 case GET_ARRAY_INFO: 3172 err = get_array_info(mddev, argp); 3173 goto done_unlock; 3174 3175 case GET_BITMAP_FILE: 3176 err = get_bitmap_file(mddev, argp); 3177 goto done_unlock; 3178 3179 case GET_DISK_INFO: 3180 err = get_disk_info(mddev, argp); 3181 goto done_unlock; 3182 3183 case RESTART_ARRAY_RW: 3184 err = restart_array(mddev); 3185 goto done_unlock; 3186 3187 case STOP_ARRAY: 3188 err = do_md_stop (mddev, 0); 3189 goto done_unlock; 3190 3191 case STOP_ARRAY_RO: 3192 err = do_md_stop (mddev, 1); 3193 goto done_unlock; 3194 3195 /* 3196 * We have a problem here : there is no easy way to give a CHS 3197 * virtual geometry. We currently pretend that we have a 2 heads 3198 * 4 sectors (with a BIG number of cylinders...). This drives 3199 * dosfs just mad... ;-) 3200 */ 3201 case HDIO_GETGEO: 3202 if (!loc) { 3203 err = -EINVAL; 3204 goto abort_unlock; 3205 } 3206 err = put_user (2, (char __user *) &loc->heads); 3207 if (err) 3208 goto abort_unlock; 3209 err = put_user (4, (char __user *) &loc->sectors); 3210 if (err) 3211 goto abort_unlock; 3212 err = put_user(get_capacity(mddev->gendisk)/8, 3213 (short __user *) &loc->cylinders); 3214 if (err) 3215 goto abort_unlock; 3216 err = put_user (get_start_sect(inode->i_bdev), 3217 (long __user *) &loc->start); 3218 goto done_unlock; 3219 } 3220 3221 /* 3222 * The remaining ioctls are changing the state of the 3223 * superblock, so we do not allow read-only arrays 3224 * here: 3225 */ 3226 if (mddev->ro) { 3227 err = -EROFS; 3228 goto abort_unlock; 3229 } 3230 3231 switch (cmd) 3232 { 3233 case ADD_NEW_DISK: 3234 { 3235 mdu_disk_info_t info; 3236 if (copy_from_user(&info, argp, sizeof(info))) 3237 err = -EFAULT; 3238 else 3239 err = add_new_disk(mddev, &info); 3240 goto done_unlock; 3241 } 3242 3243 case HOT_REMOVE_DISK: 3244 err = hot_remove_disk(mddev, new_decode_dev(arg)); 3245 goto done_unlock; 3246 3247 case HOT_ADD_DISK: 3248 err = hot_add_disk(mddev, new_decode_dev(arg)); 3249 goto done_unlock; 3250 3251 case SET_DISK_FAULTY: 3252 err = set_disk_faulty(mddev, new_decode_dev(arg)); 3253 goto done_unlock; 3254 3255 case RUN_ARRAY: 3256 err = do_md_run (mddev); 3257 goto done_unlock; 3258 3259 case SET_BITMAP_FILE: 3260 err = set_bitmap_file(mddev, (int)arg); 3261 goto done_unlock; 3262 3263 default: 3264 if (_IOC_TYPE(cmd) == MD_MAJOR) 3265 printk(KERN_WARNING "md: %s(pid %d) used" 3266 " obsolete MD ioctl, upgrade your" 3267 " software to use new ictls.\n", 3268 current->comm, current->pid); 3269 err = -EINVAL; 3270 goto abort_unlock; 3271 } 3272 3273 done_unlock: 3274 abort_unlock: 3275 mddev_unlock(mddev); 3276 3277 return err; 3278 done: 3279 if (err) 3280 MD_BUG(); 3281 abort: 3282 return err; 3283 } 3284 3285 static int md_open(struct inode *inode, struct file *file) 3286 { 3287 /* 3288 * Succeed if we can lock the mddev, which confirms that 3289 * it isn't being stopped right now. 3290 */ 3291 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3292 int err; 3293 3294 if ((err = mddev_lock(mddev))) 3295 goto out; 3296 3297 err = 0; 3298 mddev_get(mddev); 3299 mddev_unlock(mddev); 3300 3301 check_disk_change(inode->i_bdev); 3302 out: 3303 return err; 3304 } 3305 3306 static int md_release(struct inode *inode, struct file * file) 3307 { 3308 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3309 3310 if (!mddev) 3311 BUG(); 3312 mddev_put(mddev); 3313 3314 return 0; 3315 } 3316 3317 static int md_media_changed(struct gendisk *disk) 3318 { 3319 mddev_t *mddev = disk->private_data; 3320 3321 return mddev->changed; 3322 } 3323 3324 static int md_revalidate(struct gendisk *disk) 3325 { 3326 mddev_t *mddev = disk->private_data; 3327 3328 mddev->changed = 0; 3329 return 0; 3330 } 3331 static struct block_device_operations md_fops = 3332 { 3333 .owner = THIS_MODULE, 3334 .open = md_open, 3335 .release = md_release, 3336 .ioctl = md_ioctl, 3337 .media_changed = md_media_changed, 3338 .revalidate_disk= md_revalidate, 3339 }; 3340 3341 static int md_thread(void * arg) 3342 { 3343 mdk_thread_t *thread = arg; 3344 3345 /* 3346 * md_thread is a 'system-thread', it's priority should be very 3347 * high. We avoid resource deadlocks individually in each 3348 * raid personality. (RAID5 does preallocation) We also use RR and 3349 * the very same RT priority as kswapd, thus we will never get 3350 * into a priority inversion deadlock. 3351 * 3352 * we definitely have to have equal or higher priority than 3353 * bdflush, otherwise bdflush will deadlock if there are too 3354 * many dirty RAID5 blocks. 3355 */ 3356 3357 allow_signal(SIGKILL); 3358 complete(thread->event); 3359 while (!kthread_should_stop()) { 3360 void (*run)(mddev_t *); 3361 3362 wait_event_interruptible_timeout(thread->wqueue, 3363 test_bit(THREAD_WAKEUP, &thread->flags) 3364 || kthread_should_stop(), 3365 thread->timeout); 3366 try_to_freeze(); 3367 3368 clear_bit(THREAD_WAKEUP, &thread->flags); 3369 3370 run = thread->run; 3371 if (run) 3372 run(thread->mddev); 3373 } 3374 3375 return 0; 3376 } 3377 3378 void md_wakeup_thread(mdk_thread_t *thread) 3379 { 3380 if (thread) { 3381 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3382 set_bit(THREAD_WAKEUP, &thread->flags); 3383 wake_up(&thread->wqueue); 3384 } 3385 } 3386 3387 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3388 const char *name) 3389 { 3390 mdk_thread_t *thread; 3391 struct completion event; 3392 3393 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3394 if (!thread) 3395 return NULL; 3396 3397 memset(thread, 0, sizeof(mdk_thread_t)); 3398 init_waitqueue_head(&thread->wqueue); 3399 3400 init_completion(&event); 3401 thread->event = &event; 3402 thread->run = run; 3403 thread->mddev = mddev; 3404 thread->name = name; 3405 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3406 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3407 if (IS_ERR(thread->tsk)) { 3408 kfree(thread); 3409 return NULL; 3410 } 3411 wait_for_completion(&event); 3412 return thread; 3413 } 3414 3415 void md_unregister_thread(mdk_thread_t *thread) 3416 { 3417 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3418 3419 kthread_stop(thread->tsk); 3420 kfree(thread); 3421 } 3422 3423 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3424 { 3425 if (!mddev) { 3426 MD_BUG(); 3427 return; 3428 } 3429 3430 if (!rdev || rdev->faulty) 3431 return; 3432 /* 3433 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3434 mdname(mddev), 3435 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3436 __builtin_return_address(0),__builtin_return_address(1), 3437 __builtin_return_address(2),__builtin_return_address(3)); 3438 */ 3439 if (!mddev->pers->error_handler) 3440 return; 3441 mddev->pers->error_handler(mddev,rdev); 3442 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3443 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3444 md_wakeup_thread(mddev->thread); 3445 } 3446 3447 /* seq_file implementation /proc/mdstat */ 3448 3449 static void status_unused(struct seq_file *seq) 3450 { 3451 int i = 0; 3452 mdk_rdev_t *rdev; 3453 struct list_head *tmp; 3454 3455 seq_printf(seq, "unused devices: "); 3456 3457 ITERATE_RDEV_PENDING(rdev,tmp) { 3458 char b[BDEVNAME_SIZE]; 3459 i++; 3460 seq_printf(seq, "%s ", 3461 bdevname(rdev->bdev,b)); 3462 } 3463 if (!i) 3464 seq_printf(seq, "<none>"); 3465 3466 seq_printf(seq, "\n"); 3467 } 3468 3469 3470 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3471 { 3472 unsigned long max_blocks, resync, res, dt, db, rt; 3473 3474 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3475 3476 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3477 max_blocks = mddev->resync_max_sectors >> 1; 3478 else 3479 max_blocks = mddev->size; 3480 3481 /* 3482 * Should not happen. 3483 */ 3484 if (!max_blocks) { 3485 MD_BUG(); 3486 return; 3487 } 3488 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3489 { 3490 int i, x = res/50, y = 20-x; 3491 seq_printf(seq, "["); 3492 for (i = 0; i < x; i++) 3493 seq_printf(seq, "="); 3494 seq_printf(seq, ">"); 3495 for (i = 0; i < y; i++) 3496 seq_printf(seq, "."); 3497 seq_printf(seq, "] "); 3498 } 3499 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3500 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3501 "resync" : "recovery"), 3502 res/10, res % 10, resync, max_blocks); 3503 3504 /* 3505 * We do not want to overflow, so the order of operands and 3506 * the * 100 / 100 trick are important. We do a +1 to be 3507 * safe against division by zero. We only estimate anyway. 3508 * 3509 * dt: time from mark until now 3510 * db: blocks written from mark until now 3511 * rt: remaining time 3512 */ 3513 dt = ((jiffies - mddev->resync_mark) / HZ); 3514 if (!dt) dt++; 3515 db = resync - (mddev->resync_mark_cnt/2); 3516 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3517 3518 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3519 3520 seq_printf(seq, " speed=%ldK/sec", db/dt); 3521 } 3522 3523 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3524 { 3525 struct list_head *tmp; 3526 loff_t l = *pos; 3527 mddev_t *mddev; 3528 3529 if (l >= 0x10000) 3530 return NULL; 3531 if (!l--) 3532 /* header */ 3533 return (void*)1; 3534 3535 spin_lock(&all_mddevs_lock); 3536 list_for_each(tmp,&all_mddevs) 3537 if (!l--) { 3538 mddev = list_entry(tmp, mddev_t, all_mddevs); 3539 mddev_get(mddev); 3540 spin_unlock(&all_mddevs_lock); 3541 return mddev; 3542 } 3543 spin_unlock(&all_mddevs_lock); 3544 if (!l--) 3545 return (void*)2;/* tail */ 3546 return NULL; 3547 } 3548 3549 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3550 { 3551 struct list_head *tmp; 3552 mddev_t *next_mddev, *mddev = v; 3553 3554 ++*pos; 3555 if (v == (void*)2) 3556 return NULL; 3557 3558 spin_lock(&all_mddevs_lock); 3559 if (v == (void*)1) 3560 tmp = all_mddevs.next; 3561 else 3562 tmp = mddev->all_mddevs.next; 3563 if (tmp != &all_mddevs) 3564 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3565 else { 3566 next_mddev = (void*)2; 3567 *pos = 0x10000; 3568 } 3569 spin_unlock(&all_mddevs_lock); 3570 3571 if (v != (void*)1) 3572 mddev_put(mddev); 3573 return next_mddev; 3574 3575 } 3576 3577 static void md_seq_stop(struct seq_file *seq, void *v) 3578 { 3579 mddev_t *mddev = v; 3580 3581 if (mddev && v != (void*)1 && v != (void*)2) 3582 mddev_put(mddev); 3583 } 3584 3585 static int md_seq_show(struct seq_file *seq, void *v) 3586 { 3587 mddev_t *mddev = v; 3588 sector_t size; 3589 struct list_head *tmp2; 3590 mdk_rdev_t *rdev; 3591 int i; 3592 struct bitmap *bitmap; 3593 3594 if (v == (void*)1) { 3595 seq_printf(seq, "Personalities : "); 3596 spin_lock(&pers_lock); 3597 for (i = 0; i < MAX_PERSONALITY; i++) 3598 if (pers[i]) 3599 seq_printf(seq, "[%s] ", pers[i]->name); 3600 3601 spin_unlock(&pers_lock); 3602 seq_printf(seq, "\n"); 3603 return 0; 3604 } 3605 if (v == (void*)2) { 3606 status_unused(seq); 3607 return 0; 3608 } 3609 3610 if (mddev_lock(mddev)!=0) 3611 return -EINTR; 3612 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3613 seq_printf(seq, "%s : %sactive", mdname(mddev), 3614 mddev->pers ? "" : "in"); 3615 if (mddev->pers) { 3616 if (mddev->ro) 3617 seq_printf(seq, " (read-only)"); 3618 seq_printf(seq, " %s", mddev->pers->name); 3619 } 3620 3621 size = 0; 3622 ITERATE_RDEV(mddev,rdev,tmp2) { 3623 char b[BDEVNAME_SIZE]; 3624 seq_printf(seq, " %s[%d]", 3625 bdevname(rdev->bdev,b), rdev->desc_nr); 3626 if (test_bit(WriteMostly, &rdev->flags)) 3627 seq_printf(seq, "(W)"); 3628 if (rdev->faulty) { 3629 seq_printf(seq, "(F)"); 3630 continue; 3631 } else if (rdev->raid_disk < 0) 3632 seq_printf(seq, "(S)"); /* spare */ 3633 size += rdev->size; 3634 } 3635 3636 if (!list_empty(&mddev->disks)) { 3637 if (mddev->pers) 3638 seq_printf(seq, "\n %llu blocks", 3639 (unsigned long long)mddev->array_size); 3640 else 3641 seq_printf(seq, "\n %llu blocks", 3642 (unsigned long long)size); 3643 } 3644 if (mddev->persistent) { 3645 if (mddev->major_version != 0 || 3646 mddev->minor_version != 90) { 3647 seq_printf(seq," super %d.%d", 3648 mddev->major_version, 3649 mddev->minor_version); 3650 } 3651 } else 3652 seq_printf(seq, " super non-persistent"); 3653 3654 if (mddev->pers) { 3655 mddev->pers->status (seq, mddev); 3656 seq_printf(seq, "\n "); 3657 if (mddev->curr_resync > 2) { 3658 status_resync (seq, mddev); 3659 seq_printf(seq, "\n "); 3660 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3661 seq_printf(seq, " resync=DELAYED\n "); 3662 } else 3663 seq_printf(seq, "\n "); 3664 3665 if ((bitmap = mddev->bitmap)) { 3666 unsigned long chunk_kb; 3667 unsigned long flags; 3668 spin_lock_irqsave(&bitmap->lock, flags); 3669 chunk_kb = bitmap->chunksize >> 10; 3670 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3671 "%lu%s chunk", 3672 bitmap->pages - bitmap->missing_pages, 3673 bitmap->pages, 3674 (bitmap->pages - bitmap->missing_pages) 3675 << (PAGE_SHIFT - 10), 3676 chunk_kb ? chunk_kb : bitmap->chunksize, 3677 chunk_kb ? "KB" : "B"); 3678 if (bitmap->file) { 3679 seq_printf(seq, ", file: "); 3680 seq_path(seq, bitmap->file->f_vfsmnt, 3681 bitmap->file->f_dentry," \t\n"); 3682 } 3683 3684 seq_printf(seq, "\n"); 3685 spin_unlock_irqrestore(&bitmap->lock, flags); 3686 } 3687 3688 seq_printf(seq, "\n"); 3689 } 3690 mddev_unlock(mddev); 3691 3692 return 0; 3693 } 3694 3695 static struct seq_operations md_seq_ops = { 3696 .start = md_seq_start, 3697 .next = md_seq_next, 3698 .stop = md_seq_stop, 3699 .show = md_seq_show, 3700 }; 3701 3702 static int md_seq_open(struct inode *inode, struct file *file) 3703 { 3704 int error; 3705 3706 error = seq_open(file, &md_seq_ops); 3707 return error; 3708 } 3709 3710 static struct file_operations md_seq_fops = { 3711 .open = md_seq_open, 3712 .read = seq_read, 3713 .llseek = seq_lseek, 3714 .release = seq_release, 3715 }; 3716 3717 int register_md_personality(int pnum, mdk_personality_t *p) 3718 { 3719 if (pnum >= MAX_PERSONALITY) { 3720 printk(KERN_ERR 3721 "md: tried to install personality %s as nr %d, but max is %lu\n", 3722 p->name, pnum, MAX_PERSONALITY-1); 3723 return -EINVAL; 3724 } 3725 3726 spin_lock(&pers_lock); 3727 if (pers[pnum]) { 3728 spin_unlock(&pers_lock); 3729 return -EBUSY; 3730 } 3731 3732 pers[pnum] = p; 3733 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3734 spin_unlock(&pers_lock); 3735 return 0; 3736 } 3737 3738 int unregister_md_personality(int pnum) 3739 { 3740 if (pnum >= MAX_PERSONALITY) 3741 return -EINVAL; 3742 3743 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3744 spin_lock(&pers_lock); 3745 pers[pnum] = NULL; 3746 spin_unlock(&pers_lock); 3747 return 0; 3748 } 3749 3750 static int is_mddev_idle(mddev_t *mddev) 3751 { 3752 mdk_rdev_t * rdev; 3753 struct list_head *tmp; 3754 int idle; 3755 unsigned long curr_events; 3756 3757 idle = 1; 3758 ITERATE_RDEV(mddev,rdev,tmp) { 3759 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3760 curr_events = disk_stat_read(disk, sectors[0]) + 3761 disk_stat_read(disk, sectors[1]) - 3762 atomic_read(&disk->sync_io); 3763 /* Allow some slack between valud of curr_events and last_events, 3764 * as there are some uninteresting races. 3765 * Note: the following is an unsigned comparison. 3766 */ 3767 if ((curr_events - rdev->last_events + 32) > 64) { 3768 rdev->last_events = curr_events; 3769 idle = 0; 3770 } 3771 } 3772 return idle; 3773 } 3774 3775 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3776 { 3777 /* another "blocks" (512byte) blocks have been synced */ 3778 atomic_sub(blocks, &mddev->recovery_active); 3779 wake_up(&mddev->recovery_wait); 3780 if (!ok) { 3781 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3782 md_wakeup_thread(mddev->thread); 3783 // stop recovery, signal do_sync .... 3784 } 3785 } 3786 3787 3788 /* md_write_start(mddev, bi) 3789 * If we need to update some array metadata (e.g. 'active' flag 3790 * in superblock) before writing, schedule a superblock update 3791 * and wait for it to complete. 3792 */ 3793 void md_write_start(mddev_t *mddev, struct bio *bi) 3794 { 3795 if (bio_data_dir(bi) != WRITE) 3796 return; 3797 3798 atomic_inc(&mddev->writes_pending); 3799 if (mddev->in_sync) { 3800 spin_lock(&mddev->write_lock); 3801 if (mddev->in_sync) { 3802 mddev->in_sync = 0; 3803 mddev->sb_dirty = 1; 3804 md_wakeup_thread(mddev->thread); 3805 } 3806 spin_unlock(&mddev->write_lock); 3807 } 3808 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3809 } 3810 3811 void md_write_end(mddev_t *mddev) 3812 { 3813 if (atomic_dec_and_test(&mddev->writes_pending)) { 3814 if (mddev->safemode == 2) 3815 md_wakeup_thread(mddev->thread); 3816 else 3817 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3818 } 3819 } 3820 3821 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3822 3823 #define SYNC_MARKS 10 3824 #define SYNC_MARK_STEP (3*HZ) 3825 static void md_do_sync(mddev_t *mddev) 3826 { 3827 mddev_t *mddev2; 3828 unsigned int currspeed = 0, 3829 window; 3830 sector_t max_sectors,j, io_sectors; 3831 unsigned long mark[SYNC_MARKS]; 3832 sector_t mark_cnt[SYNC_MARKS]; 3833 int last_mark,m; 3834 struct list_head *tmp; 3835 sector_t last_check; 3836 int skipped = 0; 3837 3838 /* just incase thread restarts... */ 3839 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3840 return; 3841 3842 /* we overload curr_resync somewhat here. 3843 * 0 == not engaged in resync at all 3844 * 2 == checking that there is no conflict with another sync 3845 * 1 == like 2, but have yielded to allow conflicting resync to 3846 * commense 3847 * other == active in resync - this many blocks 3848 * 3849 * Before starting a resync we must have set curr_resync to 3850 * 2, and then checked that every "conflicting" array has curr_resync 3851 * less than ours. When we find one that is the same or higher 3852 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3853 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3854 * This will mean we have to start checking from the beginning again. 3855 * 3856 */ 3857 3858 do { 3859 mddev->curr_resync = 2; 3860 3861 try_again: 3862 if (signal_pending(current) || 3863 kthread_should_stop()) { 3864 flush_signals(current); 3865 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3866 goto skip; 3867 } 3868 ITERATE_MDDEV(mddev2,tmp) { 3869 if (mddev2 == mddev) 3870 continue; 3871 if (mddev2->curr_resync && 3872 match_mddev_units(mddev,mddev2)) { 3873 DEFINE_WAIT(wq); 3874 if (mddev < mddev2 && mddev->curr_resync == 2) { 3875 /* arbitrarily yield */ 3876 mddev->curr_resync = 1; 3877 wake_up(&resync_wait); 3878 } 3879 if (mddev > mddev2 && mddev->curr_resync == 1) 3880 /* no need to wait here, we can wait the next 3881 * time 'round when curr_resync == 2 3882 */ 3883 continue; 3884 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3885 if (!signal_pending(current) && 3886 !kthread_should_stop() && 3887 mddev2->curr_resync >= mddev->curr_resync) { 3888 printk(KERN_INFO "md: delaying resync of %s" 3889 " until %s has finished resync (they" 3890 " share one or more physical units)\n", 3891 mdname(mddev), mdname(mddev2)); 3892 mddev_put(mddev2); 3893 schedule(); 3894 finish_wait(&resync_wait, &wq); 3895 goto try_again; 3896 } 3897 finish_wait(&resync_wait, &wq); 3898 } 3899 } 3900 } while (mddev->curr_resync < 2); 3901 3902 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3903 /* resync follows the size requested by the personality, 3904 * which defaults to physical size, but can be virtual size 3905 */ 3906 max_sectors = mddev->resync_max_sectors; 3907 mddev->resync_mismatches = 0; 3908 } else 3909 /* recovery follows the physical size of devices */ 3910 max_sectors = mddev->size << 1; 3911 3912 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3913 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3914 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3915 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 3916 "(but not more than %d KB/sec) for reconstruction.\n", 3917 sysctl_speed_limit_max); 3918 3919 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3920 /* we don't use the checkpoint if there's a bitmap */ 3921 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap 3922 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3923 j = mddev->recovery_cp; 3924 else 3925 j = 0; 3926 io_sectors = 0; 3927 for (m = 0; m < SYNC_MARKS; m++) { 3928 mark[m] = jiffies; 3929 mark_cnt[m] = io_sectors; 3930 } 3931 last_mark = 0; 3932 mddev->resync_mark = mark[last_mark]; 3933 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3934 3935 /* 3936 * Tune reconstruction: 3937 */ 3938 window = 32*(PAGE_SIZE/512); 3939 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 3940 window/2,(unsigned long long) max_sectors/2); 3941 3942 atomic_set(&mddev->recovery_active, 0); 3943 init_waitqueue_head(&mddev->recovery_wait); 3944 last_check = 0; 3945 3946 if (j>2) { 3947 printk(KERN_INFO 3948 "md: resuming recovery of %s from checkpoint.\n", 3949 mdname(mddev)); 3950 mddev->curr_resync = j; 3951 } 3952 3953 while (j < max_sectors) { 3954 sector_t sectors; 3955 3956 skipped = 0; 3957 sectors = mddev->pers->sync_request(mddev, j, &skipped, 3958 currspeed < sysctl_speed_limit_min); 3959 if (sectors == 0) { 3960 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3961 goto out; 3962 } 3963 3964 if (!skipped) { /* actual IO requested */ 3965 io_sectors += sectors; 3966 atomic_add(sectors, &mddev->recovery_active); 3967 } 3968 3969 j += sectors; 3970 if (j>1) mddev->curr_resync = j; 3971 3972 3973 if (last_check + window > io_sectors || j == max_sectors) 3974 continue; 3975 3976 last_check = io_sectors; 3977 3978 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3979 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3980 break; 3981 3982 repeat: 3983 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 3984 /* step marks */ 3985 int next = (last_mark+1) % SYNC_MARKS; 3986 3987 mddev->resync_mark = mark[next]; 3988 mddev->resync_mark_cnt = mark_cnt[next]; 3989 mark[next] = jiffies; 3990 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 3991 last_mark = next; 3992 } 3993 3994 3995 if (signal_pending(current) || kthread_should_stop()) { 3996 /* 3997 * got a signal, exit. 3998 */ 3999 printk(KERN_INFO 4000 "md: md_do_sync() got signal ... exiting\n"); 4001 flush_signals(current); 4002 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4003 goto out; 4004 } 4005 4006 /* 4007 * this loop exits only if either when we are slower than 4008 * the 'hard' speed limit, or the system was IO-idle for 4009 * a jiffy. 4010 * the system might be non-idle CPU-wise, but we only care 4011 * about not overloading the IO subsystem. (things like an 4012 * e2fsck being done on the RAID array should execute fast) 4013 */ 4014 mddev->queue->unplug_fn(mddev->queue); 4015 cond_resched(); 4016 4017 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4018 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4019 4020 if (currspeed > sysctl_speed_limit_min) { 4021 if ((currspeed > sysctl_speed_limit_max) || 4022 !is_mddev_idle(mddev)) { 4023 msleep_interruptible(250); 4024 goto repeat; 4025 } 4026 } 4027 } 4028 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 4029 /* 4030 * this also signals 'finished resyncing' to md_stop 4031 */ 4032 out: 4033 mddev->queue->unplug_fn(mddev->queue); 4034 4035 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 4036 4037 /* tell personality that we are finished */ 4038 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4039 4040 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4041 mddev->curr_resync > 2 && 4042 mddev->curr_resync >= mddev->recovery_cp) { 4043 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4044 printk(KERN_INFO 4045 "md: checkpointing recovery of %s.\n", 4046 mdname(mddev)); 4047 mddev->recovery_cp = mddev->curr_resync; 4048 } else 4049 mddev->recovery_cp = MaxSector; 4050 } 4051 4052 skip: 4053 mddev->curr_resync = 0; 4054 wake_up(&resync_wait); 4055 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4056 md_wakeup_thread(mddev->thread); 4057 } 4058 4059 4060 /* 4061 * This routine is regularly called by all per-raid-array threads to 4062 * deal with generic issues like resync and super-block update. 4063 * Raid personalities that don't have a thread (linear/raid0) do not 4064 * need this as they never do any recovery or update the superblock. 4065 * 4066 * It does not do any resync itself, but rather "forks" off other threads 4067 * to do that as needed. 4068 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 4069 * "->recovery" and create a thread at ->sync_thread. 4070 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 4071 * and wakeups up this thread which will reap the thread and finish up. 4072 * This thread also removes any faulty devices (with nr_pending == 0). 4073 * 4074 * The overall approach is: 4075 * 1/ if the superblock needs updating, update it. 4076 * 2/ If a recovery thread is running, don't do anything else. 4077 * 3/ If recovery has finished, clean up, possibly marking spares active. 4078 * 4/ If there are any faulty devices, remove them. 4079 * 5/ If array is degraded, try to add spares devices 4080 * 6/ If array has spares or is not in-sync, start a resync thread. 4081 */ 4082 void md_check_recovery(mddev_t *mddev) 4083 { 4084 mdk_rdev_t *rdev; 4085 struct list_head *rtmp; 4086 4087 4088 if (mddev->bitmap) 4089 bitmap_daemon_work(mddev->bitmap); 4090 4091 if (mddev->ro) 4092 return; 4093 4094 if (signal_pending(current)) { 4095 if (mddev->pers->sync_request) { 4096 printk(KERN_INFO "md: %s in immediate safe mode\n", 4097 mdname(mddev)); 4098 mddev->safemode = 2; 4099 } 4100 flush_signals(current); 4101 } 4102 4103 if ( ! ( 4104 mddev->sb_dirty || 4105 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 4106 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 4107 (mddev->safemode == 1) || 4108 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 4109 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 4110 )) 4111 return; 4112 4113 if (mddev_trylock(mddev)==0) { 4114 int spares =0; 4115 4116 spin_lock(&mddev->write_lock); 4117 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4118 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4119 mddev->in_sync = 1; 4120 mddev->sb_dirty = 1; 4121 } 4122 if (mddev->safemode == 1) 4123 mddev->safemode = 0; 4124 spin_unlock(&mddev->write_lock); 4125 4126 if (mddev->sb_dirty) 4127 md_update_sb(mddev); 4128 4129 4130 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4131 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 4132 /* resync/recovery still happening */ 4133 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4134 goto unlock; 4135 } 4136 if (mddev->sync_thread) { 4137 /* resync has finished, collect result */ 4138 md_unregister_thread(mddev->sync_thread); 4139 mddev->sync_thread = NULL; 4140 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4141 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4142 /* success...*/ 4143 /* activate any spares */ 4144 mddev->pers->spare_active(mddev); 4145 } 4146 md_update_sb(mddev); 4147 4148 /* if array is no-longer degraded, then any saved_raid_disk 4149 * information must be scrapped 4150 */ 4151 if (!mddev->degraded) 4152 ITERATE_RDEV(mddev,rdev,rtmp) 4153 rdev->saved_raid_disk = -1; 4154 4155 mddev->recovery = 0; 4156 /* flag recovery needed just to double check */ 4157 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4158 goto unlock; 4159 } 4160 /* Clear some bits that don't mean anything, but 4161 * might be left set 4162 */ 4163 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4164 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 4165 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 4166 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4167 4168 /* no recovery is running. 4169 * remove any failed drives, then 4170 * add spares if possible. 4171 * Spare are also removed and re-added, to allow 4172 * the personality to fail the re-add. 4173 */ 4174 ITERATE_RDEV(mddev,rdev,rtmp) 4175 if (rdev->raid_disk >= 0 && 4176 (rdev->faulty || ! rdev->in_sync) && 4177 atomic_read(&rdev->nr_pending)==0) { 4178 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 4179 char nm[20]; 4180 sprintf(nm,"rd%d", rdev->raid_disk); 4181 sysfs_remove_link(&mddev->kobj, nm); 4182 rdev->raid_disk = -1; 4183 } 4184 } 4185 4186 if (mddev->degraded) { 4187 ITERATE_RDEV(mddev,rdev,rtmp) 4188 if (rdev->raid_disk < 0 4189 && !rdev->faulty) { 4190 if (mddev->pers->hot_add_disk(mddev,rdev)) { 4191 char nm[20]; 4192 sprintf(nm, "rd%d", rdev->raid_disk); 4193 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4194 spares++; 4195 } else 4196 break; 4197 } 4198 } 4199 4200 if (spares) { 4201 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4202 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4203 } else if (mddev->recovery_cp < MaxSector) { 4204 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4205 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4206 /* nothing to be done ... */ 4207 goto unlock; 4208 4209 if (mddev->pers->sync_request) { 4210 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4211 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4212 /* We are adding a device or devices to an array 4213 * which has the bitmap stored on all devices. 4214 * So make sure all bitmap pages get written 4215 */ 4216 bitmap_write_all(mddev->bitmap); 4217 } 4218 mddev->sync_thread = md_register_thread(md_do_sync, 4219 mddev, 4220 "%s_resync"); 4221 if (!mddev->sync_thread) { 4222 printk(KERN_ERR "%s: could not start resync" 4223 " thread...\n", 4224 mdname(mddev)); 4225 /* leave the spares where they are, it shouldn't hurt */ 4226 mddev->recovery = 0; 4227 } else { 4228 md_wakeup_thread(mddev->sync_thread); 4229 } 4230 } 4231 unlock: 4232 mddev_unlock(mddev); 4233 } 4234 } 4235 4236 static int md_notify_reboot(struct notifier_block *this, 4237 unsigned long code, void *x) 4238 { 4239 struct list_head *tmp; 4240 mddev_t *mddev; 4241 4242 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 4243 4244 printk(KERN_INFO "md: stopping all md devices.\n"); 4245 4246 ITERATE_MDDEV(mddev,tmp) 4247 if (mddev_trylock(mddev)==0) 4248 do_md_stop (mddev, 1); 4249 /* 4250 * certain more exotic SCSI devices are known to be 4251 * volatile wrt too early system reboots. While the 4252 * right place to handle this issue is the given 4253 * driver, we do want to have a safe RAID driver ... 4254 */ 4255 mdelay(1000*1); 4256 } 4257 return NOTIFY_DONE; 4258 } 4259 4260 static struct notifier_block md_notifier = { 4261 .notifier_call = md_notify_reboot, 4262 .next = NULL, 4263 .priority = INT_MAX, /* before any real devices */ 4264 }; 4265 4266 static void md_geninit(void) 4267 { 4268 struct proc_dir_entry *p; 4269 4270 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 4271 4272 p = create_proc_entry("mdstat", S_IRUGO, NULL); 4273 if (p) 4274 p->proc_fops = &md_seq_fops; 4275 } 4276 4277 static int __init md_init(void) 4278 { 4279 int minor; 4280 4281 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 4282 " MD_SB_DISKS=%d\n", 4283 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4284 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4285 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, 4286 BITMAP_MINOR); 4287 4288 if (register_blkdev(MAJOR_NR, "md")) 4289 return -1; 4290 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 4291 unregister_blkdev(MAJOR_NR, "md"); 4292 return -1; 4293 } 4294 devfs_mk_dir("md"); 4295 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 4296 md_probe, NULL, NULL); 4297 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 4298 md_probe, NULL, NULL); 4299 4300 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4301 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 4302 S_IFBLK|S_IRUSR|S_IWUSR, 4303 "md/%d", minor); 4304 4305 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4306 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4307 S_IFBLK|S_IRUSR|S_IWUSR, 4308 "md/mdp%d", minor); 4309 4310 4311 register_reboot_notifier(&md_notifier); 4312 raid_table_header = register_sysctl_table(raid_root_table, 1); 4313 4314 md_geninit(); 4315 return (0); 4316 } 4317 4318 4319 #ifndef MODULE 4320 4321 /* 4322 * Searches all registered partitions for autorun RAID arrays 4323 * at boot time. 4324 */ 4325 static dev_t detected_devices[128]; 4326 static int dev_cnt; 4327 4328 void md_autodetect_dev(dev_t dev) 4329 { 4330 if (dev_cnt >= 0 && dev_cnt < 127) 4331 detected_devices[dev_cnt++] = dev; 4332 } 4333 4334 4335 static void autostart_arrays(int part) 4336 { 4337 mdk_rdev_t *rdev; 4338 int i; 4339 4340 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4341 4342 for (i = 0; i < dev_cnt; i++) { 4343 dev_t dev = detected_devices[i]; 4344 4345 rdev = md_import_device(dev,0, 0); 4346 if (IS_ERR(rdev)) 4347 continue; 4348 4349 if (rdev->faulty) { 4350 MD_BUG(); 4351 continue; 4352 } 4353 list_add(&rdev->same_set, &pending_raid_disks); 4354 } 4355 dev_cnt = 0; 4356 4357 autorun_devices(part); 4358 } 4359 4360 #endif 4361 4362 static __exit void md_exit(void) 4363 { 4364 mddev_t *mddev; 4365 struct list_head *tmp; 4366 int i; 4367 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4368 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4369 for (i=0; i < MAX_MD_DEVS; i++) 4370 devfs_remove("md/%d", i); 4371 for (i=0; i < MAX_MD_DEVS; i++) 4372 devfs_remove("md/d%d", i); 4373 4374 devfs_remove("md"); 4375 4376 unregister_blkdev(MAJOR_NR,"md"); 4377 unregister_blkdev(mdp_major, "mdp"); 4378 unregister_reboot_notifier(&md_notifier); 4379 unregister_sysctl_table(raid_table_header); 4380 remove_proc_entry("mdstat", NULL); 4381 ITERATE_MDDEV(mddev,tmp) { 4382 struct gendisk *disk = mddev->gendisk; 4383 if (!disk) 4384 continue; 4385 export_array(mddev); 4386 del_gendisk(disk); 4387 put_disk(disk); 4388 mddev->gendisk = NULL; 4389 mddev_put(mddev); 4390 } 4391 } 4392 4393 module_init(md_init) 4394 module_exit(md_exit) 4395 4396 EXPORT_SYMBOL(register_md_personality); 4397 EXPORT_SYMBOL(unregister_md_personality); 4398 EXPORT_SYMBOL(md_error); 4399 EXPORT_SYMBOL(md_done_sync); 4400 EXPORT_SYMBOL(md_write_start); 4401 EXPORT_SYMBOL(md_write_end); 4402 EXPORT_SYMBOL(md_register_thread); 4403 EXPORT_SYMBOL(md_unregister_thread); 4404 EXPORT_SYMBOL(md_wakeup_thread); 4405 EXPORT_SYMBOL(md_print_devices); 4406 EXPORT_SYMBOL(md_check_recovery); 4407 MODULE_LICENSE("GPL"); 4408 MODULE_ALIAS("md"); 4409 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4410