1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 This program is free software; you can redistribute it and/or modify 23 it under the terms of the GNU General Public License as published by 24 the Free Software Foundation; either version 2, or (at your option) 25 any later version. 26 27 You should have received a copy of the GNU General Public License 28 (for example /usr/src/linux/COPYING); if not, write to the Free 29 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 30 */ 31 32 #include <linux/module.h> 33 #include <linux/config.h> 34 #include <linux/linkage.h> 35 #include <linux/raid/md.h> 36 #include <linux/sysctl.h> 37 #include <linux/devfs_fs_kernel.h> 38 #include <linux/buffer_head.h> /* for invalidate_bdev */ 39 #include <linux/suspend.h> 40 41 #include <linux/init.h> 42 43 #ifdef CONFIG_KMOD 44 #include <linux/kmod.h> 45 #endif 46 47 #include <asm/unaligned.h> 48 49 #define MAJOR_NR MD_MAJOR 50 #define MD_DRIVER 51 52 /* 63 partitions with the alternate major number (mdp) */ 53 #define MdpMinorShift 6 54 55 #define DEBUG 0 56 #define dprintk(x...) ((void)(DEBUG && printk(x))) 57 58 59 #ifndef MODULE 60 static void autostart_arrays (int part); 61 #endif 62 63 static mdk_personality_t *pers[MAX_PERSONALITY]; 64 static DEFINE_SPINLOCK(pers_lock); 65 66 /* 67 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 68 * is 1000 KB/sec, so the extra system load does not show up that much. 69 * Increase it if you want to have more _guaranteed_ speed. Note that 70 * the RAID driver will use the maximum available bandwith if the IO 71 * subsystem is idle. There is also an 'absolute maximum' reconstruction 72 * speed limit - in case reconstruction slows down your system despite 73 * idle IO detection. 74 * 75 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 76 */ 77 78 static int sysctl_speed_limit_min = 1000; 79 static int sysctl_speed_limit_max = 200000; 80 81 static struct ctl_table_header *raid_table_header; 82 83 static ctl_table raid_table[] = { 84 { 85 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 86 .procname = "speed_limit_min", 87 .data = &sysctl_speed_limit_min, 88 .maxlen = sizeof(int), 89 .mode = 0644, 90 .proc_handler = &proc_dointvec, 91 }, 92 { 93 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 94 .procname = "speed_limit_max", 95 .data = &sysctl_speed_limit_max, 96 .maxlen = sizeof(int), 97 .mode = 0644, 98 .proc_handler = &proc_dointvec, 99 }, 100 { .ctl_name = 0 } 101 }; 102 103 static ctl_table raid_dir_table[] = { 104 { 105 .ctl_name = DEV_RAID, 106 .procname = "raid", 107 .maxlen = 0, 108 .mode = 0555, 109 .child = raid_table, 110 }, 111 { .ctl_name = 0 } 112 }; 113 114 static ctl_table raid_root_table[] = { 115 { 116 .ctl_name = CTL_DEV, 117 .procname = "dev", 118 .maxlen = 0, 119 .mode = 0555, 120 .child = raid_dir_table, 121 }, 122 { .ctl_name = 0 } 123 }; 124 125 static struct block_device_operations md_fops; 126 127 /* 128 * Enables to iterate over all existing md arrays 129 * all_mddevs_lock protects this list. 130 */ 131 static LIST_HEAD(all_mddevs); 132 static DEFINE_SPINLOCK(all_mddevs_lock); 133 134 135 /* 136 * iterates through all used mddevs in the system. 137 * We take care to grab the all_mddevs_lock whenever navigating 138 * the list, and to always hold a refcount when unlocked. 139 * Any code which breaks out of this loop while own 140 * a reference to the current mddev and must mddev_put it. 141 */ 142 #define ITERATE_MDDEV(mddev,tmp) \ 143 \ 144 for (({ spin_lock(&all_mddevs_lock); \ 145 tmp = all_mddevs.next; \ 146 mddev = NULL;}); \ 147 ({ if (tmp != &all_mddevs) \ 148 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 149 spin_unlock(&all_mddevs_lock); \ 150 if (mddev) mddev_put(mddev); \ 151 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 152 tmp != &all_mddevs;}); \ 153 ({ spin_lock(&all_mddevs_lock); \ 154 tmp = tmp->next;}) \ 155 ) 156 157 158 static int md_fail_request (request_queue_t *q, struct bio *bio) 159 { 160 bio_io_error(bio, bio->bi_size); 161 return 0; 162 } 163 164 static inline mddev_t *mddev_get(mddev_t *mddev) 165 { 166 atomic_inc(&mddev->active); 167 return mddev; 168 } 169 170 static void mddev_put(mddev_t *mddev) 171 { 172 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 173 return; 174 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 175 list_del(&mddev->all_mddevs); 176 blk_put_queue(mddev->queue); 177 kfree(mddev); 178 } 179 spin_unlock(&all_mddevs_lock); 180 } 181 182 static mddev_t * mddev_find(dev_t unit) 183 { 184 mddev_t *mddev, *new = NULL; 185 186 retry: 187 spin_lock(&all_mddevs_lock); 188 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 189 if (mddev->unit == unit) { 190 mddev_get(mddev); 191 spin_unlock(&all_mddevs_lock); 192 if (new) 193 kfree(new); 194 return mddev; 195 } 196 197 if (new) { 198 list_add(&new->all_mddevs, &all_mddevs); 199 spin_unlock(&all_mddevs_lock); 200 return new; 201 } 202 spin_unlock(&all_mddevs_lock); 203 204 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 205 if (!new) 206 return NULL; 207 208 memset(new, 0, sizeof(*new)); 209 210 new->unit = unit; 211 if (MAJOR(unit) == MD_MAJOR) 212 new->md_minor = MINOR(unit); 213 else 214 new->md_minor = MINOR(unit) >> MdpMinorShift; 215 216 init_MUTEX(&new->reconfig_sem); 217 INIT_LIST_HEAD(&new->disks); 218 INIT_LIST_HEAD(&new->all_mddevs); 219 init_timer(&new->safemode_timer); 220 atomic_set(&new->active, 1); 221 222 new->queue = blk_alloc_queue(GFP_KERNEL); 223 if (!new->queue) { 224 kfree(new); 225 return NULL; 226 } 227 228 blk_queue_make_request(new->queue, md_fail_request); 229 230 goto retry; 231 } 232 233 static inline int mddev_lock(mddev_t * mddev) 234 { 235 return down_interruptible(&mddev->reconfig_sem); 236 } 237 238 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 239 { 240 down(&mddev->reconfig_sem); 241 } 242 243 static inline int mddev_trylock(mddev_t * mddev) 244 { 245 return down_trylock(&mddev->reconfig_sem); 246 } 247 248 static inline void mddev_unlock(mddev_t * mddev) 249 { 250 up(&mddev->reconfig_sem); 251 252 if (mddev->thread) 253 md_wakeup_thread(mddev->thread); 254 } 255 256 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 257 { 258 mdk_rdev_t * rdev; 259 struct list_head *tmp; 260 261 ITERATE_RDEV(mddev,rdev,tmp) { 262 if (rdev->desc_nr == nr) 263 return rdev; 264 } 265 return NULL; 266 } 267 268 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 269 { 270 struct list_head *tmp; 271 mdk_rdev_t *rdev; 272 273 ITERATE_RDEV(mddev,rdev,tmp) { 274 if (rdev->bdev->bd_dev == dev) 275 return rdev; 276 } 277 return NULL; 278 } 279 280 inline static sector_t calc_dev_sboffset(struct block_device *bdev) 281 { 282 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 283 return MD_NEW_SIZE_BLOCKS(size); 284 } 285 286 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 287 { 288 sector_t size; 289 290 size = rdev->sb_offset; 291 292 if (chunk_size) 293 size &= ~((sector_t)chunk_size/1024 - 1); 294 return size; 295 } 296 297 static int alloc_disk_sb(mdk_rdev_t * rdev) 298 { 299 if (rdev->sb_page) 300 MD_BUG(); 301 302 rdev->sb_page = alloc_page(GFP_KERNEL); 303 if (!rdev->sb_page) { 304 printk(KERN_ALERT "md: out of memory.\n"); 305 return -EINVAL; 306 } 307 308 return 0; 309 } 310 311 static void free_disk_sb(mdk_rdev_t * rdev) 312 { 313 if (rdev->sb_page) { 314 page_cache_release(rdev->sb_page); 315 rdev->sb_loaded = 0; 316 rdev->sb_page = NULL; 317 rdev->sb_offset = 0; 318 rdev->size = 0; 319 } 320 } 321 322 323 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 324 { 325 if (bio->bi_size) 326 return 1; 327 328 complete((struct completion*)bio->bi_private); 329 return 0; 330 } 331 332 static int sync_page_io(struct block_device *bdev, sector_t sector, int size, 333 struct page *page, int rw) 334 { 335 struct bio *bio = bio_alloc(GFP_NOIO, 1); 336 struct completion event; 337 int ret; 338 339 rw |= (1 << BIO_RW_SYNC); 340 341 bio->bi_bdev = bdev; 342 bio->bi_sector = sector; 343 bio_add_page(bio, page, size, 0); 344 init_completion(&event); 345 bio->bi_private = &event; 346 bio->bi_end_io = bi_complete; 347 submit_bio(rw, bio); 348 wait_for_completion(&event); 349 350 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 351 bio_put(bio); 352 return ret; 353 } 354 355 static int read_disk_sb(mdk_rdev_t * rdev) 356 { 357 char b[BDEVNAME_SIZE]; 358 if (!rdev->sb_page) { 359 MD_BUG(); 360 return -EINVAL; 361 } 362 if (rdev->sb_loaded) 363 return 0; 364 365 366 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) 367 goto fail; 368 rdev->sb_loaded = 1; 369 return 0; 370 371 fail: 372 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 373 bdevname(rdev->bdev,b)); 374 return -EINVAL; 375 } 376 377 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 378 { 379 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 380 (sb1->set_uuid1 == sb2->set_uuid1) && 381 (sb1->set_uuid2 == sb2->set_uuid2) && 382 (sb1->set_uuid3 == sb2->set_uuid3)) 383 384 return 1; 385 386 return 0; 387 } 388 389 390 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 391 { 392 int ret; 393 mdp_super_t *tmp1, *tmp2; 394 395 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 396 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 397 398 if (!tmp1 || !tmp2) { 399 ret = 0; 400 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 401 goto abort; 402 } 403 404 *tmp1 = *sb1; 405 *tmp2 = *sb2; 406 407 /* 408 * nr_disks is not constant 409 */ 410 tmp1->nr_disks = 0; 411 tmp2->nr_disks = 0; 412 413 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 414 ret = 0; 415 else 416 ret = 1; 417 418 abort: 419 if (tmp1) 420 kfree(tmp1); 421 if (tmp2) 422 kfree(tmp2); 423 424 return ret; 425 } 426 427 static unsigned int calc_sb_csum(mdp_super_t * sb) 428 { 429 unsigned int disk_csum, csum; 430 431 disk_csum = sb->sb_csum; 432 sb->sb_csum = 0; 433 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 434 sb->sb_csum = disk_csum; 435 return csum; 436 } 437 438 439 /* 440 * Handle superblock details. 441 * We want to be able to handle multiple superblock formats 442 * so we have a common interface to them all, and an array of 443 * different handlers. 444 * We rely on user-space to write the initial superblock, and support 445 * reading and updating of superblocks. 446 * Interface methods are: 447 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 448 * loads and validates a superblock on dev. 449 * if refdev != NULL, compare superblocks on both devices 450 * Return: 451 * 0 - dev has a superblock that is compatible with refdev 452 * 1 - dev has a superblock that is compatible and newer than refdev 453 * so dev should be used as the refdev in future 454 * -EINVAL superblock incompatible or invalid 455 * -othererror e.g. -EIO 456 * 457 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 458 * Verify that dev is acceptable into mddev. 459 * The first time, mddev->raid_disks will be 0, and data from 460 * dev should be merged in. Subsequent calls check that dev 461 * is new enough. Return 0 or -EINVAL 462 * 463 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 464 * Update the superblock for rdev with data in mddev 465 * This does not write to disc. 466 * 467 */ 468 469 struct super_type { 470 char *name; 471 struct module *owner; 472 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 473 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 474 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 475 }; 476 477 /* 478 * load_super for 0.90.0 479 */ 480 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 481 { 482 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 483 mdp_super_t *sb; 484 int ret; 485 sector_t sb_offset; 486 487 /* 488 * Calculate the position of the superblock, 489 * it's at the end of the disk. 490 * 491 * It also happens to be a multiple of 4Kb. 492 */ 493 sb_offset = calc_dev_sboffset(rdev->bdev); 494 rdev->sb_offset = sb_offset; 495 496 ret = read_disk_sb(rdev); 497 if (ret) return ret; 498 499 ret = -EINVAL; 500 501 bdevname(rdev->bdev, b); 502 sb = (mdp_super_t*)page_address(rdev->sb_page); 503 504 if (sb->md_magic != MD_SB_MAGIC) { 505 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 506 b); 507 goto abort; 508 } 509 510 if (sb->major_version != 0 || 511 sb->minor_version != 90) { 512 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 513 sb->major_version, sb->minor_version, 514 b); 515 goto abort; 516 } 517 518 if (sb->raid_disks <= 0) 519 goto abort; 520 521 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 522 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 523 b); 524 goto abort; 525 } 526 527 rdev->preferred_minor = sb->md_minor; 528 rdev->data_offset = 0; 529 530 if (sb->level == LEVEL_MULTIPATH) 531 rdev->desc_nr = -1; 532 else 533 rdev->desc_nr = sb->this_disk.number; 534 535 if (refdev == 0) 536 ret = 1; 537 else { 538 __u64 ev1, ev2; 539 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 540 if (!uuid_equal(refsb, sb)) { 541 printk(KERN_WARNING "md: %s has different UUID to %s\n", 542 b, bdevname(refdev->bdev,b2)); 543 goto abort; 544 } 545 if (!sb_equal(refsb, sb)) { 546 printk(KERN_WARNING "md: %s has same UUID" 547 " but different superblock to %s\n", 548 b, bdevname(refdev->bdev, b2)); 549 goto abort; 550 } 551 ev1 = md_event(sb); 552 ev2 = md_event(refsb); 553 if (ev1 > ev2) 554 ret = 1; 555 else 556 ret = 0; 557 } 558 rdev->size = calc_dev_size(rdev, sb->chunk_size); 559 560 abort: 561 return ret; 562 } 563 564 /* 565 * validate_super for 0.90.0 566 */ 567 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 568 { 569 mdp_disk_t *desc; 570 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 571 572 if (mddev->raid_disks == 0) { 573 mddev->major_version = 0; 574 mddev->minor_version = sb->minor_version; 575 mddev->patch_version = sb->patch_version; 576 mddev->persistent = ! sb->not_persistent; 577 mddev->chunk_size = sb->chunk_size; 578 mddev->ctime = sb->ctime; 579 mddev->utime = sb->utime; 580 mddev->level = sb->level; 581 mddev->layout = sb->layout; 582 mddev->raid_disks = sb->raid_disks; 583 mddev->size = sb->size; 584 mddev->events = md_event(sb); 585 586 if (sb->state & (1<<MD_SB_CLEAN)) 587 mddev->recovery_cp = MaxSector; 588 else { 589 if (sb->events_hi == sb->cp_events_hi && 590 sb->events_lo == sb->cp_events_lo) { 591 mddev->recovery_cp = sb->recovery_cp; 592 } else 593 mddev->recovery_cp = 0; 594 } 595 596 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 597 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 598 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 599 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 600 601 mddev->max_disks = MD_SB_DISKS; 602 } else { 603 __u64 ev1; 604 ev1 = md_event(sb); 605 ++ev1; 606 if (ev1 < mddev->events) 607 return -EINVAL; 608 } 609 if (mddev->level != LEVEL_MULTIPATH) { 610 rdev->raid_disk = -1; 611 rdev->in_sync = rdev->faulty = 0; 612 desc = sb->disks + rdev->desc_nr; 613 614 if (desc->state & (1<<MD_DISK_FAULTY)) 615 rdev->faulty = 1; 616 else if (desc->state & (1<<MD_DISK_SYNC) && 617 desc->raid_disk < mddev->raid_disks) { 618 rdev->in_sync = 1; 619 rdev->raid_disk = desc->raid_disk; 620 } 621 } 622 return 0; 623 } 624 625 /* 626 * sync_super for 0.90.0 627 */ 628 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 629 { 630 mdp_super_t *sb; 631 struct list_head *tmp; 632 mdk_rdev_t *rdev2; 633 int next_spare = mddev->raid_disks; 634 635 /* make rdev->sb match mddev data.. 636 * 637 * 1/ zero out disks 638 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 639 * 3/ any empty disks < next_spare become removed 640 * 641 * disks[0] gets initialised to REMOVED because 642 * we cannot be sure from other fields if it has 643 * been initialised or not. 644 */ 645 int i; 646 int active=0, working=0,failed=0,spare=0,nr_disks=0; 647 648 sb = (mdp_super_t*)page_address(rdev->sb_page); 649 650 memset(sb, 0, sizeof(*sb)); 651 652 sb->md_magic = MD_SB_MAGIC; 653 sb->major_version = mddev->major_version; 654 sb->minor_version = mddev->minor_version; 655 sb->patch_version = mddev->patch_version; 656 sb->gvalid_words = 0; /* ignored */ 657 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 658 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 659 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 660 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 661 662 sb->ctime = mddev->ctime; 663 sb->level = mddev->level; 664 sb->size = mddev->size; 665 sb->raid_disks = mddev->raid_disks; 666 sb->md_minor = mddev->md_minor; 667 sb->not_persistent = !mddev->persistent; 668 sb->utime = mddev->utime; 669 sb->state = 0; 670 sb->events_hi = (mddev->events>>32); 671 sb->events_lo = (u32)mddev->events; 672 673 if (mddev->in_sync) 674 { 675 sb->recovery_cp = mddev->recovery_cp; 676 sb->cp_events_hi = (mddev->events>>32); 677 sb->cp_events_lo = (u32)mddev->events; 678 if (mddev->recovery_cp == MaxSector) 679 sb->state = (1<< MD_SB_CLEAN); 680 } else 681 sb->recovery_cp = 0; 682 683 sb->layout = mddev->layout; 684 sb->chunk_size = mddev->chunk_size; 685 686 sb->disks[0].state = (1<<MD_DISK_REMOVED); 687 ITERATE_RDEV(mddev,rdev2,tmp) { 688 mdp_disk_t *d; 689 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 690 rdev2->desc_nr = rdev2->raid_disk; 691 else 692 rdev2->desc_nr = next_spare++; 693 d = &sb->disks[rdev2->desc_nr]; 694 nr_disks++; 695 d->number = rdev2->desc_nr; 696 d->major = MAJOR(rdev2->bdev->bd_dev); 697 d->minor = MINOR(rdev2->bdev->bd_dev); 698 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 699 d->raid_disk = rdev2->raid_disk; 700 else 701 d->raid_disk = rdev2->desc_nr; /* compatibility */ 702 if (rdev2->faulty) { 703 d->state = (1<<MD_DISK_FAULTY); 704 failed++; 705 } else if (rdev2->in_sync) { 706 d->state = (1<<MD_DISK_ACTIVE); 707 d->state |= (1<<MD_DISK_SYNC); 708 active++; 709 working++; 710 } else { 711 d->state = 0; 712 spare++; 713 working++; 714 } 715 } 716 717 /* now set the "removed" and "faulty" bits on any missing devices */ 718 for (i=0 ; i < mddev->raid_disks ; i++) { 719 mdp_disk_t *d = &sb->disks[i]; 720 if (d->state == 0 && d->number == 0) { 721 d->number = i; 722 d->raid_disk = i; 723 d->state = (1<<MD_DISK_REMOVED); 724 d->state |= (1<<MD_DISK_FAULTY); 725 failed++; 726 } 727 } 728 sb->nr_disks = nr_disks; 729 sb->active_disks = active; 730 sb->working_disks = working; 731 sb->failed_disks = failed; 732 sb->spare_disks = spare; 733 734 sb->this_disk = sb->disks[rdev->desc_nr]; 735 sb->sb_csum = calc_sb_csum(sb); 736 } 737 738 /* 739 * version 1 superblock 740 */ 741 742 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 743 { 744 unsigned int disk_csum, csum; 745 unsigned long long newcsum; 746 int size = 256 + le32_to_cpu(sb->max_dev)*2; 747 unsigned int *isuper = (unsigned int*)sb; 748 int i; 749 750 disk_csum = sb->sb_csum; 751 sb->sb_csum = 0; 752 newcsum = 0; 753 for (i=0; size>=4; size -= 4 ) 754 newcsum += le32_to_cpu(*isuper++); 755 756 if (size == 2) 757 newcsum += le16_to_cpu(*(unsigned short*) isuper); 758 759 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 760 sb->sb_csum = disk_csum; 761 return cpu_to_le32(csum); 762 } 763 764 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 765 { 766 struct mdp_superblock_1 *sb; 767 int ret; 768 sector_t sb_offset; 769 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 770 771 /* 772 * Calculate the position of the superblock. 773 * It is always aligned to a 4K boundary and 774 * depeding on minor_version, it can be: 775 * 0: At least 8K, but less than 12K, from end of device 776 * 1: At start of device 777 * 2: 4K from start of device. 778 */ 779 switch(minor_version) { 780 case 0: 781 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 782 sb_offset -= 8*2; 783 sb_offset &= ~(4*2-1); 784 /* convert from sectors to K */ 785 sb_offset /= 2; 786 break; 787 case 1: 788 sb_offset = 0; 789 break; 790 case 2: 791 sb_offset = 4; 792 break; 793 default: 794 return -EINVAL; 795 } 796 rdev->sb_offset = sb_offset; 797 798 ret = read_disk_sb(rdev); 799 if (ret) return ret; 800 801 802 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 803 804 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 805 sb->major_version != cpu_to_le32(1) || 806 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 807 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 808 sb->feature_map != 0) 809 return -EINVAL; 810 811 if (calc_sb_1_csum(sb) != sb->sb_csum) { 812 printk("md: invalid superblock checksum on %s\n", 813 bdevname(rdev->bdev,b)); 814 return -EINVAL; 815 } 816 if (le64_to_cpu(sb->data_size) < 10) { 817 printk("md: data_size too small on %s\n", 818 bdevname(rdev->bdev,b)); 819 return -EINVAL; 820 } 821 rdev->preferred_minor = 0xffff; 822 rdev->data_offset = le64_to_cpu(sb->data_offset); 823 824 if (refdev == 0) 825 return 1; 826 else { 827 __u64 ev1, ev2; 828 struct mdp_superblock_1 *refsb = 829 (struct mdp_superblock_1*)page_address(refdev->sb_page); 830 831 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 832 sb->level != refsb->level || 833 sb->layout != refsb->layout || 834 sb->chunksize != refsb->chunksize) { 835 printk(KERN_WARNING "md: %s has strangely different" 836 " superblock to %s\n", 837 bdevname(rdev->bdev,b), 838 bdevname(refdev->bdev,b2)); 839 return -EINVAL; 840 } 841 ev1 = le64_to_cpu(sb->events); 842 ev2 = le64_to_cpu(refsb->events); 843 844 if (ev1 > ev2) 845 return 1; 846 } 847 if (minor_version) 848 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 849 else 850 rdev->size = rdev->sb_offset; 851 if (rdev->size < le64_to_cpu(sb->data_size)/2) 852 return -EINVAL; 853 rdev->size = le64_to_cpu(sb->data_size)/2; 854 if (le32_to_cpu(sb->chunksize)) 855 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 856 return 0; 857 } 858 859 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 860 { 861 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 862 863 if (mddev->raid_disks == 0) { 864 mddev->major_version = 1; 865 mddev->patch_version = 0; 866 mddev->persistent = 1; 867 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 868 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 869 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 870 mddev->level = le32_to_cpu(sb->level); 871 mddev->layout = le32_to_cpu(sb->layout); 872 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 873 mddev->size = le64_to_cpu(sb->size)/2; 874 mddev->events = le64_to_cpu(sb->events); 875 876 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 877 memcpy(mddev->uuid, sb->set_uuid, 16); 878 879 mddev->max_disks = (4096-256)/2; 880 } else { 881 __u64 ev1; 882 ev1 = le64_to_cpu(sb->events); 883 ++ev1; 884 if (ev1 < mddev->events) 885 return -EINVAL; 886 } 887 888 if (mddev->level != LEVEL_MULTIPATH) { 889 int role; 890 rdev->desc_nr = le32_to_cpu(sb->dev_number); 891 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 892 switch(role) { 893 case 0xffff: /* spare */ 894 rdev->in_sync = 0; 895 rdev->faulty = 0; 896 rdev->raid_disk = -1; 897 break; 898 case 0xfffe: /* faulty */ 899 rdev->in_sync = 0; 900 rdev->faulty = 1; 901 rdev->raid_disk = -1; 902 break; 903 default: 904 rdev->in_sync = 1; 905 rdev->faulty = 0; 906 rdev->raid_disk = role; 907 break; 908 } 909 } 910 return 0; 911 } 912 913 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 914 { 915 struct mdp_superblock_1 *sb; 916 struct list_head *tmp; 917 mdk_rdev_t *rdev2; 918 int max_dev, i; 919 /* make rdev->sb match mddev and rdev data. */ 920 921 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 922 923 sb->feature_map = 0; 924 sb->pad0 = 0; 925 memset(sb->pad1, 0, sizeof(sb->pad1)); 926 memset(sb->pad2, 0, sizeof(sb->pad2)); 927 memset(sb->pad3, 0, sizeof(sb->pad3)); 928 929 sb->utime = cpu_to_le64((__u64)mddev->utime); 930 sb->events = cpu_to_le64(mddev->events); 931 if (mddev->in_sync) 932 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 933 else 934 sb->resync_offset = cpu_to_le64(0); 935 936 max_dev = 0; 937 ITERATE_RDEV(mddev,rdev2,tmp) 938 if (rdev2->desc_nr+1 > max_dev) 939 max_dev = rdev2->desc_nr+1; 940 941 sb->max_dev = cpu_to_le32(max_dev); 942 for (i=0; i<max_dev;i++) 943 sb->dev_roles[i] = cpu_to_le16(0xfffe); 944 945 ITERATE_RDEV(mddev,rdev2,tmp) { 946 i = rdev2->desc_nr; 947 if (rdev2->faulty) 948 sb->dev_roles[i] = cpu_to_le16(0xfffe); 949 else if (rdev2->in_sync) 950 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 951 else 952 sb->dev_roles[i] = cpu_to_le16(0xffff); 953 } 954 955 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 956 sb->sb_csum = calc_sb_1_csum(sb); 957 } 958 959 960 struct super_type super_types[] = { 961 [0] = { 962 .name = "0.90.0", 963 .owner = THIS_MODULE, 964 .load_super = super_90_load, 965 .validate_super = super_90_validate, 966 .sync_super = super_90_sync, 967 }, 968 [1] = { 969 .name = "md-1", 970 .owner = THIS_MODULE, 971 .load_super = super_1_load, 972 .validate_super = super_1_validate, 973 .sync_super = super_1_sync, 974 }, 975 }; 976 977 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 978 { 979 struct list_head *tmp; 980 mdk_rdev_t *rdev; 981 982 ITERATE_RDEV(mddev,rdev,tmp) 983 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 984 return rdev; 985 986 return NULL; 987 } 988 989 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 990 { 991 struct list_head *tmp; 992 mdk_rdev_t *rdev; 993 994 ITERATE_RDEV(mddev1,rdev,tmp) 995 if (match_dev_unit(mddev2, rdev)) 996 return 1; 997 998 return 0; 999 } 1000 1001 static LIST_HEAD(pending_raid_disks); 1002 1003 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1004 { 1005 mdk_rdev_t *same_pdev; 1006 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1007 1008 if (rdev->mddev) { 1009 MD_BUG(); 1010 return -EINVAL; 1011 } 1012 same_pdev = match_dev_unit(mddev, rdev); 1013 if (same_pdev) 1014 printk(KERN_WARNING 1015 "%s: WARNING: %s appears to be on the same physical" 1016 " disk as %s. True\n protection against single-disk" 1017 " failure might be compromised.\n", 1018 mdname(mddev), bdevname(rdev->bdev,b), 1019 bdevname(same_pdev->bdev,b2)); 1020 1021 /* Verify rdev->desc_nr is unique. 1022 * If it is -1, assign a free number, else 1023 * check number is not in use 1024 */ 1025 if (rdev->desc_nr < 0) { 1026 int choice = 0; 1027 if (mddev->pers) choice = mddev->raid_disks; 1028 while (find_rdev_nr(mddev, choice)) 1029 choice++; 1030 rdev->desc_nr = choice; 1031 } else { 1032 if (find_rdev_nr(mddev, rdev->desc_nr)) 1033 return -EBUSY; 1034 } 1035 1036 list_add(&rdev->same_set, &mddev->disks); 1037 rdev->mddev = mddev; 1038 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1039 return 0; 1040 } 1041 1042 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1043 { 1044 char b[BDEVNAME_SIZE]; 1045 if (!rdev->mddev) { 1046 MD_BUG(); 1047 return; 1048 } 1049 list_del_init(&rdev->same_set); 1050 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1051 rdev->mddev = NULL; 1052 } 1053 1054 /* 1055 * prevent the device from being mounted, repartitioned or 1056 * otherwise reused by a RAID array (or any other kernel 1057 * subsystem), by bd_claiming the device. 1058 */ 1059 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1060 { 1061 int err = 0; 1062 struct block_device *bdev; 1063 char b[BDEVNAME_SIZE]; 1064 1065 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1066 if (IS_ERR(bdev)) { 1067 printk(KERN_ERR "md: could not open %s.\n", 1068 __bdevname(dev, b)); 1069 return PTR_ERR(bdev); 1070 } 1071 err = bd_claim(bdev, rdev); 1072 if (err) { 1073 printk(KERN_ERR "md: could not bd_claim %s.\n", 1074 bdevname(bdev, b)); 1075 blkdev_put(bdev); 1076 return err; 1077 } 1078 rdev->bdev = bdev; 1079 return err; 1080 } 1081 1082 static void unlock_rdev(mdk_rdev_t *rdev) 1083 { 1084 struct block_device *bdev = rdev->bdev; 1085 rdev->bdev = NULL; 1086 if (!bdev) 1087 MD_BUG(); 1088 bd_release(bdev); 1089 blkdev_put(bdev); 1090 } 1091 1092 void md_autodetect_dev(dev_t dev); 1093 1094 static void export_rdev(mdk_rdev_t * rdev) 1095 { 1096 char b[BDEVNAME_SIZE]; 1097 printk(KERN_INFO "md: export_rdev(%s)\n", 1098 bdevname(rdev->bdev,b)); 1099 if (rdev->mddev) 1100 MD_BUG(); 1101 free_disk_sb(rdev); 1102 list_del_init(&rdev->same_set); 1103 #ifndef MODULE 1104 md_autodetect_dev(rdev->bdev->bd_dev); 1105 #endif 1106 unlock_rdev(rdev); 1107 kfree(rdev); 1108 } 1109 1110 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1111 { 1112 unbind_rdev_from_array(rdev); 1113 export_rdev(rdev); 1114 } 1115 1116 static void export_array(mddev_t *mddev) 1117 { 1118 struct list_head *tmp; 1119 mdk_rdev_t *rdev; 1120 1121 ITERATE_RDEV(mddev,rdev,tmp) { 1122 if (!rdev->mddev) { 1123 MD_BUG(); 1124 continue; 1125 } 1126 kick_rdev_from_array(rdev); 1127 } 1128 if (!list_empty(&mddev->disks)) 1129 MD_BUG(); 1130 mddev->raid_disks = 0; 1131 mddev->major_version = 0; 1132 } 1133 1134 static void print_desc(mdp_disk_t *desc) 1135 { 1136 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1137 desc->major,desc->minor,desc->raid_disk,desc->state); 1138 } 1139 1140 static void print_sb(mdp_super_t *sb) 1141 { 1142 int i; 1143 1144 printk(KERN_INFO 1145 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1146 sb->major_version, sb->minor_version, sb->patch_version, 1147 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1148 sb->ctime); 1149 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1150 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1151 sb->md_minor, sb->layout, sb->chunk_size); 1152 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1153 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1154 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1155 sb->failed_disks, sb->spare_disks, 1156 sb->sb_csum, (unsigned long)sb->events_lo); 1157 1158 printk(KERN_INFO); 1159 for (i = 0; i < MD_SB_DISKS; i++) { 1160 mdp_disk_t *desc; 1161 1162 desc = sb->disks + i; 1163 if (desc->number || desc->major || desc->minor || 1164 desc->raid_disk || (desc->state && (desc->state != 4))) { 1165 printk(" D %2d: ", i); 1166 print_desc(desc); 1167 } 1168 } 1169 printk(KERN_INFO "md: THIS: "); 1170 print_desc(&sb->this_disk); 1171 1172 } 1173 1174 static void print_rdev(mdk_rdev_t *rdev) 1175 { 1176 char b[BDEVNAME_SIZE]; 1177 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1178 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1179 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1180 if (rdev->sb_loaded) { 1181 printk(KERN_INFO "md: rdev superblock:\n"); 1182 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1183 } else 1184 printk(KERN_INFO "md: no rdev superblock!\n"); 1185 } 1186 1187 void md_print_devices(void) 1188 { 1189 struct list_head *tmp, *tmp2; 1190 mdk_rdev_t *rdev; 1191 mddev_t *mddev; 1192 char b[BDEVNAME_SIZE]; 1193 1194 printk("\n"); 1195 printk("md: **********************************\n"); 1196 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1197 printk("md: **********************************\n"); 1198 ITERATE_MDDEV(mddev,tmp) { 1199 printk("%s: ", mdname(mddev)); 1200 1201 ITERATE_RDEV(mddev,rdev,tmp2) 1202 printk("<%s>", bdevname(rdev->bdev,b)); 1203 printk("\n"); 1204 1205 ITERATE_RDEV(mddev,rdev,tmp2) 1206 print_rdev(rdev); 1207 } 1208 printk("md: **********************************\n"); 1209 printk("\n"); 1210 } 1211 1212 1213 static int write_disk_sb(mdk_rdev_t * rdev) 1214 { 1215 char b[BDEVNAME_SIZE]; 1216 if (!rdev->sb_loaded) { 1217 MD_BUG(); 1218 return 1; 1219 } 1220 if (rdev->faulty) { 1221 MD_BUG(); 1222 return 1; 1223 } 1224 1225 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1226 bdevname(rdev->bdev,b), 1227 (unsigned long long)rdev->sb_offset); 1228 1229 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) 1230 return 0; 1231 1232 printk("md: write_disk_sb failed for device %s\n", 1233 bdevname(rdev->bdev,b)); 1234 return 1; 1235 } 1236 1237 static void sync_sbs(mddev_t * mddev) 1238 { 1239 mdk_rdev_t *rdev; 1240 struct list_head *tmp; 1241 1242 ITERATE_RDEV(mddev,rdev,tmp) { 1243 super_types[mddev->major_version]. 1244 sync_super(mddev, rdev); 1245 rdev->sb_loaded = 1; 1246 } 1247 } 1248 1249 static void md_update_sb(mddev_t * mddev) 1250 { 1251 int err, count = 100; 1252 struct list_head *tmp; 1253 mdk_rdev_t *rdev; 1254 1255 mddev->sb_dirty = 0; 1256 repeat: 1257 mddev->utime = get_seconds(); 1258 mddev->events ++; 1259 1260 if (!mddev->events) { 1261 /* 1262 * oops, this 64-bit counter should never wrap. 1263 * Either we are in around ~1 trillion A.C., assuming 1264 * 1 reboot per second, or we have a bug: 1265 */ 1266 MD_BUG(); 1267 mddev->events --; 1268 } 1269 sync_sbs(mddev); 1270 1271 /* 1272 * do not write anything to disk if using 1273 * nonpersistent superblocks 1274 */ 1275 if (!mddev->persistent) 1276 return; 1277 1278 dprintk(KERN_INFO 1279 "md: updating %s RAID superblock on device (in sync %d)\n", 1280 mdname(mddev),mddev->in_sync); 1281 1282 err = 0; 1283 ITERATE_RDEV(mddev,rdev,tmp) { 1284 char b[BDEVNAME_SIZE]; 1285 dprintk(KERN_INFO "md: "); 1286 if (rdev->faulty) 1287 dprintk("(skipping faulty "); 1288 1289 dprintk("%s ", bdevname(rdev->bdev,b)); 1290 if (!rdev->faulty) { 1291 err += write_disk_sb(rdev); 1292 } else 1293 dprintk(")\n"); 1294 if (!err && mddev->level == LEVEL_MULTIPATH) 1295 /* only need to write one superblock... */ 1296 break; 1297 } 1298 if (err) { 1299 if (--count) { 1300 printk(KERN_ERR "md: errors occurred during superblock" 1301 " update, repeating\n"); 1302 goto repeat; 1303 } 1304 printk(KERN_ERR \ 1305 "md: excessive errors occurred during superblock update, exiting\n"); 1306 } 1307 } 1308 1309 /* 1310 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1311 * 1312 * mark the device faulty if: 1313 * 1314 * - the device is nonexistent (zero size) 1315 * - the device has no valid superblock 1316 * 1317 * a faulty rdev _never_ has rdev->sb set. 1318 */ 1319 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1320 { 1321 char b[BDEVNAME_SIZE]; 1322 int err; 1323 mdk_rdev_t *rdev; 1324 sector_t size; 1325 1326 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1327 if (!rdev) { 1328 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1329 return ERR_PTR(-ENOMEM); 1330 } 1331 memset(rdev, 0, sizeof(*rdev)); 1332 1333 if ((err = alloc_disk_sb(rdev))) 1334 goto abort_free; 1335 1336 err = lock_rdev(rdev, newdev); 1337 if (err) 1338 goto abort_free; 1339 1340 rdev->desc_nr = -1; 1341 rdev->faulty = 0; 1342 rdev->in_sync = 0; 1343 rdev->data_offset = 0; 1344 atomic_set(&rdev->nr_pending, 0); 1345 1346 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1347 if (!size) { 1348 printk(KERN_WARNING 1349 "md: %s has zero or unknown size, marking faulty!\n", 1350 bdevname(rdev->bdev,b)); 1351 err = -EINVAL; 1352 goto abort_free; 1353 } 1354 1355 if (super_format >= 0) { 1356 err = super_types[super_format]. 1357 load_super(rdev, NULL, super_minor); 1358 if (err == -EINVAL) { 1359 printk(KERN_WARNING 1360 "md: %s has invalid sb, not importing!\n", 1361 bdevname(rdev->bdev,b)); 1362 goto abort_free; 1363 } 1364 if (err < 0) { 1365 printk(KERN_WARNING 1366 "md: could not read %s's sb, not importing!\n", 1367 bdevname(rdev->bdev,b)); 1368 goto abort_free; 1369 } 1370 } 1371 INIT_LIST_HEAD(&rdev->same_set); 1372 1373 return rdev; 1374 1375 abort_free: 1376 if (rdev->sb_page) { 1377 if (rdev->bdev) 1378 unlock_rdev(rdev); 1379 free_disk_sb(rdev); 1380 } 1381 kfree(rdev); 1382 return ERR_PTR(err); 1383 } 1384 1385 /* 1386 * Check a full RAID array for plausibility 1387 */ 1388 1389 1390 static void analyze_sbs(mddev_t * mddev) 1391 { 1392 int i; 1393 struct list_head *tmp; 1394 mdk_rdev_t *rdev, *freshest; 1395 char b[BDEVNAME_SIZE]; 1396 1397 freshest = NULL; 1398 ITERATE_RDEV(mddev,rdev,tmp) 1399 switch (super_types[mddev->major_version]. 1400 load_super(rdev, freshest, mddev->minor_version)) { 1401 case 1: 1402 freshest = rdev; 1403 break; 1404 case 0: 1405 break; 1406 default: 1407 printk( KERN_ERR \ 1408 "md: fatal superblock inconsistency in %s" 1409 " -- removing from array\n", 1410 bdevname(rdev->bdev,b)); 1411 kick_rdev_from_array(rdev); 1412 } 1413 1414 1415 super_types[mddev->major_version]. 1416 validate_super(mddev, freshest); 1417 1418 i = 0; 1419 ITERATE_RDEV(mddev,rdev,tmp) { 1420 if (rdev != freshest) 1421 if (super_types[mddev->major_version]. 1422 validate_super(mddev, rdev)) { 1423 printk(KERN_WARNING "md: kicking non-fresh %s" 1424 " from array!\n", 1425 bdevname(rdev->bdev,b)); 1426 kick_rdev_from_array(rdev); 1427 continue; 1428 } 1429 if (mddev->level == LEVEL_MULTIPATH) { 1430 rdev->desc_nr = i++; 1431 rdev->raid_disk = rdev->desc_nr; 1432 rdev->in_sync = 1; 1433 } 1434 } 1435 1436 1437 1438 if (mddev->recovery_cp != MaxSector && 1439 mddev->level >= 1) 1440 printk(KERN_ERR "md: %s: raid array is not clean" 1441 " -- starting background reconstruction\n", 1442 mdname(mddev)); 1443 1444 } 1445 1446 int mdp_major = 0; 1447 1448 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1449 { 1450 static DECLARE_MUTEX(disks_sem); 1451 mddev_t *mddev = mddev_find(dev); 1452 struct gendisk *disk; 1453 int partitioned = (MAJOR(dev) != MD_MAJOR); 1454 int shift = partitioned ? MdpMinorShift : 0; 1455 int unit = MINOR(dev) >> shift; 1456 1457 if (!mddev) 1458 return NULL; 1459 1460 down(&disks_sem); 1461 if (mddev->gendisk) { 1462 up(&disks_sem); 1463 mddev_put(mddev); 1464 return NULL; 1465 } 1466 disk = alloc_disk(1 << shift); 1467 if (!disk) { 1468 up(&disks_sem); 1469 mddev_put(mddev); 1470 return NULL; 1471 } 1472 disk->major = MAJOR(dev); 1473 disk->first_minor = unit << shift; 1474 if (partitioned) { 1475 sprintf(disk->disk_name, "md_d%d", unit); 1476 sprintf(disk->devfs_name, "md/d%d", unit); 1477 } else { 1478 sprintf(disk->disk_name, "md%d", unit); 1479 sprintf(disk->devfs_name, "md/%d", unit); 1480 } 1481 disk->fops = &md_fops; 1482 disk->private_data = mddev; 1483 disk->queue = mddev->queue; 1484 add_disk(disk); 1485 mddev->gendisk = disk; 1486 up(&disks_sem); 1487 return NULL; 1488 } 1489 1490 void md_wakeup_thread(mdk_thread_t *thread); 1491 1492 static void md_safemode_timeout(unsigned long data) 1493 { 1494 mddev_t *mddev = (mddev_t *) data; 1495 1496 mddev->safemode = 1; 1497 md_wakeup_thread(mddev->thread); 1498 } 1499 1500 1501 static int do_md_run(mddev_t * mddev) 1502 { 1503 int pnum, err; 1504 int chunk_size; 1505 struct list_head *tmp; 1506 mdk_rdev_t *rdev; 1507 struct gendisk *disk; 1508 char b[BDEVNAME_SIZE]; 1509 1510 if (list_empty(&mddev->disks)) 1511 /* cannot run an array with no devices.. */ 1512 return -EINVAL; 1513 1514 if (mddev->pers) 1515 return -EBUSY; 1516 1517 /* 1518 * Analyze all RAID superblock(s) 1519 */ 1520 if (!mddev->raid_disks) 1521 analyze_sbs(mddev); 1522 1523 chunk_size = mddev->chunk_size; 1524 pnum = level_to_pers(mddev->level); 1525 1526 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1527 if (!chunk_size) { 1528 /* 1529 * 'default chunksize' in the old md code used to 1530 * be PAGE_SIZE, baaad. 1531 * we abort here to be on the safe side. We don't 1532 * want to continue the bad practice. 1533 */ 1534 printk(KERN_ERR 1535 "no chunksize specified, see 'man raidtab'\n"); 1536 return -EINVAL; 1537 } 1538 if (chunk_size > MAX_CHUNK_SIZE) { 1539 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1540 chunk_size, MAX_CHUNK_SIZE); 1541 return -EINVAL; 1542 } 1543 /* 1544 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1545 */ 1546 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1547 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1548 return -EINVAL; 1549 } 1550 if (chunk_size < PAGE_SIZE) { 1551 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1552 chunk_size, PAGE_SIZE); 1553 return -EINVAL; 1554 } 1555 1556 /* devices must have minimum size of one chunk */ 1557 ITERATE_RDEV(mddev,rdev,tmp) { 1558 if (rdev->faulty) 1559 continue; 1560 if (rdev->size < chunk_size / 1024) { 1561 printk(KERN_WARNING 1562 "md: Dev %s smaller than chunk_size:" 1563 " %lluk < %dk\n", 1564 bdevname(rdev->bdev,b), 1565 (unsigned long long)rdev->size, 1566 chunk_size / 1024); 1567 return -EINVAL; 1568 } 1569 } 1570 } 1571 1572 #ifdef CONFIG_KMOD 1573 if (!pers[pnum]) 1574 { 1575 request_module("md-personality-%d", pnum); 1576 } 1577 #endif 1578 1579 /* 1580 * Drop all container device buffers, from now on 1581 * the only valid external interface is through the md 1582 * device. 1583 * Also find largest hardsector size 1584 */ 1585 ITERATE_RDEV(mddev,rdev,tmp) { 1586 if (rdev->faulty) 1587 continue; 1588 sync_blockdev(rdev->bdev); 1589 invalidate_bdev(rdev->bdev, 0); 1590 } 1591 1592 md_probe(mddev->unit, NULL, NULL); 1593 disk = mddev->gendisk; 1594 if (!disk) 1595 return -ENOMEM; 1596 1597 spin_lock(&pers_lock); 1598 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1599 spin_unlock(&pers_lock); 1600 printk(KERN_WARNING "md: personality %d is not loaded!\n", 1601 pnum); 1602 return -EINVAL; 1603 } 1604 1605 mddev->pers = pers[pnum]; 1606 spin_unlock(&pers_lock); 1607 1608 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1609 1610 err = mddev->pers->run(mddev); 1611 if (err) { 1612 printk(KERN_ERR "md: pers->run() failed ...\n"); 1613 module_put(mddev->pers->owner); 1614 mddev->pers = NULL; 1615 return -EINVAL; 1616 } 1617 atomic_set(&mddev->writes_pending,0); 1618 mddev->safemode = 0; 1619 mddev->safemode_timer.function = md_safemode_timeout; 1620 mddev->safemode_timer.data = (unsigned long) mddev; 1621 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 1622 mddev->in_sync = 1; 1623 1624 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1625 1626 if (mddev->sb_dirty) 1627 md_update_sb(mddev); 1628 1629 set_capacity(disk, mddev->array_size<<1); 1630 1631 /* If we call blk_queue_make_request here, it will 1632 * re-initialise max_sectors etc which may have been 1633 * refined inside -> run. So just set the bits we need to set. 1634 * Most initialisation happended when we called 1635 * blk_queue_make_request(..., md_fail_request) 1636 * earlier. 1637 */ 1638 mddev->queue->queuedata = mddev; 1639 mddev->queue->make_request_fn = mddev->pers->make_request; 1640 1641 mddev->changed = 1; 1642 return 0; 1643 } 1644 1645 static int restart_array(mddev_t *mddev) 1646 { 1647 struct gendisk *disk = mddev->gendisk; 1648 int err; 1649 1650 /* 1651 * Complain if it has no devices 1652 */ 1653 err = -ENXIO; 1654 if (list_empty(&mddev->disks)) 1655 goto out; 1656 1657 if (mddev->pers) { 1658 err = -EBUSY; 1659 if (!mddev->ro) 1660 goto out; 1661 1662 mddev->safemode = 0; 1663 mddev->ro = 0; 1664 set_disk_ro(disk, 0); 1665 1666 printk(KERN_INFO "md: %s switched to read-write mode.\n", 1667 mdname(mddev)); 1668 /* 1669 * Kick recovery or resync if necessary 1670 */ 1671 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1672 md_wakeup_thread(mddev->thread); 1673 err = 0; 1674 } else { 1675 printk(KERN_ERR "md: %s has no personality assigned.\n", 1676 mdname(mddev)); 1677 err = -EINVAL; 1678 } 1679 1680 out: 1681 return err; 1682 } 1683 1684 static int do_md_stop(mddev_t * mddev, int ro) 1685 { 1686 int err = 0; 1687 struct gendisk *disk = mddev->gendisk; 1688 1689 if (mddev->pers) { 1690 if (atomic_read(&mddev->active)>2) { 1691 printk("md: %s still in use.\n",mdname(mddev)); 1692 return -EBUSY; 1693 } 1694 1695 if (mddev->sync_thread) { 1696 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1697 md_unregister_thread(mddev->sync_thread); 1698 mddev->sync_thread = NULL; 1699 } 1700 1701 del_timer_sync(&mddev->safemode_timer); 1702 1703 invalidate_partition(disk, 0); 1704 1705 if (ro) { 1706 err = -ENXIO; 1707 if (mddev->ro) 1708 goto out; 1709 mddev->ro = 1; 1710 } else { 1711 if (mddev->ro) 1712 set_disk_ro(disk, 0); 1713 blk_queue_make_request(mddev->queue, md_fail_request); 1714 mddev->pers->stop(mddev); 1715 module_put(mddev->pers->owner); 1716 mddev->pers = NULL; 1717 if (mddev->ro) 1718 mddev->ro = 0; 1719 } 1720 if (!mddev->in_sync) { 1721 /* mark array as shutdown cleanly */ 1722 mddev->in_sync = 1; 1723 md_update_sb(mddev); 1724 } 1725 if (ro) 1726 set_disk_ro(disk, 1); 1727 } 1728 /* 1729 * Free resources if final stop 1730 */ 1731 if (!ro) { 1732 struct gendisk *disk; 1733 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 1734 1735 export_array(mddev); 1736 1737 mddev->array_size = 0; 1738 disk = mddev->gendisk; 1739 if (disk) 1740 set_capacity(disk, 0); 1741 mddev->changed = 1; 1742 } else 1743 printk(KERN_INFO "md: %s switched to read-only mode.\n", 1744 mdname(mddev)); 1745 err = 0; 1746 out: 1747 return err; 1748 } 1749 1750 static void autorun_array(mddev_t *mddev) 1751 { 1752 mdk_rdev_t *rdev; 1753 struct list_head *tmp; 1754 int err; 1755 1756 if (list_empty(&mddev->disks)) 1757 return; 1758 1759 printk(KERN_INFO "md: running: "); 1760 1761 ITERATE_RDEV(mddev,rdev,tmp) { 1762 char b[BDEVNAME_SIZE]; 1763 printk("<%s>", bdevname(rdev->bdev,b)); 1764 } 1765 printk("\n"); 1766 1767 err = do_md_run (mddev); 1768 if (err) { 1769 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 1770 do_md_stop (mddev, 0); 1771 } 1772 } 1773 1774 /* 1775 * lets try to run arrays based on all disks that have arrived 1776 * until now. (those are in pending_raid_disks) 1777 * 1778 * the method: pick the first pending disk, collect all disks with 1779 * the same UUID, remove all from the pending list and put them into 1780 * the 'same_array' list. Then order this list based on superblock 1781 * update time (freshest comes first), kick out 'old' disks and 1782 * compare superblocks. If everything's fine then run it. 1783 * 1784 * If "unit" is allocated, then bump its reference count 1785 */ 1786 static void autorun_devices(int part) 1787 { 1788 struct list_head candidates; 1789 struct list_head *tmp; 1790 mdk_rdev_t *rdev0, *rdev; 1791 mddev_t *mddev; 1792 char b[BDEVNAME_SIZE]; 1793 1794 printk(KERN_INFO "md: autorun ...\n"); 1795 while (!list_empty(&pending_raid_disks)) { 1796 dev_t dev; 1797 rdev0 = list_entry(pending_raid_disks.next, 1798 mdk_rdev_t, same_set); 1799 1800 printk(KERN_INFO "md: considering %s ...\n", 1801 bdevname(rdev0->bdev,b)); 1802 INIT_LIST_HEAD(&candidates); 1803 ITERATE_RDEV_PENDING(rdev,tmp) 1804 if (super_90_load(rdev, rdev0, 0) >= 0) { 1805 printk(KERN_INFO "md: adding %s ...\n", 1806 bdevname(rdev->bdev,b)); 1807 list_move(&rdev->same_set, &candidates); 1808 } 1809 /* 1810 * now we have a set of devices, with all of them having 1811 * mostly sane superblocks. It's time to allocate the 1812 * mddev. 1813 */ 1814 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 1815 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 1816 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 1817 break; 1818 } 1819 if (part) 1820 dev = MKDEV(mdp_major, 1821 rdev0->preferred_minor << MdpMinorShift); 1822 else 1823 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 1824 1825 md_probe(dev, NULL, NULL); 1826 mddev = mddev_find(dev); 1827 if (!mddev) { 1828 printk(KERN_ERR 1829 "md: cannot allocate memory for md drive.\n"); 1830 break; 1831 } 1832 if (mddev_lock(mddev)) 1833 printk(KERN_WARNING "md: %s locked, cannot run\n", 1834 mdname(mddev)); 1835 else if (mddev->raid_disks || mddev->major_version 1836 || !list_empty(&mddev->disks)) { 1837 printk(KERN_WARNING 1838 "md: %s already running, cannot run %s\n", 1839 mdname(mddev), bdevname(rdev0->bdev,b)); 1840 mddev_unlock(mddev); 1841 } else { 1842 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 1843 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 1844 list_del_init(&rdev->same_set); 1845 if (bind_rdev_to_array(rdev, mddev)) 1846 export_rdev(rdev); 1847 } 1848 autorun_array(mddev); 1849 mddev_unlock(mddev); 1850 } 1851 /* on success, candidates will be empty, on error 1852 * it won't... 1853 */ 1854 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 1855 export_rdev(rdev); 1856 mddev_put(mddev); 1857 } 1858 printk(KERN_INFO "md: ... autorun DONE.\n"); 1859 } 1860 1861 /* 1862 * import RAID devices based on one partition 1863 * if possible, the array gets run as well. 1864 */ 1865 1866 static int autostart_array(dev_t startdev) 1867 { 1868 char b[BDEVNAME_SIZE]; 1869 int err = -EINVAL, i; 1870 mdp_super_t *sb = NULL; 1871 mdk_rdev_t *start_rdev = NULL, *rdev; 1872 1873 start_rdev = md_import_device(startdev, 0, 0); 1874 if (IS_ERR(start_rdev)) 1875 return err; 1876 1877 1878 /* NOTE: this can only work for 0.90.0 superblocks */ 1879 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 1880 if (sb->major_version != 0 || 1881 sb->minor_version != 90 ) { 1882 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 1883 export_rdev(start_rdev); 1884 return err; 1885 } 1886 1887 if (start_rdev->faulty) { 1888 printk(KERN_WARNING 1889 "md: can not autostart based on faulty %s!\n", 1890 bdevname(start_rdev->bdev,b)); 1891 export_rdev(start_rdev); 1892 return err; 1893 } 1894 list_add(&start_rdev->same_set, &pending_raid_disks); 1895 1896 for (i = 0; i < MD_SB_DISKS; i++) { 1897 mdp_disk_t *desc = sb->disks + i; 1898 dev_t dev = MKDEV(desc->major, desc->minor); 1899 1900 if (!dev) 1901 continue; 1902 if (dev == startdev) 1903 continue; 1904 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 1905 continue; 1906 rdev = md_import_device(dev, 0, 0); 1907 if (IS_ERR(rdev)) 1908 continue; 1909 1910 list_add(&rdev->same_set, &pending_raid_disks); 1911 } 1912 1913 /* 1914 * possibly return codes 1915 */ 1916 autorun_devices(0); 1917 return 0; 1918 1919 } 1920 1921 1922 static int get_version(void __user * arg) 1923 { 1924 mdu_version_t ver; 1925 1926 ver.major = MD_MAJOR_VERSION; 1927 ver.minor = MD_MINOR_VERSION; 1928 ver.patchlevel = MD_PATCHLEVEL_VERSION; 1929 1930 if (copy_to_user(arg, &ver, sizeof(ver))) 1931 return -EFAULT; 1932 1933 return 0; 1934 } 1935 1936 static int get_array_info(mddev_t * mddev, void __user * arg) 1937 { 1938 mdu_array_info_t info; 1939 int nr,working,active,failed,spare; 1940 mdk_rdev_t *rdev; 1941 struct list_head *tmp; 1942 1943 nr=working=active=failed=spare=0; 1944 ITERATE_RDEV(mddev,rdev,tmp) { 1945 nr++; 1946 if (rdev->faulty) 1947 failed++; 1948 else { 1949 working++; 1950 if (rdev->in_sync) 1951 active++; 1952 else 1953 spare++; 1954 } 1955 } 1956 1957 info.major_version = mddev->major_version; 1958 info.minor_version = mddev->minor_version; 1959 info.patch_version = MD_PATCHLEVEL_VERSION; 1960 info.ctime = mddev->ctime; 1961 info.level = mddev->level; 1962 info.size = mddev->size; 1963 info.nr_disks = nr; 1964 info.raid_disks = mddev->raid_disks; 1965 info.md_minor = mddev->md_minor; 1966 info.not_persistent= !mddev->persistent; 1967 1968 info.utime = mddev->utime; 1969 info.state = 0; 1970 if (mddev->in_sync) 1971 info.state = (1<<MD_SB_CLEAN); 1972 info.active_disks = active; 1973 info.working_disks = working; 1974 info.failed_disks = failed; 1975 info.spare_disks = spare; 1976 1977 info.layout = mddev->layout; 1978 info.chunk_size = mddev->chunk_size; 1979 1980 if (copy_to_user(arg, &info, sizeof(info))) 1981 return -EFAULT; 1982 1983 return 0; 1984 } 1985 1986 static int get_disk_info(mddev_t * mddev, void __user * arg) 1987 { 1988 mdu_disk_info_t info; 1989 unsigned int nr; 1990 mdk_rdev_t *rdev; 1991 1992 if (copy_from_user(&info, arg, sizeof(info))) 1993 return -EFAULT; 1994 1995 nr = info.number; 1996 1997 rdev = find_rdev_nr(mddev, nr); 1998 if (rdev) { 1999 info.major = MAJOR(rdev->bdev->bd_dev); 2000 info.minor = MINOR(rdev->bdev->bd_dev); 2001 info.raid_disk = rdev->raid_disk; 2002 info.state = 0; 2003 if (rdev->faulty) 2004 info.state |= (1<<MD_DISK_FAULTY); 2005 else if (rdev->in_sync) { 2006 info.state |= (1<<MD_DISK_ACTIVE); 2007 info.state |= (1<<MD_DISK_SYNC); 2008 } 2009 } else { 2010 info.major = info.minor = 0; 2011 info.raid_disk = -1; 2012 info.state = (1<<MD_DISK_REMOVED); 2013 } 2014 2015 if (copy_to_user(arg, &info, sizeof(info))) 2016 return -EFAULT; 2017 2018 return 0; 2019 } 2020 2021 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2022 { 2023 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2024 mdk_rdev_t *rdev; 2025 dev_t dev = MKDEV(info->major,info->minor); 2026 2027 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2028 return -EOVERFLOW; 2029 2030 if (!mddev->raid_disks) { 2031 int err; 2032 /* expecting a device which has a superblock */ 2033 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2034 if (IS_ERR(rdev)) { 2035 printk(KERN_WARNING 2036 "md: md_import_device returned %ld\n", 2037 PTR_ERR(rdev)); 2038 return PTR_ERR(rdev); 2039 } 2040 if (!list_empty(&mddev->disks)) { 2041 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2042 mdk_rdev_t, same_set); 2043 int err = super_types[mddev->major_version] 2044 .load_super(rdev, rdev0, mddev->minor_version); 2045 if (err < 0) { 2046 printk(KERN_WARNING 2047 "md: %s has different UUID to %s\n", 2048 bdevname(rdev->bdev,b), 2049 bdevname(rdev0->bdev,b2)); 2050 export_rdev(rdev); 2051 return -EINVAL; 2052 } 2053 } 2054 err = bind_rdev_to_array(rdev, mddev); 2055 if (err) 2056 export_rdev(rdev); 2057 return err; 2058 } 2059 2060 /* 2061 * add_new_disk can be used once the array is assembled 2062 * to add "hot spares". They must already have a superblock 2063 * written 2064 */ 2065 if (mddev->pers) { 2066 int err; 2067 if (!mddev->pers->hot_add_disk) { 2068 printk(KERN_WARNING 2069 "%s: personality does not support diskops!\n", 2070 mdname(mddev)); 2071 return -EINVAL; 2072 } 2073 rdev = md_import_device(dev, mddev->major_version, 2074 mddev->minor_version); 2075 if (IS_ERR(rdev)) { 2076 printk(KERN_WARNING 2077 "md: md_import_device returned %ld\n", 2078 PTR_ERR(rdev)); 2079 return PTR_ERR(rdev); 2080 } 2081 rdev->in_sync = 0; /* just to be sure */ 2082 rdev->raid_disk = -1; 2083 err = bind_rdev_to_array(rdev, mddev); 2084 if (err) 2085 export_rdev(rdev); 2086 if (mddev->thread) 2087 md_wakeup_thread(mddev->thread); 2088 return err; 2089 } 2090 2091 /* otherwise, add_new_disk is only allowed 2092 * for major_version==0 superblocks 2093 */ 2094 if (mddev->major_version != 0) { 2095 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2096 mdname(mddev)); 2097 return -EINVAL; 2098 } 2099 2100 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2101 int err; 2102 rdev = md_import_device (dev, -1, 0); 2103 if (IS_ERR(rdev)) { 2104 printk(KERN_WARNING 2105 "md: error, md_import_device() returned %ld\n", 2106 PTR_ERR(rdev)); 2107 return PTR_ERR(rdev); 2108 } 2109 rdev->desc_nr = info->number; 2110 if (info->raid_disk < mddev->raid_disks) 2111 rdev->raid_disk = info->raid_disk; 2112 else 2113 rdev->raid_disk = -1; 2114 2115 rdev->faulty = 0; 2116 if (rdev->raid_disk < mddev->raid_disks) 2117 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2118 else 2119 rdev->in_sync = 0; 2120 2121 err = bind_rdev_to_array(rdev, mddev); 2122 if (err) { 2123 export_rdev(rdev); 2124 return err; 2125 } 2126 2127 if (!mddev->persistent) { 2128 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2129 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2130 } else 2131 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2132 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2133 2134 if (!mddev->size || (mddev->size > rdev->size)) 2135 mddev->size = rdev->size; 2136 } 2137 2138 return 0; 2139 } 2140 2141 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2142 { 2143 char b[BDEVNAME_SIZE]; 2144 mdk_rdev_t *rdev; 2145 2146 if (!mddev->pers) 2147 return -ENODEV; 2148 2149 rdev = find_rdev(mddev, dev); 2150 if (!rdev) 2151 return -ENXIO; 2152 2153 if (rdev->raid_disk >= 0) 2154 goto busy; 2155 2156 kick_rdev_from_array(rdev); 2157 md_update_sb(mddev); 2158 2159 return 0; 2160 busy: 2161 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2162 bdevname(rdev->bdev,b), mdname(mddev)); 2163 return -EBUSY; 2164 } 2165 2166 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2167 { 2168 char b[BDEVNAME_SIZE]; 2169 int err; 2170 unsigned int size; 2171 mdk_rdev_t *rdev; 2172 2173 if (!mddev->pers) 2174 return -ENODEV; 2175 2176 if (mddev->major_version != 0) { 2177 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2178 " version-0 superblocks.\n", 2179 mdname(mddev)); 2180 return -EINVAL; 2181 } 2182 if (!mddev->pers->hot_add_disk) { 2183 printk(KERN_WARNING 2184 "%s: personality does not support diskops!\n", 2185 mdname(mddev)); 2186 return -EINVAL; 2187 } 2188 2189 rdev = md_import_device (dev, -1, 0); 2190 if (IS_ERR(rdev)) { 2191 printk(KERN_WARNING 2192 "md: error, md_import_device() returned %ld\n", 2193 PTR_ERR(rdev)); 2194 return -EINVAL; 2195 } 2196 2197 if (mddev->persistent) 2198 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2199 else 2200 rdev->sb_offset = 2201 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2202 2203 size = calc_dev_size(rdev, mddev->chunk_size); 2204 rdev->size = size; 2205 2206 if (size < mddev->size) { 2207 printk(KERN_WARNING 2208 "%s: disk size %llu blocks < array size %llu\n", 2209 mdname(mddev), (unsigned long long)size, 2210 (unsigned long long)mddev->size); 2211 err = -ENOSPC; 2212 goto abort_export; 2213 } 2214 2215 if (rdev->faulty) { 2216 printk(KERN_WARNING 2217 "md: can not hot-add faulty %s disk to %s!\n", 2218 bdevname(rdev->bdev,b), mdname(mddev)); 2219 err = -EINVAL; 2220 goto abort_export; 2221 } 2222 rdev->in_sync = 0; 2223 rdev->desc_nr = -1; 2224 bind_rdev_to_array(rdev, mddev); 2225 2226 /* 2227 * The rest should better be atomic, we can have disk failures 2228 * noticed in interrupt contexts ... 2229 */ 2230 2231 if (rdev->desc_nr == mddev->max_disks) { 2232 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2233 mdname(mddev)); 2234 err = -EBUSY; 2235 goto abort_unbind_export; 2236 } 2237 2238 rdev->raid_disk = -1; 2239 2240 md_update_sb(mddev); 2241 2242 /* 2243 * Kick recovery, maybe this spare has to be added to the 2244 * array immediately. 2245 */ 2246 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2247 md_wakeup_thread(mddev->thread); 2248 2249 return 0; 2250 2251 abort_unbind_export: 2252 unbind_rdev_from_array(rdev); 2253 2254 abort_export: 2255 export_rdev(rdev); 2256 return err; 2257 } 2258 2259 /* 2260 * set_array_info is used two different ways 2261 * The original usage is when creating a new array. 2262 * In this usage, raid_disks is > 0 and it together with 2263 * level, size, not_persistent,layout,chunksize determine the 2264 * shape of the array. 2265 * This will always create an array with a type-0.90.0 superblock. 2266 * The newer usage is when assembling an array. 2267 * In this case raid_disks will be 0, and the major_version field is 2268 * use to determine which style super-blocks are to be found on the devices. 2269 * The minor and patch _version numbers are also kept incase the 2270 * super_block handler wishes to interpret them. 2271 */ 2272 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2273 { 2274 2275 if (info->raid_disks == 0) { 2276 /* just setting version number for superblock loading */ 2277 if (info->major_version < 0 || 2278 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2279 super_types[info->major_version].name == NULL) { 2280 /* maybe try to auto-load a module? */ 2281 printk(KERN_INFO 2282 "md: superblock version %d not known\n", 2283 info->major_version); 2284 return -EINVAL; 2285 } 2286 mddev->major_version = info->major_version; 2287 mddev->minor_version = info->minor_version; 2288 mddev->patch_version = info->patch_version; 2289 return 0; 2290 } 2291 mddev->major_version = MD_MAJOR_VERSION; 2292 mddev->minor_version = MD_MINOR_VERSION; 2293 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2294 mddev->ctime = get_seconds(); 2295 2296 mddev->level = info->level; 2297 mddev->size = info->size; 2298 mddev->raid_disks = info->raid_disks; 2299 /* don't set md_minor, it is determined by which /dev/md* was 2300 * openned 2301 */ 2302 if (info->state & (1<<MD_SB_CLEAN)) 2303 mddev->recovery_cp = MaxSector; 2304 else 2305 mddev->recovery_cp = 0; 2306 mddev->persistent = ! info->not_persistent; 2307 2308 mddev->layout = info->layout; 2309 mddev->chunk_size = info->chunk_size; 2310 2311 mddev->max_disks = MD_SB_DISKS; 2312 2313 mddev->sb_dirty = 1; 2314 2315 /* 2316 * Generate a 128 bit UUID 2317 */ 2318 get_random_bytes(mddev->uuid, 16); 2319 2320 return 0; 2321 } 2322 2323 /* 2324 * update_array_info is used to change the configuration of an 2325 * on-line array. 2326 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2327 * fields in the info are checked against the array. 2328 * Any differences that cannot be handled will cause an error. 2329 * Normally, only one change can be managed at a time. 2330 */ 2331 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2332 { 2333 int rv = 0; 2334 int cnt = 0; 2335 2336 if (mddev->major_version != info->major_version || 2337 mddev->minor_version != info->minor_version || 2338 /* mddev->patch_version != info->patch_version || */ 2339 mddev->ctime != info->ctime || 2340 mddev->level != info->level || 2341 /* mddev->layout != info->layout || */ 2342 !mddev->persistent != info->not_persistent|| 2343 mddev->chunk_size != info->chunk_size ) 2344 return -EINVAL; 2345 /* Check there is only one change */ 2346 if (mddev->size != info->size) cnt++; 2347 if (mddev->raid_disks != info->raid_disks) cnt++; 2348 if (mddev->layout != info->layout) cnt++; 2349 if (cnt == 0) return 0; 2350 if (cnt > 1) return -EINVAL; 2351 2352 if (mddev->layout != info->layout) { 2353 /* Change layout 2354 * we don't need to do anything at the md level, the 2355 * personality will take care of it all. 2356 */ 2357 if (mddev->pers->reconfig == NULL) 2358 return -EINVAL; 2359 else 2360 return mddev->pers->reconfig(mddev, info->layout, -1); 2361 } 2362 if (mddev->size != info->size) { 2363 mdk_rdev_t * rdev; 2364 struct list_head *tmp; 2365 if (mddev->pers->resize == NULL) 2366 return -EINVAL; 2367 /* The "size" is the amount of each device that is used. 2368 * This can only make sense for arrays with redundancy. 2369 * linear and raid0 always use whatever space is available 2370 * We can only consider changing the size if no resync 2371 * or reconstruction is happening, and if the new size 2372 * is acceptable. It must fit before the sb_offset or, 2373 * if that is <data_offset, it must fit before the 2374 * size of each device. 2375 * If size is zero, we find the largest size that fits. 2376 */ 2377 if (mddev->sync_thread) 2378 return -EBUSY; 2379 ITERATE_RDEV(mddev,rdev,tmp) { 2380 sector_t avail; 2381 int fit = (info->size == 0); 2382 if (rdev->sb_offset > rdev->data_offset) 2383 avail = (rdev->sb_offset*2) - rdev->data_offset; 2384 else 2385 avail = get_capacity(rdev->bdev->bd_disk) 2386 - rdev->data_offset; 2387 if (fit && (info->size == 0 || info->size > avail/2)) 2388 info->size = avail/2; 2389 if (avail < ((sector_t)info->size << 1)) 2390 return -ENOSPC; 2391 } 2392 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 2393 if (!rv) { 2394 struct block_device *bdev; 2395 2396 bdev = bdget_disk(mddev->gendisk, 0); 2397 if (bdev) { 2398 down(&bdev->bd_inode->i_sem); 2399 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2400 up(&bdev->bd_inode->i_sem); 2401 bdput(bdev); 2402 } 2403 } 2404 } 2405 if (mddev->raid_disks != info->raid_disks) { 2406 /* change the number of raid disks */ 2407 if (mddev->pers->reshape == NULL) 2408 return -EINVAL; 2409 if (info->raid_disks <= 0 || 2410 info->raid_disks >= mddev->max_disks) 2411 return -EINVAL; 2412 if (mddev->sync_thread) 2413 return -EBUSY; 2414 rv = mddev->pers->reshape(mddev, info->raid_disks); 2415 if (!rv) { 2416 struct block_device *bdev; 2417 2418 bdev = bdget_disk(mddev->gendisk, 0); 2419 if (bdev) { 2420 down(&bdev->bd_inode->i_sem); 2421 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2422 up(&bdev->bd_inode->i_sem); 2423 bdput(bdev); 2424 } 2425 } 2426 } 2427 md_update_sb(mddev); 2428 return rv; 2429 } 2430 2431 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 2432 { 2433 mdk_rdev_t *rdev; 2434 2435 if (mddev->pers == NULL) 2436 return -ENODEV; 2437 2438 rdev = find_rdev(mddev, dev); 2439 if (!rdev) 2440 return -ENODEV; 2441 2442 md_error(mddev, rdev); 2443 return 0; 2444 } 2445 2446 static int md_ioctl(struct inode *inode, struct file *file, 2447 unsigned int cmd, unsigned long arg) 2448 { 2449 int err = 0; 2450 void __user *argp = (void __user *)arg; 2451 struct hd_geometry __user *loc = argp; 2452 mddev_t *mddev = NULL; 2453 2454 if (!capable(CAP_SYS_ADMIN)) 2455 return -EACCES; 2456 2457 /* 2458 * Commands dealing with the RAID driver but not any 2459 * particular array: 2460 */ 2461 switch (cmd) 2462 { 2463 case RAID_VERSION: 2464 err = get_version(argp); 2465 goto done; 2466 2467 case PRINT_RAID_DEBUG: 2468 err = 0; 2469 md_print_devices(); 2470 goto done; 2471 2472 #ifndef MODULE 2473 case RAID_AUTORUN: 2474 err = 0; 2475 autostart_arrays(arg); 2476 goto done; 2477 #endif 2478 default:; 2479 } 2480 2481 /* 2482 * Commands creating/starting a new array: 2483 */ 2484 2485 mddev = inode->i_bdev->bd_disk->private_data; 2486 2487 if (!mddev) { 2488 BUG(); 2489 goto abort; 2490 } 2491 2492 2493 if (cmd == START_ARRAY) { 2494 /* START_ARRAY doesn't need to lock the array as autostart_array 2495 * does the locking, and it could even be a different array 2496 */ 2497 static int cnt = 3; 2498 if (cnt > 0 ) { 2499 printk(KERN_WARNING 2500 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 2501 "This will not be supported beyond 2.6\n", 2502 current->comm, current->pid); 2503 cnt--; 2504 } 2505 err = autostart_array(new_decode_dev(arg)); 2506 if (err) { 2507 printk(KERN_WARNING "md: autostart failed!\n"); 2508 goto abort; 2509 } 2510 goto done; 2511 } 2512 2513 err = mddev_lock(mddev); 2514 if (err) { 2515 printk(KERN_INFO 2516 "md: ioctl lock interrupted, reason %d, cmd %d\n", 2517 err, cmd); 2518 goto abort; 2519 } 2520 2521 switch (cmd) 2522 { 2523 case SET_ARRAY_INFO: 2524 { 2525 mdu_array_info_t info; 2526 if (!arg) 2527 memset(&info, 0, sizeof(info)); 2528 else if (copy_from_user(&info, argp, sizeof(info))) { 2529 err = -EFAULT; 2530 goto abort_unlock; 2531 } 2532 if (mddev->pers) { 2533 err = update_array_info(mddev, &info); 2534 if (err) { 2535 printk(KERN_WARNING "md: couldn't update" 2536 " array info. %d\n", err); 2537 goto abort_unlock; 2538 } 2539 goto done_unlock; 2540 } 2541 if (!list_empty(&mddev->disks)) { 2542 printk(KERN_WARNING 2543 "md: array %s already has disks!\n", 2544 mdname(mddev)); 2545 err = -EBUSY; 2546 goto abort_unlock; 2547 } 2548 if (mddev->raid_disks) { 2549 printk(KERN_WARNING 2550 "md: array %s already initialised!\n", 2551 mdname(mddev)); 2552 err = -EBUSY; 2553 goto abort_unlock; 2554 } 2555 err = set_array_info(mddev, &info); 2556 if (err) { 2557 printk(KERN_WARNING "md: couldn't set" 2558 " array info. %d\n", err); 2559 goto abort_unlock; 2560 } 2561 } 2562 goto done_unlock; 2563 2564 default:; 2565 } 2566 2567 /* 2568 * Commands querying/configuring an existing array: 2569 */ 2570 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2571 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2572 err = -ENODEV; 2573 goto abort_unlock; 2574 } 2575 2576 /* 2577 * Commands even a read-only array can execute: 2578 */ 2579 switch (cmd) 2580 { 2581 case GET_ARRAY_INFO: 2582 err = get_array_info(mddev, argp); 2583 goto done_unlock; 2584 2585 case GET_DISK_INFO: 2586 err = get_disk_info(mddev, argp); 2587 goto done_unlock; 2588 2589 case RESTART_ARRAY_RW: 2590 err = restart_array(mddev); 2591 goto done_unlock; 2592 2593 case STOP_ARRAY: 2594 err = do_md_stop (mddev, 0); 2595 goto done_unlock; 2596 2597 case STOP_ARRAY_RO: 2598 err = do_md_stop (mddev, 1); 2599 goto done_unlock; 2600 2601 /* 2602 * We have a problem here : there is no easy way to give a CHS 2603 * virtual geometry. We currently pretend that we have a 2 heads 2604 * 4 sectors (with a BIG number of cylinders...). This drives 2605 * dosfs just mad... ;-) 2606 */ 2607 case HDIO_GETGEO: 2608 if (!loc) { 2609 err = -EINVAL; 2610 goto abort_unlock; 2611 } 2612 err = put_user (2, (char __user *) &loc->heads); 2613 if (err) 2614 goto abort_unlock; 2615 err = put_user (4, (char __user *) &loc->sectors); 2616 if (err) 2617 goto abort_unlock; 2618 err = put_user(get_capacity(mddev->gendisk)/8, 2619 (short __user *) &loc->cylinders); 2620 if (err) 2621 goto abort_unlock; 2622 err = put_user (get_start_sect(inode->i_bdev), 2623 (long __user *) &loc->start); 2624 goto done_unlock; 2625 } 2626 2627 /* 2628 * The remaining ioctls are changing the state of the 2629 * superblock, so we do not allow read-only arrays 2630 * here: 2631 */ 2632 if (mddev->ro) { 2633 err = -EROFS; 2634 goto abort_unlock; 2635 } 2636 2637 switch (cmd) 2638 { 2639 case ADD_NEW_DISK: 2640 { 2641 mdu_disk_info_t info; 2642 if (copy_from_user(&info, argp, sizeof(info))) 2643 err = -EFAULT; 2644 else 2645 err = add_new_disk(mddev, &info); 2646 goto done_unlock; 2647 } 2648 2649 case HOT_REMOVE_DISK: 2650 err = hot_remove_disk(mddev, new_decode_dev(arg)); 2651 goto done_unlock; 2652 2653 case HOT_ADD_DISK: 2654 err = hot_add_disk(mddev, new_decode_dev(arg)); 2655 goto done_unlock; 2656 2657 case SET_DISK_FAULTY: 2658 err = set_disk_faulty(mddev, new_decode_dev(arg)); 2659 goto done_unlock; 2660 2661 case RUN_ARRAY: 2662 err = do_md_run (mddev); 2663 goto done_unlock; 2664 2665 default: 2666 if (_IOC_TYPE(cmd) == MD_MAJOR) 2667 printk(KERN_WARNING "md: %s(pid %d) used" 2668 " obsolete MD ioctl, upgrade your" 2669 " software to use new ictls.\n", 2670 current->comm, current->pid); 2671 err = -EINVAL; 2672 goto abort_unlock; 2673 } 2674 2675 done_unlock: 2676 abort_unlock: 2677 mddev_unlock(mddev); 2678 2679 return err; 2680 done: 2681 if (err) 2682 MD_BUG(); 2683 abort: 2684 return err; 2685 } 2686 2687 static int md_open(struct inode *inode, struct file *file) 2688 { 2689 /* 2690 * Succeed if we can lock the mddev, which confirms that 2691 * it isn't being stopped right now. 2692 */ 2693 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 2694 int err; 2695 2696 if ((err = mddev_lock(mddev))) 2697 goto out; 2698 2699 err = 0; 2700 mddev_get(mddev); 2701 mddev_unlock(mddev); 2702 2703 check_disk_change(inode->i_bdev); 2704 out: 2705 return err; 2706 } 2707 2708 static int md_release(struct inode *inode, struct file * file) 2709 { 2710 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 2711 2712 if (!mddev) 2713 BUG(); 2714 mddev_put(mddev); 2715 2716 return 0; 2717 } 2718 2719 static int md_media_changed(struct gendisk *disk) 2720 { 2721 mddev_t *mddev = disk->private_data; 2722 2723 return mddev->changed; 2724 } 2725 2726 static int md_revalidate(struct gendisk *disk) 2727 { 2728 mddev_t *mddev = disk->private_data; 2729 2730 mddev->changed = 0; 2731 return 0; 2732 } 2733 static struct block_device_operations md_fops = 2734 { 2735 .owner = THIS_MODULE, 2736 .open = md_open, 2737 .release = md_release, 2738 .ioctl = md_ioctl, 2739 .media_changed = md_media_changed, 2740 .revalidate_disk= md_revalidate, 2741 }; 2742 2743 int md_thread(void * arg) 2744 { 2745 mdk_thread_t *thread = arg; 2746 2747 lock_kernel(); 2748 2749 /* 2750 * Detach thread 2751 */ 2752 2753 daemonize(thread->name, mdname(thread->mddev)); 2754 2755 current->exit_signal = SIGCHLD; 2756 allow_signal(SIGKILL); 2757 thread->tsk = current; 2758 2759 /* 2760 * md_thread is a 'system-thread', it's priority should be very 2761 * high. We avoid resource deadlocks individually in each 2762 * raid personality. (RAID5 does preallocation) We also use RR and 2763 * the very same RT priority as kswapd, thus we will never get 2764 * into a priority inversion deadlock. 2765 * 2766 * we definitely have to have equal or higher priority than 2767 * bdflush, otherwise bdflush will deadlock if there are too 2768 * many dirty RAID5 blocks. 2769 */ 2770 unlock_kernel(); 2771 2772 complete(thread->event); 2773 while (thread->run) { 2774 void (*run)(mddev_t *); 2775 2776 wait_event_interruptible(thread->wqueue, 2777 test_bit(THREAD_WAKEUP, &thread->flags)); 2778 if (current->flags & PF_FREEZE) 2779 refrigerator(PF_FREEZE); 2780 2781 clear_bit(THREAD_WAKEUP, &thread->flags); 2782 2783 run = thread->run; 2784 if (run) 2785 run(thread->mddev); 2786 2787 if (signal_pending(current)) 2788 flush_signals(current); 2789 } 2790 complete(thread->event); 2791 return 0; 2792 } 2793 2794 void md_wakeup_thread(mdk_thread_t *thread) 2795 { 2796 if (thread) { 2797 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 2798 set_bit(THREAD_WAKEUP, &thread->flags); 2799 wake_up(&thread->wqueue); 2800 } 2801 } 2802 2803 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 2804 const char *name) 2805 { 2806 mdk_thread_t *thread; 2807 int ret; 2808 struct completion event; 2809 2810 thread = (mdk_thread_t *) kmalloc 2811 (sizeof(mdk_thread_t), GFP_KERNEL); 2812 if (!thread) 2813 return NULL; 2814 2815 memset(thread, 0, sizeof(mdk_thread_t)); 2816 init_waitqueue_head(&thread->wqueue); 2817 2818 init_completion(&event); 2819 thread->event = &event; 2820 thread->run = run; 2821 thread->mddev = mddev; 2822 thread->name = name; 2823 ret = kernel_thread(md_thread, thread, 0); 2824 if (ret < 0) { 2825 kfree(thread); 2826 return NULL; 2827 } 2828 wait_for_completion(&event); 2829 return thread; 2830 } 2831 2832 void md_unregister_thread(mdk_thread_t *thread) 2833 { 2834 struct completion event; 2835 2836 init_completion(&event); 2837 2838 thread->event = &event; 2839 2840 /* As soon as ->run is set to NULL, the task could disappear, 2841 * so we need to hold tasklist_lock until we have sent the signal 2842 */ 2843 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 2844 read_lock(&tasklist_lock); 2845 thread->run = NULL; 2846 send_sig(SIGKILL, thread->tsk, 1); 2847 read_unlock(&tasklist_lock); 2848 wait_for_completion(&event); 2849 kfree(thread); 2850 } 2851 2852 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 2853 { 2854 if (!mddev) { 2855 MD_BUG(); 2856 return; 2857 } 2858 2859 if (!rdev || rdev->faulty) 2860 return; 2861 2862 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 2863 mdname(mddev), 2864 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 2865 __builtin_return_address(0),__builtin_return_address(1), 2866 __builtin_return_address(2),__builtin_return_address(3)); 2867 2868 if (!mddev->pers->error_handler) 2869 return; 2870 mddev->pers->error_handler(mddev,rdev); 2871 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2872 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2873 md_wakeup_thread(mddev->thread); 2874 } 2875 2876 /* seq_file implementation /proc/mdstat */ 2877 2878 static void status_unused(struct seq_file *seq) 2879 { 2880 int i = 0; 2881 mdk_rdev_t *rdev; 2882 struct list_head *tmp; 2883 2884 seq_printf(seq, "unused devices: "); 2885 2886 ITERATE_RDEV_PENDING(rdev,tmp) { 2887 char b[BDEVNAME_SIZE]; 2888 i++; 2889 seq_printf(seq, "%s ", 2890 bdevname(rdev->bdev,b)); 2891 } 2892 if (!i) 2893 seq_printf(seq, "<none>"); 2894 2895 seq_printf(seq, "\n"); 2896 } 2897 2898 2899 static void status_resync(struct seq_file *seq, mddev_t * mddev) 2900 { 2901 unsigned long max_blocks, resync, res, dt, db, rt; 2902 2903 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 2904 2905 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2906 max_blocks = mddev->resync_max_sectors >> 1; 2907 else 2908 max_blocks = mddev->size; 2909 2910 /* 2911 * Should not happen. 2912 */ 2913 if (!max_blocks) { 2914 MD_BUG(); 2915 return; 2916 } 2917 res = (resync/1024)*1000/(max_blocks/1024 + 1); 2918 { 2919 int i, x = res/50, y = 20-x; 2920 seq_printf(seq, "["); 2921 for (i = 0; i < x; i++) 2922 seq_printf(seq, "="); 2923 seq_printf(seq, ">"); 2924 for (i = 0; i < y; i++) 2925 seq_printf(seq, "."); 2926 seq_printf(seq, "] "); 2927 } 2928 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 2929 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 2930 "resync" : "recovery"), 2931 res/10, res % 10, resync, max_blocks); 2932 2933 /* 2934 * We do not want to overflow, so the order of operands and 2935 * the * 100 / 100 trick are important. We do a +1 to be 2936 * safe against division by zero. We only estimate anyway. 2937 * 2938 * dt: time from mark until now 2939 * db: blocks written from mark until now 2940 * rt: remaining time 2941 */ 2942 dt = ((jiffies - mddev->resync_mark) / HZ); 2943 if (!dt) dt++; 2944 db = resync - (mddev->resync_mark_cnt/2); 2945 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 2946 2947 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 2948 2949 seq_printf(seq, " speed=%ldK/sec", db/dt); 2950 } 2951 2952 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 2953 { 2954 struct list_head *tmp; 2955 loff_t l = *pos; 2956 mddev_t *mddev; 2957 2958 if (l >= 0x10000) 2959 return NULL; 2960 if (!l--) 2961 /* header */ 2962 return (void*)1; 2963 2964 spin_lock(&all_mddevs_lock); 2965 list_for_each(tmp,&all_mddevs) 2966 if (!l--) { 2967 mddev = list_entry(tmp, mddev_t, all_mddevs); 2968 mddev_get(mddev); 2969 spin_unlock(&all_mddevs_lock); 2970 return mddev; 2971 } 2972 spin_unlock(&all_mddevs_lock); 2973 if (!l--) 2974 return (void*)2;/* tail */ 2975 return NULL; 2976 } 2977 2978 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2979 { 2980 struct list_head *tmp; 2981 mddev_t *next_mddev, *mddev = v; 2982 2983 ++*pos; 2984 if (v == (void*)2) 2985 return NULL; 2986 2987 spin_lock(&all_mddevs_lock); 2988 if (v == (void*)1) 2989 tmp = all_mddevs.next; 2990 else 2991 tmp = mddev->all_mddevs.next; 2992 if (tmp != &all_mddevs) 2993 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 2994 else { 2995 next_mddev = (void*)2; 2996 *pos = 0x10000; 2997 } 2998 spin_unlock(&all_mddevs_lock); 2999 3000 if (v != (void*)1) 3001 mddev_put(mddev); 3002 return next_mddev; 3003 3004 } 3005 3006 static void md_seq_stop(struct seq_file *seq, void *v) 3007 { 3008 mddev_t *mddev = v; 3009 3010 if (mddev && v != (void*)1 && v != (void*)2) 3011 mddev_put(mddev); 3012 } 3013 3014 static int md_seq_show(struct seq_file *seq, void *v) 3015 { 3016 mddev_t *mddev = v; 3017 sector_t size; 3018 struct list_head *tmp2; 3019 mdk_rdev_t *rdev; 3020 int i; 3021 3022 if (v == (void*)1) { 3023 seq_printf(seq, "Personalities : "); 3024 spin_lock(&pers_lock); 3025 for (i = 0; i < MAX_PERSONALITY; i++) 3026 if (pers[i]) 3027 seq_printf(seq, "[%s] ", pers[i]->name); 3028 3029 spin_unlock(&pers_lock); 3030 seq_printf(seq, "\n"); 3031 return 0; 3032 } 3033 if (v == (void*)2) { 3034 status_unused(seq); 3035 return 0; 3036 } 3037 3038 if (mddev_lock(mddev)!=0) 3039 return -EINTR; 3040 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3041 seq_printf(seq, "%s : %sactive", mdname(mddev), 3042 mddev->pers ? "" : "in"); 3043 if (mddev->pers) { 3044 if (mddev->ro) 3045 seq_printf(seq, " (read-only)"); 3046 seq_printf(seq, " %s", mddev->pers->name); 3047 } 3048 3049 size = 0; 3050 ITERATE_RDEV(mddev,rdev,tmp2) { 3051 char b[BDEVNAME_SIZE]; 3052 seq_printf(seq, " %s[%d]", 3053 bdevname(rdev->bdev,b), rdev->desc_nr); 3054 if (rdev->faulty) { 3055 seq_printf(seq, "(F)"); 3056 continue; 3057 } 3058 size += rdev->size; 3059 } 3060 3061 if (!list_empty(&mddev->disks)) { 3062 if (mddev->pers) 3063 seq_printf(seq, "\n %llu blocks", 3064 (unsigned long long)mddev->array_size); 3065 else 3066 seq_printf(seq, "\n %llu blocks", 3067 (unsigned long long)size); 3068 } 3069 3070 if (mddev->pers) { 3071 mddev->pers->status (seq, mddev); 3072 seq_printf(seq, "\n "); 3073 if (mddev->curr_resync > 2) 3074 status_resync (seq, mddev); 3075 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3076 seq_printf(seq, " resync=DELAYED"); 3077 } 3078 3079 seq_printf(seq, "\n"); 3080 } 3081 mddev_unlock(mddev); 3082 3083 return 0; 3084 } 3085 3086 static struct seq_operations md_seq_ops = { 3087 .start = md_seq_start, 3088 .next = md_seq_next, 3089 .stop = md_seq_stop, 3090 .show = md_seq_show, 3091 }; 3092 3093 static int md_seq_open(struct inode *inode, struct file *file) 3094 { 3095 int error; 3096 3097 error = seq_open(file, &md_seq_ops); 3098 return error; 3099 } 3100 3101 static struct file_operations md_seq_fops = { 3102 .open = md_seq_open, 3103 .read = seq_read, 3104 .llseek = seq_lseek, 3105 .release = seq_release, 3106 }; 3107 3108 int register_md_personality(int pnum, mdk_personality_t *p) 3109 { 3110 if (pnum >= MAX_PERSONALITY) { 3111 printk(KERN_ERR 3112 "md: tried to install personality %s as nr %d, but max is %lu\n", 3113 p->name, pnum, MAX_PERSONALITY-1); 3114 return -EINVAL; 3115 } 3116 3117 spin_lock(&pers_lock); 3118 if (pers[pnum]) { 3119 spin_unlock(&pers_lock); 3120 return -EBUSY; 3121 } 3122 3123 pers[pnum] = p; 3124 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3125 spin_unlock(&pers_lock); 3126 return 0; 3127 } 3128 3129 int unregister_md_personality(int pnum) 3130 { 3131 if (pnum >= MAX_PERSONALITY) 3132 return -EINVAL; 3133 3134 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3135 spin_lock(&pers_lock); 3136 pers[pnum] = NULL; 3137 spin_unlock(&pers_lock); 3138 return 0; 3139 } 3140 3141 static int is_mddev_idle(mddev_t *mddev) 3142 { 3143 mdk_rdev_t * rdev; 3144 struct list_head *tmp; 3145 int idle; 3146 unsigned long curr_events; 3147 3148 idle = 1; 3149 ITERATE_RDEV(mddev,rdev,tmp) { 3150 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3151 curr_events = disk_stat_read(disk, read_sectors) + 3152 disk_stat_read(disk, write_sectors) - 3153 atomic_read(&disk->sync_io); 3154 /* Allow some slack between valud of curr_events and last_events, 3155 * as there are some uninteresting races. 3156 * Note: the following is an unsigned comparison. 3157 */ 3158 if ((curr_events - rdev->last_events + 32) > 64) { 3159 rdev->last_events = curr_events; 3160 idle = 0; 3161 } 3162 } 3163 return idle; 3164 } 3165 3166 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3167 { 3168 /* another "blocks" (512byte) blocks have been synced */ 3169 atomic_sub(blocks, &mddev->recovery_active); 3170 wake_up(&mddev->recovery_wait); 3171 if (!ok) { 3172 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3173 md_wakeup_thread(mddev->thread); 3174 // stop recovery, signal do_sync .... 3175 } 3176 } 3177 3178 3179 void md_write_start(mddev_t *mddev) 3180 { 3181 if (!atomic_read(&mddev->writes_pending)) { 3182 mddev_lock_uninterruptible(mddev); 3183 if (mddev->in_sync) { 3184 mddev->in_sync = 0; 3185 del_timer(&mddev->safemode_timer); 3186 md_update_sb(mddev); 3187 } 3188 atomic_inc(&mddev->writes_pending); 3189 mddev_unlock(mddev); 3190 } else 3191 atomic_inc(&mddev->writes_pending); 3192 } 3193 3194 void md_write_end(mddev_t *mddev) 3195 { 3196 if (atomic_dec_and_test(&mddev->writes_pending)) { 3197 if (mddev->safemode == 2) 3198 md_wakeup_thread(mddev->thread); 3199 else 3200 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3201 } 3202 } 3203 3204 static inline void md_enter_safemode(mddev_t *mddev) 3205 { 3206 if (!mddev->safemode) return; 3207 if (mddev->safemode == 2 && 3208 (atomic_read(&mddev->writes_pending) || mddev->in_sync || 3209 mddev->recovery_cp != MaxSector)) 3210 return; /* avoid the lock */ 3211 mddev_lock_uninterruptible(mddev); 3212 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 3213 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 3214 mddev->in_sync = 1; 3215 md_update_sb(mddev); 3216 } 3217 mddev_unlock(mddev); 3218 3219 if (mddev->safemode == 1) 3220 mddev->safemode = 0; 3221 } 3222 3223 void md_handle_safemode(mddev_t *mddev) 3224 { 3225 if (signal_pending(current)) { 3226 printk(KERN_INFO "md: %s in immediate safe mode\n", 3227 mdname(mddev)); 3228 mddev->safemode = 2; 3229 flush_signals(current); 3230 } 3231 md_enter_safemode(mddev); 3232 } 3233 3234 3235 DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3236 3237 #define SYNC_MARKS 10 3238 #define SYNC_MARK_STEP (3*HZ) 3239 static void md_do_sync(mddev_t *mddev) 3240 { 3241 mddev_t *mddev2; 3242 unsigned int currspeed = 0, 3243 window; 3244 sector_t max_sectors,j; 3245 unsigned long mark[SYNC_MARKS]; 3246 sector_t mark_cnt[SYNC_MARKS]; 3247 int last_mark,m; 3248 struct list_head *tmp; 3249 sector_t last_check; 3250 3251 /* just incase thread restarts... */ 3252 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3253 return; 3254 3255 /* we overload curr_resync somewhat here. 3256 * 0 == not engaged in resync at all 3257 * 2 == checking that there is no conflict with another sync 3258 * 1 == like 2, but have yielded to allow conflicting resync to 3259 * commense 3260 * other == active in resync - this many blocks 3261 * 3262 * Before starting a resync we must have set curr_resync to 3263 * 2, and then checked that every "conflicting" array has curr_resync 3264 * less than ours. When we find one that is the same or higher 3265 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3266 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3267 * This will mean we have to start checking from the beginning again. 3268 * 3269 */ 3270 3271 do { 3272 mddev->curr_resync = 2; 3273 3274 try_again: 3275 if (signal_pending(current)) { 3276 flush_signals(current); 3277 goto skip; 3278 } 3279 ITERATE_MDDEV(mddev2,tmp) { 3280 printk("."); 3281 if (mddev2 == mddev) 3282 continue; 3283 if (mddev2->curr_resync && 3284 match_mddev_units(mddev,mddev2)) { 3285 DEFINE_WAIT(wq); 3286 if (mddev < mddev2 && mddev->curr_resync == 2) { 3287 /* arbitrarily yield */ 3288 mddev->curr_resync = 1; 3289 wake_up(&resync_wait); 3290 } 3291 if (mddev > mddev2 && mddev->curr_resync == 1) 3292 /* no need to wait here, we can wait the next 3293 * time 'round when curr_resync == 2 3294 */ 3295 continue; 3296 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3297 if (!signal_pending(current) 3298 && mddev2->curr_resync >= mddev->curr_resync) { 3299 printk(KERN_INFO "md: delaying resync of %s" 3300 " until %s has finished resync (they" 3301 " share one or more physical units)\n", 3302 mdname(mddev), mdname(mddev2)); 3303 mddev_put(mddev2); 3304 schedule(); 3305 finish_wait(&resync_wait, &wq); 3306 goto try_again; 3307 } 3308 finish_wait(&resync_wait, &wq); 3309 } 3310 } 3311 } while (mddev->curr_resync < 2); 3312 3313 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3314 /* resync follows the size requested by the personality, 3315 * which default to physical size, but can be virtual size 3316 */ 3317 max_sectors = mddev->resync_max_sectors; 3318 else 3319 /* recovery follows the physical size of devices */ 3320 max_sectors = mddev->size << 1; 3321 3322 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3323 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3324 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3325 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3326 "(but not more than %d KB/sec) for reconstruction.\n", 3327 sysctl_speed_limit_max); 3328 3329 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3330 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3331 j = mddev->recovery_cp; 3332 else 3333 j = 0; 3334 for (m = 0; m < SYNC_MARKS; m++) { 3335 mark[m] = jiffies; 3336 mark_cnt[m] = j; 3337 } 3338 last_mark = 0; 3339 mddev->resync_mark = mark[last_mark]; 3340 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3341 3342 /* 3343 * Tune reconstruction: 3344 */ 3345 window = 32*(PAGE_SIZE/512); 3346 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 3347 window/2,(unsigned long long) max_sectors/2); 3348 3349 atomic_set(&mddev->recovery_active, 0); 3350 init_waitqueue_head(&mddev->recovery_wait); 3351 last_check = 0; 3352 3353 if (j>2) { 3354 printk(KERN_INFO 3355 "md: resuming recovery of %s from checkpoint.\n", 3356 mdname(mddev)); 3357 mddev->curr_resync = j; 3358 } 3359 3360 while (j < max_sectors) { 3361 int sectors; 3362 3363 sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); 3364 if (sectors < 0) { 3365 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3366 goto out; 3367 } 3368 atomic_add(sectors, &mddev->recovery_active); 3369 j += sectors; 3370 if (j>1) mddev->curr_resync = j; 3371 3372 if (last_check + window > j || j == max_sectors) 3373 continue; 3374 3375 last_check = j; 3376 3377 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3378 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3379 break; 3380 3381 repeat: 3382 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 3383 /* step marks */ 3384 int next = (last_mark+1) % SYNC_MARKS; 3385 3386 mddev->resync_mark = mark[next]; 3387 mddev->resync_mark_cnt = mark_cnt[next]; 3388 mark[next] = jiffies; 3389 mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3390 last_mark = next; 3391 } 3392 3393 3394 if (signal_pending(current)) { 3395 /* 3396 * got a signal, exit. 3397 */ 3398 printk(KERN_INFO 3399 "md: md_do_sync() got signal ... exiting\n"); 3400 flush_signals(current); 3401 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3402 goto out; 3403 } 3404 3405 /* 3406 * this loop exits only if either when we are slower than 3407 * the 'hard' speed limit, or the system was IO-idle for 3408 * a jiffy. 3409 * the system might be non-idle CPU-wise, but we only care 3410 * about not overloading the IO subsystem. (things like an 3411 * e2fsck being done on the RAID array should execute fast) 3412 */ 3413 mddev->queue->unplug_fn(mddev->queue); 3414 cond_resched(); 3415 3416 currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3417 3418 if (currspeed > sysctl_speed_limit_min) { 3419 if ((currspeed > sysctl_speed_limit_max) || 3420 !is_mddev_idle(mddev)) { 3421 msleep_interruptible(250); 3422 goto repeat; 3423 } 3424 } 3425 } 3426 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 3427 /* 3428 * this also signals 'finished resyncing' to md_stop 3429 */ 3430 out: 3431 mddev->queue->unplug_fn(mddev->queue); 3432 3433 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3434 3435 /* tell personality that we are finished */ 3436 mddev->pers->sync_request(mddev, max_sectors, 1); 3437 3438 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3439 mddev->curr_resync > 2 && 3440 mddev->curr_resync >= mddev->recovery_cp) { 3441 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3442 printk(KERN_INFO 3443 "md: checkpointing recovery of %s.\n", 3444 mdname(mddev)); 3445 mddev->recovery_cp = mddev->curr_resync; 3446 } else 3447 mddev->recovery_cp = MaxSector; 3448 } 3449 3450 md_enter_safemode(mddev); 3451 skip: 3452 mddev->curr_resync = 0; 3453 wake_up(&resync_wait); 3454 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 3455 md_wakeup_thread(mddev->thread); 3456 } 3457 3458 3459 /* 3460 * This routine is regularly called by all per-raid-array threads to 3461 * deal with generic issues like resync and super-block update. 3462 * Raid personalities that don't have a thread (linear/raid0) do not 3463 * need this as they never do any recovery or update the superblock. 3464 * 3465 * It does not do any resync itself, but rather "forks" off other threads 3466 * to do that as needed. 3467 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 3468 * "->recovery" and create a thread at ->sync_thread. 3469 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 3470 * and wakeups up this thread which will reap the thread and finish up. 3471 * This thread also removes any faulty devices (with nr_pending == 0). 3472 * 3473 * The overall approach is: 3474 * 1/ if the superblock needs updating, update it. 3475 * 2/ If a recovery thread is running, don't do anything else. 3476 * 3/ If recovery has finished, clean up, possibly marking spares active. 3477 * 4/ If there are any faulty devices, remove them. 3478 * 5/ If array is degraded, try to add spares devices 3479 * 6/ If array has spares or is not in-sync, start a resync thread. 3480 */ 3481 void md_check_recovery(mddev_t *mddev) 3482 { 3483 mdk_rdev_t *rdev; 3484 struct list_head *rtmp; 3485 3486 3487 dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); 3488 3489 if (mddev->ro) 3490 return; 3491 if ( ! ( 3492 mddev->sb_dirty || 3493 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3494 test_bit(MD_RECOVERY_DONE, &mddev->recovery) 3495 )) 3496 return; 3497 if (mddev_trylock(mddev)==0) { 3498 int spares =0; 3499 if (mddev->sb_dirty) 3500 md_update_sb(mddev); 3501 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3502 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 3503 /* resync/recovery still happening */ 3504 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3505 goto unlock; 3506 } 3507 if (mddev->sync_thread) { 3508 /* resync has finished, collect result */ 3509 md_unregister_thread(mddev->sync_thread); 3510 mddev->sync_thread = NULL; 3511 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3512 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3513 /* success...*/ 3514 /* activate any spares */ 3515 mddev->pers->spare_active(mddev); 3516 } 3517 md_update_sb(mddev); 3518 mddev->recovery = 0; 3519 /* flag recovery needed just to double check */ 3520 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3521 goto unlock; 3522 } 3523 if (mddev->recovery) 3524 /* probably just the RECOVERY_NEEDED flag */ 3525 mddev->recovery = 0; 3526 3527 /* no recovery is running. 3528 * remove any failed drives, then 3529 * add spares if possible. 3530 * Spare are also removed and re-added, to allow 3531 * the personality to fail the re-add. 3532 */ 3533 ITERATE_RDEV(mddev,rdev,rtmp) 3534 if (rdev->raid_disk >= 0 && 3535 (rdev->faulty || ! rdev->in_sync) && 3536 atomic_read(&rdev->nr_pending)==0) { 3537 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) 3538 rdev->raid_disk = -1; 3539 } 3540 3541 if (mddev->degraded) { 3542 ITERATE_RDEV(mddev,rdev,rtmp) 3543 if (rdev->raid_disk < 0 3544 && !rdev->faulty) { 3545 if (mddev->pers->hot_add_disk(mddev,rdev)) 3546 spares++; 3547 else 3548 break; 3549 } 3550 } 3551 3552 if (!spares && (mddev->recovery_cp == MaxSector )) { 3553 /* nothing we can do ... */ 3554 goto unlock; 3555 } 3556 if (mddev->pers->sync_request) { 3557 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3558 if (!spares) 3559 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3560 mddev->sync_thread = md_register_thread(md_do_sync, 3561 mddev, 3562 "%s_resync"); 3563 if (!mddev->sync_thread) { 3564 printk(KERN_ERR "%s: could not start resync" 3565 " thread...\n", 3566 mdname(mddev)); 3567 /* leave the spares where they are, it shouldn't hurt */ 3568 mddev->recovery = 0; 3569 } else { 3570 md_wakeup_thread(mddev->sync_thread); 3571 } 3572 } 3573 unlock: 3574 mddev_unlock(mddev); 3575 } 3576 } 3577 3578 int md_notify_reboot(struct notifier_block *this, 3579 unsigned long code, void *x) 3580 { 3581 struct list_head *tmp; 3582 mddev_t *mddev; 3583 3584 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 3585 3586 printk(KERN_INFO "md: stopping all md devices.\n"); 3587 3588 ITERATE_MDDEV(mddev,tmp) 3589 if (mddev_trylock(mddev)==0) 3590 do_md_stop (mddev, 1); 3591 /* 3592 * certain more exotic SCSI devices are known to be 3593 * volatile wrt too early system reboots. While the 3594 * right place to handle this issue is the given 3595 * driver, we do want to have a safe RAID driver ... 3596 */ 3597 mdelay(1000*1); 3598 } 3599 return NOTIFY_DONE; 3600 } 3601 3602 struct notifier_block md_notifier = { 3603 .notifier_call = md_notify_reboot, 3604 .next = NULL, 3605 .priority = INT_MAX, /* before any real devices */ 3606 }; 3607 3608 static void md_geninit(void) 3609 { 3610 struct proc_dir_entry *p; 3611 3612 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3613 3614 p = create_proc_entry("mdstat", S_IRUGO, NULL); 3615 if (p) 3616 p->proc_fops = &md_seq_fops; 3617 } 3618 3619 int __init md_init(void) 3620 { 3621 int minor; 3622 3623 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 3624 " MD_SB_DISKS=%d\n", 3625 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3626 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3627 3628 if (register_blkdev(MAJOR_NR, "md")) 3629 return -1; 3630 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 3631 unregister_blkdev(MAJOR_NR, "md"); 3632 return -1; 3633 } 3634 devfs_mk_dir("md"); 3635 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 3636 md_probe, NULL, NULL); 3637 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 3638 md_probe, NULL, NULL); 3639 3640 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3641 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 3642 S_IFBLK|S_IRUSR|S_IWUSR, 3643 "md/%d", minor); 3644 3645 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3646 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 3647 S_IFBLK|S_IRUSR|S_IWUSR, 3648 "md/mdp%d", minor); 3649 3650 3651 register_reboot_notifier(&md_notifier); 3652 raid_table_header = register_sysctl_table(raid_root_table, 1); 3653 3654 md_geninit(); 3655 return (0); 3656 } 3657 3658 3659 #ifndef MODULE 3660 3661 /* 3662 * Searches all registered partitions for autorun RAID arrays 3663 * at boot time. 3664 */ 3665 static dev_t detected_devices[128]; 3666 static int dev_cnt; 3667 3668 void md_autodetect_dev(dev_t dev) 3669 { 3670 if (dev_cnt >= 0 && dev_cnt < 127) 3671 detected_devices[dev_cnt++] = dev; 3672 } 3673 3674 3675 static void autostart_arrays(int part) 3676 { 3677 mdk_rdev_t *rdev; 3678 int i; 3679 3680 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3681 3682 for (i = 0; i < dev_cnt; i++) { 3683 dev_t dev = detected_devices[i]; 3684 3685 rdev = md_import_device(dev,0, 0); 3686 if (IS_ERR(rdev)) 3687 continue; 3688 3689 if (rdev->faulty) { 3690 MD_BUG(); 3691 continue; 3692 } 3693 list_add(&rdev->same_set, &pending_raid_disks); 3694 } 3695 dev_cnt = 0; 3696 3697 autorun_devices(part); 3698 } 3699 3700 #endif 3701 3702 static __exit void md_exit(void) 3703 { 3704 mddev_t *mddev; 3705 struct list_head *tmp; 3706 int i; 3707 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 3708 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 3709 for (i=0; i < MAX_MD_DEVS; i++) 3710 devfs_remove("md/%d", i); 3711 for (i=0; i < MAX_MD_DEVS; i++) 3712 devfs_remove("md/d%d", i); 3713 3714 devfs_remove("md"); 3715 3716 unregister_blkdev(MAJOR_NR,"md"); 3717 unregister_blkdev(mdp_major, "mdp"); 3718 unregister_reboot_notifier(&md_notifier); 3719 unregister_sysctl_table(raid_table_header); 3720 remove_proc_entry("mdstat", NULL); 3721 ITERATE_MDDEV(mddev,tmp) { 3722 struct gendisk *disk = mddev->gendisk; 3723 if (!disk) 3724 continue; 3725 export_array(mddev); 3726 del_gendisk(disk); 3727 put_disk(disk); 3728 mddev->gendisk = NULL; 3729 mddev_put(mddev); 3730 } 3731 } 3732 3733 module_init(md_init) 3734 module_exit(md_exit) 3735 3736 EXPORT_SYMBOL(register_md_personality); 3737 EXPORT_SYMBOL(unregister_md_personality); 3738 EXPORT_SYMBOL(md_error); 3739 EXPORT_SYMBOL(md_done_sync); 3740 EXPORT_SYMBOL(md_write_start); 3741 EXPORT_SYMBOL(md_write_end); 3742 EXPORT_SYMBOL(md_handle_safemode); 3743 EXPORT_SYMBOL(md_register_thread); 3744 EXPORT_SYMBOL(md_unregister_thread); 3745 EXPORT_SYMBOL(md_wakeup_thread); 3746 EXPORT_SYMBOL(md_print_devices); 3747 EXPORT_SYMBOL(md_check_recovery); 3748 MODULE_LICENSE("GPL"); 3749