1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 46 #include <linux/init.h> 47 48 #include <linux/file.h> 49 50 #ifdef CONFIG_KMOD 51 #include <linux/kmod.h> 52 #endif 53 54 #include <asm/unaligned.h> 55 56 #define MAJOR_NR MD_MAJOR 57 #define MD_DRIVER 58 59 /* 63 partitions with the alternate major number (mdp) */ 60 #define MdpMinorShift 6 61 62 #define DEBUG 0 63 #define dprintk(x...) ((void)(DEBUG && printk(x))) 64 65 66 #ifndef MODULE 67 static void autostart_arrays (int part); 68 #endif 69 70 static mdk_personality_t *pers[MAX_PERSONALITY]; 71 static DEFINE_SPINLOCK(pers_lock); 72 73 /* 74 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 75 * is 1000 KB/sec, so the extra system load does not show up that much. 76 * Increase it if you want to have more _guaranteed_ speed. Note that 77 * the RAID driver will use the maximum available bandwidth if the IO 78 * subsystem is idle. There is also an 'absolute maximum' reconstruction 79 * speed limit - in case reconstruction slows down your system despite 80 * idle IO detection. 81 * 82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 */ 84 85 static int sysctl_speed_limit_min = 1000; 86 static int sysctl_speed_limit_max = 200000; 87 88 static struct ctl_table_header *raid_table_header; 89 90 static ctl_table raid_table[] = { 91 { 92 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 93 .procname = "speed_limit_min", 94 .data = &sysctl_speed_limit_min, 95 .maxlen = sizeof(int), 96 .mode = 0644, 97 .proc_handler = &proc_dointvec, 98 }, 99 { 100 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 101 .procname = "speed_limit_max", 102 .data = &sysctl_speed_limit_max, 103 .maxlen = sizeof(int), 104 .mode = 0644, 105 .proc_handler = &proc_dointvec, 106 }, 107 { .ctl_name = 0 } 108 }; 109 110 static ctl_table raid_dir_table[] = { 111 { 112 .ctl_name = DEV_RAID, 113 .procname = "raid", 114 .maxlen = 0, 115 .mode = 0555, 116 .child = raid_table, 117 }, 118 { .ctl_name = 0 } 119 }; 120 121 static ctl_table raid_root_table[] = { 122 { 123 .ctl_name = CTL_DEV, 124 .procname = "dev", 125 .maxlen = 0, 126 .mode = 0555, 127 .child = raid_dir_table, 128 }, 129 { .ctl_name = 0 } 130 }; 131 132 static struct block_device_operations md_fops; 133 134 /* 135 * Enables to iterate over all existing md arrays 136 * all_mddevs_lock protects this list. 137 */ 138 static LIST_HEAD(all_mddevs); 139 static DEFINE_SPINLOCK(all_mddevs_lock); 140 141 142 /* 143 * iterates through all used mddevs in the system. 144 * We take care to grab the all_mddevs_lock whenever navigating 145 * the list, and to always hold a refcount when unlocked. 146 * Any code which breaks out of this loop while own 147 * a reference to the current mddev and must mddev_put it. 148 */ 149 #define ITERATE_MDDEV(mddev,tmp) \ 150 \ 151 for (({ spin_lock(&all_mddevs_lock); \ 152 tmp = all_mddevs.next; \ 153 mddev = NULL;}); \ 154 ({ if (tmp != &all_mddevs) \ 155 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 156 spin_unlock(&all_mddevs_lock); \ 157 if (mddev) mddev_put(mddev); \ 158 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 159 tmp != &all_mddevs;}); \ 160 ({ spin_lock(&all_mddevs_lock); \ 161 tmp = tmp->next;}) \ 162 ) 163 164 165 static int md_fail_request (request_queue_t *q, struct bio *bio) 166 { 167 bio_io_error(bio, bio->bi_size); 168 return 0; 169 } 170 171 static inline mddev_t *mddev_get(mddev_t *mddev) 172 { 173 atomic_inc(&mddev->active); 174 return mddev; 175 } 176 177 static void mddev_put(mddev_t *mddev) 178 { 179 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 180 return; 181 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 182 list_del(&mddev->all_mddevs); 183 blk_put_queue(mddev->queue); 184 kobject_unregister(&mddev->kobj); 185 } 186 spin_unlock(&all_mddevs_lock); 187 } 188 189 static mddev_t * mddev_find(dev_t unit) 190 { 191 mddev_t *mddev, *new = NULL; 192 193 retry: 194 spin_lock(&all_mddevs_lock); 195 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 196 if (mddev->unit == unit) { 197 mddev_get(mddev); 198 spin_unlock(&all_mddevs_lock); 199 kfree(new); 200 return mddev; 201 } 202 203 if (new) { 204 list_add(&new->all_mddevs, &all_mddevs); 205 spin_unlock(&all_mddevs_lock); 206 return new; 207 } 208 spin_unlock(&all_mddevs_lock); 209 210 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 211 if (!new) 212 return NULL; 213 214 memset(new, 0, sizeof(*new)); 215 216 new->unit = unit; 217 if (MAJOR(unit) == MD_MAJOR) 218 new->md_minor = MINOR(unit); 219 else 220 new->md_minor = MINOR(unit) >> MdpMinorShift; 221 222 init_MUTEX(&new->reconfig_sem); 223 INIT_LIST_HEAD(&new->disks); 224 INIT_LIST_HEAD(&new->all_mddevs); 225 init_timer(&new->safemode_timer); 226 atomic_set(&new->active, 1); 227 spin_lock_init(&new->write_lock); 228 init_waitqueue_head(&new->sb_wait); 229 230 new->queue = blk_alloc_queue(GFP_KERNEL); 231 if (!new->queue) { 232 kfree(new); 233 return NULL; 234 } 235 236 blk_queue_make_request(new->queue, md_fail_request); 237 238 goto retry; 239 } 240 241 static inline int mddev_lock(mddev_t * mddev) 242 { 243 return down_interruptible(&mddev->reconfig_sem); 244 } 245 246 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 247 { 248 down(&mddev->reconfig_sem); 249 } 250 251 static inline int mddev_trylock(mddev_t * mddev) 252 { 253 return down_trylock(&mddev->reconfig_sem); 254 } 255 256 static inline void mddev_unlock(mddev_t * mddev) 257 { 258 up(&mddev->reconfig_sem); 259 260 md_wakeup_thread(mddev->thread); 261 } 262 263 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 264 { 265 mdk_rdev_t * rdev; 266 struct list_head *tmp; 267 268 ITERATE_RDEV(mddev,rdev,tmp) { 269 if (rdev->desc_nr == nr) 270 return rdev; 271 } 272 return NULL; 273 } 274 275 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 276 { 277 struct list_head *tmp; 278 mdk_rdev_t *rdev; 279 280 ITERATE_RDEV(mddev,rdev,tmp) { 281 if (rdev->bdev->bd_dev == dev) 282 return rdev; 283 } 284 return NULL; 285 } 286 287 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 288 { 289 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 290 return MD_NEW_SIZE_BLOCKS(size); 291 } 292 293 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 294 { 295 sector_t size; 296 297 size = rdev->sb_offset; 298 299 if (chunk_size) 300 size &= ~((sector_t)chunk_size/1024 - 1); 301 return size; 302 } 303 304 static int alloc_disk_sb(mdk_rdev_t * rdev) 305 { 306 if (rdev->sb_page) 307 MD_BUG(); 308 309 rdev->sb_page = alloc_page(GFP_KERNEL); 310 if (!rdev->sb_page) { 311 printk(KERN_ALERT "md: out of memory.\n"); 312 return -EINVAL; 313 } 314 315 return 0; 316 } 317 318 static void free_disk_sb(mdk_rdev_t * rdev) 319 { 320 if (rdev->sb_page) { 321 page_cache_release(rdev->sb_page); 322 rdev->sb_loaded = 0; 323 rdev->sb_page = NULL; 324 rdev->sb_offset = 0; 325 rdev->size = 0; 326 } 327 } 328 329 330 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 331 { 332 mdk_rdev_t *rdev = bio->bi_private; 333 if (bio->bi_size) 334 return 1; 335 336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 md_error(rdev->mddev, rdev); 338 339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 wake_up(&rdev->mddev->sb_wait); 341 bio_put(bio); 342 return 0; 343 } 344 345 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 346 sector_t sector, int size, struct page *page) 347 { 348 /* write first size bytes of page to sector of rdev 349 * Increment mddev->pending_writes before returning 350 * and decrement it on completion, waking up sb_wait 351 * if zero is reached. 352 * If an error occurred, call md_error 353 */ 354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 355 356 bio->bi_bdev = rdev->bdev; 357 bio->bi_sector = sector; 358 bio_add_page(bio, page, size, 0); 359 bio->bi_private = rdev; 360 bio->bi_end_io = super_written; 361 atomic_inc(&mddev->pending_writes); 362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 363 } 364 365 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 366 { 367 if (bio->bi_size) 368 return 1; 369 370 complete((struct completion*)bio->bi_private); 371 return 0; 372 } 373 374 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 375 struct page *page, int rw) 376 { 377 struct bio *bio = bio_alloc(GFP_NOIO, 1); 378 struct completion event; 379 int ret; 380 381 rw |= (1 << BIO_RW_SYNC); 382 383 bio->bi_bdev = bdev; 384 bio->bi_sector = sector; 385 bio_add_page(bio, page, size, 0); 386 init_completion(&event); 387 bio->bi_private = &event; 388 bio->bi_end_io = bi_complete; 389 submit_bio(rw, bio); 390 wait_for_completion(&event); 391 392 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 393 bio_put(bio); 394 return ret; 395 } 396 397 static int read_disk_sb(mdk_rdev_t * rdev, int size) 398 { 399 char b[BDEVNAME_SIZE]; 400 if (!rdev->sb_page) { 401 MD_BUG(); 402 return -EINVAL; 403 } 404 if (rdev->sb_loaded) 405 return 0; 406 407 408 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 409 goto fail; 410 rdev->sb_loaded = 1; 411 return 0; 412 413 fail: 414 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 415 bdevname(rdev->bdev,b)); 416 return -EINVAL; 417 } 418 419 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 420 { 421 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 422 (sb1->set_uuid1 == sb2->set_uuid1) && 423 (sb1->set_uuid2 == sb2->set_uuid2) && 424 (sb1->set_uuid3 == sb2->set_uuid3)) 425 426 return 1; 427 428 return 0; 429 } 430 431 432 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 433 { 434 int ret; 435 mdp_super_t *tmp1, *tmp2; 436 437 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 438 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 439 440 if (!tmp1 || !tmp2) { 441 ret = 0; 442 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 443 goto abort; 444 } 445 446 *tmp1 = *sb1; 447 *tmp2 = *sb2; 448 449 /* 450 * nr_disks is not constant 451 */ 452 tmp1->nr_disks = 0; 453 tmp2->nr_disks = 0; 454 455 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 456 ret = 0; 457 else 458 ret = 1; 459 460 abort: 461 kfree(tmp1); 462 kfree(tmp2); 463 return ret; 464 } 465 466 static unsigned int calc_sb_csum(mdp_super_t * sb) 467 { 468 unsigned int disk_csum, csum; 469 470 disk_csum = sb->sb_csum; 471 sb->sb_csum = 0; 472 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 473 sb->sb_csum = disk_csum; 474 return csum; 475 } 476 477 478 /* 479 * Handle superblock details. 480 * We want to be able to handle multiple superblock formats 481 * so we have a common interface to them all, and an array of 482 * different handlers. 483 * We rely on user-space to write the initial superblock, and support 484 * reading and updating of superblocks. 485 * Interface methods are: 486 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 487 * loads and validates a superblock on dev. 488 * if refdev != NULL, compare superblocks on both devices 489 * Return: 490 * 0 - dev has a superblock that is compatible with refdev 491 * 1 - dev has a superblock that is compatible and newer than refdev 492 * so dev should be used as the refdev in future 493 * -EINVAL superblock incompatible or invalid 494 * -othererror e.g. -EIO 495 * 496 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 497 * Verify that dev is acceptable into mddev. 498 * The first time, mddev->raid_disks will be 0, and data from 499 * dev should be merged in. Subsequent calls check that dev 500 * is new enough. Return 0 or -EINVAL 501 * 502 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 503 * Update the superblock for rdev with data in mddev 504 * This does not write to disc. 505 * 506 */ 507 508 struct super_type { 509 char *name; 510 struct module *owner; 511 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 512 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 513 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 514 }; 515 516 /* 517 * load_super for 0.90.0 518 */ 519 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 520 { 521 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 522 mdp_super_t *sb; 523 int ret; 524 sector_t sb_offset; 525 526 /* 527 * Calculate the position of the superblock, 528 * it's at the end of the disk. 529 * 530 * It also happens to be a multiple of 4Kb. 531 */ 532 sb_offset = calc_dev_sboffset(rdev->bdev); 533 rdev->sb_offset = sb_offset; 534 535 ret = read_disk_sb(rdev, MD_SB_BYTES); 536 if (ret) return ret; 537 538 ret = -EINVAL; 539 540 bdevname(rdev->bdev, b); 541 sb = (mdp_super_t*)page_address(rdev->sb_page); 542 543 if (sb->md_magic != MD_SB_MAGIC) { 544 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 545 b); 546 goto abort; 547 } 548 549 if (sb->major_version != 0 || 550 sb->minor_version != 90) { 551 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 552 sb->major_version, sb->minor_version, 553 b); 554 goto abort; 555 } 556 557 if (sb->raid_disks <= 0) 558 goto abort; 559 560 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 561 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 562 b); 563 goto abort; 564 } 565 566 rdev->preferred_minor = sb->md_minor; 567 rdev->data_offset = 0; 568 rdev->sb_size = MD_SB_BYTES; 569 570 if (sb->level == LEVEL_MULTIPATH) 571 rdev->desc_nr = -1; 572 else 573 rdev->desc_nr = sb->this_disk.number; 574 575 if (refdev == 0) 576 ret = 1; 577 else { 578 __u64 ev1, ev2; 579 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 580 if (!uuid_equal(refsb, sb)) { 581 printk(KERN_WARNING "md: %s has different UUID to %s\n", 582 b, bdevname(refdev->bdev,b2)); 583 goto abort; 584 } 585 if (!sb_equal(refsb, sb)) { 586 printk(KERN_WARNING "md: %s has same UUID" 587 " but different superblock to %s\n", 588 b, bdevname(refdev->bdev, b2)); 589 goto abort; 590 } 591 ev1 = md_event(sb); 592 ev2 = md_event(refsb); 593 if (ev1 > ev2) 594 ret = 1; 595 else 596 ret = 0; 597 } 598 rdev->size = calc_dev_size(rdev, sb->chunk_size); 599 600 abort: 601 return ret; 602 } 603 604 /* 605 * validate_super for 0.90.0 606 */ 607 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 608 { 609 mdp_disk_t *desc; 610 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 611 612 rdev->raid_disk = -1; 613 rdev->flags = 0; 614 if (mddev->raid_disks == 0) { 615 mddev->major_version = 0; 616 mddev->minor_version = sb->minor_version; 617 mddev->patch_version = sb->patch_version; 618 mddev->persistent = ! sb->not_persistent; 619 mddev->chunk_size = sb->chunk_size; 620 mddev->ctime = sb->ctime; 621 mddev->utime = sb->utime; 622 mddev->level = sb->level; 623 mddev->layout = sb->layout; 624 mddev->raid_disks = sb->raid_disks; 625 mddev->size = sb->size; 626 mddev->events = md_event(sb); 627 mddev->bitmap_offset = 0; 628 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 629 630 if (sb->state & (1<<MD_SB_CLEAN)) 631 mddev->recovery_cp = MaxSector; 632 else { 633 if (sb->events_hi == sb->cp_events_hi && 634 sb->events_lo == sb->cp_events_lo) { 635 mddev->recovery_cp = sb->recovery_cp; 636 } else 637 mddev->recovery_cp = 0; 638 } 639 640 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 641 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 642 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 643 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 644 645 mddev->max_disks = MD_SB_DISKS; 646 647 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 648 mddev->bitmap_file == NULL) { 649 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 650 /* FIXME use a better test */ 651 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 652 return -EINVAL; 653 } 654 mddev->bitmap_offset = mddev->default_bitmap_offset; 655 } 656 657 } else if (mddev->pers == NULL) { 658 /* Insist on good event counter while assembling */ 659 __u64 ev1 = md_event(sb); 660 ++ev1; 661 if (ev1 < mddev->events) 662 return -EINVAL; 663 } else if (mddev->bitmap) { 664 /* if adding to array with a bitmap, then we can accept an 665 * older device ... but not too old. 666 */ 667 __u64 ev1 = md_event(sb); 668 if (ev1 < mddev->bitmap->events_cleared) 669 return 0; 670 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 671 return 0; 672 673 if (mddev->level != LEVEL_MULTIPATH) { 674 desc = sb->disks + rdev->desc_nr; 675 676 if (desc->state & (1<<MD_DISK_FAULTY)) 677 set_bit(Faulty, &rdev->flags); 678 else if (desc->state & (1<<MD_DISK_SYNC) && 679 desc->raid_disk < mddev->raid_disks) { 680 set_bit(In_sync, &rdev->flags); 681 rdev->raid_disk = desc->raid_disk; 682 } 683 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 684 set_bit(WriteMostly, &rdev->flags); 685 } else /* MULTIPATH are always insync */ 686 set_bit(In_sync, &rdev->flags); 687 return 0; 688 } 689 690 /* 691 * sync_super for 0.90.0 692 */ 693 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 694 { 695 mdp_super_t *sb; 696 struct list_head *tmp; 697 mdk_rdev_t *rdev2; 698 int next_spare = mddev->raid_disks; 699 char nm[20]; 700 701 /* make rdev->sb match mddev data.. 702 * 703 * 1/ zero out disks 704 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 705 * 3/ any empty disks < next_spare become removed 706 * 707 * disks[0] gets initialised to REMOVED because 708 * we cannot be sure from other fields if it has 709 * been initialised or not. 710 */ 711 int i; 712 int active=0, working=0,failed=0,spare=0,nr_disks=0; 713 unsigned int fixdesc=0; 714 715 rdev->sb_size = MD_SB_BYTES; 716 717 sb = (mdp_super_t*)page_address(rdev->sb_page); 718 719 memset(sb, 0, sizeof(*sb)); 720 721 sb->md_magic = MD_SB_MAGIC; 722 sb->major_version = mddev->major_version; 723 sb->minor_version = mddev->minor_version; 724 sb->patch_version = mddev->patch_version; 725 sb->gvalid_words = 0; /* ignored */ 726 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 727 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 728 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 729 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 730 731 sb->ctime = mddev->ctime; 732 sb->level = mddev->level; 733 sb->size = mddev->size; 734 sb->raid_disks = mddev->raid_disks; 735 sb->md_minor = mddev->md_minor; 736 sb->not_persistent = !mddev->persistent; 737 sb->utime = mddev->utime; 738 sb->state = 0; 739 sb->events_hi = (mddev->events>>32); 740 sb->events_lo = (u32)mddev->events; 741 742 if (mddev->in_sync) 743 { 744 sb->recovery_cp = mddev->recovery_cp; 745 sb->cp_events_hi = (mddev->events>>32); 746 sb->cp_events_lo = (u32)mddev->events; 747 if (mddev->recovery_cp == MaxSector) 748 sb->state = (1<< MD_SB_CLEAN); 749 } else 750 sb->recovery_cp = 0; 751 752 sb->layout = mddev->layout; 753 sb->chunk_size = mddev->chunk_size; 754 755 if (mddev->bitmap && mddev->bitmap_file == NULL) 756 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 757 758 sb->disks[0].state = (1<<MD_DISK_REMOVED); 759 ITERATE_RDEV(mddev,rdev2,tmp) { 760 mdp_disk_t *d; 761 int desc_nr; 762 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 763 && !test_bit(Faulty, &rdev2->flags)) 764 desc_nr = rdev2->raid_disk; 765 else 766 desc_nr = next_spare++; 767 if (desc_nr != rdev2->desc_nr) { 768 fixdesc |= (1 << desc_nr); 769 rdev2->desc_nr = desc_nr; 770 if (rdev2->raid_disk >= 0) { 771 sprintf(nm, "rd%d", rdev2->raid_disk); 772 sysfs_remove_link(&mddev->kobj, nm); 773 } 774 sysfs_remove_link(&rdev2->kobj, "block"); 775 kobject_del(&rdev2->kobj); 776 } 777 d = &sb->disks[rdev2->desc_nr]; 778 nr_disks++; 779 d->number = rdev2->desc_nr; 780 d->major = MAJOR(rdev2->bdev->bd_dev); 781 d->minor = MINOR(rdev2->bdev->bd_dev); 782 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 783 && !test_bit(Faulty, &rdev2->flags)) 784 d->raid_disk = rdev2->raid_disk; 785 else 786 d->raid_disk = rdev2->desc_nr; /* compatibility */ 787 if (test_bit(Faulty, &rdev2->flags)) { 788 d->state = (1<<MD_DISK_FAULTY); 789 failed++; 790 } else if (test_bit(In_sync, &rdev2->flags)) { 791 d->state = (1<<MD_DISK_ACTIVE); 792 d->state |= (1<<MD_DISK_SYNC); 793 active++; 794 working++; 795 } else { 796 d->state = 0; 797 spare++; 798 working++; 799 } 800 if (test_bit(WriteMostly, &rdev2->flags)) 801 d->state |= (1<<MD_DISK_WRITEMOSTLY); 802 } 803 if (fixdesc) 804 ITERATE_RDEV(mddev,rdev2,tmp) 805 if (fixdesc & (1<<rdev2->desc_nr)) { 806 snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", 807 rdev2->desc_nr); 808 /* kobject_add gets a ref on the parent, so 809 * we have to drop the one we already have 810 */ 811 kobject_add(&rdev2->kobj); 812 kobject_put(rdev->kobj.parent); 813 sysfs_create_link(&rdev2->kobj, 814 &rdev2->bdev->bd_disk->kobj, 815 "block"); 816 if (rdev2->raid_disk >= 0) { 817 sprintf(nm, "rd%d", rdev2->raid_disk); 818 sysfs_create_link(&mddev->kobj, 819 &rdev2->kobj, nm); 820 } 821 } 822 /* now set the "removed" and "faulty" bits on any missing devices */ 823 for (i=0 ; i < mddev->raid_disks ; i++) { 824 mdp_disk_t *d = &sb->disks[i]; 825 if (d->state == 0 && d->number == 0) { 826 d->number = i; 827 d->raid_disk = i; 828 d->state = (1<<MD_DISK_REMOVED); 829 d->state |= (1<<MD_DISK_FAULTY); 830 failed++; 831 } 832 } 833 sb->nr_disks = nr_disks; 834 sb->active_disks = active; 835 sb->working_disks = working; 836 sb->failed_disks = failed; 837 sb->spare_disks = spare; 838 839 sb->this_disk = sb->disks[rdev->desc_nr]; 840 sb->sb_csum = calc_sb_csum(sb); 841 } 842 843 /* 844 * version 1 superblock 845 */ 846 847 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 848 { 849 unsigned int disk_csum, csum; 850 unsigned long long newcsum; 851 int size = 256 + le32_to_cpu(sb->max_dev)*2; 852 unsigned int *isuper = (unsigned int*)sb; 853 int i; 854 855 disk_csum = sb->sb_csum; 856 sb->sb_csum = 0; 857 newcsum = 0; 858 for (i=0; size>=4; size -= 4 ) 859 newcsum += le32_to_cpu(*isuper++); 860 861 if (size == 2) 862 newcsum += le16_to_cpu(*(unsigned short*) isuper); 863 864 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 865 sb->sb_csum = disk_csum; 866 return cpu_to_le32(csum); 867 } 868 869 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 870 { 871 struct mdp_superblock_1 *sb; 872 int ret; 873 sector_t sb_offset; 874 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 875 int bmask; 876 877 /* 878 * Calculate the position of the superblock. 879 * It is always aligned to a 4K boundary and 880 * depeding on minor_version, it can be: 881 * 0: At least 8K, but less than 12K, from end of device 882 * 1: At start of device 883 * 2: 4K from start of device. 884 */ 885 switch(minor_version) { 886 case 0: 887 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 888 sb_offset -= 8*2; 889 sb_offset &= ~(sector_t)(4*2-1); 890 /* convert from sectors to K */ 891 sb_offset /= 2; 892 break; 893 case 1: 894 sb_offset = 0; 895 break; 896 case 2: 897 sb_offset = 4; 898 break; 899 default: 900 return -EINVAL; 901 } 902 rdev->sb_offset = sb_offset; 903 904 /* superblock is rarely larger than 1K, but it can be larger, 905 * and it is safe to read 4k, so we do that 906 */ 907 ret = read_disk_sb(rdev, 4096); 908 if (ret) return ret; 909 910 911 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 912 913 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 914 sb->major_version != cpu_to_le32(1) || 915 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 916 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 917 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 918 return -EINVAL; 919 920 if (calc_sb_1_csum(sb) != sb->sb_csum) { 921 printk("md: invalid superblock checksum on %s\n", 922 bdevname(rdev->bdev,b)); 923 return -EINVAL; 924 } 925 if (le64_to_cpu(sb->data_size) < 10) { 926 printk("md: data_size too small on %s\n", 927 bdevname(rdev->bdev,b)); 928 return -EINVAL; 929 } 930 rdev->preferred_minor = 0xffff; 931 rdev->data_offset = le64_to_cpu(sb->data_offset); 932 933 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 934 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 935 if (rdev->sb_size & bmask) 936 rdev-> sb_size = (rdev->sb_size | bmask)+1; 937 938 if (refdev == 0) 939 return 1; 940 else { 941 __u64 ev1, ev2; 942 struct mdp_superblock_1 *refsb = 943 (struct mdp_superblock_1*)page_address(refdev->sb_page); 944 945 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 946 sb->level != refsb->level || 947 sb->layout != refsb->layout || 948 sb->chunksize != refsb->chunksize) { 949 printk(KERN_WARNING "md: %s has strangely different" 950 " superblock to %s\n", 951 bdevname(rdev->bdev,b), 952 bdevname(refdev->bdev,b2)); 953 return -EINVAL; 954 } 955 ev1 = le64_to_cpu(sb->events); 956 ev2 = le64_to_cpu(refsb->events); 957 958 if (ev1 > ev2) 959 return 1; 960 } 961 if (minor_version) 962 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 963 else 964 rdev->size = rdev->sb_offset; 965 if (rdev->size < le64_to_cpu(sb->data_size)/2) 966 return -EINVAL; 967 rdev->size = le64_to_cpu(sb->data_size)/2; 968 if (le32_to_cpu(sb->chunksize)) 969 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 970 return 0; 971 } 972 973 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 974 { 975 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 976 977 rdev->raid_disk = -1; 978 rdev->flags = 0; 979 if (mddev->raid_disks == 0) { 980 mddev->major_version = 1; 981 mddev->patch_version = 0; 982 mddev->persistent = 1; 983 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 984 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 985 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 986 mddev->level = le32_to_cpu(sb->level); 987 mddev->layout = le32_to_cpu(sb->layout); 988 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 989 mddev->size = le64_to_cpu(sb->size)/2; 990 mddev->events = le64_to_cpu(sb->events); 991 mddev->bitmap_offset = 0; 992 mddev->default_bitmap_offset = 0; 993 mddev->default_bitmap_offset = 1024; 994 995 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 996 memcpy(mddev->uuid, sb->set_uuid, 16); 997 998 mddev->max_disks = (4096-256)/2; 999 1000 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1001 mddev->bitmap_file == NULL ) { 1002 if (mddev->level != 1) { 1003 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 1004 return -EINVAL; 1005 } 1006 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1007 } 1008 } else if (mddev->pers == NULL) { 1009 /* Insist of good event counter while assembling */ 1010 __u64 ev1 = le64_to_cpu(sb->events); 1011 ++ev1; 1012 if (ev1 < mddev->events) 1013 return -EINVAL; 1014 } else if (mddev->bitmap) { 1015 /* If adding to array with a bitmap, then we can accept an 1016 * older device, but not too old. 1017 */ 1018 __u64 ev1 = le64_to_cpu(sb->events); 1019 if (ev1 < mddev->bitmap->events_cleared) 1020 return 0; 1021 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1022 return 0; 1023 1024 if (mddev->level != LEVEL_MULTIPATH) { 1025 int role; 1026 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1027 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1028 switch(role) { 1029 case 0xffff: /* spare */ 1030 break; 1031 case 0xfffe: /* faulty */ 1032 set_bit(Faulty, &rdev->flags); 1033 break; 1034 default: 1035 set_bit(In_sync, &rdev->flags); 1036 rdev->raid_disk = role; 1037 break; 1038 } 1039 if (sb->devflags & WriteMostly1) 1040 set_bit(WriteMostly, &rdev->flags); 1041 } else /* MULTIPATH are always insync */ 1042 set_bit(In_sync, &rdev->flags); 1043 1044 return 0; 1045 } 1046 1047 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1048 { 1049 struct mdp_superblock_1 *sb; 1050 struct list_head *tmp; 1051 mdk_rdev_t *rdev2; 1052 int max_dev, i; 1053 /* make rdev->sb match mddev and rdev data. */ 1054 1055 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1056 1057 sb->feature_map = 0; 1058 sb->pad0 = 0; 1059 memset(sb->pad1, 0, sizeof(sb->pad1)); 1060 memset(sb->pad2, 0, sizeof(sb->pad2)); 1061 memset(sb->pad3, 0, sizeof(sb->pad3)); 1062 1063 sb->utime = cpu_to_le64((__u64)mddev->utime); 1064 sb->events = cpu_to_le64(mddev->events); 1065 if (mddev->in_sync) 1066 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1067 else 1068 sb->resync_offset = cpu_to_le64(0); 1069 1070 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1071 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1072 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1073 } 1074 1075 max_dev = 0; 1076 ITERATE_RDEV(mddev,rdev2,tmp) 1077 if (rdev2->desc_nr+1 > max_dev) 1078 max_dev = rdev2->desc_nr+1; 1079 1080 sb->max_dev = cpu_to_le32(max_dev); 1081 for (i=0; i<max_dev;i++) 1082 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1083 1084 ITERATE_RDEV(mddev,rdev2,tmp) { 1085 i = rdev2->desc_nr; 1086 if (test_bit(Faulty, &rdev2->flags)) 1087 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1088 else if (test_bit(In_sync, &rdev2->flags)) 1089 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1090 else 1091 sb->dev_roles[i] = cpu_to_le16(0xffff); 1092 } 1093 1094 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1095 sb->sb_csum = calc_sb_1_csum(sb); 1096 } 1097 1098 1099 static struct super_type super_types[] = { 1100 [0] = { 1101 .name = "0.90.0", 1102 .owner = THIS_MODULE, 1103 .load_super = super_90_load, 1104 .validate_super = super_90_validate, 1105 .sync_super = super_90_sync, 1106 }, 1107 [1] = { 1108 .name = "md-1", 1109 .owner = THIS_MODULE, 1110 .load_super = super_1_load, 1111 .validate_super = super_1_validate, 1112 .sync_super = super_1_sync, 1113 }, 1114 }; 1115 1116 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1117 { 1118 struct list_head *tmp; 1119 mdk_rdev_t *rdev; 1120 1121 ITERATE_RDEV(mddev,rdev,tmp) 1122 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1123 return rdev; 1124 1125 return NULL; 1126 } 1127 1128 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1129 { 1130 struct list_head *tmp; 1131 mdk_rdev_t *rdev; 1132 1133 ITERATE_RDEV(mddev1,rdev,tmp) 1134 if (match_dev_unit(mddev2, rdev)) 1135 return 1; 1136 1137 return 0; 1138 } 1139 1140 static LIST_HEAD(pending_raid_disks); 1141 1142 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1143 { 1144 mdk_rdev_t *same_pdev; 1145 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1146 1147 if (rdev->mddev) { 1148 MD_BUG(); 1149 return -EINVAL; 1150 } 1151 same_pdev = match_dev_unit(mddev, rdev); 1152 if (same_pdev) 1153 printk(KERN_WARNING 1154 "%s: WARNING: %s appears to be on the same physical" 1155 " disk as %s. True\n protection against single-disk" 1156 " failure might be compromised.\n", 1157 mdname(mddev), bdevname(rdev->bdev,b), 1158 bdevname(same_pdev->bdev,b2)); 1159 1160 /* Verify rdev->desc_nr is unique. 1161 * If it is -1, assign a free number, else 1162 * check number is not in use 1163 */ 1164 if (rdev->desc_nr < 0) { 1165 int choice = 0; 1166 if (mddev->pers) choice = mddev->raid_disks; 1167 while (find_rdev_nr(mddev, choice)) 1168 choice++; 1169 rdev->desc_nr = choice; 1170 } else { 1171 if (find_rdev_nr(mddev, rdev->desc_nr)) 1172 return -EBUSY; 1173 } 1174 1175 list_add(&rdev->same_set, &mddev->disks); 1176 rdev->mddev = mddev; 1177 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1178 1179 rdev->kobj.k_name = NULL; 1180 snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); 1181 rdev->kobj.parent = &mddev->kobj; 1182 kobject_add(&rdev->kobj); 1183 1184 sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); 1185 return 0; 1186 } 1187 1188 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1189 { 1190 char b[BDEVNAME_SIZE]; 1191 if (!rdev->mddev) { 1192 MD_BUG(); 1193 return; 1194 } 1195 list_del_init(&rdev->same_set); 1196 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1197 rdev->mddev = NULL; 1198 sysfs_remove_link(&rdev->kobj, "block"); 1199 kobject_del(&rdev->kobj); 1200 } 1201 1202 /* 1203 * prevent the device from being mounted, repartitioned or 1204 * otherwise reused by a RAID array (or any other kernel 1205 * subsystem), by bd_claiming the device. 1206 */ 1207 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1208 { 1209 int err = 0; 1210 struct block_device *bdev; 1211 char b[BDEVNAME_SIZE]; 1212 1213 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1214 if (IS_ERR(bdev)) { 1215 printk(KERN_ERR "md: could not open %s.\n", 1216 __bdevname(dev, b)); 1217 return PTR_ERR(bdev); 1218 } 1219 err = bd_claim(bdev, rdev); 1220 if (err) { 1221 printk(KERN_ERR "md: could not bd_claim %s.\n", 1222 bdevname(bdev, b)); 1223 blkdev_put(bdev); 1224 return err; 1225 } 1226 rdev->bdev = bdev; 1227 return err; 1228 } 1229 1230 static void unlock_rdev(mdk_rdev_t *rdev) 1231 { 1232 struct block_device *bdev = rdev->bdev; 1233 rdev->bdev = NULL; 1234 if (!bdev) 1235 MD_BUG(); 1236 bd_release(bdev); 1237 blkdev_put(bdev); 1238 } 1239 1240 void md_autodetect_dev(dev_t dev); 1241 1242 static void export_rdev(mdk_rdev_t * rdev) 1243 { 1244 char b[BDEVNAME_SIZE]; 1245 printk(KERN_INFO "md: export_rdev(%s)\n", 1246 bdevname(rdev->bdev,b)); 1247 if (rdev->mddev) 1248 MD_BUG(); 1249 free_disk_sb(rdev); 1250 list_del_init(&rdev->same_set); 1251 #ifndef MODULE 1252 md_autodetect_dev(rdev->bdev->bd_dev); 1253 #endif 1254 unlock_rdev(rdev); 1255 kobject_put(&rdev->kobj); 1256 } 1257 1258 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1259 { 1260 unbind_rdev_from_array(rdev); 1261 export_rdev(rdev); 1262 } 1263 1264 static void export_array(mddev_t *mddev) 1265 { 1266 struct list_head *tmp; 1267 mdk_rdev_t *rdev; 1268 1269 ITERATE_RDEV(mddev,rdev,tmp) { 1270 if (!rdev->mddev) { 1271 MD_BUG(); 1272 continue; 1273 } 1274 kick_rdev_from_array(rdev); 1275 } 1276 if (!list_empty(&mddev->disks)) 1277 MD_BUG(); 1278 mddev->raid_disks = 0; 1279 mddev->major_version = 0; 1280 } 1281 1282 static void print_desc(mdp_disk_t *desc) 1283 { 1284 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1285 desc->major,desc->minor,desc->raid_disk,desc->state); 1286 } 1287 1288 static void print_sb(mdp_super_t *sb) 1289 { 1290 int i; 1291 1292 printk(KERN_INFO 1293 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1294 sb->major_version, sb->minor_version, sb->patch_version, 1295 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1296 sb->ctime); 1297 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1298 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1299 sb->md_minor, sb->layout, sb->chunk_size); 1300 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1301 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1302 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1303 sb->failed_disks, sb->spare_disks, 1304 sb->sb_csum, (unsigned long)sb->events_lo); 1305 1306 printk(KERN_INFO); 1307 for (i = 0; i < MD_SB_DISKS; i++) { 1308 mdp_disk_t *desc; 1309 1310 desc = sb->disks + i; 1311 if (desc->number || desc->major || desc->minor || 1312 desc->raid_disk || (desc->state && (desc->state != 4))) { 1313 printk(" D %2d: ", i); 1314 print_desc(desc); 1315 } 1316 } 1317 printk(KERN_INFO "md: THIS: "); 1318 print_desc(&sb->this_disk); 1319 1320 } 1321 1322 static void print_rdev(mdk_rdev_t *rdev) 1323 { 1324 char b[BDEVNAME_SIZE]; 1325 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1326 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1327 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1328 rdev->desc_nr); 1329 if (rdev->sb_loaded) { 1330 printk(KERN_INFO "md: rdev superblock:\n"); 1331 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1332 } else 1333 printk(KERN_INFO "md: no rdev superblock!\n"); 1334 } 1335 1336 void md_print_devices(void) 1337 { 1338 struct list_head *tmp, *tmp2; 1339 mdk_rdev_t *rdev; 1340 mddev_t *mddev; 1341 char b[BDEVNAME_SIZE]; 1342 1343 printk("\n"); 1344 printk("md: **********************************\n"); 1345 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1346 printk("md: **********************************\n"); 1347 ITERATE_MDDEV(mddev,tmp) { 1348 1349 if (mddev->bitmap) 1350 bitmap_print_sb(mddev->bitmap); 1351 else 1352 printk("%s: ", mdname(mddev)); 1353 ITERATE_RDEV(mddev,rdev,tmp2) 1354 printk("<%s>", bdevname(rdev->bdev,b)); 1355 printk("\n"); 1356 1357 ITERATE_RDEV(mddev,rdev,tmp2) 1358 print_rdev(rdev); 1359 } 1360 printk("md: **********************************\n"); 1361 printk("\n"); 1362 } 1363 1364 1365 static void sync_sbs(mddev_t * mddev) 1366 { 1367 mdk_rdev_t *rdev; 1368 struct list_head *tmp; 1369 1370 ITERATE_RDEV(mddev,rdev,tmp) { 1371 super_types[mddev->major_version]. 1372 sync_super(mddev, rdev); 1373 rdev->sb_loaded = 1; 1374 } 1375 } 1376 1377 static void md_update_sb(mddev_t * mddev) 1378 { 1379 int err; 1380 struct list_head *tmp; 1381 mdk_rdev_t *rdev; 1382 int sync_req; 1383 1384 repeat: 1385 spin_lock(&mddev->write_lock); 1386 sync_req = mddev->in_sync; 1387 mddev->utime = get_seconds(); 1388 mddev->events ++; 1389 1390 if (!mddev->events) { 1391 /* 1392 * oops, this 64-bit counter should never wrap. 1393 * Either we are in around ~1 trillion A.C., assuming 1394 * 1 reboot per second, or we have a bug: 1395 */ 1396 MD_BUG(); 1397 mddev->events --; 1398 } 1399 mddev->sb_dirty = 2; 1400 sync_sbs(mddev); 1401 1402 /* 1403 * do not write anything to disk if using 1404 * nonpersistent superblocks 1405 */ 1406 if (!mddev->persistent) { 1407 mddev->sb_dirty = 0; 1408 spin_unlock(&mddev->write_lock); 1409 wake_up(&mddev->sb_wait); 1410 return; 1411 } 1412 spin_unlock(&mddev->write_lock); 1413 1414 dprintk(KERN_INFO 1415 "md: updating %s RAID superblock on device (in sync %d)\n", 1416 mdname(mddev),mddev->in_sync); 1417 1418 err = bitmap_update_sb(mddev->bitmap); 1419 ITERATE_RDEV(mddev,rdev,tmp) { 1420 char b[BDEVNAME_SIZE]; 1421 dprintk(KERN_INFO "md: "); 1422 if (test_bit(Faulty, &rdev->flags)) 1423 dprintk("(skipping faulty "); 1424 1425 dprintk("%s ", bdevname(rdev->bdev,b)); 1426 if (!test_bit(Faulty, &rdev->flags)) { 1427 md_super_write(mddev,rdev, 1428 rdev->sb_offset<<1, rdev->sb_size, 1429 rdev->sb_page); 1430 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1431 bdevname(rdev->bdev,b), 1432 (unsigned long long)rdev->sb_offset); 1433 1434 } else 1435 dprintk(")\n"); 1436 if (mddev->level == LEVEL_MULTIPATH) 1437 /* only need to write one superblock... */ 1438 break; 1439 } 1440 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1441 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1442 1443 spin_lock(&mddev->write_lock); 1444 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1445 /* have to write it out again */ 1446 spin_unlock(&mddev->write_lock); 1447 goto repeat; 1448 } 1449 mddev->sb_dirty = 0; 1450 spin_unlock(&mddev->write_lock); 1451 wake_up(&mddev->sb_wait); 1452 1453 } 1454 1455 struct rdev_sysfs_entry { 1456 struct attribute attr; 1457 ssize_t (*show)(mdk_rdev_t *, char *); 1458 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1459 }; 1460 1461 static ssize_t 1462 rdev_show_state(mdk_rdev_t *rdev, char *page) 1463 { 1464 char *sep = ""; 1465 int len=0; 1466 1467 if (test_bit(Faulty, &rdev->flags)) { 1468 len+= sprintf(page+len, "%sfaulty",sep); 1469 sep = ","; 1470 } 1471 if (test_bit(In_sync, &rdev->flags)) { 1472 len += sprintf(page+len, "%sin_sync",sep); 1473 sep = ","; 1474 } 1475 if (!test_bit(Faulty, &rdev->flags) && 1476 !test_bit(In_sync, &rdev->flags)) { 1477 len += sprintf(page+len, "%sspare", sep); 1478 sep = ","; 1479 } 1480 return len+sprintf(page+len, "\n"); 1481 } 1482 1483 static struct rdev_sysfs_entry rdev_state = { 1484 .attr = {.name = "state", .mode = S_IRUGO }, 1485 .show = rdev_show_state, 1486 }; 1487 1488 static ssize_t 1489 rdev_show_super(mdk_rdev_t *rdev, char *page) 1490 { 1491 if (rdev->sb_loaded && rdev->sb_size) { 1492 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1493 return rdev->sb_size; 1494 } else 1495 return 0; 1496 } 1497 static struct rdev_sysfs_entry rdev_super = { 1498 .attr = {.name = "super", .mode = S_IRUGO }, 1499 .show = rdev_show_super, 1500 }; 1501 static struct attribute *rdev_default_attrs[] = { 1502 &rdev_state.attr, 1503 &rdev_super.attr, 1504 NULL, 1505 }; 1506 static ssize_t 1507 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1508 { 1509 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1510 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1511 1512 if (!entry->show) 1513 return -EIO; 1514 return entry->show(rdev, page); 1515 } 1516 1517 static ssize_t 1518 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1519 const char *page, size_t length) 1520 { 1521 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1522 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1523 1524 if (!entry->store) 1525 return -EIO; 1526 return entry->store(rdev, page, length); 1527 } 1528 1529 static void rdev_free(struct kobject *ko) 1530 { 1531 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1532 kfree(rdev); 1533 } 1534 static struct sysfs_ops rdev_sysfs_ops = { 1535 .show = rdev_attr_show, 1536 .store = rdev_attr_store, 1537 }; 1538 static struct kobj_type rdev_ktype = { 1539 .release = rdev_free, 1540 .sysfs_ops = &rdev_sysfs_ops, 1541 .default_attrs = rdev_default_attrs, 1542 }; 1543 1544 /* 1545 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1546 * 1547 * mark the device faulty if: 1548 * 1549 * - the device is nonexistent (zero size) 1550 * - the device has no valid superblock 1551 * 1552 * a faulty rdev _never_ has rdev->sb set. 1553 */ 1554 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1555 { 1556 char b[BDEVNAME_SIZE]; 1557 int err; 1558 mdk_rdev_t *rdev; 1559 sector_t size; 1560 1561 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1562 if (!rdev) { 1563 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1564 return ERR_PTR(-ENOMEM); 1565 } 1566 memset(rdev, 0, sizeof(*rdev)); 1567 1568 if ((err = alloc_disk_sb(rdev))) 1569 goto abort_free; 1570 1571 err = lock_rdev(rdev, newdev); 1572 if (err) 1573 goto abort_free; 1574 1575 rdev->kobj.parent = NULL; 1576 rdev->kobj.ktype = &rdev_ktype; 1577 kobject_init(&rdev->kobj); 1578 1579 rdev->desc_nr = -1; 1580 rdev->flags = 0; 1581 rdev->data_offset = 0; 1582 atomic_set(&rdev->nr_pending, 0); 1583 atomic_set(&rdev->read_errors, 0); 1584 1585 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1586 if (!size) { 1587 printk(KERN_WARNING 1588 "md: %s has zero or unknown size, marking faulty!\n", 1589 bdevname(rdev->bdev,b)); 1590 err = -EINVAL; 1591 goto abort_free; 1592 } 1593 1594 if (super_format >= 0) { 1595 err = super_types[super_format]. 1596 load_super(rdev, NULL, super_minor); 1597 if (err == -EINVAL) { 1598 printk(KERN_WARNING 1599 "md: %s has invalid sb, not importing!\n", 1600 bdevname(rdev->bdev,b)); 1601 goto abort_free; 1602 } 1603 if (err < 0) { 1604 printk(KERN_WARNING 1605 "md: could not read %s's sb, not importing!\n", 1606 bdevname(rdev->bdev,b)); 1607 goto abort_free; 1608 } 1609 } 1610 INIT_LIST_HEAD(&rdev->same_set); 1611 1612 return rdev; 1613 1614 abort_free: 1615 if (rdev->sb_page) { 1616 if (rdev->bdev) 1617 unlock_rdev(rdev); 1618 free_disk_sb(rdev); 1619 } 1620 kfree(rdev); 1621 return ERR_PTR(err); 1622 } 1623 1624 /* 1625 * Check a full RAID array for plausibility 1626 */ 1627 1628 1629 static void analyze_sbs(mddev_t * mddev) 1630 { 1631 int i; 1632 struct list_head *tmp; 1633 mdk_rdev_t *rdev, *freshest; 1634 char b[BDEVNAME_SIZE]; 1635 1636 freshest = NULL; 1637 ITERATE_RDEV(mddev,rdev,tmp) 1638 switch (super_types[mddev->major_version]. 1639 load_super(rdev, freshest, mddev->minor_version)) { 1640 case 1: 1641 freshest = rdev; 1642 break; 1643 case 0: 1644 break; 1645 default: 1646 printk( KERN_ERR \ 1647 "md: fatal superblock inconsistency in %s" 1648 " -- removing from array\n", 1649 bdevname(rdev->bdev,b)); 1650 kick_rdev_from_array(rdev); 1651 } 1652 1653 1654 super_types[mddev->major_version]. 1655 validate_super(mddev, freshest); 1656 1657 i = 0; 1658 ITERATE_RDEV(mddev,rdev,tmp) { 1659 if (rdev != freshest) 1660 if (super_types[mddev->major_version]. 1661 validate_super(mddev, rdev)) { 1662 printk(KERN_WARNING "md: kicking non-fresh %s" 1663 " from array!\n", 1664 bdevname(rdev->bdev,b)); 1665 kick_rdev_from_array(rdev); 1666 continue; 1667 } 1668 if (mddev->level == LEVEL_MULTIPATH) { 1669 rdev->desc_nr = i++; 1670 rdev->raid_disk = rdev->desc_nr; 1671 set_bit(In_sync, &rdev->flags); 1672 } 1673 } 1674 1675 1676 1677 if (mddev->recovery_cp != MaxSector && 1678 mddev->level >= 1) 1679 printk(KERN_ERR "md: %s: raid array is not clean" 1680 " -- starting background reconstruction\n", 1681 mdname(mddev)); 1682 1683 } 1684 1685 static ssize_t 1686 md_show_level(mddev_t *mddev, char *page) 1687 { 1688 mdk_personality_t *p = mddev->pers; 1689 if (p == NULL) 1690 return 0; 1691 if (mddev->level >= 0) 1692 return sprintf(page, "RAID-%d\n", mddev->level); 1693 else 1694 return sprintf(page, "%s\n", p->name); 1695 } 1696 1697 static struct md_sysfs_entry md_level = { 1698 .attr = {.name = "level", .mode = S_IRUGO }, 1699 .show = md_show_level, 1700 }; 1701 1702 static ssize_t 1703 md_show_rdisks(mddev_t *mddev, char *page) 1704 { 1705 return sprintf(page, "%d\n", mddev->raid_disks); 1706 } 1707 1708 static struct md_sysfs_entry md_raid_disks = { 1709 .attr = {.name = "raid_disks", .mode = S_IRUGO }, 1710 .show = md_show_rdisks, 1711 }; 1712 1713 static ssize_t 1714 md_show_scan(mddev_t *mddev, char *page) 1715 { 1716 char *type = "none"; 1717 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1718 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 1719 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1720 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1721 type = "resync"; 1722 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1723 type = "check"; 1724 else 1725 type = "repair"; 1726 } else 1727 type = "recover"; 1728 } 1729 return sprintf(page, "%s\n", type); 1730 } 1731 1732 static ssize_t 1733 md_store_scan(mddev_t *mddev, const char *page, size_t len) 1734 { 1735 int canscan=0; 1736 1737 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1738 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 1739 return -EBUSY; 1740 down(&mddev->reconfig_sem); 1741 if (mddev->pers && mddev->pers->sync_request) 1742 canscan=1; 1743 up(&mddev->reconfig_sem); 1744 if (!canscan) 1745 return -EINVAL; 1746 1747 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) 1748 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 1749 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) 1750 return -EINVAL; 1751 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 1752 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 1753 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1754 md_wakeup_thread(mddev->thread); 1755 return len; 1756 } 1757 1758 static ssize_t 1759 md_show_mismatch(mddev_t *mddev, char *page) 1760 { 1761 return sprintf(page, "%llu\n", 1762 (unsigned long long) mddev->resync_mismatches); 1763 } 1764 1765 static struct md_sysfs_entry md_scan_mode = { 1766 .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, 1767 .show = md_show_scan, 1768 .store = md_store_scan, 1769 }; 1770 1771 static struct md_sysfs_entry md_mismatches = { 1772 .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, 1773 .show = md_show_mismatch, 1774 }; 1775 1776 static struct attribute *md_default_attrs[] = { 1777 &md_level.attr, 1778 &md_raid_disks.attr, 1779 &md_scan_mode.attr, 1780 &md_mismatches.attr, 1781 NULL, 1782 }; 1783 1784 static ssize_t 1785 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1786 { 1787 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1788 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1789 1790 if (!entry->show) 1791 return -EIO; 1792 return entry->show(mddev, page); 1793 } 1794 1795 static ssize_t 1796 md_attr_store(struct kobject *kobj, struct attribute *attr, 1797 const char *page, size_t length) 1798 { 1799 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1800 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1801 1802 if (!entry->store) 1803 return -EIO; 1804 return entry->store(mddev, page, length); 1805 } 1806 1807 static void md_free(struct kobject *ko) 1808 { 1809 mddev_t *mddev = container_of(ko, mddev_t, kobj); 1810 kfree(mddev); 1811 } 1812 1813 static struct sysfs_ops md_sysfs_ops = { 1814 .show = md_attr_show, 1815 .store = md_attr_store, 1816 }; 1817 static struct kobj_type md_ktype = { 1818 .release = md_free, 1819 .sysfs_ops = &md_sysfs_ops, 1820 .default_attrs = md_default_attrs, 1821 }; 1822 1823 int mdp_major = 0; 1824 1825 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1826 { 1827 static DECLARE_MUTEX(disks_sem); 1828 mddev_t *mddev = mddev_find(dev); 1829 struct gendisk *disk; 1830 int partitioned = (MAJOR(dev) != MD_MAJOR); 1831 int shift = partitioned ? MdpMinorShift : 0; 1832 int unit = MINOR(dev) >> shift; 1833 1834 if (!mddev) 1835 return NULL; 1836 1837 down(&disks_sem); 1838 if (mddev->gendisk) { 1839 up(&disks_sem); 1840 mddev_put(mddev); 1841 return NULL; 1842 } 1843 disk = alloc_disk(1 << shift); 1844 if (!disk) { 1845 up(&disks_sem); 1846 mddev_put(mddev); 1847 return NULL; 1848 } 1849 disk->major = MAJOR(dev); 1850 disk->first_minor = unit << shift; 1851 if (partitioned) { 1852 sprintf(disk->disk_name, "md_d%d", unit); 1853 sprintf(disk->devfs_name, "md/d%d", unit); 1854 } else { 1855 sprintf(disk->disk_name, "md%d", unit); 1856 sprintf(disk->devfs_name, "md/%d", unit); 1857 } 1858 disk->fops = &md_fops; 1859 disk->private_data = mddev; 1860 disk->queue = mddev->queue; 1861 add_disk(disk); 1862 mddev->gendisk = disk; 1863 up(&disks_sem); 1864 mddev->kobj.parent = &disk->kobj; 1865 mddev->kobj.k_name = NULL; 1866 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 1867 mddev->kobj.ktype = &md_ktype; 1868 kobject_register(&mddev->kobj); 1869 return NULL; 1870 } 1871 1872 void md_wakeup_thread(mdk_thread_t *thread); 1873 1874 static void md_safemode_timeout(unsigned long data) 1875 { 1876 mddev_t *mddev = (mddev_t *) data; 1877 1878 mddev->safemode = 1; 1879 md_wakeup_thread(mddev->thread); 1880 } 1881 1882 1883 static int do_md_run(mddev_t * mddev) 1884 { 1885 int pnum, err; 1886 int chunk_size; 1887 struct list_head *tmp; 1888 mdk_rdev_t *rdev; 1889 struct gendisk *disk; 1890 char b[BDEVNAME_SIZE]; 1891 1892 if (list_empty(&mddev->disks)) 1893 /* cannot run an array with no devices.. */ 1894 return -EINVAL; 1895 1896 if (mddev->pers) 1897 return -EBUSY; 1898 1899 /* 1900 * Analyze all RAID superblock(s) 1901 */ 1902 if (!mddev->raid_disks) 1903 analyze_sbs(mddev); 1904 1905 chunk_size = mddev->chunk_size; 1906 pnum = level_to_pers(mddev->level); 1907 1908 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1909 if (!chunk_size) { 1910 /* 1911 * 'default chunksize' in the old md code used to 1912 * be PAGE_SIZE, baaad. 1913 * we abort here to be on the safe side. We don't 1914 * want to continue the bad practice. 1915 */ 1916 printk(KERN_ERR 1917 "no chunksize specified, see 'man raidtab'\n"); 1918 return -EINVAL; 1919 } 1920 if (chunk_size > MAX_CHUNK_SIZE) { 1921 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1922 chunk_size, MAX_CHUNK_SIZE); 1923 return -EINVAL; 1924 } 1925 /* 1926 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1927 */ 1928 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1929 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1930 return -EINVAL; 1931 } 1932 if (chunk_size < PAGE_SIZE) { 1933 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1934 chunk_size, PAGE_SIZE); 1935 return -EINVAL; 1936 } 1937 1938 /* devices must have minimum size of one chunk */ 1939 ITERATE_RDEV(mddev,rdev,tmp) { 1940 if (test_bit(Faulty, &rdev->flags)) 1941 continue; 1942 if (rdev->size < chunk_size / 1024) { 1943 printk(KERN_WARNING 1944 "md: Dev %s smaller than chunk_size:" 1945 " %lluk < %dk\n", 1946 bdevname(rdev->bdev,b), 1947 (unsigned long long)rdev->size, 1948 chunk_size / 1024); 1949 return -EINVAL; 1950 } 1951 } 1952 } 1953 1954 #ifdef CONFIG_KMOD 1955 if (!pers[pnum]) 1956 { 1957 request_module("md-personality-%d", pnum); 1958 } 1959 #endif 1960 1961 /* 1962 * Drop all container device buffers, from now on 1963 * the only valid external interface is through the md 1964 * device. 1965 * Also find largest hardsector size 1966 */ 1967 ITERATE_RDEV(mddev,rdev,tmp) { 1968 if (test_bit(Faulty, &rdev->flags)) 1969 continue; 1970 sync_blockdev(rdev->bdev); 1971 invalidate_bdev(rdev->bdev, 0); 1972 } 1973 1974 md_probe(mddev->unit, NULL, NULL); 1975 disk = mddev->gendisk; 1976 if (!disk) 1977 return -ENOMEM; 1978 1979 spin_lock(&pers_lock); 1980 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1981 spin_unlock(&pers_lock); 1982 printk(KERN_WARNING "md: personality %d is not loaded!\n", 1983 pnum); 1984 return -EINVAL; 1985 } 1986 1987 mddev->pers = pers[pnum]; 1988 spin_unlock(&pers_lock); 1989 1990 mddev->recovery = 0; 1991 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1992 1993 /* before we start the array running, initialise the bitmap */ 1994 err = bitmap_create(mddev); 1995 if (err) 1996 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 1997 mdname(mddev), err); 1998 else 1999 err = mddev->pers->run(mddev); 2000 if (err) { 2001 printk(KERN_ERR "md: pers->run() failed ...\n"); 2002 module_put(mddev->pers->owner); 2003 mddev->pers = NULL; 2004 bitmap_destroy(mddev); 2005 return err; 2006 } 2007 atomic_set(&mddev->writes_pending,0); 2008 mddev->safemode = 0; 2009 mddev->safemode_timer.function = md_safemode_timeout; 2010 mddev->safemode_timer.data = (unsigned long) mddev; 2011 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2012 mddev->in_sync = 1; 2013 2014 ITERATE_RDEV(mddev,rdev,tmp) 2015 if (rdev->raid_disk >= 0) { 2016 char nm[20]; 2017 sprintf(nm, "rd%d", rdev->raid_disk); 2018 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2019 } 2020 2021 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2022 md_wakeup_thread(mddev->thread); 2023 2024 if (mddev->sb_dirty) 2025 md_update_sb(mddev); 2026 2027 set_capacity(disk, mddev->array_size<<1); 2028 2029 /* If we call blk_queue_make_request here, it will 2030 * re-initialise max_sectors etc which may have been 2031 * refined inside -> run. So just set the bits we need to set. 2032 * Most initialisation happended when we called 2033 * blk_queue_make_request(..., md_fail_request) 2034 * earlier. 2035 */ 2036 mddev->queue->queuedata = mddev; 2037 mddev->queue->make_request_fn = mddev->pers->make_request; 2038 2039 mddev->changed = 1; 2040 return 0; 2041 } 2042 2043 static int restart_array(mddev_t *mddev) 2044 { 2045 struct gendisk *disk = mddev->gendisk; 2046 int err; 2047 2048 /* 2049 * Complain if it has no devices 2050 */ 2051 err = -ENXIO; 2052 if (list_empty(&mddev->disks)) 2053 goto out; 2054 2055 if (mddev->pers) { 2056 err = -EBUSY; 2057 if (!mddev->ro) 2058 goto out; 2059 2060 mddev->safemode = 0; 2061 mddev->ro = 0; 2062 set_disk_ro(disk, 0); 2063 2064 printk(KERN_INFO "md: %s switched to read-write mode.\n", 2065 mdname(mddev)); 2066 /* 2067 * Kick recovery or resync if necessary 2068 */ 2069 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2070 md_wakeup_thread(mddev->thread); 2071 err = 0; 2072 } else { 2073 printk(KERN_ERR "md: %s has no personality assigned.\n", 2074 mdname(mddev)); 2075 err = -EINVAL; 2076 } 2077 2078 out: 2079 return err; 2080 } 2081 2082 static int do_md_stop(mddev_t * mddev, int ro) 2083 { 2084 int err = 0; 2085 struct gendisk *disk = mddev->gendisk; 2086 2087 if (mddev->pers) { 2088 if (atomic_read(&mddev->active)>2) { 2089 printk("md: %s still in use.\n",mdname(mddev)); 2090 return -EBUSY; 2091 } 2092 2093 if (mddev->sync_thread) { 2094 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2095 md_unregister_thread(mddev->sync_thread); 2096 mddev->sync_thread = NULL; 2097 } 2098 2099 del_timer_sync(&mddev->safemode_timer); 2100 2101 invalidate_partition(disk, 0); 2102 2103 if (ro) { 2104 err = -ENXIO; 2105 if (mddev->ro) 2106 goto out; 2107 mddev->ro = 1; 2108 } else { 2109 bitmap_flush(mddev); 2110 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 2111 if (mddev->ro) 2112 set_disk_ro(disk, 0); 2113 blk_queue_make_request(mddev->queue, md_fail_request); 2114 mddev->pers->stop(mddev); 2115 module_put(mddev->pers->owner); 2116 mddev->pers = NULL; 2117 if (mddev->ro) 2118 mddev->ro = 0; 2119 } 2120 if (!mddev->in_sync) { 2121 /* mark array as shutdown cleanly */ 2122 mddev->in_sync = 1; 2123 md_update_sb(mddev); 2124 } 2125 if (ro) 2126 set_disk_ro(disk, 1); 2127 } 2128 2129 bitmap_destroy(mddev); 2130 if (mddev->bitmap_file) { 2131 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 2132 fput(mddev->bitmap_file); 2133 mddev->bitmap_file = NULL; 2134 } 2135 mddev->bitmap_offset = 0; 2136 2137 /* 2138 * Free resources if final stop 2139 */ 2140 if (!ro) { 2141 mdk_rdev_t *rdev; 2142 struct list_head *tmp; 2143 struct gendisk *disk; 2144 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2145 2146 ITERATE_RDEV(mddev,rdev,tmp) 2147 if (rdev->raid_disk >= 0) { 2148 char nm[20]; 2149 sprintf(nm, "rd%d", rdev->raid_disk); 2150 sysfs_remove_link(&mddev->kobj, nm); 2151 } 2152 2153 export_array(mddev); 2154 2155 mddev->array_size = 0; 2156 disk = mddev->gendisk; 2157 if (disk) 2158 set_capacity(disk, 0); 2159 mddev->changed = 1; 2160 } else 2161 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2162 mdname(mddev)); 2163 err = 0; 2164 out: 2165 return err; 2166 } 2167 2168 static void autorun_array(mddev_t *mddev) 2169 { 2170 mdk_rdev_t *rdev; 2171 struct list_head *tmp; 2172 int err; 2173 2174 if (list_empty(&mddev->disks)) 2175 return; 2176 2177 printk(KERN_INFO "md: running: "); 2178 2179 ITERATE_RDEV(mddev,rdev,tmp) { 2180 char b[BDEVNAME_SIZE]; 2181 printk("<%s>", bdevname(rdev->bdev,b)); 2182 } 2183 printk("\n"); 2184 2185 err = do_md_run (mddev); 2186 if (err) { 2187 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 2188 do_md_stop (mddev, 0); 2189 } 2190 } 2191 2192 /* 2193 * lets try to run arrays based on all disks that have arrived 2194 * until now. (those are in pending_raid_disks) 2195 * 2196 * the method: pick the first pending disk, collect all disks with 2197 * the same UUID, remove all from the pending list and put them into 2198 * the 'same_array' list. Then order this list based on superblock 2199 * update time (freshest comes first), kick out 'old' disks and 2200 * compare superblocks. If everything's fine then run it. 2201 * 2202 * If "unit" is allocated, then bump its reference count 2203 */ 2204 static void autorun_devices(int part) 2205 { 2206 struct list_head candidates; 2207 struct list_head *tmp; 2208 mdk_rdev_t *rdev0, *rdev; 2209 mddev_t *mddev; 2210 char b[BDEVNAME_SIZE]; 2211 2212 printk(KERN_INFO "md: autorun ...\n"); 2213 while (!list_empty(&pending_raid_disks)) { 2214 dev_t dev; 2215 rdev0 = list_entry(pending_raid_disks.next, 2216 mdk_rdev_t, same_set); 2217 2218 printk(KERN_INFO "md: considering %s ...\n", 2219 bdevname(rdev0->bdev,b)); 2220 INIT_LIST_HEAD(&candidates); 2221 ITERATE_RDEV_PENDING(rdev,tmp) 2222 if (super_90_load(rdev, rdev0, 0) >= 0) { 2223 printk(KERN_INFO "md: adding %s ...\n", 2224 bdevname(rdev->bdev,b)); 2225 list_move(&rdev->same_set, &candidates); 2226 } 2227 /* 2228 * now we have a set of devices, with all of them having 2229 * mostly sane superblocks. It's time to allocate the 2230 * mddev. 2231 */ 2232 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 2233 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 2234 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 2235 break; 2236 } 2237 if (part) 2238 dev = MKDEV(mdp_major, 2239 rdev0->preferred_minor << MdpMinorShift); 2240 else 2241 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 2242 2243 md_probe(dev, NULL, NULL); 2244 mddev = mddev_find(dev); 2245 if (!mddev) { 2246 printk(KERN_ERR 2247 "md: cannot allocate memory for md drive.\n"); 2248 break; 2249 } 2250 if (mddev_lock(mddev)) 2251 printk(KERN_WARNING "md: %s locked, cannot run\n", 2252 mdname(mddev)); 2253 else if (mddev->raid_disks || mddev->major_version 2254 || !list_empty(&mddev->disks)) { 2255 printk(KERN_WARNING 2256 "md: %s already running, cannot run %s\n", 2257 mdname(mddev), bdevname(rdev0->bdev,b)); 2258 mddev_unlock(mddev); 2259 } else { 2260 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 2261 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 2262 list_del_init(&rdev->same_set); 2263 if (bind_rdev_to_array(rdev, mddev)) 2264 export_rdev(rdev); 2265 } 2266 autorun_array(mddev); 2267 mddev_unlock(mddev); 2268 } 2269 /* on success, candidates will be empty, on error 2270 * it won't... 2271 */ 2272 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 2273 export_rdev(rdev); 2274 mddev_put(mddev); 2275 } 2276 printk(KERN_INFO "md: ... autorun DONE.\n"); 2277 } 2278 2279 /* 2280 * import RAID devices based on one partition 2281 * if possible, the array gets run as well. 2282 */ 2283 2284 static int autostart_array(dev_t startdev) 2285 { 2286 char b[BDEVNAME_SIZE]; 2287 int err = -EINVAL, i; 2288 mdp_super_t *sb = NULL; 2289 mdk_rdev_t *start_rdev = NULL, *rdev; 2290 2291 start_rdev = md_import_device(startdev, 0, 0); 2292 if (IS_ERR(start_rdev)) 2293 return err; 2294 2295 2296 /* NOTE: this can only work for 0.90.0 superblocks */ 2297 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2298 if (sb->major_version != 0 || 2299 sb->minor_version != 90 ) { 2300 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2301 export_rdev(start_rdev); 2302 return err; 2303 } 2304 2305 if (test_bit(Faulty, &start_rdev->flags)) { 2306 printk(KERN_WARNING 2307 "md: can not autostart based on faulty %s!\n", 2308 bdevname(start_rdev->bdev,b)); 2309 export_rdev(start_rdev); 2310 return err; 2311 } 2312 list_add(&start_rdev->same_set, &pending_raid_disks); 2313 2314 for (i = 0; i < MD_SB_DISKS; i++) { 2315 mdp_disk_t *desc = sb->disks + i; 2316 dev_t dev = MKDEV(desc->major, desc->minor); 2317 2318 if (!dev) 2319 continue; 2320 if (dev == startdev) 2321 continue; 2322 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2323 continue; 2324 rdev = md_import_device(dev, 0, 0); 2325 if (IS_ERR(rdev)) 2326 continue; 2327 2328 list_add(&rdev->same_set, &pending_raid_disks); 2329 } 2330 2331 /* 2332 * possibly return codes 2333 */ 2334 autorun_devices(0); 2335 return 0; 2336 2337 } 2338 2339 2340 static int get_version(void __user * arg) 2341 { 2342 mdu_version_t ver; 2343 2344 ver.major = MD_MAJOR_VERSION; 2345 ver.minor = MD_MINOR_VERSION; 2346 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2347 2348 if (copy_to_user(arg, &ver, sizeof(ver))) 2349 return -EFAULT; 2350 2351 return 0; 2352 } 2353 2354 static int get_array_info(mddev_t * mddev, void __user * arg) 2355 { 2356 mdu_array_info_t info; 2357 int nr,working,active,failed,spare; 2358 mdk_rdev_t *rdev; 2359 struct list_head *tmp; 2360 2361 nr=working=active=failed=spare=0; 2362 ITERATE_RDEV(mddev,rdev,tmp) { 2363 nr++; 2364 if (test_bit(Faulty, &rdev->flags)) 2365 failed++; 2366 else { 2367 working++; 2368 if (test_bit(In_sync, &rdev->flags)) 2369 active++; 2370 else 2371 spare++; 2372 } 2373 } 2374 2375 info.major_version = mddev->major_version; 2376 info.minor_version = mddev->minor_version; 2377 info.patch_version = MD_PATCHLEVEL_VERSION; 2378 info.ctime = mddev->ctime; 2379 info.level = mddev->level; 2380 info.size = mddev->size; 2381 info.nr_disks = nr; 2382 info.raid_disks = mddev->raid_disks; 2383 info.md_minor = mddev->md_minor; 2384 info.not_persistent= !mddev->persistent; 2385 2386 info.utime = mddev->utime; 2387 info.state = 0; 2388 if (mddev->in_sync) 2389 info.state = (1<<MD_SB_CLEAN); 2390 if (mddev->bitmap && mddev->bitmap_offset) 2391 info.state = (1<<MD_SB_BITMAP_PRESENT); 2392 info.active_disks = active; 2393 info.working_disks = working; 2394 info.failed_disks = failed; 2395 info.spare_disks = spare; 2396 2397 info.layout = mddev->layout; 2398 info.chunk_size = mddev->chunk_size; 2399 2400 if (copy_to_user(arg, &info, sizeof(info))) 2401 return -EFAULT; 2402 2403 return 0; 2404 } 2405 2406 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2407 { 2408 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2409 char *ptr, *buf = NULL; 2410 int err = -ENOMEM; 2411 2412 file = kmalloc(sizeof(*file), GFP_KERNEL); 2413 if (!file) 2414 goto out; 2415 2416 /* bitmap disabled, zero the first byte and copy out */ 2417 if (!mddev->bitmap || !mddev->bitmap->file) { 2418 file->pathname[0] = '\0'; 2419 goto copy_out; 2420 } 2421 2422 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2423 if (!buf) 2424 goto out; 2425 2426 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2427 if (!ptr) 2428 goto out; 2429 2430 strcpy(file->pathname, ptr); 2431 2432 copy_out: 2433 err = 0; 2434 if (copy_to_user(arg, file, sizeof(*file))) 2435 err = -EFAULT; 2436 out: 2437 kfree(buf); 2438 kfree(file); 2439 return err; 2440 } 2441 2442 static int get_disk_info(mddev_t * mddev, void __user * arg) 2443 { 2444 mdu_disk_info_t info; 2445 unsigned int nr; 2446 mdk_rdev_t *rdev; 2447 2448 if (copy_from_user(&info, arg, sizeof(info))) 2449 return -EFAULT; 2450 2451 nr = info.number; 2452 2453 rdev = find_rdev_nr(mddev, nr); 2454 if (rdev) { 2455 info.major = MAJOR(rdev->bdev->bd_dev); 2456 info.minor = MINOR(rdev->bdev->bd_dev); 2457 info.raid_disk = rdev->raid_disk; 2458 info.state = 0; 2459 if (test_bit(Faulty, &rdev->flags)) 2460 info.state |= (1<<MD_DISK_FAULTY); 2461 else if (test_bit(In_sync, &rdev->flags)) { 2462 info.state |= (1<<MD_DISK_ACTIVE); 2463 info.state |= (1<<MD_DISK_SYNC); 2464 } 2465 if (test_bit(WriteMostly, &rdev->flags)) 2466 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2467 } else { 2468 info.major = info.minor = 0; 2469 info.raid_disk = -1; 2470 info.state = (1<<MD_DISK_REMOVED); 2471 } 2472 2473 if (copy_to_user(arg, &info, sizeof(info))) 2474 return -EFAULT; 2475 2476 return 0; 2477 } 2478 2479 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2480 { 2481 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2482 mdk_rdev_t *rdev; 2483 dev_t dev = MKDEV(info->major,info->minor); 2484 2485 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2486 return -EOVERFLOW; 2487 2488 if (!mddev->raid_disks) { 2489 int err; 2490 /* expecting a device which has a superblock */ 2491 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2492 if (IS_ERR(rdev)) { 2493 printk(KERN_WARNING 2494 "md: md_import_device returned %ld\n", 2495 PTR_ERR(rdev)); 2496 return PTR_ERR(rdev); 2497 } 2498 if (!list_empty(&mddev->disks)) { 2499 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2500 mdk_rdev_t, same_set); 2501 int err = super_types[mddev->major_version] 2502 .load_super(rdev, rdev0, mddev->minor_version); 2503 if (err < 0) { 2504 printk(KERN_WARNING 2505 "md: %s has different UUID to %s\n", 2506 bdevname(rdev->bdev,b), 2507 bdevname(rdev0->bdev,b2)); 2508 export_rdev(rdev); 2509 return -EINVAL; 2510 } 2511 } 2512 err = bind_rdev_to_array(rdev, mddev); 2513 if (err) 2514 export_rdev(rdev); 2515 return err; 2516 } 2517 2518 /* 2519 * add_new_disk can be used once the array is assembled 2520 * to add "hot spares". They must already have a superblock 2521 * written 2522 */ 2523 if (mddev->pers) { 2524 int err; 2525 if (!mddev->pers->hot_add_disk) { 2526 printk(KERN_WARNING 2527 "%s: personality does not support diskops!\n", 2528 mdname(mddev)); 2529 return -EINVAL; 2530 } 2531 if (mddev->persistent) 2532 rdev = md_import_device(dev, mddev->major_version, 2533 mddev->minor_version); 2534 else 2535 rdev = md_import_device(dev, -1, -1); 2536 if (IS_ERR(rdev)) { 2537 printk(KERN_WARNING 2538 "md: md_import_device returned %ld\n", 2539 PTR_ERR(rdev)); 2540 return PTR_ERR(rdev); 2541 } 2542 /* set save_raid_disk if appropriate */ 2543 if (!mddev->persistent) { 2544 if (info->state & (1<<MD_DISK_SYNC) && 2545 info->raid_disk < mddev->raid_disks) 2546 rdev->raid_disk = info->raid_disk; 2547 else 2548 rdev->raid_disk = -1; 2549 } else 2550 super_types[mddev->major_version]. 2551 validate_super(mddev, rdev); 2552 rdev->saved_raid_disk = rdev->raid_disk; 2553 2554 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 2555 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2556 set_bit(WriteMostly, &rdev->flags); 2557 2558 rdev->raid_disk = -1; 2559 err = bind_rdev_to_array(rdev, mddev); 2560 if (err) 2561 export_rdev(rdev); 2562 2563 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2564 md_wakeup_thread(mddev->thread); 2565 return err; 2566 } 2567 2568 /* otherwise, add_new_disk is only allowed 2569 * for major_version==0 superblocks 2570 */ 2571 if (mddev->major_version != 0) { 2572 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2573 mdname(mddev)); 2574 return -EINVAL; 2575 } 2576 2577 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2578 int err; 2579 rdev = md_import_device (dev, -1, 0); 2580 if (IS_ERR(rdev)) { 2581 printk(KERN_WARNING 2582 "md: error, md_import_device() returned %ld\n", 2583 PTR_ERR(rdev)); 2584 return PTR_ERR(rdev); 2585 } 2586 rdev->desc_nr = info->number; 2587 if (info->raid_disk < mddev->raid_disks) 2588 rdev->raid_disk = info->raid_disk; 2589 else 2590 rdev->raid_disk = -1; 2591 2592 rdev->flags = 0; 2593 2594 if (rdev->raid_disk < mddev->raid_disks) 2595 if (info->state & (1<<MD_DISK_SYNC)) 2596 set_bit(In_sync, &rdev->flags); 2597 2598 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2599 set_bit(WriteMostly, &rdev->flags); 2600 2601 err = bind_rdev_to_array(rdev, mddev); 2602 if (err) { 2603 export_rdev(rdev); 2604 return err; 2605 } 2606 2607 if (!mddev->persistent) { 2608 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2609 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2610 } else 2611 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2612 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2613 2614 if (!mddev->size || (mddev->size > rdev->size)) 2615 mddev->size = rdev->size; 2616 } 2617 2618 return 0; 2619 } 2620 2621 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2622 { 2623 char b[BDEVNAME_SIZE]; 2624 mdk_rdev_t *rdev; 2625 2626 if (!mddev->pers) 2627 return -ENODEV; 2628 2629 rdev = find_rdev(mddev, dev); 2630 if (!rdev) 2631 return -ENXIO; 2632 2633 if (rdev->raid_disk >= 0) 2634 goto busy; 2635 2636 kick_rdev_from_array(rdev); 2637 md_update_sb(mddev); 2638 2639 return 0; 2640 busy: 2641 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2642 bdevname(rdev->bdev,b), mdname(mddev)); 2643 return -EBUSY; 2644 } 2645 2646 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2647 { 2648 char b[BDEVNAME_SIZE]; 2649 int err; 2650 unsigned int size; 2651 mdk_rdev_t *rdev; 2652 2653 if (!mddev->pers) 2654 return -ENODEV; 2655 2656 if (mddev->major_version != 0) { 2657 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2658 " version-0 superblocks.\n", 2659 mdname(mddev)); 2660 return -EINVAL; 2661 } 2662 if (!mddev->pers->hot_add_disk) { 2663 printk(KERN_WARNING 2664 "%s: personality does not support diskops!\n", 2665 mdname(mddev)); 2666 return -EINVAL; 2667 } 2668 2669 rdev = md_import_device (dev, -1, 0); 2670 if (IS_ERR(rdev)) { 2671 printk(KERN_WARNING 2672 "md: error, md_import_device() returned %ld\n", 2673 PTR_ERR(rdev)); 2674 return -EINVAL; 2675 } 2676 2677 if (mddev->persistent) 2678 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2679 else 2680 rdev->sb_offset = 2681 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2682 2683 size = calc_dev_size(rdev, mddev->chunk_size); 2684 rdev->size = size; 2685 2686 if (size < mddev->size) { 2687 printk(KERN_WARNING 2688 "%s: disk size %llu blocks < array size %llu\n", 2689 mdname(mddev), (unsigned long long)size, 2690 (unsigned long long)mddev->size); 2691 err = -ENOSPC; 2692 goto abort_export; 2693 } 2694 2695 if (test_bit(Faulty, &rdev->flags)) { 2696 printk(KERN_WARNING 2697 "md: can not hot-add faulty %s disk to %s!\n", 2698 bdevname(rdev->bdev,b), mdname(mddev)); 2699 err = -EINVAL; 2700 goto abort_export; 2701 } 2702 clear_bit(In_sync, &rdev->flags); 2703 rdev->desc_nr = -1; 2704 bind_rdev_to_array(rdev, mddev); 2705 2706 /* 2707 * The rest should better be atomic, we can have disk failures 2708 * noticed in interrupt contexts ... 2709 */ 2710 2711 if (rdev->desc_nr == mddev->max_disks) { 2712 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2713 mdname(mddev)); 2714 err = -EBUSY; 2715 goto abort_unbind_export; 2716 } 2717 2718 rdev->raid_disk = -1; 2719 2720 md_update_sb(mddev); 2721 2722 /* 2723 * Kick recovery, maybe this spare has to be added to the 2724 * array immediately. 2725 */ 2726 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2727 md_wakeup_thread(mddev->thread); 2728 2729 return 0; 2730 2731 abort_unbind_export: 2732 unbind_rdev_from_array(rdev); 2733 2734 abort_export: 2735 export_rdev(rdev); 2736 return err; 2737 } 2738 2739 /* similar to deny_write_access, but accounts for our holding a reference 2740 * to the file ourselves */ 2741 static int deny_bitmap_write_access(struct file * file) 2742 { 2743 struct inode *inode = file->f_mapping->host; 2744 2745 spin_lock(&inode->i_lock); 2746 if (atomic_read(&inode->i_writecount) > 1) { 2747 spin_unlock(&inode->i_lock); 2748 return -ETXTBSY; 2749 } 2750 atomic_set(&inode->i_writecount, -1); 2751 spin_unlock(&inode->i_lock); 2752 2753 return 0; 2754 } 2755 2756 static int set_bitmap_file(mddev_t *mddev, int fd) 2757 { 2758 int err; 2759 2760 if (mddev->pers) { 2761 if (!mddev->pers->quiesce) 2762 return -EBUSY; 2763 if (mddev->recovery || mddev->sync_thread) 2764 return -EBUSY; 2765 /* we should be able to change the bitmap.. */ 2766 } 2767 2768 2769 if (fd >= 0) { 2770 if (mddev->bitmap) 2771 return -EEXIST; /* cannot add when bitmap is present */ 2772 mddev->bitmap_file = fget(fd); 2773 2774 if (mddev->bitmap_file == NULL) { 2775 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2776 mdname(mddev)); 2777 return -EBADF; 2778 } 2779 2780 err = deny_bitmap_write_access(mddev->bitmap_file); 2781 if (err) { 2782 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2783 mdname(mddev)); 2784 fput(mddev->bitmap_file); 2785 mddev->bitmap_file = NULL; 2786 return err; 2787 } 2788 mddev->bitmap_offset = 0; /* file overrides offset */ 2789 } else if (mddev->bitmap == NULL) 2790 return -ENOENT; /* cannot remove what isn't there */ 2791 err = 0; 2792 if (mddev->pers) { 2793 mddev->pers->quiesce(mddev, 1); 2794 if (fd >= 0) 2795 err = bitmap_create(mddev); 2796 if (fd < 0 || err) 2797 bitmap_destroy(mddev); 2798 mddev->pers->quiesce(mddev, 0); 2799 } else if (fd < 0) { 2800 if (mddev->bitmap_file) 2801 fput(mddev->bitmap_file); 2802 mddev->bitmap_file = NULL; 2803 } 2804 2805 return err; 2806 } 2807 2808 /* 2809 * set_array_info is used two different ways 2810 * The original usage is when creating a new array. 2811 * In this usage, raid_disks is > 0 and it together with 2812 * level, size, not_persistent,layout,chunksize determine the 2813 * shape of the array. 2814 * This will always create an array with a type-0.90.0 superblock. 2815 * The newer usage is when assembling an array. 2816 * In this case raid_disks will be 0, and the major_version field is 2817 * use to determine which style super-blocks are to be found on the devices. 2818 * The minor and patch _version numbers are also kept incase the 2819 * super_block handler wishes to interpret them. 2820 */ 2821 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2822 { 2823 2824 if (info->raid_disks == 0) { 2825 /* just setting version number for superblock loading */ 2826 if (info->major_version < 0 || 2827 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2828 super_types[info->major_version].name == NULL) { 2829 /* maybe try to auto-load a module? */ 2830 printk(KERN_INFO 2831 "md: superblock version %d not known\n", 2832 info->major_version); 2833 return -EINVAL; 2834 } 2835 mddev->major_version = info->major_version; 2836 mddev->minor_version = info->minor_version; 2837 mddev->patch_version = info->patch_version; 2838 return 0; 2839 } 2840 mddev->major_version = MD_MAJOR_VERSION; 2841 mddev->minor_version = MD_MINOR_VERSION; 2842 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2843 mddev->ctime = get_seconds(); 2844 2845 mddev->level = info->level; 2846 mddev->size = info->size; 2847 mddev->raid_disks = info->raid_disks; 2848 /* don't set md_minor, it is determined by which /dev/md* was 2849 * openned 2850 */ 2851 if (info->state & (1<<MD_SB_CLEAN)) 2852 mddev->recovery_cp = MaxSector; 2853 else 2854 mddev->recovery_cp = 0; 2855 mddev->persistent = ! info->not_persistent; 2856 2857 mddev->layout = info->layout; 2858 mddev->chunk_size = info->chunk_size; 2859 2860 mddev->max_disks = MD_SB_DISKS; 2861 2862 mddev->sb_dirty = 1; 2863 2864 /* 2865 * Generate a 128 bit UUID 2866 */ 2867 get_random_bytes(mddev->uuid, 16); 2868 2869 return 0; 2870 } 2871 2872 /* 2873 * update_array_info is used to change the configuration of an 2874 * on-line array. 2875 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2876 * fields in the info are checked against the array. 2877 * Any differences that cannot be handled will cause an error. 2878 * Normally, only one change can be managed at a time. 2879 */ 2880 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2881 { 2882 int rv = 0; 2883 int cnt = 0; 2884 int state = 0; 2885 2886 /* calculate expected state,ignoring low bits */ 2887 if (mddev->bitmap && mddev->bitmap_offset) 2888 state |= (1 << MD_SB_BITMAP_PRESENT); 2889 2890 if (mddev->major_version != info->major_version || 2891 mddev->minor_version != info->minor_version || 2892 /* mddev->patch_version != info->patch_version || */ 2893 mddev->ctime != info->ctime || 2894 mddev->level != info->level || 2895 /* mddev->layout != info->layout || */ 2896 !mddev->persistent != info->not_persistent|| 2897 mddev->chunk_size != info->chunk_size || 2898 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 2899 ((state^info->state) & 0xfffffe00) 2900 ) 2901 return -EINVAL; 2902 /* Check there is only one change */ 2903 if (mddev->size != info->size) cnt++; 2904 if (mddev->raid_disks != info->raid_disks) cnt++; 2905 if (mddev->layout != info->layout) cnt++; 2906 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 2907 if (cnt == 0) return 0; 2908 if (cnt > 1) return -EINVAL; 2909 2910 if (mddev->layout != info->layout) { 2911 /* Change layout 2912 * we don't need to do anything at the md level, the 2913 * personality will take care of it all. 2914 */ 2915 if (mddev->pers->reconfig == NULL) 2916 return -EINVAL; 2917 else 2918 return mddev->pers->reconfig(mddev, info->layout, -1); 2919 } 2920 if (mddev->size != info->size) { 2921 mdk_rdev_t * rdev; 2922 struct list_head *tmp; 2923 if (mddev->pers->resize == NULL) 2924 return -EINVAL; 2925 /* The "size" is the amount of each device that is used. 2926 * This can only make sense for arrays with redundancy. 2927 * linear and raid0 always use whatever space is available 2928 * We can only consider changing the size if no resync 2929 * or reconstruction is happening, and if the new size 2930 * is acceptable. It must fit before the sb_offset or, 2931 * if that is <data_offset, it must fit before the 2932 * size of each device. 2933 * If size is zero, we find the largest size that fits. 2934 */ 2935 if (mddev->sync_thread) 2936 return -EBUSY; 2937 ITERATE_RDEV(mddev,rdev,tmp) { 2938 sector_t avail; 2939 int fit = (info->size == 0); 2940 if (rdev->sb_offset > rdev->data_offset) 2941 avail = (rdev->sb_offset*2) - rdev->data_offset; 2942 else 2943 avail = get_capacity(rdev->bdev->bd_disk) 2944 - rdev->data_offset; 2945 if (fit && (info->size == 0 || info->size > avail/2)) 2946 info->size = avail/2; 2947 if (avail < ((sector_t)info->size << 1)) 2948 return -ENOSPC; 2949 } 2950 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 2951 if (!rv) { 2952 struct block_device *bdev; 2953 2954 bdev = bdget_disk(mddev->gendisk, 0); 2955 if (bdev) { 2956 down(&bdev->bd_inode->i_sem); 2957 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2958 up(&bdev->bd_inode->i_sem); 2959 bdput(bdev); 2960 } 2961 } 2962 } 2963 if (mddev->raid_disks != info->raid_disks) { 2964 /* change the number of raid disks */ 2965 if (mddev->pers->reshape == NULL) 2966 return -EINVAL; 2967 if (info->raid_disks <= 0 || 2968 info->raid_disks >= mddev->max_disks) 2969 return -EINVAL; 2970 if (mddev->sync_thread) 2971 return -EBUSY; 2972 rv = mddev->pers->reshape(mddev, info->raid_disks); 2973 if (!rv) { 2974 struct block_device *bdev; 2975 2976 bdev = bdget_disk(mddev->gendisk, 0); 2977 if (bdev) { 2978 down(&bdev->bd_inode->i_sem); 2979 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2980 up(&bdev->bd_inode->i_sem); 2981 bdput(bdev); 2982 } 2983 } 2984 } 2985 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 2986 if (mddev->pers->quiesce == NULL) 2987 return -EINVAL; 2988 if (mddev->recovery || mddev->sync_thread) 2989 return -EBUSY; 2990 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 2991 /* add the bitmap */ 2992 if (mddev->bitmap) 2993 return -EEXIST; 2994 if (mddev->default_bitmap_offset == 0) 2995 return -EINVAL; 2996 mddev->bitmap_offset = mddev->default_bitmap_offset; 2997 mddev->pers->quiesce(mddev, 1); 2998 rv = bitmap_create(mddev); 2999 if (rv) 3000 bitmap_destroy(mddev); 3001 mddev->pers->quiesce(mddev, 0); 3002 } else { 3003 /* remove the bitmap */ 3004 if (!mddev->bitmap) 3005 return -ENOENT; 3006 if (mddev->bitmap->file) 3007 return -EINVAL; 3008 mddev->pers->quiesce(mddev, 1); 3009 bitmap_destroy(mddev); 3010 mddev->pers->quiesce(mddev, 0); 3011 mddev->bitmap_offset = 0; 3012 } 3013 } 3014 md_update_sb(mddev); 3015 return rv; 3016 } 3017 3018 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 3019 { 3020 mdk_rdev_t *rdev; 3021 3022 if (mddev->pers == NULL) 3023 return -ENODEV; 3024 3025 rdev = find_rdev(mddev, dev); 3026 if (!rdev) 3027 return -ENODEV; 3028 3029 md_error(mddev, rdev); 3030 return 0; 3031 } 3032 3033 static int md_ioctl(struct inode *inode, struct file *file, 3034 unsigned int cmd, unsigned long arg) 3035 { 3036 int err = 0; 3037 void __user *argp = (void __user *)arg; 3038 struct hd_geometry __user *loc = argp; 3039 mddev_t *mddev = NULL; 3040 3041 if (!capable(CAP_SYS_ADMIN)) 3042 return -EACCES; 3043 3044 /* 3045 * Commands dealing with the RAID driver but not any 3046 * particular array: 3047 */ 3048 switch (cmd) 3049 { 3050 case RAID_VERSION: 3051 err = get_version(argp); 3052 goto done; 3053 3054 case PRINT_RAID_DEBUG: 3055 err = 0; 3056 md_print_devices(); 3057 goto done; 3058 3059 #ifndef MODULE 3060 case RAID_AUTORUN: 3061 err = 0; 3062 autostart_arrays(arg); 3063 goto done; 3064 #endif 3065 default:; 3066 } 3067 3068 /* 3069 * Commands creating/starting a new array: 3070 */ 3071 3072 mddev = inode->i_bdev->bd_disk->private_data; 3073 3074 if (!mddev) { 3075 BUG(); 3076 goto abort; 3077 } 3078 3079 3080 if (cmd == START_ARRAY) { 3081 /* START_ARRAY doesn't need to lock the array as autostart_array 3082 * does the locking, and it could even be a different array 3083 */ 3084 static int cnt = 3; 3085 if (cnt > 0 ) { 3086 printk(KERN_WARNING 3087 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 3088 "This will not be supported beyond 2.6\n", 3089 current->comm, current->pid); 3090 cnt--; 3091 } 3092 err = autostart_array(new_decode_dev(arg)); 3093 if (err) { 3094 printk(KERN_WARNING "md: autostart failed!\n"); 3095 goto abort; 3096 } 3097 goto done; 3098 } 3099 3100 err = mddev_lock(mddev); 3101 if (err) { 3102 printk(KERN_INFO 3103 "md: ioctl lock interrupted, reason %d, cmd %d\n", 3104 err, cmd); 3105 goto abort; 3106 } 3107 3108 switch (cmd) 3109 { 3110 case SET_ARRAY_INFO: 3111 { 3112 mdu_array_info_t info; 3113 if (!arg) 3114 memset(&info, 0, sizeof(info)); 3115 else if (copy_from_user(&info, argp, sizeof(info))) { 3116 err = -EFAULT; 3117 goto abort_unlock; 3118 } 3119 if (mddev->pers) { 3120 err = update_array_info(mddev, &info); 3121 if (err) { 3122 printk(KERN_WARNING "md: couldn't update" 3123 " array info. %d\n", err); 3124 goto abort_unlock; 3125 } 3126 goto done_unlock; 3127 } 3128 if (!list_empty(&mddev->disks)) { 3129 printk(KERN_WARNING 3130 "md: array %s already has disks!\n", 3131 mdname(mddev)); 3132 err = -EBUSY; 3133 goto abort_unlock; 3134 } 3135 if (mddev->raid_disks) { 3136 printk(KERN_WARNING 3137 "md: array %s already initialised!\n", 3138 mdname(mddev)); 3139 err = -EBUSY; 3140 goto abort_unlock; 3141 } 3142 err = set_array_info(mddev, &info); 3143 if (err) { 3144 printk(KERN_WARNING "md: couldn't set" 3145 " array info. %d\n", err); 3146 goto abort_unlock; 3147 } 3148 } 3149 goto done_unlock; 3150 3151 default:; 3152 } 3153 3154 /* 3155 * Commands querying/configuring an existing array: 3156 */ 3157 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 3158 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 3159 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 3160 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 3161 err = -ENODEV; 3162 goto abort_unlock; 3163 } 3164 3165 /* 3166 * Commands even a read-only array can execute: 3167 */ 3168 switch (cmd) 3169 { 3170 case GET_ARRAY_INFO: 3171 err = get_array_info(mddev, argp); 3172 goto done_unlock; 3173 3174 case GET_BITMAP_FILE: 3175 err = get_bitmap_file(mddev, argp); 3176 goto done_unlock; 3177 3178 case GET_DISK_INFO: 3179 err = get_disk_info(mddev, argp); 3180 goto done_unlock; 3181 3182 case RESTART_ARRAY_RW: 3183 err = restart_array(mddev); 3184 goto done_unlock; 3185 3186 case STOP_ARRAY: 3187 err = do_md_stop (mddev, 0); 3188 goto done_unlock; 3189 3190 case STOP_ARRAY_RO: 3191 err = do_md_stop (mddev, 1); 3192 goto done_unlock; 3193 3194 /* 3195 * We have a problem here : there is no easy way to give a CHS 3196 * virtual geometry. We currently pretend that we have a 2 heads 3197 * 4 sectors (with a BIG number of cylinders...). This drives 3198 * dosfs just mad... ;-) 3199 */ 3200 case HDIO_GETGEO: 3201 if (!loc) { 3202 err = -EINVAL; 3203 goto abort_unlock; 3204 } 3205 err = put_user (2, (char __user *) &loc->heads); 3206 if (err) 3207 goto abort_unlock; 3208 err = put_user (4, (char __user *) &loc->sectors); 3209 if (err) 3210 goto abort_unlock; 3211 err = put_user(get_capacity(mddev->gendisk)/8, 3212 (short __user *) &loc->cylinders); 3213 if (err) 3214 goto abort_unlock; 3215 err = put_user (get_start_sect(inode->i_bdev), 3216 (long __user *) &loc->start); 3217 goto done_unlock; 3218 } 3219 3220 /* 3221 * The remaining ioctls are changing the state of the 3222 * superblock, so we do not allow read-only arrays 3223 * here: 3224 */ 3225 if (mddev->ro) { 3226 err = -EROFS; 3227 goto abort_unlock; 3228 } 3229 3230 switch (cmd) 3231 { 3232 case ADD_NEW_DISK: 3233 { 3234 mdu_disk_info_t info; 3235 if (copy_from_user(&info, argp, sizeof(info))) 3236 err = -EFAULT; 3237 else 3238 err = add_new_disk(mddev, &info); 3239 goto done_unlock; 3240 } 3241 3242 case HOT_REMOVE_DISK: 3243 err = hot_remove_disk(mddev, new_decode_dev(arg)); 3244 goto done_unlock; 3245 3246 case HOT_ADD_DISK: 3247 err = hot_add_disk(mddev, new_decode_dev(arg)); 3248 goto done_unlock; 3249 3250 case SET_DISK_FAULTY: 3251 err = set_disk_faulty(mddev, new_decode_dev(arg)); 3252 goto done_unlock; 3253 3254 case RUN_ARRAY: 3255 err = do_md_run (mddev); 3256 goto done_unlock; 3257 3258 case SET_BITMAP_FILE: 3259 err = set_bitmap_file(mddev, (int)arg); 3260 goto done_unlock; 3261 3262 default: 3263 if (_IOC_TYPE(cmd) == MD_MAJOR) 3264 printk(KERN_WARNING "md: %s(pid %d) used" 3265 " obsolete MD ioctl, upgrade your" 3266 " software to use new ictls.\n", 3267 current->comm, current->pid); 3268 err = -EINVAL; 3269 goto abort_unlock; 3270 } 3271 3272 done_unlock: 3273 abort_unlock: 3274 mddev_unlock(mddev); 3275 3276 return err; 3277 done: 3278 if (err) 3279 MD_BUG(); 3280 abort: 3281 return err; 3282 } 3283 3284 static int md_open(struct inode *inode, struct file *file) 3285 { 3286 /* 3287 * Succeed if we can lock the mddev, which confirms that 3288 * it isn't being stopped right now. 3289 */ 3290 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3291 int err; 3292 3293 if ((err = mddev_lock(mddev))) 3294 goto out; 3295 3296 err = 0; 3297 mddev_get(mddev); 3298 mddev_unlock(mddev); 3299 3300 check_disk_change(inode->i_bdev); 3301 out: 3302 return err; 3303 } 3304 3305 static int md_release(struct inode *inode, struct file * file) 3306 { 3307 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3308 3309 if (!mddev) 3310 BUG(); 3311 mddev_put(mddev); 3312 3313 return 0; 3314 } 3315 3316 static int md_media_changed(struct gendisk *disk) 3317 { 3318 mddev_t *mddev = disk->private_data; 3319 3320 return mddev->changed; 3321 } 3322 3323 static int md_revalidate(struct gendisk *disk) 3324 { 3325 mddev_t *mddev = disk->private_data; 3326 3327 mddev->changed = 0; 3328 return 0; 3329 } 3330 static struct block_device_operations md_fops = 3331 { 3332 .owner = THIS_MODULE, 3333 .open = md_open, 3334 .release = md_release, 3335 .ioctl = md_ioctl, 3336 .media_changed = md_media_changed, 3337 .revalidate_disk= md_revalidate, 3338 }; 3339 3340 static int md_thread(void * arg) 3341 { 3342 mdk_thread_t *thread = arg; 3343 3344 /* 3345 * md_thread is a 'system-thread', it's priority should be very 3346 * high. We avoid resource deadlocks individually in each 3347 * raid personality. (RAID5 does preallocation) We also use RR and 3348 * the very same RT priority as kswapd, thus we will never get 3349 * into a priority inversion deadlock. 3350 * 3351 * we definitely have to have equal or higher priority than 3352 * bdflush, otherwise bdflush will deadlock if there are too 3353 * many dirty RAID5 blocks. 3354 */ 3355 3356 allow_signal(SIGKILL); 3357 complete(thread->event); 3358 while (!kthread_should_stop()) { 3359 void (*run)(mddev_t *); 3360 3361 wait_event_interruptible_timeout(thread->wqueue, 3362 test_bit(THREAD_WAKEUP, &thread->flags) 3363 || kthread_should_stop(), 3364 thread->timeout); 3365 try_to_freeze(); 3366 3367 clear_bit(THREAD_WAKEUP, &thread->flags); 3368 3369 run = thread->run; 3370 if (run) 3371 run(thread->mddev); 3372 } 3373 3374 return 0; 3375 } 3376 3377 void md_wakeup_thread(mdk_thread_t *thread) 3378 { 3379 if (thread) { 3380 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3381 set_bit(THREAD_WAKEUP, &thread->flags); 3382 wake_up(&thread->wqueue); 3383 } 3384 } 3385 3386 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3387 const char *name) 3388 { 3389 mdk_thread_t *thread; 3390 struct completion event; 3391 3392 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3393 if (!thread) 3394 return NULL; 3395 3396 memset(thread, 0, sizeof(mdk_thread_t)); 3397 init_waitqueue_head(&thread->wqueue); 3398 3399 init_completion(&event); 3400 thread->event = &event; 3401 thread->run = run; 3402 thread->mddev = mddev; 3403 thread->name = name; 3404 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3405 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3406 if (IS_ERR(thread->tsk)) { 3407 kfree(thread); 3408 return NULL; 3409 } 3410 wait_for_completion(&event); 3411 return thread; 3412 } 3413 3414 void md_unregister_thread(mdk_thread_t *thread) 3415 { 3416 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3417 3418 kthread_stop(thread->tsk); 3419 kfree(thread); 3420 } 3421 3422 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3423 { 3424 if (!mddev) { 3425 MD_BUG(); 3426 return; 3427 } 3428 3429 if (!rdev || test_bit(Faulty, &rdev->flags)) 3430 return; 3431 /* 3432 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3433 mdname(mddev), 3434 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3435 __builtin_return_address(0),__builtin_return_address(1), 3436 __builtin_return_address(2),__builtin_return_address(3)); 3437 */ 3438 if (!mddev->pers->error_handler) 3439 return; 3440 mddev->pers->error_handler(mddev,rdev); 3441 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3442 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3443 md_wakeup_thread(mddev->thread); 3444 } 3445 3446 /* seq_file implementation /proc/mdstat */ 3447 3448 static void status_unused(struct seq_file *seq) 3449 { 3450 int i = 0; 3451 mdk_rdev_t *rdev; 3452 struct list_head *tmp; 3453 3454 seq_printf(seq, "unused devices: "); 3455 3456 ITERATE_RDEV_PENDING(rdev,tmp) { 3457 char b[BDEVNAME_SIZE]; 3458 i++; 3459 seq_printf(seq, "%s ", 3460 bdevname(rdev->bdev,b)); 3461 } 3462 if (!i) 3463 seq_printf(seq, "<none>"); 3464 3465 seq_printf(seq, "\n"); 3466 } 3467 3468 3469 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3470 { 3471 unsigned long max_blocks, resync, res, dt, db, rt; 3472 3473 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3474 3475 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3476 max_blocks = mddev->resync_max_sectors >> 1; 3477 else 3478 max_blocks = mddev->size; 3479 3480 /* 3481 * Should not happen. 3482 */ 3483 if (!max_blocks) { 3484 MD_BUG(); 3485 return; 3486 } 3487 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3488 { 3489 int i, x = res/50, y = 20-x; 3490 seq_printf(seq, "["); 3491 for (i = 0; i < x; i++) 3492 seq_printf(seq, "="); 3493 seq_printf(seq, ">"); 3494 for (i = 0; i < y; i++) 3495 seq_printf(seq, "."); 3496 seq_printf(seq, "] "); 3497 } 3498 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3499 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3500 "resync" : "recovery"), 3501 res/10, res % 10, resync, max_blocks); 3502 3503 /* 3504 * We do not want to overflow, so the order of operands and 3505 * the * 100 / 100 trick are important. We do a +1 to be 3506 * safe against division by zero. We only estimate anyway. 3507 * 3508 * dt: time from mark until now 3509 * db: blocks written from mark until now 3510 * rt: remaining time 3511 */ 3512 dt = ((jiffies - mddev->resync_mark) / HZ); 3513 if (!dt) dt++; 3514 db = resync - (mddev->resync_mark_cnt/2); 3515 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3516 3517 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3518 3519 seq_printf(seq, " speed=%ldK/sec", db/dt); 3520 } 3521 3522 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3523 { 3524 struct list_head *tmp; 3525 loff_t l = *pos; 3526 mddev_t *mddev; 3527 3528 if (l >= 0x10000) 3529 return NULL; 3530 if (!l--) 3531 /* header */ 3532 return (void*)1; 3533 3534 spin_lock(&all_mddevs_lock); 3535 list_for_each(tmp,&all_mddevs) 3536 if (!l--) { 3537 mddev = list_entry(tmp, mddev_t, all_mddevs); 3538 mddev_get(mddev); 3539 spin_unlock(&all_mddevs_lock); 3540 return mddev; 3541 } 3542 spin_unlock(&all_mddevs_lock); 3543 if (!l--) 3544 return (void*)2;/* tail */ 3545 return NULL; 3546 } 3547 3548 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3549 { 3550 struct list_head *tmp; 3551 mddev_t *next_mddev, *mddev = v; 3552 3553 ++*pos; 3554 if (v == (void*)2) 3555 return NULL; 3556 3557 spin_lock(&all_mddevs_lock); 3558 if (v == (void*)1) 3559 tmp = all_mddevs.next; 3560 else 3561 tmp = mddev->all_mddevs.next; 3562 if (tmp != &all_mddevs) 3563 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3564 else { 3565 next_mddev = (void*)2; 3566 *pos = 0x10000; 3567 } 3568 spin_unlock(&all_mddevs_lock); 3569 3570 if (v != (void*)1) 3571 mddev_put(mddev); 3572 return next_mddev; 3573 3574 } 3575 3576 static void md_seq_stop(struct seq_file *seq, void *v) 3577 { 3578 mddev_t *mddev = v; 3579 3580 if (mddev && v != (void*)1 && v != (void*)2) 3581 mddev_put(mddev); 3582 } 3583 3584 static int md_seq_show(struct seq_file *seq, void *v) 3585 { 3586 mddev_t *mddev = v; 3587 sector_t size; 3588 struct list_head *tmp2; 3589 mdk_rdev_t *rdev; 3590 int i; 3591 struct bitmap *bitmap; 3592 3593 if (v == (void*)1) { 3594 seq_printf(seq, "Personalities : "); 3595 spin_lock(&pers_lock); 3596 for (i = 0; i < MAX_PERSONALITY; i++) 3597 if (pers[i]) 3598 seq_printf(seq, "[%s] ", pers[i]->name); 3599 3600 spin_unlock(&pers_lock); 3601 seq_printf(seq, "\n"); 3602 return 0; 3603 } 3604 if (v == (void*)2) { 3605 status_unused(seq); 3606 return 0; 3607 } 3608 3609 if (mddev_lock(mddev)!=0) 3610 return -EINTR; 3611 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3612 seq_printf(seq, "%s : %sactive", mdname(mddev), 3613 mddev->pers ? "" : "in"); 3614 if (mddev->pers) { 3615 if (mddev->ro) 3616 seq_printf(seq, " (read-only)"); 3617 seq_printf(seq, " %s", mddev->pers->name); 3618 } 3619 3620 size = 0; 3621 ITERATE_RDEV(mddev,rdev,tmp2) { 3622 char b[BDEVNAME_SIZE]; 3623 seq_printf(seq, " %s[%d]", 3624 bdevname(rdev->bdev,b), rdev->desc_nr); 3625 if (test_bit(WriteMostly, &rdev->flags)) 3626 seq_printf(seq, "(W)"); 3627 if (test_bit(Faulty, &rdev->flags)) { 3628 seq_printf(seq, "(F)"); 3629 continue; 3630 } else if (rdev->raid_disk < 0) 3631 seq_printf(seq, "(S)"); /* spare */ 3632 size += rdev->size; 3633 } 3634 3635 if (!list_empty(&mddev->disks)) { 3636 if (mddev->pers) 3637 seq_printf(seq, "\n %llu blocks", 3638 (unsigned long long)mddev->array_size); 3639 else 3640 seq_printf(seq, "\n %llu blocks", 3641 (unsigned long long)size); 3642 } 3643 if (mddev->persistent) { 3644 if (mddev->major_version != 0 || 3645 mddev->minor_version != 90) { 3646 seq_printf(seq," super %d.%d", 3647 mddev->major_version, 3648 mddev->minor_version); 3649 } 3650 } else 3651 seq_printf(seq, " super non-persistent"); 3652 3653 if (mddev->pers) { 3654 mddev->pers->status (seq, mddev); 3655 seq_printf(seq, "\n "); 3656 if (mddev->curr_resync > 2) { 3657 status_resync (seq, mddev); 3658 seq_printf(seq, "\n "); 3659 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3660 seq_printf(seq, " resync=DELAYED\n "); 3661 } else 3662 seq_printf(seq, "\n "); 3663 3664 if ((bitmap = mddev->bitmap)) { 3665 unsigned long chunk_kb; 3666 unsigned long flags; 3667 spin_lock_irqsave(&bitmap->lock, flags); 3668 chunk_kb = bitmap->chunksize >> 10; 3669 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3670 "%lu%s chunk", 3671 bitmap->pages - bitmap->missing_pages, 3672 bitmap->pages, 3673 (bitmap->pages - bitmap->missing_pages) 3674 << (PAGE_SHIFT - 10), 3675 chunk_kb ? chunk_kb : bitmap->chunksize, 3676 chunk_kb ? "KB" : "B"); 3677 if (bitmap->file) { 3678 seq_printf(seq, ", file: "); 3679 seq_path(seq, bitmap->file->f_vfsmnt, 3680 bitmap->file->f_dentry," \t\n"); 3681 } 3682 3683 seq_printf(seq, "\n"); 3684 spin_unlock_irqrestore(&bitmap->lock, flags); 3685 } 3686 3687 seq_printf(seq, "\n"); 3688 } 3689 mddev_unlock(mddev); 3690 3691 return 0; 3692 } 3693 3694 static struct seq_operations md_seq_ops = { 3695 .start = md_seq_start, 3696 .next = md_seq_next, 3697 .stop = md_seq_stop, 3698 .show = md_seq_show, 3699 }; 3700 3701 static int md_seq_open(struct inode *inode, struct file *file) 3702 { 3703 int error; 3704 3705 error = seq_open(file, &md_seq_ops); 3706 return error; 3707 } 3708 3709 static struct file_operations md_seq_fops = { 3710 .open = md_seq_open, 3711 .read = seq_read, 3712 .llseek = seq_lseek, 3713 .release = seq_release, 3714 }; 3715 3716 int register_md_personality(int pnum, mdk_personality_t *p) 3717 { 3718 if (pnum >= MAX_PERSONALITY) { 3719 printk(KERN_ERR 3720 "md: tried to install personality %s as nr %d, but max is %lu\n", 3721 p->name, pnum, MAX_PERSONALITY-1); 3722 return -EINVAL; 3723 } 3724 3725 spin_lock(&pers_lock); 3726 if (pers[pnum]) { 3727 spin_unlock(&pers_lock); 3728 return -EBUSY; 3729 } 3730 3731 pers[pnum] = p; 3732 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3733 spin_unlock(&pers_lock); 3734 return 0; 3735 } 3736 3737 int unregister_md_personality(int pnum) 3738 { 3739 if (pnum >= MAX_PERSONALITY) 3740 return -EINVAL; 3741 3742 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3743 spin_lock(&pers_lock); 3744 pers[pnum] = NULL; 3745 spin_unlock(&pers_lock); 3746 return 0; 3747 } 3748 3749 static int is_mddev_idle(mddev_t *mddev) 3750 { 3751 mdk_rdev_t * rdev; 3752 struct list_head *tmp; 3753 int idle; 3754 unsigned long curr_events; 3755 3756 idle = 1; 3757 ITERATE_RDEV(mddev,rdev,tmp) { 3758 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3759 curr_events = disk_stat_read(disk, sectors[0]) + 3760 disk_stat_read(disk, sectors[1]) - 3761 atomic_read(&disk->sync_io); 3762 /* Allow some slack between valud of curr_events and last_events, 3763 * as there are some uninteresting races. 3764 * Note: the following is an unsigned comparison. 3765 */ 3766 if ((curr_events - rdev->last_events + 32) > 64) { 3767 rdev->last_events = curr_events; 3768 idle = 0; 3769 } 3770 } 3771 return idle; 3772 } 3773 3774 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3775 { 3776 /* another "blocks" (512byte) blocks have been synced */ 3777 atomic_sub(blocks, &mddev->recovery_active); 3778 wake_up(&mddev->recovery_wait); 3779 if (!ok) { 3780 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3781 md_wakeup_thread(mddev->thread); 3782 // stop recovery, signal do_sync .... 3783 } 3784 } 3785 3786 3787 /* md_write_start(mddev, bi) 3788 * If we need to update some array metadata (e.g. 'active' flag 3789 * in superblock) before writing, schedule a superblock update 3790 * and wait for it to complete. 3791 */ 3792 void md_write_start(mddev_t *mddev, struct bio *bi) 3793 { 3794 if (bio_data_dir(bi) != WRITE) 3795 return; 3796 3797 atomic_inc(&mddev->writes_pending); 3798 if (mddev->in_sync) { 3799 spin_lock(&mddev->write_lock); 3800 if (mddev->in_sync) { 3801 mddev->in_sync = 0; 3802 mddev->sb_dirty = 1; 3803 md_wakeup_thread(mddev->thread); 3804 } 3805 spin_unlock(&mddev->write_lock); 3806 } 3807 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3808 } 3809 3810 void md_write_end(mddev_t *mddev) 3811 { 3812 if (atomic_dec_and_test(&mddev->writes_pending)) { 3813 if (mddev->safemode == 2) 3814 md_wakeup_thread(mddev->thread); 3815 else 3816 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3817 } 3818 } 3819 3820 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3821 3822 #define SYNC_MARKS 10 3823 #define SYNC_MARK_STEP (3*HZ) 3824 static void md_do_sync(mddev_t *mddev) 3825 { 3826 mddev_t *mddev2; 3827 unsigned int currspeed = 0, 3828 window; 3829 sector_t max_sectors,j, io_sectors; 3830 unsigned long mark[SYNC_MARKS]; 3831 sector_t mark_cnt[SYNC_MARKS]; 3832 int last_mark,m; 3833 struct list_head *tmp; 3834 sector_t last_check; 3835 int skipped = 0; 3836 3837 /* just incase thread restarts... */ 3838 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3839 return; 3840 3841 /* we overload curr_resync somewhat here. 3842 * 0 == not engaged in resync at all 3843 * 2 == checking that there is no conflict with another sync 3844 * 1 == like 2, but have yielded to allow conflicting resync to 3845 * commense 3846 * other == active in resync - this many blocks 3847 * 3848 * Before starting a resync we must have set curr_resync to 3849 * 2, and then checked that every "conflicting" array has curr_resync 3850 * less than ours. When we find one that is the same or higher 3851 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3852 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3853 * This will mean we have to start checking from the beginning again. 3854 * 3855 */ 3856 3857 do { 3858 mddev->curr_resync = 2; 3859 3860 try_again: 3861 if (signal_pending(current) || 3862 kthread_should_stop()) { 3863 flush_signals(current); 3864 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3865 goto skip; 3866 } 3867 ITERATE_MDDEV(mddev2,tmp) { 3868 if (mddev2 == mddev) 3869 continue; 3870 if (mddev2->curr_resync && 3871 match_mddev_units(mddev,mddev2)) { 3872 DEFINE_WAIT(wq); 3873 if (mddev < mddev2 && mddev->curr_resync == 2) { 3874 /* arbitrarily yield */ 3875 mddev->curr_resync = 1; 3876 wake_up(&resync_wait); 3877 } 3878 if (mddev > mddev2 && mddev->curr_resync == 1) 3879 /* no need to wait here, we can wait the next 3880 * time 'round when curr_resync == 2 3881 */ 3882 continue; 3883 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3884 if (!signal_pending(current) && 3885 !kthread_should_stop() && 3886 mddev2->curr_resync >= mddev->curr_resync) { 3887 printk(KERN_INFO "md: delaying resync of %s" 3888 " until %s has finished resync (they" 3889 " share one or more physical units)\n", 3890 mdname(mddev), mdname(mddev2)); 3891 mddev_put(mddev2); 3892 schedule(); 3893 finish_wait(&resync_wait, &wq); 3894 goto try_again; 3895 } 3896 finish_wait(&resync_wait, &wq); 3897 } 3898 } 3899 } while (mddev->curr_resync < 2); 3900 3901 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3902 /* resync follows the size requested by the personality, 3903 * which defaults to physical size, but can be virtual size 3904 */ 3905 max_sectors = mddev->resync_max_sectors; 3906 mddev->resync_mismatches = 0; 3907 } else 3908 /* recovery follows the physical size of devices */ 3909 max_sectors = mddev->size << 1; 3910 3911 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3912 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3913 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3914 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 3915 "(but not more than %d KB/sec) for reconstruction.\n", 3916 sysctl_speed_limit_max); 3917 3918 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3919 /* we don't use the checkpoint if there's a bitmap */ 3920 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap 3921 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3922 j = mddev->recovery_cp; 3923 else 3924 j = 0; 3925 io_sectors = 0; 3926 for (m = 0; m < SYNC_MARKS; m++) { 3927 mark[m] = jiffies; 3928 mark_cnt[m] = io_sectors; 3929 } 3930 last_mark = 0; 3931 mddev->resync_mark = mark[last_mark]; 3932 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3933 3934 /* 3935 * Tune reconstruction: 3936 */ 3937 window = 32*(PAGE_SIZE/512); 3938 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 3939 window/2,(unsigned long long) max_sectors/2); 3940 3941 atomic_set(&mddev->recovery_active, 0); 3942 init_waitqueue_head(&mddev->recovery_wait); 3943 last_check = 0; 3944 3945 if (j>2) { 3946 printk(KERN_INFO 3947 "md: resuming recovery of %s from checkpoint.\n", 3948 mdname(mddev)); 3949 mddev->curr_resync = j; 3950 } 3951 3952 while (j < max_sectors) { 3953 sector_t sectors; 3954 3955 skipped = 0; 3956 sectors = mddev->pers->sync_request(mddev, j, &skipped, 3957 currspeed < sysctl_speed_limit_min); 3958 if (sectors == 0) { 3959 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3960 goto out; 3961 } 3962 3963 if (!skipped) { /* actual IO requested */ 3964 io_sectors += sectors; 3965 atomic_add(sectors, &mddev->recovery_active); 3966 } 3967 3968 j += sectors; 3969 if (j>1) mddev->curr_resync = j; 3970 3971 3972 if (last_check + window > io_sectors || j == max_sectors) 3973 continue; 3974 3975 last_check = io_sectors; 3976 3977 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3978 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3979 break; 3980 3981 repeat: 3982 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 3983 /* step marks */ 3984 int next = (last_mark+1) % SYNC_MARKS; 3985 3986 mddev->resync_mark = mark[next]; 3987 mddev->resync_mark_cnt = mark_cnt[next]; 3988 mark[next] = jiffies; 3989 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 3990 last_mark = next; 3991 } 3992 3993 3994 if (signal_pending(current) || kthread_should_stop()) { 3995 /* 3996 * got a signal, exit. 3997 */ 3998 printk(KERN_INFO 3999 "md: md_do_sync() got signal ... exiting\n"); 4000 flush_signals(current); 4001 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4002 goto out; 4003 } 4004 4005 /* 4006 * this loop exits only if either when we are slower than 4007 * the 'hard' speed limit, or the system was IO-idle for 4008 * a jiffy. 4009 * the system might be non-idle CPU-wise, but we only care 4010 * about not overloading the IO subsystem. (things like an 4011 * e2fsck being done on the RAID array should execute fast) 4012 */ 4013 mddev->queue->unplug_fn(mddev->queue); 4014 cond_resched(); 4015 4016 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4017 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4018 4019 if (currspeed > sysctl_speed_limit_min) { 4020 if ((currspeed > sysctl_speed_limit_max) || 4021 !is_mddev_idle(mddev)) { 4022 msleep_interruptible(250); 4023 goto repeat; 4024 } 4025 } 4026 } 4027 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 4028 /* 4029 * this also signals 'finished resyncing' to md_stop 4030 */ 4031 out: 4032 mddev->queue->unplug_fn(mddev->queue); 4033 4034 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 4035 4036 /* tell personality that we are finished */ 4037 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4038 4039 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4040 mddev->curr_resync > 2 && 4041 mddev->curr_resync >= mddev->recovery_cp) { 4042 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4043 printk(KERN_INFO 4044 "md: checkpointing recovery of %s.\n", 4045 mdname(mddev)); 4046 mddev->recovery_cp = mddev->curr_resync; 4047 } else 4048 mddev->recovery_cp = MaxSector; 4049 } 4050 4051 skip: 4052 mddev->curr_resync = 0; 4053 wake_up(&resync_wait); 4054 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4055 md_wakeup_thread(mddev->thread); 4056 } 4057 4058 4059 /* 4060 * This routine is regularly called by all per-raid-array threads to 4061 * deal with generic issues like resync and super-block update. 4062 * Raid personalities that don't have a thread (linear/raid0) do not 4063 * need this as they never do any recovery or update the superblock. 4064 * 4065 * It does not do any resync itself, but rather "forks" off other threads 4066 * to do that as needed. 4067 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 4068 * "->recovery" and create a thread at ->sync_thread. 4069 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 4070 * and wakeups up this thread which will reap the thread and finish up. 4071 * This thread also removes any faulty devices (with nr_pending == 0). 4072 * 4073 * The overall approach is: 4074 * 1/ if the superblock needs updating, update it. 4075 * 2/ If a recovery thread is running, don't do anything else. 4076 * 3/ If recovery has finished, clean up, possibly marking spares active. 4077 * 4/ If there are any faulty devices, remove them. 4078 * 5/ If array is degraded, try to add spares devices 4079 * 6/ If array has spares or is not in-sync, start a resync thread. 4080 */ 4081 void md_check_recovery(mddev_t *mddev) 4082 { 4083 mdk_rdev_t *rdev; 4084 struct list_head *rtmp; 4085 4086 4087 if (mddev->bitmap) 4088 bitmap_daemon_work(mddev->bitmap); 4089 4090 if (mddev->ro) 4091 return; 4092 4093 if (signal_pending(current)) { 4094 if (mddev->pers->sync_request) { 4095 printk(KERN_INFO "md: %s in immediate safe mode\n", 4096 mdname(mddev)); 4097 mddev->safemode = 2; 4098 } 4099 flush_signals(current); 4100 } 4101 4102 if ( ! ( 4103 mddev->sb_dirty || 4104 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 4105 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 4106 (mddev->safemode == 1) || 4107 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 4108 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 4109 )) 4110 return; 4111 4112 if (mddev_trylock(mddev)==0) { 4113 int spares =0; 4114 4115 spin_lock(&mddev->write_lock); 4116 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4117 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4118 mddev->in_sync = 1; 4119 mddev->sb_dirty = 1; 4120 } 4121 if (mddev->safemode == 1) 4122 mddev->safemode = 0; 4123 spin_unlock(&mddev->write_lock); 4124 4125 if (mddev->sb_dirty) 4126 md_update_sb(mddev); 4127 4128 4129 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4130 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 4131 /* resync/recovery still happening */ 4132 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4133 goto unlock; 4134 } 4135 if (mddev->sync_thread) { 4136 /* resync has finished, collect result */ 4137 md_unregister_thread(mddev->sync_thread); 4138 mddev->sync_thread = NULL; 4139 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4140 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4141 /* success...*/ 4142 /* activate any spares */ 4143 mddev->pers->spare_active(mddev); 4144 } 4145 md_update_sb(mddev); 4146 4147 /* if array is no-longer degraded, then any saved_raid_disk 4148 * information must be scrapped 4149 */ 4150 if (!mddev->degraded) 4151 ITERATE_RDEV(mddev,rdev,rtmp) 4152 rdev->saved_raid_disk = -1; 4153 4154 mddev->recovery = 0; 4155 /* flag recovery needed just to double check */ 4156 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4157 goto unlock; 4158 } 4159 /* Clear some bits that don't mean anything, but 4160 * might be left set 4161 */ 4162 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4163 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 4164 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 4165 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4166 4167 /* no recovery is running. 4168 * remove any failed drives, then 4169 * add spares if possible. 4170 * Spare are also removed and re-added, to allow 4171 * the personality to fail the re-add. 4172 */ 4173 ITERATE_RDEV(mddev,rdev,rtmp) 4174 if (rdev->raid_disk >= 0 && 4175 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 4176 atomic_read(&rdev->nr_pending)==0) { 4177 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 4178 char nm[20]; 4179 sprintf(nm,"rd%d", rdev->raid_disk); 4180 sysfs_remove_link(&mddev->kobj, nm); 4181 rdev->raid_disk = -1; 4182 } 4183 } 4184 4185 if (mddev->degraded) { 4186 ITERATE_RDEV(mddev,rdev,rtmp) 4187 if (rdev->raid_disk < 0 4188 && !test_bit(Faulty, &rdev->flags)) { 4189 if (mddev->pers->hot_add_disk(mddev,rdev)) { 4190 char nm[20]; 4191 sprintf(nm, "rd%d", rdev->raid_disk); 4192 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4193 spares++; 4194 } else 4195 break; 4196 } 4197 } 4198 4199 if (spares) { 4200 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4201 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4202 } else if (mddev->recovery_cp < MaxSector) { 4203 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4204 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4205 /* nothing to be done ... */ 4206 goto unlock; 4207 4208 if (mddev->pers->sync_request) { 4209 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4210 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4211 /* We are adding a device or devices to an array 4212 * which has the bitmap stored on all devices. 4213 * So make sure all bitmap pages get written 4214 */ 4215 bitmap_write_all(mddev->bitmap); 4216 } 4217 mddev->sync_thread = md_register_thread(md_do_sync, 4218 mddev, 4219 "%s_resync"); 4220 if (!mddev->sync_thread) { 4221 printk(KERN_ERR "%s: could not start resync" 4222 " thread...\n", 4223 mdname(mddev)); 4224 /* leave the spares where they are, it shouldn't hurt */ 4225 mddev->recovery = 0; 4226 } else { 4227 md_wakeup_thread(mddev->sync_thread); 4228 } 4229 } 4230 unlock: 4231 mddev_unlock(mddev); 4232 } 4233 } 4234 4235 static int md_notify_reboot(struct notifier_block *this, 4236 unsigned long code, void *x) 4237 { 4238 struct list_head *tmp; 4239 mddev_t *mddev; 4240 4241 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 4242 4243 printk(KERN_INFO "md: stopping all md devices.\n"); 4244 4245 ITERATE_MDDEV(mddev,tmp) 4246 if (mddev_trylock(mddev)==0) 4247 do_md_stop (mddev, 1); 4248 /* 4249 * certain more exotic SCSI devices are known to be 4250 * volatile wrt too early system reboots. While the 4251 * right place to handle this issue is the given 4252 * driver, we do want to have a safe RAID driver ... 4253 */ 4254 mdelay(1000*1); 4255 } 4256 return NOTIFY_DONE; 4257 } 4258 4259 static struct notifier_block md_notifier = { 4260 .notifier_call = md_notify_reboot, 4261 .next = NULL, 4262 .priority = INT_MAX, /* before any real devices */ 4263 }; 4264 4265 static void md_geninit(void) 4266 { 4267 struct proc_dir_entry *p; 4268 4269 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 4270 4271 p = create_proc_entry("mdstat", S_IRUGO, NULL); 4272 if (p) 4273 p->proc_fops = &md_seq_fops; 4274 } 4275 4276 static int __init md_init(void) 4277 { 4278 int minor; 4279 4280 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 4281 " MD_SB_DISKS=%d\n", 4282 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4283 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4284 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 4285 BITMAP_MINOR); 4286 4287 if (register_blkdev(MAJOR_NR, "md")) 4288 return -1; 4289 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 4290 unregister_blkdev(MAJOR_NR, "md"); 4291 return -1; 4292 } 4293 devfs_mk_dir("md"); 4294 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 4295 md_probe, NULL, NULL); 4296 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 4297 md_probe, NULL, NULL); 4298 4299 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4300 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 4301 S_IFBLK|S_IRUSR|S_IWUSR, 4302 "md/%d", minor); 4303 4304 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4305 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4306 S_IFBLK|S_IRUSR|S_IWUSR, 4307 "md/mdp%d", minor); 4308 4309 4310 register_reboot_notifier(&md_notifier); 4311 raid_table_header = register_sysctl_table(raid_root_table, 1); 4312 4313 md_geninit(); 4314 return (0); 4315 } 4316 4317 4318 #ifndef MODULE 4319 4320 /* 4321 * Searches all registered partitions for autorun RAID arrays 4322 * at boot time. 4323 */ 4324 static dev_t detected_devices[128]; 4325 static int dev_cnt; 4326 4327 void md_autodetect_dev(dev_t dev) 4328 { 4329 if (dev_cnt >= 0 && dev_cnt < 127) 4330 detected_devices[dev_cnt++] = dev; 4331 } 4332 4333 4334 static void autostart_arrays(int part) 4335 { 4336 mdk_rdev_t *rdev; 4337 int i; 4338 4339 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4340 4341 for (i = 0; i < dev_cnt; i++) { 4342 dev_t dev = detected_devices[i]; 4343 4344 rdev = md_import_device(dev,0, 0); 4345 if (IS_ERR(rdev)) 4346 continue; 4347 4348 if (test_bit(Faulty, &rdev->flags)) { 4349 MD_BUG(); 4350 continue; 4351 } 4352 list_add(&rdev->same_set, &pending_raid_disks); 4353 } 4354 dev_cnt = 0; 4355 4356 autorun_devices(part); 4357 } 4358 4359 #endif 4360 4361 static __exit void md_exit(void) 4362 { 4363 mddev_t *mddev; 4364 struct list_head *tmp; 4365 int i; 4366 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4367 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4368 for (i=0; i < MAX_MD_DEVS; i++) 4369 devfs_remove("md/%d", i); 4370 for (i=0; i < MAX_MD_DEVS; i++) 4371 devfs_remove("md/d%d", i); 4372 4373 devfs_remove("md"); 4374 4375 unregister_blkdev(MAJOR_NR,"md"); 4376 unregister_blkdev(mdp_major, "mdp"); 4377 unregister_reboot_notifier(&md_notifier); 4378 unregister_sysctl_table(raid_table_header); 4379 remove_proc_entry("mdstat", NULL); 4380 ITERATE_MDDEV(mddev,tmp) { 4381 struct gendisk *disk = mddev->gendisk; 4382 if (!disk) 4383 continue; 4384 export_array(mddev); 4385 del_gendisk(disk); 4386 put_disk(disk); 4387 mddev->gendisk = NULL; 4388 mddev_put(mddev); 4389 } 4390 } 4391 4392 module_init(md_init) 4393 module_exit(md_exit) 4394 4395 EXPORT_SYMBOL(register_md_personality); 4396 EXPORT_SYMBOL(unregister_md_personality); 4397 EXPORT_SYMBOL(md_error); 4398 EXPORT_SYMBOL(md_done_sync); 4399 EXPORT_SYMBOL(md_write_start); 4400 EXPORT_SYMBOL(md_write_end); 4401 EXPORT_SYMBOL(md_register_thread); 4402 EXPORT_SYMBOL(md_unregister_thread); 4403 EXPORT_SYMBOL(md_wakeup_thread); 4404 EXPORT_SYMBOL(md_print_devices); 4405 EXPORT_SYMBOL(md_check_recovery); 4406 MODULE_LICENSE("GPL"); 4407 MODULE_ALIAS("md"); 4408 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4409