1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 #include <linux/poll.h> 46 47 #include <linux/init.h> 48 49 #include <linux/file.h> 50 51 #ifdef CONFIG_KMOD 52 #include <linux/kmod.h> 53 #endif 54 55 #include <asm/unaligned.h> 56 57 #define MAJOR_NR MD_MAJOR 58 #define MD_DRIVER 59 60 /* 63 partitions with the alternate major number (mdp) */ 61 #define MdpMinorShift 6 62 63 #define DEBUG 0 64 #define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67 #ifndef MODULE 68 static void autostart_arrays (int part); 69 #endif 70 71 static LIST_HEAD(pers_list); 72 static DEFINE_SPINLOCK(pers_lock); 73 74 /* 75 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 76 * is 1000 KB/sec, so the extra system load does not show up that much. 77 * Increase it if you want to have more _guaranteed_ speed. Note that 78 * the RAID driver will use the maximum available bandwidth if the IO 79 * subsystem is idle. There is also an 'absolute maximum' reconstruction 80 * speed limit - in case reconstruction slows down your system despite 81 * idle IO detection. 82 * 83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 84 */ 85 86 static int sysctl_speed_limit_min = 1000; 87 static int sysctl_speed_limit_max = 200000; 88 89 static struct ctl_table_header *raid_table_header; 90 91 static ctl_table raid_table[] = { 92 { 93 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 94 .procname = "speed_limit_min", 95 .data = &sysctl_speed_limit_min, 96 .maxlen = sizeof(int), 97 .mode = 0644, 98 .proc_handler = &proc_dointvec, 99 }, 100 { 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 102 .procname = "speed_limit_max", 103 .data = &sysctl_speed_limit_max, 104 .maxlen = sizeof(int), 105 .mode = 0644, 106 .proc_handler = &proc_dointvec, 107 }, 108 { .ctl_name = 0 } 109 }; 110 111 static ctl_table raid_dir_table[] = { 112 { 113 .ctl_name = DEV_RAID, 114 .procname = "raid", 115 .maxlen = 0, 116 .mode = 0555, 117 .child = raid_table, 118 }, 119 { .ctl_name = 0 } 120 }; 121 122 static ctl_table raid_root_table[] = { 123 { 124 .ctl_name = CTL_DEV, 125 .procname = "dev", 126 .maxlen = 0, 127 .mode = 0555, 128 .child = raid_dir_table, 129 }, 130 { .ctl_name = 0 } 131 }; 132 133 static struct block_device_operations md_fops; 134 135 static int start_readonly; 136 137 /* 138 * We have a system wide 'event count' that is incremented 139 * on any 'interesting' event, and readers of /proc/mdstat 140 * can use 'poll' or 'select' to find out when the event 141 * count increases. 142 * 143 * Events are: 144 * start array, stop array, error, add device, remove device, 145 * start build, activate spare 146 */ 147 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 148 static atomic_t md_event_count; 149 static void md_new_event(mddev_t *mddev) 150 { 151 atomic_inc(&md_event_count); 152 wake_up(&md_event_waiters); 153 } 154 155 /* 156 * Enables to iterate over all existing md arrays 157 * all_mddevs_lock protects this list. 158 */ 159 static LIST_HEAD(all_mddevs); 160 static DEFINE_SPINLOCK(all_mddevs_lock); 161 162 163 /* 164 * iterates through all used mddevs in the system. 165 * We take care to grab the all_mddevs_lock whenever navigating 166 * the list, and to always hold a refcount when unlocked. 167 * Any code which breaks out of this loop while own 168 * a reference to the current mddev and must mddev_put it. 169 */ 170 #define ITERATE_MDDEV(mddev,tmp) \ 171 \ 172 for (({ spin_lock(&all_mddevs_lock); \ 173 tmp = all_mddevs.next; \ 174 mddev = NULL;}); \ 175 ({ if (tmp != &all_mddevs) \ 176 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 177 spin_unlock(&all_mddevs_lock); \ 178 if (mddev) mddev_put(mddev); \ 179 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 180 tmp != &all_mddevs;}); \ 181 ({ spin_lock(&all_mddevs_lock); \ 182 tmp = tmp->next;}) \ 183 ) 184 185 186 static int md_fail_request (request_queue_t *q, struct bio *bio) 187 { 188 bio_io_error(bio, bio->bi_size); 189 return 0; 190 } 191 192 static inline mddev_t *mddev_get(mddev_t *mddev) 193 { 194 atomic_inc(&mddev->active); 195 return mddev; 196 } 197 198 static void mddev_put(mddev_t *mddev) 199 { 200 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 201 return; 202 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 203 list_del(&mddev->all_mddevs); 204 blk_put_queue(mddev->queue); 205 kobject_unregister(&mddev->kobj); 206 } 207 spin_unlock(&all_mddevs_lock); 208 } 209 210 static mddev_t * mddev_find(dev_t unit) 211 { 212 mddev_t *mddev, *new = NULL; 213 214 retry: 215 spin_lock(&all_mddevs_lock); 216 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 217 if (mddev->unit == unit) { 218 mddev_get(mddev); 219 spin_unlock(&all_mddevs_lock); 220 kfree(new); 221 return mddev; 222 } 223 224 if (new) { 225 list_add(&new->all_mddevs, &all_mddevs); 226 spin_unlock(&all_mddevs_lock); 227 return new; 228 } 229 spin_unlock(&all_mddevs_lock); 230 231 new = kzalloc(sizeof(*new), GFP_KERNEL); 232 if (!new) 233 return NULL; 234 235 new->unit = unit; 236 if (MAJOR(unit) == MD_MAJOR) 237 new->md_minor = MINOR(unit); 238 else 239 new->md_minor = MINOR(unit) >> MdpMinorShift; 240 241 init_MUTEX(&new->reconfig_sem); 242 INIT_LIST_HEAD(&new->disks); 243 INIT_LIST_HEAD(&new->all_mddevs); 244 init_timer(&new->safemode_timer); 245 atomic_set(&new->active, 1); 246 spin_lock_init(&new->write_lock); 247 init_waitqueue_head(&new->sb_wait); 248 249 new->queue = blk_alloc_queue(GFP_KERNEL); 250 if (!new->queue) { 251 kfree(new); 252 return NULL; 253 } 254 255 blk_queue_make_request(new->queue, md_fail_request); 256 257 goto retry; 258 } 259 260 static inline int mddev_lock(mddev_t * mddev) 261 { 262 return down_interruptible(&mddev->reconfig_sem); 263 } 264 265 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 266 { 267 down(&mddev->reconfig_sem); 268 } 269 270 static inline int mddev_trylock(mddev_t * mddev) 271 { 272 return down_trylock(&mddev->reconfig_sem); 273 } 274 275 static inline void mddev_unlock(mddev_t * mddev) 276 { 277 up(&mddev->reconfig_sem); 278 279 md_wakeup_thread(mddev->thread); 280 } 281 282 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 283 { 284 mdk_rdev_t * rdev; 285 struct list_head *tmp; 286 287 ITERATE_RDEV(mddev,rdev,tmp) { 288 if (rdev->desc_nr == nr) 289 return rdev; 290 } 291 return NULL; 292 } 293 294 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 295 { 296 struct list_head *tmp; 297 mdk_rdev_t *rdev; 298 299 ITERATE_RDEV(mddev,rdev,tmp) { 300 if (rdev->bdev->bd_dev == dev) 301 return rdev; 302 } 303 return NULL; 304 } 305 306 static struct mdk_personality *find_pers(int level, char *clevel) 307 { 308 struct mdk_personality *pers; 309 list_for_each_entry(pers, &pers_list, list) { 310 if (level != LEVEL_NONE && pers->level == level) 311 return pers; 312 if (strcmp(pers->name, clevel)==0) 313 return pers; 314 } 315 return NULL; 316 } 317 318 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 319 { 320 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 321 return MD_NEW_SIZE_BLOCKS(size); 322 } 323 324 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 325 { 326 sector_t size; 327 328 size = rdev->sb_offset; 329 330 if (chunk_size) 331 size &= ~((sector_t)chunk_size/1024 - 1); 332 return size; 333 } 334 335 static int alloc_disk_sb(mdk_rdev_t * rdev) 336 { 337 if (rdev->sb_page) 338 MD_BUG(); 339 340 rdev->sb_page = alloc_page(GFP_KERNEL); 341 if (!rdev->sb_page) { 342 printk(KERN_ALERT "md: out of memory.\n"); 343 return -EINVAL; 344 } 345 346 return 0; 347 } 348 349 static void free_disk_sb(mdk_rdev_t * rdev) 350 { 351 if (rdev->sb_page) { 352 put_page(rdev->sb_page); 353 rdev->sb_loaded = 0; 354 rdev->sb_page = NULL; 355 rdev->sb_offset = 0; 356 rdev->size = 0; 357 } 358 } 359 360 361 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 362 { 363 mdk_rdev_t *rdev = bio->bi_private; 364 mddev_t *mddev = rdev->mddev; 365 if (bio->bi_size) 366 return 1; 367 368 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 369 md_error(mddev, rdev); 370 371 if (atomic_dec_and_test(&mddev->pending_writes)) 372 wake_up(&mddev->sb_wait); 373 bio_put(bio); 374 return 0; 375 } 376 377 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 378 { 379 struct bio *bio2 = bio->bi_private; 380 mdk_rdev_t *rdev = bio2->bi_private; 381 mddev_t *mddev = rdev->mddev; 382 if (bio->bi_size) 383 return 1; 384 385 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 386 error == -EOPNOTSUPP) { 387 unsigned long flags; 388 /* barriers don't appear to be supported :-( */ 389 set_bit(BarriersNotsupp, &rdev->flags); 390 mddev->barriers_work = 0; 391 spin_lock_irqsave(&mddev->write_lock, flags); 392 bio2->bi_next = mddev->biolist; 393 mddev->biolist = bio2; 394 spin_unlock_irqrestore(&mddev->write_lock, flags); 395 wake_up(&mddev->sb_wait); 396 bio_put(bio); 397 return 0; 398 } 399 bio_put(bio2); 400 bio->bi_private = rdev; 401 return super_written(bio, bytes_done, error); 402 } 403 404 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 405 sector_t sector, int size, struct page *page) 406 { 407 /* write first size bytes of page to sector of rdev 408 * Increment mddev->pending_writes before returning 409 * and decrement it on completion, waking up sb_wait 410 * if zero is reached. 411 * If an error occurred, call md_error 412 * 413 * As we might need to resubmit the request if BIO_RW_BARRIER 414 * causes ENOTSUPP, we allocate a spare bio... 415 */ 416 struct bio *bio = bio_alloc(GFP_NOIO, 1); 417 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 418 419 bio->bi_bdev = rdev->bdev; 420 bio->bi_sector = sector; 421 bio_add_page(bio, page, size, 0); 422 bio->bi_private = rdev; 423 bio->bi_end_io = super_written; 424 bio->bi_rw = rw; 425 426 atomic_inc(&mddev->pending_writes); 427 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 428 struct bio *rbio; 429 rw |= (1<<BIO_RW_BARRIER); 430 rbio = bio_clone(bio, GFP_NOIO); 431 rbio->bi_private = bio; 432 rbio->bi_end_io = super_written_barrier; 433 submit_bio(rw, rbio); 434 } else 435 submit_bio(rw, bio); 436 } 437 438 void md_super_wait(mddev_t *mddev) 439 { 440 /* wait for all superblock writes that were scheduled to complete. 441 * if any had to be retried (due to BARRIER problems), retry them 442 */ 443 DEFINE_WAIT(wq); 444 for(;;) { 445 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 446 if (atomic_read(&mddev->pending_writes)==0) 447 break; 448 while (mddev->biolist) { 449 struct bio *bio; 450 spin_lock_irq(&mddev->write_lock); 451 bio = mddev->biolist; 452 mddev->biolist = bio->bi_next ; 453 bio->bi_next = NULL; 454 spin_unlock_irq(&mddev->write_lock); 455 submit_bio(bio->bi_rw, bio); 456 } 457 schedule(); 458 } 459 finish_wait(&mddev->sb_wait, &wq); 460 } 461 462 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 463 { 464 if (bio->bi_size) 465 return 1; 466 467 complete((struct completion*)bio->bi_private); 468 return 0; 469 } 470 471 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 472 struct page *page, int rw) 473 { 474 struct bio *bio = bio_alloc(GFP_NOIO, 1); 475 struct completion event; 476 int ret; 477 478 rw |= (1 << BIO_RW_SYNC); 479 480 bio->bi_bdev = bdev; 481 bio->bi_sector = sector; 482 bio_add_page(bio, page, size, 0); 483 init_completion(&event); 484 bio->bi_private = &event; 485 bio->bi_end_io = bi_complete; 486 submit_bio(rw, bio); 487 wait_for_completion(&event); 488 489 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 490 bio_put(bio); 491 return ret; 492 } 493 EXPORT_SYMBOL_GPL(sync_page_io); 494 495 static int read_disk_sb(mdk_rdev_t * rdev, int size) 496 { 497 char b[BDEVNAME_SIZE]; 498 if (!rdev->sb_page) { 499 MD_BUG(); 500 return -EINVAL; 501 } 502 if (rdev->sb_loaded) 503 return 0; 504 505 506 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 507 goto fail; 508 rdev->sb_loaded = 1; 509 return 0; 510 511 fail: 512 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 513 bdevname(rdev->bdev,b)); 514 return -EINVAL; 515 } 516 517 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 518 { 519 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 520 (sb1->set_uuid1 == sb2->set_uuid1) && 521 (sb1->set_uuid2 == sb2->set_uuid2) && 522 (sb1->set_uuid3 == sb2->set_uuid3)) 523 524 return 1; 525 526 return 0; 527 } 528 529 530 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 531 { 532 int ret; 533 mdp_super_t *tmp1, *tmp2; 534 535 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 536 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 537 538 if (!tmp1 || !tmp2) { 539 ret = 0; 540 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 541 goto abort; 542 } 543 544 *tmp1 = *sb1; 545 *tmp2 = *sb2; 546 547 /* 548 * nr_disks is not constant 549 */ 550 tmp1->nr_disks = 0; 551 tmp2->nr_disks = 0; 552 553 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 554 ret = 0; 555 else 556 ret = 1; 557 558 abort: 559 kfree(tmp1); 560 kfree(tmp2); 561 return ret; 562 } 563 564 static unsigned int calc_sb_csum(mdp_super_t * sb) 565 { 566 unsigned int disk_csum, csum; 567 568 disk_csum = sb->sb_csum; 569 sb->sb_csum = 0; 570 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 571 sb->sb_csum = disk_csum; 572 return csum; 573 } 574 575 576 /* 577 * Handle superblock details. 578 * We want to be able to handle multiple superblock formats 579 * so we have a common interface to them all, and an array of 580 * different handlers. 581 * We rely on user-space to write the initial superblock, and support 582 * reading and updating of superblocks. 583 * Interface methods are: 584 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 585 * loads and validates a superblock on dev. 586 * if refdev != NULL, compare superblocks on both devices 587 * Return: 588 * 0 - dev has a superblock that is compatible with refdev 589 * 1 - dev has a superblock that is compatible and newer than refdev 590 * so dev should be used as the refdev in future 591 * -EINVAL superblock incompatible or invalid 592 * -othererror e.g. -EIO 593 * 594 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 595 * Verify that dev is acceptable into mddev. 596 * The first time, mddev->raid_disks will be 0, and data from 597 * dev should be merged in. Subsequent calls check that dev 598 * is new enough. Return 0 or -EINVAL 599 * 600 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 601 * Update the superblock for rdev with data in mddev 602 * This does not write to disc. 603 * 604 */ 605 606 struct super_type { 607 char *name; 608 struct module *owner; 609 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 610 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 611 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 612 }; 613 614 /* 615 * load_super for 0.90.0 616 */ 617 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 618 { 619 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 620 mdp_super_t *sb; 621 int ret; 622 sector_t sb_offset; 623 624 /* 625 * Calculate the position of the superblock, 626 * it's at the end of the disk. 627 * 628 * It also happens to be a multiple of 4Kb. 629 */ 630 sb_offset = calc_dev_sboffset(rdev->bdev); 631 rdev->sb_offset = sb_offset; 632 633 ret = read_disk_sb(rdev, MD_SB_BYTES); 634 if (ret) return ret; 635 636 ret = -EINVAL; 637 638 bdevname(rdev->bdev, b); 639 sb = (mdp_super_t*)page_address(rdev->sb_page); 640 641 if (sb->md_magic != MD_SB_MAGIC) { 642 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 643 b); 644 goto abort; 645 } 646 647 if (sb->major_version != 0 || 648 sb->minor_version != 90) { 649 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 650 sb->major_version, sb->minor_version, 651 b); 652 goto abort; 653 } 654 655 if (sb->raid_disks <= 0) 656 goto abort; 657 658 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 659 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 660 b); 661 goto abort; 662 } 663 664 rdev->preferred_minor = sb->md_minor; 665 rdev->data_offset = 0; 666 rdev->sb_size = MD_SB_BYTES; 667 668 if (sb->level == LEVEL_MULTIPATH) 669 rdev->desc_nr = -1; 670 else 671 rdev->desc_nr = sb->this_disk.number; 672 673 if (refdev == 0) 674 ret = 1; 675 else { 676 __u64 ev1, ev2; 677 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 678 if (!uuid_equal(refsb, sb)) { 679 printk(KERN_WARNING "md: %s has different UUID to %s\n", 680 b, bdevname(refdev->bdev,b2)); 681 goto abort; 682 } 683 if (!sb_equal(refsb, sb)) { 684 printk(KERN_WARNING "md: %s has same UUID" 685 " but different superblock to %s\n", 686 b, bdevname(refdev->bdev, b2)); 687 goto abort; 688 } 689 ev1 = md_event(sb); 690 ev2 = md_event(refsb); 691 if (ev1 > ev2) 692 ret = 1; 693 else 694 ret = 0; 695 } 696 rdev->size = calc_dev_size(rdev, sb->chunk_size); 697 698 if (rdev->size < sb->size && sb->level > 1) 699 /* "this cannot possibly happen" ... */ 700 ret = -EINVAL; 701 702 abort: 703 return ret; 704 } 705 706 /* 707 * validate_super for 0.90.0 708 */ 709 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 710 { 711 mdp_disk_t *desc; 712 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 713 714 rdev->raid_disk = -1; 715 rdev->flags = 0; 716 if (mddev->raid_disks == 0) { 717 mddev->major_version = 0; 718 mddev->minor_version = sb->minor_version; 719 mddev->patch_version = sb->patch_version; 720 mddev->persistent = ! sb->not_persistent; 721 mddev->chunk_size = sb->chunk_size; 722 mddev->ctime = sb->ctime; 723 mddev->utime = sb->utime; 724 mddev->level = sb->level; 725 mddev->clevel[0] = 0; 726 mddev->layout = sb->layout; 727 mddev->raid_disks = sb->raid_disks; 728 mddev->size = sb->size; 729 mddev->events = md_event(sb); 730 mddev->bitmap_offset = 0; 731 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 732 733 if (sb->state & (1<<MD_SB_CLEAN)) 734 mddev->recovery_cp = MaxSector; 735 else { 736 if (sb->events_hi == sb->cp_events_hi && 737 sb->events_lo == sb->cp_events_lo) { 738 mddev->recovery_cp = sb->recovery_cp; 739 } else 740 mddev->recovery_cp = 0; 741 } 742 743 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 744 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 745 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 746 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 747 748 mddev->max_disks = MD_SB_DISKS; 749 750 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 751 mddev->bitmap_file == NULL) { 752 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 753 && mddev->level != 10) { 754 /* FIXME use a better test */ 755 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 756 return -EINVAL; 757 } 758 mddev->bitmap_offset = mddev->default_bitmap_offset; 759 } 760 761 } else if (mddev->pers == NULL) { 762 /* Insist on good event counter while assembling */ 763 __u64 ev1 = md_event(sb); 764 ++ev1; 765 if (ev1 < mddev->events) 766 return -EINVAL; 767 } else if (mddev->bitmap) { 768 /* if adding to array with a bitmap, then we can accept an 769 * older device ... but not too old. 770 */ 771 __u64 ev1 = md_event(sb); 772 if (ev1 < mddev->bitmap->events_cleared) 773 return 0; 774 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 775 return 0; 776 777 if (mddev->level != LEVEL_MULTIPATH) { 778 desc = sb->disks + rdev->desc_nr; 779 780 if (desc->state & (1<<MD_DISK_FAULTY)) 781 set_bit(Faulty, &rdev->flags); 782 else if (desc->state & (1<<MD_DISK_SYNC) && 783 desc->raid_disk < mddev->raid_disks) { 784 set_bit(In_sync, &rdev->flags); 785 rdev->raid_disk = desc->raid_disk; 786 } 787 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 788 set_bit(WriteMostly, &rdev->flags); 789 } else /* MULTIPATH are always insync */ 790 set_bit(In_sync, &rdev->flags); 791 return 0; 792 } 793 794 /* 795 * sync_super for 0.90.0 796 */ 797 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 798 { 799 mdp_super_t *sb; 800 struct list_head *tmp; 801 mdk_rdev_t *rdev2; 802 int next_spare = mddev->raid_disks; 803 804 805 /* make rdev->sb match mddev data.. 806 * 807 * 1/ zero out disks 808 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 809 * 3/ any empty disks < next_spare become removed 810 * 811 * disks[0] gets initialised to REMOVED because 812 * we cannot be sure from other fields if it has 813 * been initialised or not. 814 */ 815 int i; 816 int active=0, working=0,failed=0,spare=0,nr_disks=0; 817 818 rdev->sb_size = MD_SB_BYTES; 819 820 sb = (mdp_super_t*)page_address(rdev->sb_page); 821 822 memset(sb, 0, sizeof(*sb)); 823 824 sb->md_magic = MD_SB_MAGIC; 825 sb->major_version = mddev->major_version; 826 sb->minor_version = mddev->minor_version; 827 sb->patch_version = mddev->patch_version; 828 sb->gvalid_words = 0; /* ignored */ 829 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 830 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 831 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 832 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 833 834 sb->ctime = mddev->ctime; 835 sb->level = mddev->level; 836 sb->size = mddev->size; 837 sb->raid_disks = mddev->raid_disks; 838 sb->md_minor = mddev->md_minor; 839 sb->not_persistent = !mddev->persistent; 840 sb->utime = mddev->utime; 841 sb->state = 0; 842 sb->events_hi = (mddev->events>>32); 843 sb->events_lo = (u32)mddev->events; 844 845 if (mddev->in_sync) 846 { 847 sb->recovery_cp = mddev->recovery_cp; 848 sb->cp_events_hi = (mddev->events>>32); 849 sb->cp_events_lo = (u32)mddev->events; 850 if (mddev->recovery_cp == MaxSector) 851 sb->state = (1<< MD_SB_CLEAN); 852 } else 853 sb->recovery_cp = 0; 854 855 sb->layout = mddev->layout; 856 sb->chunk_size = mddev->chunk_size; 857 858 if (mddev->bitmap && mddev->bitmap_file == NULL) 859 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 860 861 sb->disks[0].state = (1<<MD_DISK_REMOVED); 862 ITERATE_RDEV(mddev,rdev2,tmp) { 863 mdp_disk_t *d; 864 int desc_nr; 865 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 866 && !test_bit(Faulty, &rdev2->flags)) 867 desc_nr = rdev2->raid_disk; 868 else 869 desc_nr = next_spare++; 870 rdev2->desc_nr = desc_nr; 871 d = &sb->disks[rdev2->desc_nr]; 872 nr_disks++; 873 d->number = rdev2->desc_nr; 874 d->major = MAJOR(rdev2->bdev->bd_dev); 875 d->minor = MINOR(rdev2->bdev->bd_dev); 876 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 877 && !test_bit(Faulty, &rdev2->flags)) 878 d->raid_disk = rdev2->raid_disk; 879 else 880 d->raid_disk = rdev2->desc_nr; /* compatibility */ 881 if (test_bit(Faulty, &rdev2->flags)) { 882 d->state = (1<<MD_DISK_FAULTY); 883 failed++; 884 } else if (test_bit(In_sync, &rdev2->flags)) { 885 d->state = (1<<MD_DISK_ACTIVE); 886 d->state |= (1<<MD_DISK_SYNC); 887 active++; 888 working++; 889 } else { 890 d->state = 0; 891 spare++; 892 working++; 893 } 894 if (test_bit(WriteMostly, &rdev2->flags)) 895 d->state |= (1<<MD_DISK_WRITEMOSTLY); 896 } 897 /* now set the "removed" and "faulty" bits on any missing devices */ 898 for (i=0 ; i < mddev->raid_disks ; i++) { 899 mdp_disk_t *d = &sb->disks[i]; 900 if (d->state == 0 && d->number == 0) { 901 d->number = i; 902 d->raid_disk = i; 903 d->state = (1<<MD_DISK_REMOVED); 904 d->state |= (1<<MD_DISK_FAULTY); 905 failed++; 906 } 907 } 908 sb->nr_disks = nr_disks; 909 sb->active_disks = active; 910 sb->working_disks = working; 911 sb->failed_disks = failed; 912 sb->spare_disks = spare; 913 914 sb->this_disk = sb->disks[rdev->desc_nr]; 915 sb->sb_csum = calc_sb_csum(sb); 916 } 917 918 /* 919 * version 1 superblock 920 */ 921 922 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 923 { 924 unsigned int disk_csum, csum; 925 unsigned long long newcsum; 926 int size = 256 + le32_to_cpu(sb->max_dev)*2; 927 unsigned int *isuper = (unsigned int*)sb; 928 int i; 929 930 disk_csum = sb->sb_csum; 931 sb->sb_csum = 0; 932 newcsum = 0; 933 for (i=0; size>=4; size -= 4 ) 934 newcsum += le32_to_cpu(*isuper++); 935 936 if (size == 2) 937 newcsum += le16_to_cpu(*(unsigned short*) isuper); 938 939 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 940 sb->sb_csum = disk_csum; 941 return cpu_to_le32(csum); 942 } 943 944 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 945 { 946 struct mdp_superblock_1 *sb; 947 int ret; 948 sector_t sb_offset; 949 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 950 int bmask; 951 952 /* 953 * Calculate the position of the superblock. 954 * It is always aligned to a 4K boundary and 955 * depeding on minor_version, it can be: 956 * 0: At least 8K, but less than 12K, from end of device 957 * 1: At start of device 958 * 2: 4K from start of device. 959 */ 960 switch(minor_version) { 961 case 0: 962 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 963 sb_offset -= 8*2; 964 sb_offset &= ~(sector_t)(4*2-1); 965 /* convert from sectors to K */ 966 sb_offset /= 2; 967 break; 968 case 1: 969 sb_offset = 0; 970 break; 971 case 2: 972 sb_offset = 4; 973 break; 974 default: 975 return -EINVAL; 976 } 977 rdev->sb_offset = sb_offset; 978 979 /* superblock is rarely larger than 1K, but it can be larger, 980 * and it is safe to read 4k, so we do that 981 */ 982 ret = read_disk_sb(rdev, 4096); 983 if (ret) return ret; 984 985 986 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 987 988 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 989 sb->major_version != cpu_to_le32(1) || 990 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 991 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 992 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 993 return -EINVAL; 994 995 if (calc_sb_1_csum(sb) != sb->sb_csum) { 996 printk("md: invalid superblock checksum on %s\n", 997 bdevname(rdev->bdev,b)); 998 return -EINVAL; 999 } 1000 if (le64_to_cpu(sb->data_size) < 10) { 1001 printk("md: data_size too small on %s\n", 1002 bdevname(rdev->bdev,b)); 1003 return -EINVAL; 1004 } 1005 rdev->preferred_minor = 0xffff; 1006 rdev->data_offset = le64_to_cpu(sb->data_offset); 1007 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1008 1009 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1010 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1011 if (rdev->sb_size & bmask) 1012 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1013 1014 if (refdev == 0) 1015 return 1; 1016 else { 1017 __u64 ev1, ev2; 1018 struct mdp_superblock_1 *refsb = 1019 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1020 1021 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1022 sb->level != refsb->level || 1023 sb->layout != refsb->layout || 1024 sb->chunksize != refsb->chunksize) { 1025 printk(KERN_WARNING "md: %s has strangely different" 1026 " superblock to %s\n", 1027 bdevname(rdev->bdev,b), 1028 bdevname(refdev->bdev,b2)); 1029 return -EINVAL; 1030 } 1031 ev1 = le64_to_cpu(sb->events); 1032 ev2 = le64_to_cpu(refsb->events); 1033 1034 if (ev1 > ev2) 1035 return 1; 1036 } 1037 if (minor_version) 1038 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1039 else 1040 rdev->size = rdev->sb_offset; 1041 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1042 return -EINVAL; 1043 rdev->size = le64_to_cpu(sb->data_size)/2; 1044 if (le32_to_cpu(sb->chunksize)) 1045 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1046 1047 if (le32_to_cpu(sb->size) > rdev->size*2) 1048 return -EINVAL; 1049 return 0; 1050 } 1051 1052 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1053 { 1054 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1055 1056 rdev->raid_disk = -1; 1057 rdev->flags = 0; 1058 if (mddev->raid_disks == 0) { 1059 mddev->major_version = 1; 1060 mddev->patch_version = 0; 1061 mddev->persistent = 1; 1062 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1063 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1064 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1065 mddev->level = le32_to_cpu(sb->level); 1066 mddev->clevel[0] = 0; 1067 mddev->layout = le32_to_cpu(sb->layout); 1068 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1069 mddev->size = le64_to_cpu(sb->size)/2; 1070 mddev->events = le64_to_cpu(sb->events); 1071 mddev->bitmap_offset = 0; 1072 mddev->default_bitmap_offset = 1024; 1073 1074 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1075 memcpy(mddev->uuid, sb->set_uuid, 16); 1076 1077 mddev->max_disks = (4096-256)/2; 1078 1079 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1080 mddev->bitmap_file == NULL ) { 1081 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1082 && mddev->level != 10) { 1083 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1084 return -EINVAL; 1085 } 1086 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1087 } 1088 } else if (mddev->pers == NULL) { 1089 /* Insist of good event counter while assembling */ 1090 __u64 ev1 = le64_to_cpu(sb->events); 1091 ++ev1; 1092 if (ev1 < mddev->events) 1093 return -EINVAL; 1094 } else if (mddev->bitmap) { 1095 /* If adding to array with a bitmap, then we can accept an 1096 * older device, but not too old. 1097 */ 1098 __u64 ev1 = le64_to_cpu(sb->events); 1099 if (ev1 < mddev->bitmap->events_cleared) 1100 return 0; 1101 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1102 return 0; 1103 1104 if (mddev->level != LEVEL_MULTIPATH) { 1105 int role; 1106 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1107 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1108 switch(role) { 1109 case 0xffff: /* spare */ 1110 break; 1111 case 0xfffe: /* faulty */ 1112 set_bit(Faulty, &rdev->flags); 1113 break; 1114 default: 1115 set_bit(In_sync, &rdev->flags); 1116 rdev->raid_disk = role; 1117 break; 1118 } 1119 if (sb->devflags & WriteMostly1) 1120 set_bit(WriteMostly, &rdev->flags); 1121 } else /* MULTIPATH are always insync */ 1122 set_bit(In_sync, &rdev->flags); 1123 1124 return 0; 1125 } 1126 1127 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1128 { 1129 struct mdp_superblock_1 *sb; 1130 struct list_head *tmp; 1131 mdk_rdev_t *rdev2; 1132 int max_dev, i; 1133 /* make rdev->sb match mddev and rdev data. */ 1134 1135 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1136 1137 sb->feature_map = 0; 1138 sb->pad0 = 0; 1139 memset(sb->pad1, 0, sizeof(sb->pad1)); 1140 memset(sb->pad2, 0, sizeof(sb->pad2)); 1141 memset(sb->pad3, 0, sizeof(sb->pad3)); 1142 1143 sb->utime = cpu_to_le64((__u64)mddev->utime); 1144 sb->events = cpu_to_le64(mddev->events); 1145 if (mddev->in_sync) 1146 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1147 else 1148 sb->resync_offset = cpu_to_le64(0); 1149 1150 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); 1151 1152 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1153 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1154 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1155 } 1156 1157 max_dev = 0; 1158 ITERATE_RDEV(mddev,rdev2,tmp) 1159 if (rdev2->desc_nr+1 > max_dev) 1160 max_dev = rdev2->desc_nr+1; 1161 1162 sb->max_dev = cpu_to_le32(max_dev); 1163 for (i=0; i<max_dev;i++) 1164 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1165 1166 ITERATE_RDEV(mddev,rdev2,tmp) { 1167 i = rdev2->desc_nr; 1168 if (test_bit(Faulty, &rdev2->flags)) 1169 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1170 else if (test_bit(In_sync, &rdev2->flags)) 1171 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1172 else 1173 sb->dev_roles[i] = cpu_to_le16(0xffff); 1174 } 1175 1176 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1177 sb->sb_csum = calc_sb_1_csum(sb); 1178 } 1179 1180 1181 static struct super_type super_types[] = { 1182 [0] = { 1183 .name = "0.90.0", 1184 .owner = THIS_MODULE, 1185 .load_super = super_90_load, 1186 .validate_super = super_90_validate, 1187 .sync_super = super_90_sync, 1188 }, 1189 [1] = { 1190 .name = "md-1", 1191 .owner = THIS_MODULE, 1192 .load_super = super_1_load, 1193 .validate_super = super_1_validate, 1194 .sync_super = super_1_sync, 1195 }, 1196 }; 1197 1198 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1199 { 1200 struct list_head *tmp; 1201 mdk_rdev_t *rdev; 1202 1203 ITERATE_RDEV(mddev,rdev,tmp) 1204 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1205 return rdev; 1206 1207 return NULL; 1208 } 1209 1210 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1211 { 1212 struct list_head *tmp; 1213 mdk_rdev_t *rdev; 1214 1215 ITERATE_RDEV(mddev1,rdev,tmp) 1216 if (match_dev_unit(mddev2, rdev)) 1217 return 1; 1218 1219 return 0; 1220 } 1221 1222 static LIST_HEAD(pending_raid_disks); 1223 1224 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1225 { 1226 mdk_rdev_t *same_pdev; 1227 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1228 struct kobject *ko; 1229 1230 if (rdev->mddev) { 1231 MD_BUG(); 1232 return -EINVAL; 1233 } 1234 /* make sure rdev->size exceeds mddev->size */ 1235 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1236 if (mddev->pers) 1237 /* Cannot change size, so fail */ 1238 return -ENOSPC; 1239 else 1240 mddev->size = rdev->size; 1241 } 1242 same_pdev = match_dev_unit(mddev, rdev); 1243 if (same_pdev) 1244 printk(KERN_WARNING 1245 "%s: WARNING: %s appears to be on the same physical" 1246 " disk as %s. True\n protection against single-disk" 1247 " failure might be compromised.\n", 1248 mdname(mddev), bdevname(rdev->bdev,b), 1249 bdevname(same_pdev->bdev,b2)); 1250 1251 /* Verify rdev->desc_nr is unique. 1252 * If it is -1, assign a free number, else 1253 * check number is not in use 1254 */ 1255 if (rdev->desc_nr < 0) { 1256 int choice = 0; 1257 if (mddev->pers) choice = mddev->raid_disks; 1258 while (find_rdev_nr(mddev, choice)) 1259 choice++; 1260 rdev->desc_nr = choice; 1261 } else { 1262 if (find_rdev_nr(mddev, rdev->desc_nr)) 1263 return -EBUSY; 1264 } 1265 bdevname(rdev->bdev,b); 1266 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1267 return -ENOMEM; 1268 1269 list_add(&rdev->same_set, &mddev->disks); 1270 rdev->mddev = mddev; 1271 printk(KERN_INFO "md: bind<%s>\n", b); 1272 1273 rdev->kobj.parent = &mddev->kobj; 1274 kobject_add(&rdev->kobj); 1275 1276 if (rdev->bdev->bd_part) 1277 ko = &rdev->bdev->bd_part->kobj; 1278 else 1279 ko = &rdev->bdev->bd_disk->kobj; 1280 sysfs_create_link(&rdev->kobj, ko, "block"); 1281 return 0; 1282 } 1283 1284 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1285 { 1286 char b[BDEVNAME_SIZE]; 1287 if (!rdev->mddev) { 1288 MD_BUG(); 1289 return; 1290 } 1291 list_del_init(&rdev->same_set); 1292 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1293 rdev->mddev = NULL; 1294 sysfs_remove_link(&rdev->kobj, "block"); 1295 kobject_del(&rdev->kobj); 1296 } 1297 1298 /* 1299 * prevent the device from being mounted, repartitioned or 1300 * otherwise reused by a RAID array (or any other kernel 1301 * subsystem), by bd_claiming the device. 1302 */ 1303 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1304 { 1305 int err = 0; 1306 struct block_device *bdev; 1307 char b[BDEVNAME_SIZE]; 1308 1309 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1310 if (IS_ERR(bdev)) { 1311 printk(KERN_ERR "md: could not open %s.\n", 1312 __bdevname(dev, b)); 1313 return PTR_ERR(bdev); 1314 } 1315 err = bd_claim(bdev, rdev); 1316 if (err) { 1317 printk(KERN_ERR "md: could not bd_claim %s.\n", 1318 bdevname(bdev, b)); 1319 blkdev_put(bdev); 1320 return err; 1321 } 1322 rdev->bdev = bdev; 1323 return err; 1324 } 1325 1326 static void unlock_rdev(mdk_rdev_t *rdev) 1327 { 1328 struct block_device *bdev = rdev->bdev; 1329 rdev->bdev = NULL; 1330 if (!bdev) 1331 MD_BUG(); 1332 bd_release(bdev); 1333 blkdev_put(bdev); 1334 } 1335 1336 void md_autodetect_dev(dev_t dev); 1337 1338 static void export_rdev(mdk_rdev_t * rdev) 1339 { 1340 char b[BDEVNAME_SIZE]; 1341 printk(KERN_INFO "md: export_rdev(%s)\n", 1342 bdevname(rdev->bdev,b)); 1343 if (rdev->mddev) 1344 MD_BUG(); 1345 free_disk_sb(rdev); 1346 list_del_init(&rdev->same_set); 1347 #ifndef MODULE 1348 md_autodetect_dev(rdev->bdev->bd_dev); 1349 #endif 1350 unlock_rdev(rdev); 1351 kobject_put(&rdev->kobj); 1352 } 1353 1354 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1355 { 1356 unbind_rdev_from_array(rdev); 1357 export_rdev(rdev); 1358 } 1359 1360 static void export_array(mddev_t *mddev) 1361 { 1362 struct list_head *tmp; 1363 mdk_rdev_t *rdev; 1364 1365 ITERATE_RDEV(mddev,rdev,tmp) { 1366 if (!rdev->mddev) { 1367 MD_BUG(); 1368 continue; 1369 } 1370 kick_rdev_from_array(rdev); 1371 } 1372 if (!list_empty(&mddev->disks)) 1373 MD_BUG(); 1374 mddev->raid_disks = 0; 1375 mddev->major_version = 0; 1376 } 1377 1378 static void print_desc(mdp_disk_t *desc) 1379 { 1380 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1381 desc->major,desc->minor,desc->raid_disk,desc->state); 1382 } 1383 1384 static void print_sb(mdp_super_t *sb) 1385 { 1386 int i; 1387 1388 printk(KERN_INFO 1389 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1390 sb->major_version, sb->minor_version, sb->patch_version, 1391 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1392 sb->ctime); 1393 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1394 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1395 sb->md_minor, sb->layout, sb->chunk_size); 1396 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1397 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1398 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1399 sb->failed_disks, sb->spare_disks, 1400 sb->sb_csum, (unsigned long)sb->events_lo); 1401 1402 printk(KERN_INFO); 1403 for (i = 0; i < MD_SB_DISKS; i++) { 1404 mdp_disk_t *desc; 1405 1406 desc = sb->disks + i; 1407 if (desc->number || desc->major || desc->minor || 1408 desc->raid_disk || (desc->state && (desc->state != 4))) { 1409 printk(" D %2d: ", i); 1410 print_desc(desc); 1411 } 1412 } 1413 printk(KERN_INFO "md: THIS: "); 1414 print_desc(&sb->this_disk); 1415 1416 } 1417 1418 static void print_rdev(mdk_rdev_t *rdev) 1419 { 1420 char b[BDEVNAME_SIZE]; 1421 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1422 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1423 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1424 rdev->desc_nr); 1425 if (rdev->sb_loaded) { 1426 printk(KERN_INFO "md: rdev superblock:\n"); 1427 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1428 } else 1429 printk(KERN_INFO "md: no rdev superblock!\n"); 1430 } 1431 1432 void md_print_devices(void) 1433 { 1434 struct list_head *tmp, *tmp2; 1435 mdk_rdev_t *rdev; 1436 mddev_t *mddev; 1437 char b[BDEVNAME_SIZE]; 1438 1439 printk("\n"); 1440 printk("md: **********************************\n"); 1441 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1442 printk("md: **********************************\n"); 1443 ITERATE_MDDEV(mddev,tmp) { 1444 1445 if (mddev->bitmap) 1446 bitmap_print_sb(mddev->bitmap); 1447 else 1448 printk("%s: ", mdname(mddev)); 1449 ITERATE_RDEV(mddev,rdev,tmp2) 1450 printk("<%s>", bdevname(rdev->bdev,b)); 1451 printk("\n"); 1452 1453 ITERATE_RDEV(mddev,rdev,tmp2) 1454 print_rdev(rdev); 1455 } 1456 printk("md: **********************************\n"); 1457 printk("\n"); 1458 } 1459 1460 1461 static void sync_sbs(mddev_t * mddev) 1462 { 1463 mdk_rdev_t *rdev; 1464 struct list_head *tmp; 1465 1466 ITERATE_RDEV(mddev,rdev,tmp) { 1467 super_types[mddev->major_version]. 1468 sync_super(mddev, rdev); 1469 rdev->sb_loaded = 1; 1470 } 1471 } 1472 1473 static void md_update_sb(mddev_t * mddev) 1474 { 1475 int err; 1476 struct list_head *tmp; 1477 mdk_rdev_t *rdev; 1478 int sync_req; 1479 1480 repeat: 1481 spin_lock_irq(&mddev->write_lock); 1482 sync_req = mddev->in_sync; 1483 mddev->utime = get_seconds(); 1484 mddev->events ++; 1485 1486 if (!mddev->events) { 1487 /* 1488 * oops, this 64-bit counter should never wrap. 1489 * Either we are in around ~1 trillion A.C., assuming 1490 * 1 reboot per second, or we have a bug: 1491 */ 1492 MD_BUG(); 1493 mddev->events --; 1494 } 1495 mddev->sb_dirty = 2; 1496 sync_sbs(mddev); 1497 1498 /* 1499 * do not write anything to disk if using 1500 * nonpersistent superblocks 1501 */ 1502 if (!mddev->persistent) { 1503 mddev->sb_dirty = 0; 1504 spin_unlock_irq(&mddev->write_lock); 1505 wake_up(&mddev->sb_wait); 1506 return; 1507 } 1508 spin_unlock_irq(&mddev->write_lock); 1509 1510 dprintk(KERN_INFO 1511 "md: updating %s RAID superblock on device (in sync %d)\n", 1512 mdname(mddev),mddev->in_sync); 1513 1514 err = bitmap_update_sb(mddev->bitmap); 1515 ITERATE_RDEV(mddev,rdev,tmp) { 1516 char b[BDEVNAME_SIZE]; 1517 dprintk(KERN_INFO "md: "); 1518 if (test_bit(Faulty, &rdev->flags)) 1519 dprintk("(skipping faulty "); 1520 1521 dprintk("%s ", bdevname(rdev->bdev,b)); 1522 if (!test_bit(Faulty, &rdev->flags)) { 1523 md_super_write(mddev,rdev, 1524 rdev->sb_offset<<1, rdev->sb_size, 1525 rdev->sb_page); 1526 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1527 bdevname(rdev->bdev,b), 1528 (unsigned long long)rdev->sb_offset); 1529 1530 } else 1531 dprintk(")\n"); 1532 if (mddev->level == LEVEL_MULTIPATH) 1533 /* only need to write one superblock... */ 1534 break; 1535 } 1536 md_super_wait(mddev); 1537 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1538 1539 spin_lock_irq(&mddev->write_lock); 1540 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1541 /* have to write it out again */ 1542 spin_unlock_irq(&mddev->write_lock); 1543 goto repeat; 1544 } 1545 mddev->sb_dirty = 0; 1546 spin_unlock_irq(&mddev->write_lock); 1547 wake_up(&mddev->sb_wait); 1548 1549 } 1550 1551 /* words written to sysfs files may, or my not, be \n terminated. 1552 * We want to accept with case. For this we use cmd_match. 1553 */ 1554 static int cmd_match(const char *cmd, const char *str) 1555 { 1556 /* See if cmd, written into a sysfs file, matches 1557 * str. They must either be the same, or cmd can 1558 * have a trailing newline 1559 */ 1560 while (*cmd && *str && *cmd == *str) { 1561 cmd++; 1562 str++; 1563 } 1564 if (*cmd == '\n') 1565 cmd++; 1566 if (*str || *cmd) 1567 return 0; 1568 return 1; 1569 } 1570 1571 struct rdev_sysfs_entry { 1572 struct attribute attr; 1573 ssize_t (*show)(mdk_rdev_t *, char *); 1574 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1575 }; 1576 1577 static ssize_t 1578 state_show(mdk_rdev_t *rdev, char *page) 1579 { 1580 char *sep = ""; 1581 int len=0; 1582 1583 if (test_bit(Faulty, &rdev->flags)) { 1584 len+= sprintf(page+len, "%sfaulty",sep); 1585 sep = ","; 1586 } 1587 if (test_bit(In_sync, &rdev->flags)) { 1588 len += sprintf(page+len, "%sin_sync",sep); 1589 sep = ","; 1590 } 1591 if (!test_bit(Faulty, &rdev->flags) && 1592 !test_bit(In_sync, &rdev->flags)) { 1593 len += sprintf(page+len, "%sspare", sep); 1594 sep = ","; 1595 } 1596 return len+sprintf(page+len, "\n"); 1597 } 1598 1599 static struct rdev_sysfs_entry 1600 rdev_state = __ATTR_RO(state); 1601 1602 static ssize_t 1603 super_show(mdk_rdev_t *rdev, char *page) 1604 { 1605 if (rdev->sb_loaded && rdev->sb_size) { 1606 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1607 return rdev->sb_size; 1608 } else 1609 return 0; 1610 } 1611 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1612 1613 static ssize_t 1614 errors_show(mdk_rdev_t *rdev, char *page) 1615 { 1616 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1617 } 1618 1619 static ssize_t 1620 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1621 { 1622 char *e; 1623 unsigned long n = simple_strtoul(buf, &e, 10); 1624 if (*buf && (*e == 0 || *e == '\n')) { 1625 atomic_set(&rdev->corrected_errors, n); 1626 return len; 1627 } 1628 return -EINVAL; 1629 } 1630 static struct rdev_sysfs_entry rdev_errors = 1631 __ATTR(errors, 0644, errors_show, errors_store); 1632 1633 static ssize_t 1634 slot_show(mdk_rdev_t *rdev, char *page) 1635 { 1636 if (rdev->raid_disk < 0) 1637 return sprintf(page, "none\n"); 1638 else 1639 return sprintf(page, "%d\n", rdev->raid_disk); 1640 } 1641 1642 static ssize_t 1643 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1644 { 1645 char *e; 1646 int slot = simple_strtoul(buf, &e, 10); 1647 if (strncmp(buf, "none", 4)==0) 1648 slot = -1; 1649 else if (e==buf || (*e && *e!= '\n')) 1650 return -EINVAL; 1651 if (rdev->mddev->pers) 1652 /* Cannot set slot in active array (yet) */ 1653 return -EBUSY; 1654 if (slot >= rdev->mddev->raid_disks) 1655 return -ENOSPC; 1656 rdev->raid_disk = slot; 1657 /* assume it is working */ 1658 rdev->flags = 0; 1659 set_bit(In_sync, &rdev->flags); 1660 return len; 1661 } 1662 1663 1664 static struct rdev_sysfs_entry rdev_slot = 1665 __ATTR(slot, 0644, slot_show, slot_store); 1666 1667 static struct attribute *rdev_default_attrs[] = { 1668 &rdev_state.attr, 1669 &rdev_super.attr, 1670 &rdev_errors.attr, 1671 &rdev_slot.attr, 1672 NULL, 1673 }; 1674 static ssize_t 1675 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1676 { 1677 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1678 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1679 1680 if (!entry->show) 1681 return -EIO; 1682 return entry->show(rdev, page); 1683 } 1684 1685 static ssize_t 1686 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1687 const char *page, size_t length) 1688 { 1689 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1690 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1691 1692 if (!entry->store) 1693 return -EIO; 1694 return entry->store(rdev, page, length); 1695 } 1696 1697 static void rdev_free(struct kobject *ko) 1698 { 1699 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1700 kfree(rdev); 1701 } 1702 static struct sysfs_ops rdev_sysfs_ops = { 1703 .show = rdev_attr_show, 1704 .store = rdev_attr_store, 1705 }; 1706 static struct kobj_type rdev_ktype = { 1707 .release = rdev_free, 1708 .sysfs_ops = &rdev_sysfs_ops, 1709 .default_attrs = rdev_default_attrs, 1710 }; 1711 1712 /* 1713 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1714 * 1715 * mark the device faulty if: 1716 * 1717 * - the device is nonexistent (zero size) 1718 * - the device has no valid superblock 1719 * 1720 * a faulty rdev _never_ has rdev->sb set. 1721 */ 1722 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1723 { 1724 char b[BDEVNAME_SIZE]; 1725 int err; 1726 mdk_rdev_t *rdev; 1727 sector_t size; 1728 1729 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1730 if (!rdev) { 1731 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1732 return ERR_PTR(-ENOMEM); 1733 } 1734 1735 if ((err = alloc_disk_sb(rdev))) 1736 goto abort_free; 1737 1738 err = lock_rdev(rdev, newdev); 1739 if (err) 1740 goto abort_free; 1741 1742 rdev->kobj.parent = NULL; 1743 rdev->kobj.ktype = &rdev_ktype; 1744 kobject_init(&rdev->kobj); 1745 1746 rdev->desc_nr = -1; 1747 rdev->flags = 0; 1748 rdev->data_offset = 0; 1749 atomic_set(&rdev->nr_pending, 0); 1750 atomic_set(&rdev->read_errors, 0); 1751 atomic_set(&rdev->corrected_errors, 0); 1752 1753 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1754 if (!size) { 1755 printk(KERN_WARNING 1756 "md: %s has zero or unknown size, marking faulty!\n", 1757 bdevname(rdev->bdev,b)); 1758 err = -EINVAL; 1759 goto abort_free; 1760 } 1761 1762 if (super_format >= 0) { 1763 err = super_types[super_format]. 1764 load_super(rdev, NULL, super_minor); 1765 if (err == -EINVAL) { 1766 printk(KERN_WARNING 1767 "md: %s has invalid sb, not importing!\n", 1768 bdevname(rdev->bdev,b)); 1769 goto abort_free; 1770 } 1771 if (err < 0) { 1772 printk(KERN_WARNING 1773 "md: could not read %s's sb, not importing!\n", 1774 bdevname(rdev->bdev,b)); 1775 goto abort_free; 1776 } 1777 } 1778 INIT_LIST_HEAD(&rdev->same_set); 1779 1780 return rdev; 1781 1782 abort_free: 1783 if (rdev->sb_page) { 1784 if (rdev->bdev) 1785 unlock_rdev(rdev); 1786 free_disk_sb(rdev); 1787 } 1788 kfree(rdev); 1789 return ERR_PTR(err); 1790 } 1791 1792 /* 1793 * Check a full RAID array for plausibility 1794 */ 1795 1796 1797 static void analyze_sbs(mddev_t * mddev) 1798 { 1799 int i; 1800 struct list_head *tmp; 1801 mdk_rdev_t *rdev, *freshest; 1802 char b[BDEVNAME_SIZE]; 1803 1804 freshest = NULL; 1805 ITERATE_RDEV(mddev,rdev,tmp) 1806 switch (super_types[mddev->major_version]. 1807 load_super(rdev, freshest, mddev->minor_version)) { 1808 case 1: 1809 freshest = rdev; 1810 break; 1811 case 0: 1812 break; 1813 default: 1814 printk( KERN_ERR \ 1815 "md: fatal superblock inconsistency in %s" 1816 " -- removing from array\n", 1817 bdevname(rdev->bdev,b)); 1818 kick_rdev_from_array(rdev); 1819 } 1820 1821 1822 super_types[mddev->major_version]. 1823 validate_super(mddev, freshest); 1824 1825 i = 0; 1826 ITERATE_RDEV(mddev,rdev,tmp) { 1827 if (rdev != freshest) 1828 if (super_types[mddev->major_version]. 1829 validate_super(mddev, rdev)) { 1830 printk(KERN_WARNING "md: kicking non-fresh %s" 1831 " from array!\n", 1832 bdevname(rdev->bdev,b)); 1833 kick_rdev_from_array(rdev); 1834 continue; 1835 } 1836 if (mddev->level == LEVEL_MULTIPATH) { 1837 rdev->desc_nr = i++; 1838 rdev->raid_disk = rdev->desc_nr; 1839 set_bit(In_sync, &rdev->flags); 1840 } 1841 } 1842 1843 1844 1845 if (mddev->recovery_cp != MaxSector && 1846 mddev->level >= 1) 1847 printk(KERN_ERR "md: %s: raid array is not clean" 1848 " -- starting background reconstruction\n", 1849 mdname(mddev)); 1850 1851 } 1852 1853 static ssize_t 1854 level_show(mddev_t *mddev, char *page) 1855 { 1856 struct mdk_personality *p = mddev->pers; 1857 if (p) 1858 return sprintf(page, "%s\n", p->name); 1859 else if (mddev->clevel[0]) 1860 return sprintf(page, "%s\n", mddev->clevel); 1861 else if (mddev->level != LEVEL_NONE) 1862 return sprintf(page, "%d\n", mddev->level); 1863 else 1864 return 0; 1865 } 1866 1867 static ssize_t 1868 level_store(mddev_t *mddev, const char *buf, size_t len) 1869 { 1870 int rv = len; 1871 if (mddev->pers) 1872 return -EBUSY; 1873 if (len == 0) 1874 return 0; 1875 if (len >= sizeof(mddev->clevel)) 1876 return -ENOSPC; 1877 strncpy(mddev->clevel, buf, len); 1878 if (mddev->clevel[len-1] == '\n') 1879 len--; 1880 mddev->clevel[len] = 0; 1881 mddev->level = LEVEL_NONE; 1882 return rv; 1883 } 1884 1885 static struct md_sysfs_entry md_level = 1886 __ATTR(level, 0644, level_show, level_store); 1887 1888 static ssize_t 1889 raid_disks_show(mddev_t *mddev, char *page) 1890 { 1891 if (mddev->raid_disks == 0) 1892 return 0; 1893 return sprintf(page, "%d\n", mddev->raid_disks); 1894 } 1895 1896 static int update_raid_disks(mddev_t *mddev, int raid_disks); 1897 1898 static ssize_t 1899 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 1900 { 1901 /* can only set raid_disks if array is not yet active */ 1902 char *e; 1903 int rv = 0; 1904 unsigned long n = simple_strtoul(buf, &e, 10); 1905 1906 if (!*buf || (*e && *e != '\n')) 1907 return -EINVAL; 1908 1909 if (mddev->pers) 1910 rv = update_raid_disks(mddev, n); 1911 else 1912 mddev->raid_disks = n; 1913 return rv ? rv : len; 1914 } 1915 static struct md_sysfs_entry md_raid_disks = 1916 __ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); 1917 1918 static ssize_t 1919 chunk_size_show(mddev_t *mddev, char *page) 1920 { 1921 return sprintf(page, "%d\n", mddev->chunk_size); 1922 } 1923 1924 static ssize_t 1925 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 1926 { 1927 /* can only set chunk_size if array is not yet active */ 1928 char *e; 1929 unsigned long n = simple_strtoul(buf, &e, 10); 1930 1931 if (mddev->pers) 1932 return -EBUSY; 1933 if (!*buf || (*e && *e != '\n')) 1934 return -EINVAL; 1935 1936 mddev->chunk_size = n; 1937 return len; 1938 } 1939 static struct md_sysfs_entry md_chunk_size = 1940 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 1941 1942 1943 static ssize_t 1944 size_show(mddev_t *mddev, char *page) 1945 { 1946 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 1947 } 1948 1949 static int update_size(mddev_t *mddev, unsigned long size); 1950 1951 static ssize_t 1952 size_store(mddev_t *mddev, const char *buf, size_t len) 1953 { 1954 /* If array is inactive, we can reduce the component size, but 1955 * not increase it (except from 0). 1956 * If array is active, we can try an on-line resize 1957 */ 1958 char *e; 1959 int err = 0; 1960 unsigned long long size = simple_strtoull(buf, &e, 10); 1961 if (!*buf || *buf == '\n' || 1962 (*e && *e != '\n')) 1963 return -EINVAL; 1964 1965 if (mddev->pers) { 1966 err = update_size(mddev, size); 1967 md_update_sb(mddev); 1968 } else { 1969 if (mddev->size == 0 || 1970 mddev->size > size) 1971 mddev->size = size; 1972 else 1973 err = -ENOSPC; 1974 } 1975 return err ? err : len; 1976 } 1977 1978 static struct md_sysfs_entry md_size = 1979 __ATTR(component_size, 0644, size_show, size_store); 1980 1981 1982 /* Metdata version. 1983 * This is either 'none' for arrays with externally managed metadata, 1984 * or N.M for internally known formats 1985 */ 1986 static ssize_t 1987 metadata_show(mddev_t *mddev, char *page) 1988 { 1989 if (mddev->persistent) 1990 return sprintf(page, "%d.%d\n", 1991 mddev->major_version, mddev->minor_version); 1992 else 1993 return sprintf(page, "none\n"); 1994 } 1995 1996 static ssize_t 1997 metadata_store(mddev_t *mddev, const char *buf, size_t len) 1998 { 1999 int major, minor; 2000 char *e; 2001 if (!list_empty(&mddev->disks)) 2002 return -EBUSY; 2003 2004 if (cmd_match(buf, "none")) { 2005 mddev->persistent = 0; 2006 mddev->major_version = 0; 2007 mddev->minor_version = 90; 2008 return len; 2009 } 2010 major = simple_strtoul(buf, &e, 10); 2011 if (e==buf || *e != '.') 2012 return -EINVAL; 2013 buf = e+1; 2014 minor = simple_strtoul(buf, &e, 10); 2015 if (e==buf || *e != '\n') 2016 return -EINVAL; 2017 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2018 super_types[major].name == NULL) 2019 return -ENOENT; 2020 mddev->major_version = major; 2021 mddev->minor_version = minor; 2022 mddev->persistent = 1; 2023 return len; 2024 } 2025 2026 static struct md_sysfs_entry md_metadata = 2027 __ATTR(metadata_version, 0644, metadata_show, metadata_store); 2028 2029 static ssize_t 2030 action_show(mddev_t *mddev, char *page) 2031 { 2032 char *type = "idle"; 2033 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2034 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2035 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2036 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2037 type = "resync"; 2038 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2039 type = "check"; 2040 else 2041 type = "repair"; 2042 } else 2043 type = "recover"; 2044 } 2045 return sprintf(page, "%s\n", type); 2046 } 2047 2048 static ssize_t 2049 action_store(mddev_t *mddev, const char *page, size_t len) 2050 { 2051 if (!mddev->pers || !mddev->pers->sync_request) 2052 return -EINVAL; 2053 2054 if (cmd_match(page, "idle")) { 2055 if (mddev->sync_thread) { 2056 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2057 md_unregister_thread(mddev->sync_thread); 2058 mddev->sync_thread = NULL; 2059 mddev->recovery = 0; 2060 } 2061 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2062 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2063 return -EBUSY; 2064 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2065 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2066 else { 2067 if (cmd_match(page, "check")) 2068 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2069 else if (cmd_match(page, "repair")) 2070 return -EINVAL; 2071 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2072 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2073 } 2074 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2075 md_wakeup_thread(mddev->thread); 2076 return len; 2077 } 2078 2079 static ssize_t 2080 mismatch_cnt_show(mddev_t *mddev, char *page) 2081 { 2082 return sprintf(page, "%llu\n", 2083 (unsigned long long) mddev->resync_mismatches); 2084 } 2085 2086 static struct md_sysfs_entry 2087 md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2088 2089 2090 static struct md_sysfs_entry 2091 md_mismatches = __ATTR_RO(mismatch_cnt); 2092 2093 static struct attribute *md_default_attrs[] = { 2094 &md_level.attr, 2095 &md_raid_disks.attr, 2096 &md_chunk_size.attr, 2097 &md_size.attr, 2098 &md_metadata.attr, 2099 NULL, 2100 }; 2101 2102 static struct attribute *md_redundancy_attrs[] = { 2103 &md_scan_mode.attr, 2104 &md_mismatches.attr, 2105 NULL, 2106 }; 2107 static struct attribute_group md_redundancy_group = { 2108 .name = NULL, 2109 .attrs = md_redundancy_attrs, 2110 }; 2111 2112 2113 static ssize_t 2114 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2115 { 2116 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2117 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2118 ssize_t rv; 2119 2120 if (!entry->show) 2121 return -EIO; 2122 mddev_lock(mddev); 2123 rv = entry->show(mddev, page); 2124 mddev_unlock(mddev); 2125 return rv; 2126 } 2127 2128 static ssize_t 2129 md_attr_store(struct kobject *kobj, struct attribute *attr, 2130 const char *page, size_t length) 2131 { 2132 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2133 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2134 ssize_t rv; 2135 2136 if (!entry->store) 2137 return -EIO; 2138 mddev_lock(mddev); 2139 rv = entry->store(mddev, page, length); 2140 mddev_unlock(mddev); 2141 return rv; 2142 } 2143 2144 static void md_free(struct kobject *ko) 2145 { 2146 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2147 kfree(mddev); 2148 } 2149 2150 static struct sysfs_ops md_sysfs_ops = { 2151 .show = md_attr_show, 2152 .store = md_attr_store, 2153 }; 2154 static struct kobj_type md_ktype = { 2155 .release = md_free, 2156 .sysfs_ops = &md_sysfs_ops, 2157 .default_attrs = md_default_attrs, 2158 }; 2159 2160 int mdp_major = 0; 2161 2162 static struct kobject *md_probe(dev_t dev, int *part, void *data) 2163 { 2164 static DECLARE_MUTEX(disks_sem); 2165 mddev_t *mddev = mddev_find(dev); 2166 struct gendisk *disk; 2167 int partitioned = (MAJOR(dev) != MD_MAJOR); 2168 int shift = partitioned ? MdpMinorShift : 0; 2169 int unit = MINOR(dev) >> shift; 2170 2171 if (!mddev) 2172 return NULL; 2173 2174 down(&disks_sem); 2175 if (mddev->gendisk) { 2176 up(&disks_sem); 2177 mddev_put(mddev); 2178 return NULL; 2179 } 2180 disk = alloc_disk(1 << shift); 2181 if (!disk) { 2182 up(&disks_sem); 2183 mddev_put(mddev); 2184 return NULL; 2185 } 2186 disk->major = MAJOR(dev); 2187 disk->first_minor = unit << shift; 2188 if (partitioned) { 2189 sprintf(disk->disk_name, "md_d%d", unit); 2190 sprintf(disk->devfs_name, "md/d%d", unit); 2191 } else { 2192 sprintf(disk->disk_name, "md%d", unit); 2193 sprintf(disk->devfs_name, "md/%d", unit); 2194 } 2195 disk->fops = &md_fops; 2196 disk->private_data = mddev; 2197 disk->queue = mddev->queue; 2198 add_disk(disk); 2199 mddev->gendisk = disk; 2200 up(&disks_sem); 2201 mddev->kobj.parent = &disk->kobj; 2202 mddev->kobj.k_name = NULL; 2203 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2204 mddev->kobj.ktype = &md_ktype; 2205 kobject_register(&mddev->kobj); 2206 return NULL; 2207 } 2208 2209 void md_wakeup_thread(mdk_thread_t *thread); 2210 2211 static void md_safemode_timeout(unsigned long data) 2212 { 2213 mddev_t *mddev = (mddev_t *) data; 2214 2215 mddev->safemode = 1; 2216 md_wakeup_thread(mddev->thread); 2217 } 2218 2219 static int start_dirty_degraded; 2220 2221 static int do_md_run(mddev_t * mddev) 2222 { 2223 int err; 2224 int chunk_size; 2225 struct list_head *tmp; 2226 mdk_rdev_t *rdev; 2227 struct gendisk *disk; 2228 struct mdk_personality *pers; 2229 char b[BDEVNAME_SIZE]; 2230 2231 if (list_empty(&mddev->disks)) 2232 /* cannot run an array with no devices.. */ 2233 return -EINVAL; 2234 2235 if (mddev->pers) 2236 return -EBUSY; 2237 2238 /* 2239 * Analyze all RAID superblock(s) 2240 */ 2241 if (!mddev->raid_disks) 2242 analyze_sbs(mddev); 2243 2244 chunk_size = mddev->chunk_size; 2245 2246 if (chunk_size) { 2247 if (chunk_size > MAX_CHUNK_SIZE) { 2248 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2249 chunk_size, MAX_CHUNK_SIZE); 2250 return -EINVAL; 2251 } 2252 /* 2253 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 2254 */ 2255 if ( (1 << ffz(~chunk_size)) != chunk_size) { 2256 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 2257 return -EINVAL; 2258 } 2259 if (chunk_size < PAGE_SIZE) { 2260 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 2261 chunk_size, PAGE_SIZE); 2262 return -EINVAL; 2263 } 2264 2265 /* devices must have minimum size of one chunk */ 2266 ITERATE_RDEV(mddev,rdev,tmp) { 2267 if (test_bit(Faulty, &rdev->flags)) 2268 continue; 2269 if (rdev->size < chunk_size / 1024) { 2270 printk(KERN_WARNING 2271 "md: Dev %s smaller than chunk_size:" 2272 " %lluk < %dk\n", 2273 bdevname(rdev->bdev,b), 2274 (unsigned long long)rdev->size, 2275 chunk_size / 1024); 2276 return -EINVAL; 2277 } 2278 } 2279 } 2280 2281 #ifdef CONFIG_KMOD 2282 if (mddev->level != LEVEL_NONE) 2283 request_module("md-level-%d", mddev->level); 2284 else if (mddev->clevel[0]) 2285 request_module("md-%s", mddev->clevel); 2286 #endif 2287 2288 /* 2289 * Drop all container device buffers, from now on 2290 * the only valid external interface is through the md 2291 * device. 2292 * Also find largest hardsector size 2293 */ 2294 ITERATE_RDEV(mddev,rdev,tmp) { 2295 if (test_bit(Faulty, &rdev->flags)) 2296 continue; 2297 sync_blockdev(rdev->bdev); 2298 invalidate_bdev(rdev->bdev, 0); 2299 } 2300 2301 md_probe(mddev->unit, NULL, NULL); 2302 disk = mddev->gendisk; 2303 if (!disk) 2304 return -ENOMEM; 2305 2306 spin_lock(&pers_lock); 2307 pers = find_pers(mddev->level, mddev->clevel); 2308 if (!pers || !try_module_get(pers->owner)) { 2309 spin_unlock(&pers_lock); 2310 if (mddev->level != LEVEL_NONE) 2311 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 2312 mddev->level); 2313 else 2314 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 2315 mddev->clevel); 2316 return -EINVAL; 2317 } 2318 mddev->pers = pers; 2319 spin_unlock(&pers_lock); 2320 mddev->level = pers->level; 2321 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2322 2323 mddev->recovery = 0; 2324 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2325 mddev->barriers_work = 1; 2326 mddev->ok_start_degraded = start_dirty_degraded; 2327 2328 if (start_readonly) 2329 mddev->ro = 2; /* read-only, but switch on first write */ 2330 2331 err = mddev->pers->run(mddev); 2332 if (!err && mddev->pers->sync_request) { 2333 err = bitmap_create(mddev); 2334 if (err) { 2335 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2336 mdname(mddev), err); 2337 mddev->pers->stop(mddev); 2338 } 2339 } 2340 if (err) { 2341 printk(KERN_ERR "md: pers->run() failed ...\n"); 2342 module_put(mddev->pers->owner); 2343 mddev->pers = NULL; 2344 bitmap_destroy(mddev); 2345 return err; 2346 } 2347 if (mddev->pers->sync_request) 2348 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 2349 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 2350 mddev->ro = 0; 2351 2352 atomic_set(&mddev->writes_pending,0); 2353 mddev->safemode = 0; 2354 mddev->safemode_timer.function = md_safemode_timeout; 2355 mddev->safemode_timer.data = (unsigned long) mddev; 2356 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2357 mddev->in_sync = 1; 2358 2359 ITERATE_RDEV(mddev,rdev,tmp) 2360 if (rdev->raid_disk >= 0) { 2361 char nm[20]; 2362 sprintf(nm, "rd%d", rdev->raid_disk); 2363 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2364 } 2365 2366 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2367 md_wakeup_thread(mddev->thread); 2368 2369 if (mddev->sb_dirty) 2370 md_update_sb(mddev); 2371 2372 set_capacity(disk, mddev->array_size<<1); 2373 2374 /* If we call blk_queue_make_request here, it will 2375 * re-initialise max_sectors etc which may have been 2376 * refined inside -> run. So just set the bits we need to set. 2377 * Most initialisation happended when we called 2378 * blk_queue_make_request(..., md_fail_request) 2379 * earlier. 2380 */ 2381 mddev->queue->queuedata = mddev; 2382 mddev->queue->make_request_fn = mddev->pers->make_request; 2383 2384 mddev->changed = 1; 2385 md_new_event(mddev); 2386 return 0; 2387 } 2388 2389 static int restart_array(mddev_t *mddev) 2390 { 2391 struct gendisk *disk = mddev->gendisk; 2392 int err; 2393 2394 /* 2395 * Complain if it has no devices 2396 */ 2397 err = -ENXIO; 2398 if (list_empty(&mddev->disks)) 2399 goto out; 2400 2401 if (mddev->pers) { 2402 err = -EBUSY; 2403 if (!mddev->ro) 2404 goto out; 2405 2406 mddev->safemode = 0; 2407 mddev->ro = 0; 2408 set_disk_ro(disk, 0); 2409 2410 printk(KERN_INFO "md: %s switched to read-write mode.\n", 2411 mdname(mddev)); 2412 /* 2413 * Kick recovery or resync if necessary 2414 */ 2415 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2416 md_wakeup_thread(mddev->thread); 2417 err = 0; 2418 } else { 2419 printk(KERN_ERR "md: %s has no personality assigned.\n", 2420 mdname(mddev)); 2421 err = -EINVAL; 2422 } 2423 2424 out: 2425 return err; 2426 } 2427 2428 static int do_md_stop(mddev_t * mddev, int ro) 2429 { 2430 int err = 0; 2431 struct gendisk *disk = mddev->gendisk; 2432 2433 if (mddev->pers) { 2434 if (atomic_read(&mddev->active)>2) { 2435 printk("md: %s still in use.\n",mdname(mddev)); 2436 return -EBUSY; 2437 } 2438 2439 if (mddev->sync_thread) { 2440 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2441 md_unregister_thread(mddev->sync_thread); 2442 mddev->sync_thread = NULL; 2443 } 2444 2445 del_timer_sync(&mddev->safemode_timer); 2446 2447 invalidate_partition(disk, 0); 2448 2449 if (ro) { 2450 err = -ENXIO; 2451 if (mddev->ro==1) 2452 goto out; 2453 mddev->ro = 1; 2454 } else { 2455 bitmap_flush(mddev); 2456 md_super_wait(mddev); 2457 if (mddev->ro) 2458 set_disk_ro(disk, 0); 2459 blk_queue_make_request(mddev->queue, md_fail_request); 2460 mddev->pers->stop(mddev); 2461 if (mddev->pers->sync_request) 2462 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 2463 2464 module_put(mddev->pers->owner); 2465 mddev->pers = NULL; 2466 if (mddev->ro) 2467 mddev->ro = 0; 2468 } 2469 if (!mddev->in_sync) { 2470 /* mark array as shutdown cleanly */ 2471 mddev->in_sync = 1; 2472 md_update_sb(mddev); 2473 } 2474 if (ro) 2475 set_disk_ro(disk, 1); 2476 } 2477 2478 bitmap_destroy(mddev); 2479 if (mddev->bitmap_file) { 2480 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 2481 fput(mddev->bitmap_file); 2482 mddev->bitmap_file = NULL; 2483 } 2484 mddev->bitmap_offset = 0; 2485 2486 /* 2487 * Free resources if final stop 2488 */ 2489 if (!ro) { 2490 mdk_rdev_t *rdev; 2491 struct list_head *tmp; 2492 struct gendisk *disk; 2493 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2494 2495 ITERATE_RDEV(mddev,rdev,tmp) 2496 if (rdev->raid_disk >= 0) { 2497 char nm[20]; 2498 sprintf(nm, "rd%d", rdev->raid_disk); 2499 sysfs_remove_link(&mddev->kobj, nm); 2500 } 2501 2502 export_array(mddev); 2503 2504 mddev->array_size = 0; 2505 disk = mddev->gendisk; 2506 if (disk) 2507 set_capacity(disk, 0); 2508 mddev->changed = 1; 2509 } else 2510 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2511 mdname(mddev)); 2512 err = 0; 2513 md_new_event(mddev); 2514 out: 2515 return err; 2516 } 2517 2518 static void autorun_array(mddev_t *mddev) 2519 { 2520 mdk_rdev_t *rdev; 2521 struct list_head *tmp; 2522 int err; 2523 2524 if (list_empty(&mddev->disks)) 2525 return; 2526 2527 printk(KERN_INFO "md: running: "); 2528 2529 ITERATE_RDEV(mddev,rdev,tmp) { 2530 char b[BDEVNAME_SIZE]; 2531 printk("<%s>", bdevname(rdev->bdev,b)); 2532 } 2533 printk("\n"); 2534 2535 err = do_md_run (mddev); 2536 if (err) { 2537 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 2538 do_md_stop (mddev, 0); 2539 } 2540 } 2541 2542 /* 2543 * lets try to run arrays based on all disks that have arrived 2544 * until now. (those are in pending_raid_disks) 2545 * 2546 * the method: pick the first pending disk, collect all disks with 2547 * the same UUID, remove all from the pending list and put them into 2548 * the 'same_array' list. Then order this list based on superblock 2549 * update time (freshest comes first), kick out 'old' disks and 2550 * compare superblocks. If everything's fine then run it. 2551 * 2552 * If "unit" is allocated, then bump its reference count 2553 */ 2554 static void autorun_devices(int part) 2555 { 2556 struct list_head candidates; 2557 struct list_head *tmp; 2558 mdk_rdev_t *rdev0, *rdev; 2559 mddev_t *mddev; 2560 char b[BDEVNAME_SIZE]; 2561 2562 printk(KERN_INFO "md: autorun ...\n"); 2563 while (!list_empty(&pending_raid_disks)) { 2564 dev_t dev; 2565 rdev0 = list_entry(pending_raid_disks.next, 2566 mdk_rdev_t, same_set); 2567 2568 printk(KERN_INFO "md: considering %s ...\n", 2569 bdevname(rdev0->bdev,b)); 2570 INIT_LIST_HEAD(&candidates); 2571 ITERATE_RDEV_PENDING(rdev,tmp) 2572 if (super_90_load(rdev, rdev0, 0) >= 0) { 2573 printk(KERN_INFO "md: adding %s ...\n", 2574 bdevname(rdev->bdev,b)); 2575 list_move(&rdev->same_set, &candidates); 2576 } 2577 /* 2578 * now we have a set of devices, with all of them having 2579 * mostly sane superblocks. It's time to allocate the 2580 * mddev. 2581 */ 2582 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 2583 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 2584 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 2585 break; 2586 } 2587 if (part) 2588 dev = MKDEV(mdp_major, 2589 rdev0->preferred_minor << MdpMinorShift); 2590 else 2591 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 2592 2593 md_probe(dev, NULL, NULL); 2594 mddev = mddev_find(dev); 2595 if (!mddev) { 2596 printk(KERN_ERR 2597 "md: cannot allocate memory for md drive.\n"); 2598 break; 2599 } 2600 if (mddev_lock(mddev)) 2601 printk(KERN_WARNING "md: %s locked, cannot run\n", 2602 mdname(mddev)); 2603 else if (mddev->raid_disks || mddev->major_version 2604 || !list_empty(&mddev->disks)) { 2605 printk(KERN_WARNING 2606 "md: %s already running, cannot run %s\n", 2607 mdname(mddev), bdevname(rdev0->bdev,b)); 2608 mddev_unlock(mddev); 2609 } else { 2610 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 2611 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 2612 list_del_init(&rdev->same_set); 2613 if (bind_rdev_to_array(rdev, mddev)) 2614 export_rdev(rdev); 2615 } 2616 autorun_array(mddev); 2617 mddev_unlock(mddev); 2618 } 2619 /* on success, candidates will be empty, on error 2620 * it won't... 2621 */ 2622 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 2623 export_rdev(rdev); 2624 mddev_put(mddev); 2625 } 2626 printk(KERN_INFO "md: ... autorun DONE.\n"); 2627 } 2628 2629 /* 2630 * import RAID devices based on one partition 2631 * if possible, the array gets run as well. 2632 */ 2633 2634 static int autostart_array(dev_t startdev) 2635 { 2636 char b[BDEVNAME_SIZE]; 2637 int err = -EINVAL, i; 2638 mdp_super_t *sb = NULL; 2639 mdk_rdev_t *start_rdev = NULL, *rdev; 2640 2641 start_rdev = md_import_device(startdev, 0, 0); 2642 if (IS_ERR(start_rdev)) 2643 return err; 2644 2645 2646 /* NOTE: this can only work for 0.90.0 superblocks */ 2647 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2648 if (sb->major_version != 0 || 2649 sb->minor_version != 90 ) { 2650 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2651 export_rdev(start_rdev); 2652 return err; 2653 } 2654 2655 if (test_bit(Faulty, &start_rdev->flags)) { 2656 printk(KERN_WARNING 2657 "md: can not autostart based on faulty %s!\n", 2658 bdevname(start_rdev->bdev,b)); 2659 export_rdev(start_rdev); 2660 return err; 2661 } 2662 list_add(&start_rdev->same_set, &pending_raid_disks); 2663 2664 for (i = 0; i < MD_SB_DISKS; i++) { 2665 mdp_disk_t *desc = sb->disks + i; 2666 dev_t dev = MKDEV(desc->major, desc->minor); 2667 2668 if (!dev) 2669 continue; 2670 if (dev == startdev) 2671 continue; 2672 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2673 continue; 2674 rdev = md_import_device(dev, 0, 0); 2675 if (IS_ERR(rdev)) 2676 continue; 2677 2678 list_add(&rdev->same_set, &pending_raid_disks); 2679 } 2680 2681 /* 2682 * possibly return codes 2683 */ 2684 autorun_devices(0); 2685 return 0; 2686 2687 } 2688 2689 2690 static int get_version(void __user * arg) 2691 { 2692 mdu_version_t ver; 2693 2694 ver.major = MD_MAJOR_VERSION; 2695 ver.minor = MD_MINOR_VERSION; 2696 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2697 2698 if (copy_to_user(arg, &ver, sizeof(ver))) 2699 return -EFAULT; 2700 2701 return 0; 2702 } 2703 2704 static int get_array_info(mddev_t * mddev, void __user * arg) 2705 { 2706 mdu_array_info_t info; 2707 int nr,working,active,failed,spare; 2708 mdk_rdev_t *rdev; 2709 struct list_head *tmp; 2710 2711 nr=working=active=failed=spare=0; 2712 ITERATE_RDEV(mddev,rdev,tmp) { 2713 nr++; 2714 if (test_bit(Faulty, &rdev->flags)) 2715 failed++; 2716 else { 2717 working++; 2718 if (test_bit(In_sync, &rdev->flags)) 2719 active++; 2720 else 2721 spare++; 2722 } 2723 } 2724 2725 info.major_version = mddev->major_version; 2726 info.minor_version = mddev->minor_version; 2727 info.patch_version = MD_PATCHLEVEL_VERSION; 2728 info.ctime = mddev->ctime; 2729 info.level = mddev->level; 2730 info.size = mddev->size; 2731 info.nr_disks = nr; 2732 info.raid_disks = mddev->raid_disks; 2733 info.md_minor = mddev->md_minor; 2734 info.not_persistent= !mddev->persistent; 2735 2736 info.utime = mddev->utime; 2737 info.state = 0; 2738 if (mddev->in_sync) 2739 info.state = (1<<MD_SB_CLEAN); 2740 if (mddev->bitmap && mddev->bitmap_offset) 2741 info.state = (1<<MD_SB_BITMAP_PRESENT); 2742 info.active_disks = active; 2743 info.working_disks = working; 2744 info.failed_disks = failed; 2745 info.spare_disks = spare; 2746 2747 info.layout = mddev->layout; 2748 info.chunk_size = mddev->chunk_size; 2749 2750 if (copy_to_user(arg, &info, sizeof(info))) 2751 return -EFAULT; 2752 2753 return 0; 2754 } 2755 2756 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2757 { 2758 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2759 char *ptr, *buf = NULL; 2760 int err = -ENOMEM; 2761 2762 file = kmalloc(sizeof(*file), GFP_KERNEL); 2763 if (!file) 2764 goto out; 2765 2766 /* bitmap disabled, zero the first byte and copy out */ 2767 if (!mddev->bitmap || !mddev->bitmap->file) { 2768 file->pathname[0] = '\0'; 2769 goto copy_out; 2770 } 2771 2772 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2773 if (!buf) 2774 goto out; 2775 2776 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2777 if (!ptr) 2778 goto out; 2779 2780 strcpy(file->pathname, ptr); 2781 2782 copy_out: 2783 err = 0; 2784 if (copy_to_user(arg, file, sizeof(*file))) 2785 err = -EFAULT; 2786 out: 2787 kfree(buf); 2788 kfree(file); 2789 return err; 2790 } 2791 2792 static int get_disk_info(mddev_t * mddev, void __user * arg) 2793 { 2794 mdu_disk_info_t info; 2795 unsigned int nr; 2796 mdk_rdev_t *rdev; 2797 2798 if (copy_from_user(&info, arg, sizeof(info))) 2799 return -EFAULT; 2800 2801 nr = info.number; 2802 2803 rdev = find_rdev_nr(mddev, nr); 2804 if (rdev) { 2805 info.major = MAJOR(rdev->bdev->bd_dev); 2806 info.minor = MINOR(rdev->bdev->bd_dev); 2807 info.raid_disk = rdev->raid_disk; 2808 info.state = 0; 2809 if (test_bit(Faulty, &rdev->flags)) 2810 info.state |= (1<<MD_DISK_FAULTY); 2811 else if (test_bit(In_sync, &rdev->flags)) { 2812 info.state |= (1<<MD_DISK_ACTIVE); 2813 info.state |= (1<<MD_DISK_SYNC); 2814 } 2815 if (test_bit(WriteMostly, &rdev->flags)) 2816 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2817 } else { 2818 info.major = info.minor = 0; 2819 info.raid_disk = -1; 2820 info.state = (1<<MD_DISK_REMOVED); 2821 } 2822 2823 if (copy_to_user(arg, &info, sizeof(info))) 2824 return -EFAULT; 2825 2826 return 0; 2827 } 2828 2829 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2830 { 2831 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2832 mdk_rdev_t *rdev; 2833 dev_t dev = MKDEV(info->major,info->minor); 2834 2835 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2836 return -EOVERFLOW; 2837 2838 if (!mddev->raid_disks) { 2839 int err; 2840 /* expecting a device which has a superblock */ 2841 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2842 if (IS_ERR(rdev)) { 2843 printk(KERN_WARNING 2844 "md: md_import_device returned %ld\n", 2845 PTR_ERR(rdev)); 2846 return PTR_ERR(rdev); 2847 } 2848 if (!list_empty(&mddev->disks)) { 2849 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2850 mdk_rdev_t, same_set); 2851 int err = super_types[mddev->major_version] 2852 .load_super(rdev, rdev0, mddev->minor_version); 2853 if (err < 0) { 2854 printk(KERN_WARNING 2855 "md: %s has different UUID to %s\n", 2856 bdevname(rdev->bdev,b), 2857 bdevname(rdev0->bdev,b2)); 2858 export_rdev(rdev); 2859 return -EINVAL; 2860 } 2861 } 2862 err = bind_rdev_to_array(rdev, mddev); 2863 if (err) 2864 export_rdev(rdev); 2865 return err; 2866 } 2867 2868 /* 2869 * add_new_disk can be used once the array is assembled 2870 * to add "hot spares". They must already have a superblock 2871 * written 2872 */ 2873 if (mddev->pers) { 2874 int err; 2875 if (!mddev->pers->hot_add_disk) { 2876 printk(KERN_WARNING 2877 "%s: personality does not support diskops!\n", 2878 mdname(mddev)); 2879 return -EINVAL; 2880 } 2881 if (mddev->persistent) 2882 rdev = md_import_device(dev, mddev->major_version, 2883 mddev->minor_version); 2884 else 2885 rdev = md_import_device(dev, -1, -1); 2886 if (IS_ERR(rdev)) { 2887 printk(KERN_WARNING 2888 "md: md_import_device returned %ld\n", 2889 PTR_ERR(rdev)); 2890 return PTR_ERR(rdev); 2891 } 2892 /* set save_raid_disk if appropriate */ 2893 if (!mddev->persistent) { 2894 if (info->state & (1<<MD_DISK_SYNC) && 2895 info->raid_disk < mddev->raid_disks) 2896 rdev->raid_disk = info->raid_disk; 2897 else 2898 rdev->raid_disk = -1; 2899 } else 2900 super_types[mddev->major_version]. 2901 validate_super(mddev, rdev); 2902 rdev->saved_raid_disk = rdev->raid_disk; 2903 2904 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 2905 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2906 set_bit(WriteMostly, &rdev->flags); 2907 2908 rdev->raid_disk = -1; 2909 err = bind_rdev_to_array(rdev, mddev); 2910 if (err) 2911 export_rdev(rdev); 2912 2913 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2914 md_wakeup_thread(mddev->thread); 2915 return err; 2916 } 2917 2918 /* otherwise, add_new_disk is only allowed 2919 * for major_version==0 superblocks 2920 */ 2921 if (mddev->major_version != 0) { 2922 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2923 mdname(mddev)); 2924 return -EINVAL; 2925 } 2926 2927 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2928 int err; 2929 rdev = md_import_device (dev, -1, 0); 2930 if (IS_ERR(rdev)) { 2931 printk(KERN_WARNING 2932 "md: error, md_import_device() returned %ld\n", 2933 PTR_ERR(rdev)); 2934 return PTR_ERR(rdev); 2935 } 2936 rdev->desc_nr = info->number; 2937 if (info->raid_disk < mddev->raid_disks) 2938 rdev->raid_disk = info->raid_disk; 2939 else 2940 rdev->raid_disk = -1; 2941 2942 rdev->flags = 0; 2943 2944 if (rdev->raid_disk < mddev->raid_disks) 2945 if (info->state & (1<<MD_DISK_SYNC)) 2946 set_bit(In_sync, &rdev->flags); 2947 2948 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2949 set_bit(WriteMostly, &rdev->flags); 2950 2951 if (!mddev->persistent) { 2952 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2953 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2954 } else 2955 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2956 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2957 2958 err = bind_rdev_to_array(rdev, mddev); 2959 if (err) { 2960 export_rdev(rdev); 2961 return err; 2962 } 2963 } 2964 2965 return 0; 2966 } 2967 2968 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2969 { 2970 char b[BDEVNAME_SIZE]; 2971 mdk_rdev_t *rdev; 2972 2973 if (!mddev->pers) 2974 return -ENODEV; 2975 2976 rdev = find_rdev(mddev, dev); 2977 if (!rdev) 2978 return -ENXIO; 2979 2980 if (rdev->raid_disk >= 0) 2981 goto busy; 2982 2983 kick_rdev_from_array(rdev); 2984 md_update_sb(mddev); 2985 md_new_event(mddev); 2986 2987 return 0; 2988 busy: 2989 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2990 bdevname(rdev->bdev,b), mdname(mddev)); 2991 return -EBUSY; 2992 } 2993 2994 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2995 { 2996 char b[BDEVNAME_SIZE]; 2997 int err; 2998 unsigned int size; 2999 mdk_rdev_t *rdev; 3000 3001 if (!mddev->pers) 3002 return -ENODEV; 3003 3004 if (mddev->major_version != 0) { 3005 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3006 " version-0 superblocks.\n", 3007 mdname(mddev)); 3008 return -EINVAL; 3009 } 3010 if (!mddev->pers->hot_add_disk) { 3011 printk(KERN_WARNING 3012 "%s: personality does not support diskops!\n", 3013 mdname(mddev)); 3014 return -EINVAL; 3015 } 3016 3017 rdev = md_import_device (dev, -1, 0); 3018 if (IS_ERR(rdev)) { 3019 printk(KERN_WARNING 3020 "md: error, md_import_device() returned %ld\n", 3021 PTR_ERR(rdev)); 3022 return -EINVAL; 3023 } 3024 3025 if (mddev->persistent) 3026 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3027 else 3028 rdev->sb_offset = 3029 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3030 3031 size = calc_dev_size(rdev, mddev->chunk_size); 3032 rdev->size = size; 3033 3034 if (test_bit(Faulty, &rdev->flags)) { 3035 printk(KERN_WARNING 3036 "md: can not hot-add faulty %s disk to %s!\n", 3037 bdevname(rdev->bdev,b), mdname(mddev)); 3038 err = -EINVAL; 3039 goto abort_export; 3040 } 3041 clear_bit(In_sync, &rdev->flags); 3042 rdev->desc_nr = -1; 3043 err = bind_rdev_to_array(rdev, mddev); 3044 if (err) 3045 goto abort_export; 3046 3047 /* 3048 * The rest should better be atomic, we can have disk failures 3049 * noticed in interrupt contexts ... 3050 */ 3051 3052 if (rdev->desc_nr == mddev->max_disks) { 3053 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3054 mdname(mddev)); 3055 err = -EBUSY; 3056 goto abort_unbind_export; 3057 } 3058 3059 rdev->raid_disk = -1; 3060 3061 md_update_sb(mddev); 3062 3063 /* 3064 * Kick recovery, maybe this spare has to be added to the 3065 * array immediately. 3066 */ 3067 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3068 md_wakeup_thread(mddev->thread); 3069 md_new_event(mddev); 3070 return 0; 3071 3072 abort_unbind_export: 3073 unbind_rdev_from_array(rdev); 3074 3075 abort_export: 3076 export_rdev(rdev); 3077 return err; 3078 } 3079 3080 /* similar to deny_write_access, but accounts for our holding a reference 3081 * to the file ourselves */ 3082 static int deny_bitmap_write_access(struct file * file) 3083 { 3084 struct inode *inode = file->f_mapping->host; 3085 3086 spin_lock(&inode->i_lock); 3087 if (atomic_read(&inode->i_writecount) > 1) { 3088 spin_unlock(&inode->i_lock); 3089 return -ETXTBSY; 3090 } 3091 atomic_set(&inode->i_writecount, -1); 3092 spin_unlock(&inode->i_lock); 3093 3094 return 0; 3095 } 3096 3097 static int set_bitmap_file(mddev_t *mddev, int fd) 3098 { 3099 int err; 3100 3101 if (mddev->pers) { 3102 if (!mddev->pers->quiesce) 3103 return -EBUSY; 3104 if (mddev->recovery || mddev->sync_thread) 3105 return -EBUSY; 3106 /* we should be able to change the bitmap.. */ 3107 } 3108 3109 3110 if (fd >= 0) { 3111 if (mddev->bitmap) 3112 return -EEXIST; /* cannot add when bitmap is present */ 3113 mddev->bitmap_file = fget(fd); 3114 3115 if (mddev->bitmap_file == NULL) { 3116 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3117 mdname(mddev)); 3118 return -EBADF; 3119 } 3120 3121 err = deny_bitmap_write_access(mddev->bitmap_file); 3122 if (err) { 3123 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3124 mdname(mddev)); 3125 fput(mddev->bitmap_file); 3126 mddev->bitmap_file = NULL; 3127 return err; 3128 } 3129 mddev->bitmap_offset = 0; /* file overrides offset */ 3130 } else if (mddev->bitmap == NULL) 3131 return -ENOENT; /* cannot remove what isn't there */ 3132 err = 0; 3133 if (mddev->pers) { 3134 mddev->pers->quiesce(mddev, 1); 3135 if (fd >= 0) 3136 err = bitmap_create(mddev); 3137 if (fd < 0 || err) 3138 bitmap_destroy(mddev); 3139 mddev->pers->quiesce(mddev, 0); 3140 } else if (fd < 0) { 3141 if (mddev->bitmap_file) 3142 fput(mddev->bitmap_file); 3143 mddev->bitmap_file = NULL; 3144 } 3145 3146 return err; 3147 } 3148 3149 /* 3150 * set_array_info is used two different ways 3151 * The original usage is when creating a new array. 3152 * In this usage, raid_disks is > 0 and it together with 3153 * level, size, not_persistent,layout,chunksize determine the 3154 * shape of the array. 3155 * This will always create an array with a type-0.90.0 superblock. 3156 * The newer usage is when assembling an array. 3157 * In this case raid_disks will be 0, and the major_version field is 3158 * use to determine which style super-blocks are to be found on the devices. 3159 * The minor and patch _version numbers are also kept incase the 3160 * super_block handler wishes to interpret them. 3161 */ 3162 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3163 { 3164 3165 if (info->raid_disks == 0) { 3166 /* just setting version number for superblock loading */ 3167 if (info->major_version < 0 || 3168 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3169 super_types[info->major_version].name == NULL) { 3170 /* maybe try to auto-load a module? */ 3171 printk(KERN_INFO 3172 "md: superblock version %d not known\n", 3173 info->major_version); 3174 return -EINVAL; 3175 } 3176 mddev->major_version = info->major_version; 3177 mddev->minor_version = info->minor_version; 3178 mddev->patch_version = info->patch_version; 3179 return 0; 3180 } 3181 mddev->major_version = MD_MAJOR_VERSION; 3182 mddev->minor_version = MD_MINOR_VERSION; 3183 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3184 mddev->ctime = get_seconds(); 3185 3186 mddev->level = info->level; 3187 mddev->size = info->size; 3188 mddev->raid_disks = info->raid_disks; 3189 /* don't set md_minor, it is determined by which /dev/md* was 3190 * openned 3191 */ 3192 if (info->state & (1<<MD_SB_CLEAN)) 3193 mddev->recovery_cp = MaxSector; 3194 else 3195 mddev->recovery_cp = 0; 3196 mddev->persistent = ! info->not_persistent; 3197 3198 mddev->layout = info->layout; 3199 mddev->chunk_size = info->chunk_size; 3200 3201 mddev->max_disks = MD_SB_DISKS; 3202 3203 mddev->sb_dirty = 1; 3204 3205 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 3206 mddev->bitmap_offset = 0; 3207 3208 /* 3209 * Generate a 128 bit UUID 3210 */ 3211 get_random_bytes(mddev->uuid, 16); 3212 3213 return 0; 3214 } 3215 3216 static int update_size(mddev_t *mddev, unsigned long size) 3217 { 3218 mdk_rdev_t * rdev; 3219 int rv; 3220 struct list_head *tmp; 3221 3222 if (mddev->pers->resize == NULL) 3223 return -EINVAL; 3224 /* The "size" is the amount of each device that is used. 3225 * This can only make sense for arrays with redundancy. 3226 * linear and raid0 always use whatever space is available 3227 * We can only consider changing the size if no resync 3228 * or reconstruction is happening, and if the new size 3229 * is acceptable. It must fit before the sb_offset or, 3230 * if that is <data_offset, it must fit before the 3231 * size of each device. 3232 * If size is zero, we find the largest size that fits. 3233 */ 3234 if (mddev->sync_thread) 3235 return -EBUSY; 3236 ITERATE_RDEV(mddev,rdev,tmp) { 3237 sector_t avail; 3238 int fit = (size == 0); 3239 if (rdev->sb_offset > rdev->data_offset) 3240 avail = (rdev->sb_offset*2) - rdev->data_offset; 3241 else 3242 avail = get_capacity(rdev->bdev->bd_disk) 3243 - rdev->data_offset; 3244 if (fit && (size == 0 || size > avail/2)) 3245 size = avail/2; 3246 if (avail < ((sector_t)size << 1)) 3247 return -ENOSPC; 3248 } 3249 rv = mddev->pers->resize(mddev, (sector_t)size *2); 3250 if (!rv) { 3251 struct block_device *bdev; 3252 3253 bdev = bdget_disk(mddev->gendisk, 0); 3254 if (bdev) { 3255 down(&bdev->bd_inode->i_sem); 3256 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3257 up(&bdev->bd_inode->i_sem); 3258 bdput(bdev); 3259 } 3260 } 3261 return rv; 3262 } 3263 3264 static int update_raid_disks(mddev_t *mddev, int raid_disks) 3265 { 3266 int rv; 3267 /* change the number of raid disks */ 3268 if (mddev->pers->reshape == NULL) 3269 return -EINVAL; 3270 if (raid_disks <= 0 || 3271 raid_disks >= mddev->max_disks) 3272 return -EINVAL; 3273 if (mddev->sync_thread) 3274 return -EBUSY; 3275 rv = mddev->pers->reshape(mddev, raid_disks); 3276 if (!rv) { 3277 struct block_device *bdev; 3278 3279 bdev = bdget_disk(mddev->gendisk, 0); 3280 if (bdev) { 3281 down(&bdev->bd_inode->i_sem); 3282 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3283 up(&bdev->bd_inode->i_sem); 3284 bdput(bdev); 3285 } 3286 } 3287 return rv; 3288 } 3289 3290 3291 /* 3292 * update_array_info is used to change the configuration of an 3293 * on-line array. 3294 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 3295 * fields in the info are checked against the array. 3296 * Any differences that cannot be handled will cause an error. 3297 * Normally, only one change can be managed at a time. 3298 */ 3299 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 3300 { 3301 int rv = 0; 3302 int cnt = 0; 3303 int state = 0; 3304 3305 /* calculate expected state,ignoring low bits */ 3306 if (mddev->bitmap && mddev->bitmap_offset) 3307 state |= (1 << MD_SB_BITMAP_PRESENT); 3308 3309 if (mddev->major_version != info->major_version || 3310 mddev->minor_version != info->minor_version || 3311 /* mddev->patch_version != info->patch_version || */ 3312 mddev->ctime != info->ctime || 3313 mddev->level != info->level || 3314 /* mddev->layout != info->layout || */ 3315 !mddev->persistent != info->not_persistent|| 3316 mddev->chunk_size != info->chunk_size || 3317 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 3318 ((state^info->state) & 0xfffffe00) 3319 ) 3320 return -EINVAL; 3321 /* Check there is only one change */ 3322 if (mddev->size != info->size) cnt++; 3323 if (mddev->raid_disks != info->raid_disks) cnt++; 3324 if (mddev->layout != info->layout) cnt++; 3325 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 3326 if (cnt == 0) return 0; 3327 if (cnt > 1) return -EINVAL; 3328 3329 if (mddev->layout != info->layout) { 3330 /* Change layout 3331 * we don't need to do anything at the md level, the 3332 * personality will take care of it all. 3333 */ 3334 if (mddev->pers->reconfig == NULL) 3335 return -EINVAL; 3336 else 3337 return mddev->pers->reconfig(mddev, info->layout, -1); 3338 } 3339 if (mddev->size != info->size) 3340 rv = update_size(mddev, info->size); 3341 3342 if (mddev->raid_disks != info->raid_disks) 3343 rv = update_raid_disks(mddev, info->raid_disks); 3344 3345 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3346 if (mddev->pers->quiesce == NULL) 3347 return -EINVAL; 3348 if (mddev->recovery || mddev->sync_thread) 3349 return -EBUSY; 3350 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 3351 /* add the bitmap */ 3352 if (mddev->bitmap) 3353 return -EEXIST; 3354 if (mddev->default_bitmap_offset == 0) 3355 return -EINVAL; 3356 mddev->bitmap_offset = mddev->default_bitmap_offset; 3357 mddev->pers->quiesce(mddev, 1); 3358 rv = bitmap_create(mddev); 3359 if (rv) 3360 bitmap_destroy(mddev); 3361 mddev->pers->quiesce(mddev, 0); 3362 } else { 3363 /* remove the bitmap */ 3364 if (!mddev->bitmap) 3365 return -ENOENT; 3366 if (mddev->bitmap->file) 3367 return -EINVAL; 3368 mddev->pers->quiesce(mddev, 1); 3369 bitmap_destroy(mddev); 3370 mddev->pers->quiesce(mddev, 0); 3371 mddev->bitmap_offset = 0; 3372 } 3373 } 3374 md_update_sb(mddev); 3375 return rv; 3376 } 3377 3378 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 3379 { 3380 mdk_rdev_t *rdev; 3381 3382 if (mddev->pers == NULL) 3383 return -ENODEV; 3384 3385 rdev = find_rdev(mddev, dev); 3386 if (!rdev) 3387 return -ENODEV; 3388 3389 md_error(mddev, rdev); 3390 return 0; 3391 } 3392 3393 static int md_ioctl(struct inode *inode, struct file *file, 3394 unsigned int cmd, unsigned long arg) 3395 { 3396 int err = 0; 3397 void __user *argp = (void __user *)arg; 3398 struct hd_geometry __user *loc = argp; 3399 mddev_t *mddev = NULL; 3400 3401 if (!capable(CAP_SYS_ADMIN)) 3402 return -EACCES; 3403 3404 /* 3405 * Commands dealing with the RAID driver but not any 3406 * particular array: 3407 */ 3408 switch (cmd) 3409 { 3410 case RAID_VERSION: 3411 err = get_version(argp); 3412 goto done; 3413 3414 case PRINT_RAID_DEBUG: 3415 err = 0; 3416 md_print_devices(); 3417 goto done; 3418 3419 #ifndef MODULE 3420 case RAID_AUTORUN: 3421 err = 0; 3422 autostart_arrays(arg); 3423 goto done; 3424 #endif 3425 default:; 3426 } 3427 3428 /* 3429 * Commands creating/starting a new array: 3430 */ 3431 3432 mddev = inode->i_bdev->bd_disk->private_data; 3433 3434 if (!mddev) { 3435 BUG(); 3436 goto abort; 3437 } 3438 3439 3440 if (cmd == START_ARRAY) { 3441 /* START_ARRAY doesn't need to lock the array as autostart_array 3442 * does the locking, and it could even be a different array 3443 */ 3444 static int cnt = 3; 3445 if (cnt > 0 ) { 3446 printk(KERN_WARNING 3447 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 3448 "This will not be supported beyond July 2006\n", 3449 current->comm, current->pid); 3450 cnt--; 3451 } 3452 err = autostart_array(new_decode_dev(arg)); 3453 if (err) { 3454 printk(KERN_WARNING "md: autostart failed!\n"); 3455 goto abort; 3456 } 3457 goto done; 3458 } 3459 3460 err = mddev_lock(mddev); 3461 if (err) { 3462 printk(KERN_INFO 3463 "md: ioctl lock interrupted, reason %d, cmd %d\n", 3464 err, cmd); 3465 goto abort; 3466 } 3467 3468 switch (cmd) 3469 { 3470 case SET_ARRAY_INFO: 3471 { 3472 mdu_array_info_t info; 3473 if (!arg) 3474 memset(&info, 0, sizeof(info)); 3475 else if (copy_from_user(&info, argp, sizeof(info))) { 3476 err = -EFAULT; 3477 goto abort_unlock; 3478 } 3479 if (mddev->pers) { 3480 err = update_array_info(mddev, &info); 3481 if (err) { 3482 printk(KERN_WARNING "md: couldn't update" 3483 " array info. %d\n", err); 3484 goto abort_unlock; 3485 } 3486 goto done_unlock; 3487 } 3488 if (!list_empty(&mddev->disks)) { 3489 printk(KERN_WARNING 3490 "md: array %s already has disks!\n", 3491 mdname(mddev)); 3492 err = -EBUSY; 3493 goto abort_unlock; 3494 } 3495 if (mddev->raid_disks) { 3496 printk(KERN_WARNING 3497 "md: array %s already initialised!\n", 3498 mdname(mddev)); 3499 err = -EBUSY; 3500 goto abort_unlock; 3501 } 3502 err = set_array_info(mddev, &info); 3503 if (err) { 3504 printk(KERN_WARNING "md: couldn't set" 3505 " array info. %d\n", err); 3506 goto abort_unlock; 3507 } 3508 } 3509 goto done_unlock; 3510 3511 default:; 3512 } 3513 3514 /* 3515 * Commands querying/configuring an existing array: 3516 */ 3517 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 3518 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 3519 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 3520 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 3521 err = -ENODEV; 3522 goto abort_unlock; 3523 } 3524 3525 /* 3526 * Commands even a read-only array can execute: 3527 */ 3528 switch (cmd) 3529 { 3530 case GET_ARRAY_INFO: 3531 err = get_array_info(mddev, argp); 3532 goto done_unlock; 3533 3534 case GET_BITMAP_FILE: 3535 err = get_bitmap_file(mddev, argp); 3536 goto done_unlock; 3537 3538 case GET_DISK_INFO: 3539 err = get_disk_info(mddev, argp); 3540 goto done_unlock; 3541 3542 case RESTART_ARRAY_RW: 3543 err = restart_array(mddev); 3544 goto done_unlock; 3545 3546 case STOP_ARRAY: 3547 err = do_md_stop (mddev, 0); 3548 goto done_unlock; 3549 3550 case STOP_ARRAY_RO: 3551 err = do_md_stop (mddev, 1); 3552 goto done_unlock; 3553 3554 /* 3555 * We have a problem here : there is no easy way to give a CHS 3556 * virtual geometry. We currently pretend that we have a 2 heads 3557 * 4 sectors (with a BIG number of cylinders...). This drives 3558 * dosfs just mad... ;-) 3559 */ 3560 case HDIO_GETGEO: 3561 if (!loc) { 3562 err = -EINVAL; 3563 goto abort_unlock; 3564 } 3565 err = put_user (2, (char __user *) &loc->heads); 3566 if (err) 3567 goto abort_unlock; 3568 err = put_user (4, (char __user *) &loc->sectors); 3569 if (err) 3570 goto abort_unlock; 3571 err = put_user(get_capacity(mddev->gendisk)/8, 3572 (short __user *) &loc->cylinders); 3573 if (err) 3574 goto abort_unlock; 3575 err = put_user (get_start_sect(inode->i_bdev), 3576 (long __user *) &loc->start); 3577 goto done_unlock; 3578 } 3579 3580 /* 3581 * The remaining ioctls are changing the state of the 3582 * superblock, so we do not allow them on read-only arrays. 3583 * However non-MD ioctls (e.g. get-size) will still come through 3584 * here and hit the 'default' below, so only disallow 3585 * 'md' ioctls, and switch to rw mode if started auto-readonly. 3586 */ 3587 if (_IOC_TYPE(cmd) == MD_MAJOR && 3588 mddev->ro && mddev->pers) { 3589 if (mddev->ro == 2) { 3590 mddev->ro = 0; 3591 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3592 md_wakeup_thread(mddev->thread); 3593 3594 } else { 3595 err = -EROFS; 3596 goto abort_unlock; 3597 } 3598 } 3599 3600 switch (cmd) 3601 { 3602 case ADD_NEW_DISK: 3603 { 3604 mdu_disk_info_t info; 3605 if (copy_from_user(&info, argp, sizeof(info))) 3606 err = -EFAULT; 3607 else 3608 err = add_new_disk(mddev, &info); 3609 goto done_unlock; 3610 } 3611 3612 case HOT_REMOVE_DISK: 3613 err = hot_remove_disk(mddev, new_decode_dev(arg)); 3614 goto done_unlock; 3615 3616 case HOT_ADD_DISK: 3617 err = hot_add_disk(mddev, new_decode_dev(arg)); 3618 goto done_unlock; 3619 3620 case SET_DISK_FAULTY: 3621 err = set_disk_faulty(mddev, new_decode_dev(arg)); 3622 goto done_unlock; 3623 3624 case RUN_ARRAY: 3625 err = do_md_run (mddev); 3626 goto done_unlock; 3627 3628 case SET_BITMAP_FILE: 3629 err = set_bitmap_file(mddev, (int)arg); 3630 goto done_unlock; 3631 3632 default: 3633 if (_IOC_TYPE(cmd) == MD_MAJOR) 3634 printk(KERN_WARNING "md: %s(pid %d) used" 3635 " obsolete MD ioctl, upgrade your" 3636 " software to use new ictls.\n", 3637 current->comm, current->pid); 3638 err = -EINVAL; 3639 goto abort_unlock; 3640 } 3641 3642 done_unlock: 3643 abort_unlock: 3644 mddev_unlock(mddev); 3645 3646 return err; 3647 done: 3648 if (err) 3649 MD_BUG(); 3650 abort: 3651 return err; 3652 } 3653 3654 static int md_open(struct inode *inode, struct file *file) 3655 { 3656 /* 3657 * Succeed if we can lock the mddev, which confirms that 3658 * it isn't being stopped right now. 3659 */ 3660 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3661 int err; 3662 3663 if ((err = mddev_lock(mddev))) 3664 goto out; 3665 3666 err = 0; 3667 mddev_get(mddev); 3668 mddev_unlock(mddev); 3669 3670 check_disk_change(inode->i_bdev); 3671 out: 3672 return err; 3673 } 3674 3675 static int md_release(struct inode *inode, struct file * file) 3676 { 3677 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3678 3679 if (!mddev) 3680 BUG(); 3681 mddev_put(mddev); 3682 3683 return 0; 3684 } 3685 3686 static int md_media_changed(struct gendisk *disk) 3687 { 3688 mddev_t *mddev = disk->private_data; 3689 3690 return mddev->changed; 3691 } 3692 3693 static int md_revalidate(struct gendisk *disk) 3694 { 3695 mddev_t *mddev = disk->private_data; 3696 3697 mddev->changed = 0; 3698 return 0; 3699 } 3700 static struct block_device_operations md_fops = 3701 { 3702 .owner = THIS_MODULE, 3703 .open = md_open, 3704 .release = md_release, 3705 .ioctl = md_ioctl, 3706 .media_changed = md_media_changed, 3707 .revalidate_disk= md_revalidate, 3708 }; 3709 3710 static int md_thread(void * arg) 3711 { 3712 mdk_thread_t *thread = arg; 3713 3714 /* 3715 * md_thread is a 'system-thread', it's priority should be very 3716 * high. We avoid resource deadlocks individually in each 3717 * raid personality. (RAID5 does preallocation) We also use RR and 3718 * the very same RT priority as kswapd, thus we will never get 3719 * into a priority inversion deadlock. 3720 * 3721 * we definitely have to have equal or higher priority than 3722 * bdflush, otherwise bdflush will deadlock if there are too 3723 * many dirty RAID5 blocks. 3724 */ 3725 3726 allow_signal(SIGKILL); 3727 while (!kthread_should_stop()) { 3728 3729 /* We need to wait INTERRUPTIBLE so that 3730 * we don't add to the load-average. 3731 * That means we need to be sure no signals are 3732 * pending 3733 */ 3734 if (signal_pending(current)) 3735 flush_signals(current); 3736 3737 wait_event_interruptible_timeout 3738 (thread->wqueue, 3739 test_bit(THREAD_WAKEUP, &thread->flags) 3740 || kthread_should_stop(), 3741 thread->timeout); 3742 try_to_freeze(); 3743 3744 clear_bit(THREAD_WAKEUP, &thread->flags); 3745 3746 thread->run(thread->mddev); 3747 } 3748 3749 return 0; 3750 } 3751 3752 void md_wakeup_thread(mdk_thread_t *thread) 3753 { 3754 if (thread) { 3755 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3756 set_bit(THREAD_WAKEUP, &thread->flags); 3757 wake_up(&thread->wqueue); 3758 } 3759 } 3760 3761 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3762 const char *name) 3763 { 3764 mdk_thread_t *thread; 3765 3766 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3767 if (!thread) 3768 return NULL; 3769 3770 init_waitqueue_head(&thread->wqueue); 3771 3772 thread->run = run; 3773 thread->mddev = mddev; 3774 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3775 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3776 if (IS_ERR(thread->tsk)) { 3777 kfree(thread); 3778 return NULL; 3779 } 3780 return thread; 3781 } 3782 3783 void md_unregister_thread(mdk_thread_t *thread) 3784 { 3785 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3786 3787 kthread_stop(thread->tsk); 3788 kfree(thread); 3789 } 3790 3791 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3792 { 3793 if (!mddev) { 3794 MD_BUG(); 3795 return; 3796 } 3797 3798 if (!rdev || test_bit(Faulty, &rdev->flags)) 3799 return; 3800 /* 3801 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3802 mdname(mddev), 3803 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3804 __builtin_return_address(0),__builtin_return_address(1), 3805 __builtin_return_address(2),__builtin_return_address(3)); 3806 */ 3807 if (!mddev->pers->error_handler) 3808 return; 3809 mddev->pers->error_handler(mddev,rdev); 3810 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3811 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3812 md_wakeup_thread(mddev->thread); 3813 md_new_event(mddev); 3814 } 3815 3816 /* seq_file implementation /proc/mdstat */ 3817 3818 static void status_unused(struct seq_file *seq) 3819 { 3820 int i = 0; 3821 mdk_rdev_t *rdev; 3822 struct list_head *tmp; 3823 3824 seq_printf(seq, "unused devices: "); 3825 3826 ITERATE_RDEV_PENDING(rdev,tmp) { 3827 char b[BDEVNAME_SIZE]; 3828 i++; 3829 seq_printf(seq, "%s ", 3830 bdevname(rdev->bdev,b)); 3831 } 3832 if (!i) 3833 seq_printf(seq, "<none>"); 3834 3835 seq_printf(seq, "\n"); 3836 } 3837 3838 3839 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3840 { 3841 unsigned long max_blocks, resync, res, dt, db, rt; 3842 3843 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3844 3845 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3846 max_blocks = mddev->resync_max_sectors >> 1; 3847 else 3848 max_blocks = mddev->size; 3849 3850 /* 3851 * Should not happen. 3852 */ 3853 if (!max_blocks) { 3854 MD_BUG(); 3855 return; 3856 } 3857 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3858 { 3859 int i, x = res/50, y = 20-x; 3860 seq_printf(seq, "["); 3861 for (i = 0; i < x; i++) 3862 seq_printf(seq, "="); 3863 seq_printf(seq, ">"); 3864 for (i = 0; i < y; i++) 3865 seq_printf(seq, "."); 3866 seq_printf(seq, "] "); 3867 } 3868 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3869 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3870 "resync" : "recovery"), 3871 res/10, res % 10, resync, max_blocks); 3872 3873 /* 3874 * We do not want to overflow, so the order of operands and 3875 * the * 100 / 100 trick are important. We do a +1 to be 3876 * safe against division by zero. We only estimate anyway. 3877 * 3878 * dt: time from mark until now 3879 * db: blocks written from mark until now 3880 * rt: remaining time 3881 */ 3882 dt = ((jiffies - mddev->resync_mark) / HZ); 3883 if (!dt) dt++; 3884 db = resync - (mddev->resync_mark_cnt/2); 3885 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3886 3887 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3888 3889 seq_printf(seq, " speed=%ldK/sec", db/dt); 3890 } 3891 3892 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3893 { 3894 struct list_head *tmp; 3895 loff_t l = *pos; 3896 mddev_t *mddev; 3897 3898 if (l >= 0x10000) 3899 return NULL; 3900 if (!l--) 3901 /* header */ 3902 return (void*)1; 3903 3904 spin_lock(&all_mddevs_lock); 3905 list_for_each(tmp,&all_mddevs) 3906 if (!l--) { 3907 mddev = list_entry(tmp, mddev_t, all_mddevs); 3908 mddev_get(mddev); 3909 spin_unlock(&all_mddevs_lock); 3910 return mddev; 3911 } 3912 spin_unlock(&all_mddevs_lock); 3913 if (!l--) 3914 return (void*)2;/* tail */ 3915 return NULL; 3916 } 3917 3918 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3919 { 3920 struct list_head *tmp; 3921 mddev_t *next_mddev, *mddev = v; 3922 3923 ++*pos; 3924 if (v == (void*)2) 3925 return NULL; 3926 3927 spin_lock(&all_mddevs_lock); 3928 if (v == (void*)1) 3929 tmp = all_mddevs.next; 3930 else 3931 tmp = mddev->all_mddevs.next; 3932 if (tmp != &all_mddevs) 3933 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3934 else { 3935 next_mddev = (void*)2; 3936 *pos = 0x10000; 3937 } 3938 spin_unlock(&all_mddevs_lock); 3939 3940 if (v != (void*)1) 3941 mddev_put(mddev); 3942 return next_mddev; 3943 3944 } 3945 3946 static void md_seq_stop(struct seq_file *seq, void *v) 3947 { 3948 mddev_t *mddev = v; 3949 3950 if (mddev && v != (void*)1 && v != (void*)2) 3951 mddev_put(mddev); 3952 } 3953 3954 struct mdstat_info { 3955 int event; 3956 }; 3957 3958 static int md_seq_show(struct seq_file *seq, void *v) 3959 { 3960 mddev_t *mddev = v; 3961 sector_t size; 3962 struct list_head *tmp2; 3963 mdk_rdev_t *rdev; 3964 struct mdstat_info *mi = seq->private; 3965 struct bitmap *bitmap; 3966 3967 if (v == (void*)1) { 3968 struct mdk_personality *pers; 3969 seq_printf(seq, "Personalities : "); 3970 spin_lock(&pers_lock); 3971 list_for_each_entry(pers, &pers_list, list) 3972 seq_printf(seq, "[%s] ", pers->name); 3973 3974 spin_unlock(&pers_lock); 3975 seq_printf(seq, "\n"); 3976 mi->event = atomic_read(&md_event_count); 3977 return 0; 3978 } 3979 if (v == (void*)2) { 3980 status_unused(seq); 3981 return 0; 3982 } 3983 3984 if (mddev_lock(mddev)!=0) 3985 return -EINTR; 3986 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3987 seq_printf(seq, "%s : %sactive", mdname(mddev), 3988 mddev->pers ? "" : "in"); 3989 if (mddev->pers) { 3990 if (mddev->ro==1) 3991 seq_printf(seq, " (read-only)"); 3992 if (mddev->ro==2) 3993 seq_printf(seq, "(auto-read-only)"); 3994 seq_printf(seq, " %s", mddev->pers->name); 3995 } 3996 3997 size = 0; 3998 ITERATE_RDEV(mddev,rdev,tmp2) { 3999 char b[BDEVNAME_SIZE]; 4000 seq_printf(seq, " %s[%d]", 4001 bdevname(rdev->bdev,b), rdev->desc_nr); 4002 if (test_bit(WriteMostly, &rdev->flags)) 4003 seq_printf(seq, "(W)"); 4004 if (test_bit(Faulty, &rdev->flags)) { 4005 seq_printf(seq, "(F)"); 4006 continue; 4007 } else if (rdev->raid_disk < 0) 4008 seq_printf(seq, "(S)"); /* spare */ 4009 size += rdev->size; 4010 } 4011 4012 if (!list_empty(&mddev->disks)) { 4013 if (mddev->pers) 4014 seq_printf(seq, "\n %llu blocks", 4015 (unsigned long long)mddev->array_size); 4016 else 4017 seq_printf(seq, "\n %llu blocks", 4018 (unsigned long long)size); 4019 } 4020 if (mddev->persistent) { 4021 if (mddev->major_version != 0 || 4022 mddev->minor_version != 90) { 4023 seq_printf(seq," super %d.%d", 4024 mddev->major_version, 4025 mddev->minor_version); 4026 } 4027 } else 4028 seq_printf(seq, " super non-persistent"); 4029 4030 if (mddev->pers) { 4031 mddev->pers->status (seq, mddev); 4032 seq_printf(seq, "\n "); 4033 if (mddev->pers->sync_request) { 4034 if (mddev->curr_resync > 2) { 4035 status_resync (seq, mddev); 4036 seq_printf(seq, "\n "); 4037 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4038 seq_printf(seq, "\tresync=DELAYED\n "); 4039 else if (mddev->recovery_cp < MaxSector) 4040 seq_printf(seq, "\tresync=PENDING\n "); 4041 } 4042 } else 4043 seq_printf(seq, "\n "); 4044 4045 if ((bitmap = mddev->bitmap)) { 4046 unsigned long chunk_kb; 4047 unsigned long flags; 4048 spin_lock_irqsave(&bitmap->lock, flags); 4049 chunk_kb = bitmap->chunksize >> 10; 4050 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4051 "%lu%s chunk", 4052 bitmap->pages - bitmap->missing_pages, 4053 bitmap->pages, 4054 (bitmap->pages - bitmap->missing_pages) 4055 << (PAGE_SHIFT - 10), 4056 chunk_kb ? chunk_kb : bitmap->chunksize, 4057 chunk_kb ? "KB" : "B"); 4058 if (bitmap->file) { 4059 seq_printf(seq, ", file: "); 4060 seq_path(seq, bitmap->file->f_vfsmnt, 4061 bitmap->file->f_dentry," \t\n"); 4062 } 4063 4064 seq_printf(seq, "\n"); 4065 spin_unlock_irqrestore(&bitmap->lock, flags); 4066 } 4067 4068 seq_printf(seq, "\n"); 4069 } 4070 mddev_unlock(mddev); 4071 4072 return 0; 4073 } 4074 4075 static struct seq_operations md_seq_ops = { 4076 .start = md_seq_start, 4077 .next = md_seq_next, 4078 .stop = md_seq_stop, 4079 .show = md_seq_show, 4080 }; 4081 4082 static int md_seq_open(struct inode *inode, struct file *file) 4083 { 4084 int error; 4085 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4086 if (mi == NULL) 4087 return -ENOMEM; 4088 4089 error = seq_open(file, &md_seq_ops); 4090 if (error) 4091 kfree(mi); 4092 else { 4093 struct seq_file *p = file->private_data; 4094 p->private = mi; 4095 mi->event = atomic_read(&md_event_count); 4096 } 4097 return error; 4098 } 4099 4100 static int md_seq_release(struct inode *inode, struct file *file) 4101 { 4102 struct seq_file *m = file->private_data; 4103 struct mdstat_info *mi = m->private; 4104 m->private = NULL; 4105 kfree(mi); 4106 return seq_release(inode, file); 4107 } 4108 4109 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4110 { 4111 struct seq_file *m = filp->private_data; 4112 struct mdstat_info *mi = m->private; 4113 int mask; 4114 4115 poll_wait(filp, &md_event_waiters, wait); 4116 4117 /* always allow read */ 4118 mask = POLLIN | POLLRDNORM; 4119 4120 if (mi->event != atomic_read(&md_event_count)) 4121 mask |= POLLERR | POLLPRI; 4122 return mask; 4123 } 4124 4125 static struct file_operations md_seq_fops = { 4126 .open = md_seq_open, 4127 .read = seq_read, 4128 .llseek = seq_lseek, 4129 .release = md_seq_release, 4130 .poll = mdstat_poll, 4131 }; 4132 4133 int register_md_personality(struct mdk_personality *p) 4134 { 4135 spin_lock(&pers_lock); 4136 list_add_tail(&p->list, &pers_list); 4137 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4138 spin_unlock(&pers_lock); 4139 return 0; 4140 } 4141 4142 int unregister_md_personality(struct mdk_personality *p) 4143 { 4144 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4145 spin_lock(&pers_lock); 4146 list_del_init(&p->list); 4147 spin_unlock(&pers_lock); 4148 return 0; 4149 } 4150 4151 static int is_mddev_idle(mddev_t *mddev) 4152 { 4153 mdk_rdev_t * rdev; 4154 struct list_head *tmp; 4155 int idle; 4156 unsigned long curr_events; 4157 4158 idle = 1; 4159 ITERATE_RDEV(mddev,rdev,tmp) { 4160 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4161 curr_events = disk_stat_read(disk, sectors[0]) + 4162 disk_stat_read(disk, sectors[1]) - 4163 atomic_read(&disk->sync_io); 4164 /* The difference between curr_events and last_events 4165 * will be affected by any new non-sync IO (making 4166 * curr_events bigger) and any difference in the amount of 4167 * in-flight syncio (making current_events bigger or smaller) 4168 * The amount in-flight is currently limited to 4169 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4170 * which is at most 4096 sectors. 4171 * These numbers are fairly fragile and should be made 4172 * more robust, probably by enforcing the 4173 * 'window size' that md_do_sync sort-of uses. 4174 * 4175 * Note: the following is an unsigned comparison. 4176 */ 4177 if ((curr_events - rdev->last_events + 4096) > 8192) { 4178 rdev->last_events = curr_events; 4179 idle = 0; 4180 } 4181 } 4182 return idle; 4183 } 4184 4185 void md_done_sync(mddev_t *mddev, int blocks, int ok) 4186 { 4187 /* another "blocks" (512byte) blocks have been synced */ 4188 atomic_sub(blocks, &mddev->recovery_active); 4189 wake_up(&mddev->recovery_wait); 4190 if (!ok) { 4191 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4192 md_wakeup_thread(mddev->thread); 4193 // stop recovery, signal do_sync .... 4194 } 4195 } 4196 4197 4198 /* md_write_start(mddev, bi) 4199 * If we need to update some array metadata (e.g. 'active' flag 4200 * in superblock) before writing, schedule a superblock update 4201 * and wait for it to complete. 4202 */ 4203 void md_write_start(mddev_t *mddev, struct bio *bi) 4204 { 4205 if (bio_data_dir(bi) != WRITE) 4206 return; 4207 4208 BUG_ON(mddev->ro == 1); 4209 if (mddev->ro == 2) { 4210 /* need to switch to read/write */ 4211 mddev->ro = 0; 4212 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4213 md_wakeup_thread(mddev->thread); 4214 } 4215 atomic_inc(&mddev->writes_pending); 4216 if (mddev->in_sync) { 4217 spin_lock_irq(&mddev->write_lock); 4218 if (mddev->in_sync) { 4219 mddev->in_sync = 0; 4220 mddev->sb_dirty = 1; 4221 md_wakeup_thread(mddev->thread); 4222 } 4223 spin_unlock_irq(&mddev->write_lock); 4224 } 4225 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 4226 } 4227 4228 void md_write_end(mddev_t *mddev) 4229 { 4230 if (atomic_dec_and_test(&mddev->writes_pending)) { 4231 if (mddev->safemode == 2) 4232 md_wakeup_thread(mddev->thread); 4233 else 4234 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 4235 } 4236 } 4237 4238 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 4239 4240 #define SYNC_MARKS 10 4241 #define SYNC_MARK_STEP (3*HZ) 4242 static void md_do_sync(mddev_t *mddev) 4243 { 4244 mddev_t *mddev2; 4245 unsigned int currspeed = 0, 4246 window; 4247 sector_t max_sectors,j, io_sectors; 4248 unsigned long mark[SYNC_MARKS]; 4249 sector_t mark_cnt[SYNC_MARKS]; 4250 int last_mark,m; 4251 struct list_head *tmp; 4252 sector_t last_check; 4253 int skipped = 0; 4254 4255 /* just incase thread restarts... */ 4256 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 4257 return; 4258 4259 /* we overload curr_resync somewhat here. 4260 * 0 == not engaged in resync at all 4261 * 2 == checking that there is no conflict with another sync 4262 * 1 == like 2, but have yielded to allow conflicting resync to 4263 * commense 4264 * other == active in resync - this many blocks 4265 * 4266 * Before starting a resync we must have set curr_resync to 4267 * 2, and then checked that every "conflicting" array has curr_resync 4268 * less than ours. When we find one that is the same or higher 4269 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 4270 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 4271 * This will mean we have to start checking from the beginning again. 4272 * 4273 */ 4274 4275 do { 4276 mddev->curr_resync = 2; 4277 4278 try_again: 4279 if (kthread_should_stop()) { 4280 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4281 goto skip; 4282 } 4283 ITERATE_MDDEV(mddev2,tmp) { 4284 if (mddev2 == mddev) 4285 continue; 4286 if (mddev2->curr_resync && 4287 match_mddev_units(mddev,mddev2)) { 4288 DEFINE_WAIT(wq); 4289 if (mddev < mddev2 && mddev->curr_resync == 2) { 4290 /* arbitrarily yield */ 4291 mddev->curr_resync = 1; 4292 wake_up(&resync_wait); 4293 } 4294 if (mddev > mddev2 && mddev->curr_resync == 1) 4295 /* no need to wait here, we can wait the next 4296 * time 'round when curr_resync == 2 4297 */ 4298 continue; 4299 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 4300 if (!kthread_should_stop() && 4301 mddev2->curr_resync >= mddev->curr_resync) { 4302 printk(KERN_INFO "md: delaying resync of %s" 4303 " until %s has finished resync (they" 4304 " share one or more physical units)\n", 4305 mdname(mddev), mdname(mddev2)); 4306 mddev_put(mddev2); 4307 schedule(); 4308 finish_wait(&resync_wait, &wq); 4309 goto try_again; 4310 } 4311 finish_wait(&resync_wait, &wq); 4312 } 4313 } 4314 } while (mddev->curr_resync < 2); 4315 4316 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4317 /* resync follows the size requested by the personality, 4318 * which defaults to physical size, but can be virtual size 4319 */ 4320 max_sectors = mddev->resync_max_sectors; 4321 mddev->resync_mismatches = 0; 4322 } else 4323 /* recovery follows the physical size of devices */ 4324 max_sectors = mddev->size << 1; 4325 4326 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 4327 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 4328 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 4329 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 4330 "(but not more than %d KB/sec) for reconstruction.\n", 4331 sysctl_speed_limit_max); 4332 4333 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4334 /* we don't use the checkpoint if there's a bitmap */ 4335 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap 4336 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 4337 j = mddev->recovery_cp; 4338 else 4339 j = 0; 4340 io_sectors = 0; 4341 for (m = 0; m < SYNC_MARKS; m++) { 4342 mark[m] = jiffies; 4343 mark_cnt[m] = io_sectors; 4344 } 4345 last_mark = 0; 4346 mddev->resync_mark = mark[last_mark]; 4347 mddev->resync_mark_cnt = mark_cnt[last_mark]; 4348 4349 /* 4350 * Tune reconstruction: 4351 */ 4352 window = 32*(PAGE_SIZE/512); 4353 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 4354 window/2,(unsigned long long) max_sectors/2); 4355 4356 atomic_set(&mddev->recovery_active, 0); 4357 init_waitqueue_head(&mddev->recovery_wait); 4358 last_check = 0; 4359 4360 if (j>2) { 4361 printk(KERN_INFO 4362 "md: resuming recovery of %s from checkpoint.\n", 4363 mdname(mddev)); 4364 mddev->curr_resync = j; 4365 } 4366 4367 while (j < max_sectors) { 4368 sector_t sectors; 4369 4370 skipped = 0; 4371 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4372 currspeed < sysctl_speed_limit_min); 4373 if (sectors == 0) { 4374 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4375 goto out; 4376 } 4377 4378 if (!skipped) { /* actual IO requested */ 4379 io_sectors += sectors; 4380 atomic_add(sectors, &mddev->recovery_active); 4381 } 4382 4383 j += sectors; 4384 if (j>1) mddev->curr_resync = j; 4385 if (last_check == 0) 4386 /* this is the earliers that rebuilt will be 4387 * visible in /proc/mdstat 4388 */ 4389 md_new_event(mddev); 4390 4391 if (last_check + window > io_sectors || j == max_sectors) 4392 continue; 4393 4394 last_check = io_sectors; 4395 4396 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 4397 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 4398 break; 4399 4400 repeat: 4401 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 4402 /* step marks */ 4403 int next = (last_mark+1) % SYNC_MARKS; 4404 4405 mddev->resync_mark = mark[next]; 4406 mddev->resync_mark_cnt = mark_cnt[next]; 4407 mark[next] = jiffies; 4408 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 4409 last_mark = next; 4410 } 4411 4412 4413 if (kthread_should_stop()) { 4414 /* 4415 * got a signal, exit. 4416 */ 4417 printk(KERN_INFO 4418 "md: md_do_sync() got signal ... exiting\n"); 4419 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4420 goto out; 4421 } 4422 4423 /* 4424 * this loop exits only if either when we are slower than 4425 * the 'hard' speed limit, or the system was IO-idle for 4426 * a jiffy. 4427 * the system might be non-idle CPU-wise, but we only care 4428 * about not overloading the IO subsystem. (things like an 4429 * e2fsck being done on the RAID array should execute fast) 4430 */ 4431 mddev->queue->unplug_fn(mddev->queue); 4432 cond_resched(); 4433 4434 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4435 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4436 4437 if (currspeed > sysctl_speed_limit_min) { 4438 if ((currspeed > sysctl_speed_limit_max) || 4439 !is_mddev_idle(mddev)) { 4440 msleep(500); 4441 goto repeat; 4442 } 4443 } 4444 } 4445 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 4446 /* 4447 * this also signals 'finished resyncing' to md_stop 4448 */ 4449 out: 4450 mddev->queue->unplug_fn(mddev->queue); 4451 4452 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 4453 4454 /* tell personality that we are finished */ 4455 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4456 4457 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4458 mddev->curr_resync > 2 && 4459 mddev->curr_resync >= mddev->recovery_cp) { 4460 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4461 printk(KERN_INFO 4462 "md: checkpointing recovery of %s.\n", 4463 mdname(mddev)); 4464 mddev->recovery_cp = mddev->curr_resync; 4465 } else 4466 mddev->recovery_cp = MaxSector; 4467 } 4468 4469 skip: 4470 mddev->curr_resync = 0; 4471 wake_up(&resync_wait); 4472 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4473 md_wakeup_thread(mddev->thread); 4474 } 4475 4476 4477 /* 4478 * This routine is regularly called by all per-raid-array threads to 4479 * deal with generic issues like resync and super-block update. 4480 * Raid personalities that don't have a thread (linear/raid0) do not 4481 * need this as they never do any recovery or update the superblock. 4482 * 4483 * It does not do any resync itself, but rather "forks" off other threads 4484 * to do that as needed. 4485 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 4486 * "->recovery" and create a thread at ->sync_thread. 4487 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 4488 * and wakeups up this thread which will reap the thread and finish up. 4489 * This thread also removes any faulty devices (with nr_pending == 0). 4490 * 4491 * The overall approach is: 4492 * 1/ if the superblock needs updating, update it. 4493 * 2/ If a recovery thread is running, don't do anything else. 4494 * 3/ If recovery has finished, clean up, possibly marking spares active. 4495 * 4/ If there are any faulty devices, remove them. 4496 * 5/ If array is degraded, try to add spares devices 4497 * 6/ If array has spares or is not in-sync, start a resync thread. 4498 */ 4499 void md_check_recovery(mddev_t *mddev) 4500 { 4501 mdk_rdev_t *rdev; 4502 struct list_head *rtmp; 4503 4504 4505 if (mddev->bitmap) 4506 bitmap_daemon_work(mddev->bitmap); 4507 4508 if (mddev->ro) 4509 return; 4510 4511 if (signal_pending(current)) { 4512 if (mddev->pers->sync_request) { 4513 printk(KERN_INFO "md: %s in immediate safe mode\n", 4514 mdname(mddev)); 4515 mddev->safemode = 2; 4516 } 4517 flush_signals(current); 4518 } 4519 4520 if ( ! ( 4521 mddev->sb_dirty || 4522 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 4523 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 4524 (mddev->safemode == 1) || 4525 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 4526 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 4527 )) 4528 return; 4529 4530 if (mddev_trylock(mddev)==0) { 4531 int spares =0; 4532 4533 spin_lock_irq(&mddev->write_lock); 4534 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4535 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4536 mddev->in_sync = 1; 4537 mddev->sb_dirty = 1; 4538 } 4539 if (mddev->safemode == 1) 4540 mddev->safemode = 0; 4541 spin_unlock_irq(&mddev->write_lock); 4542 4543 if (mddev->sb_dirty) 4544 md_update_sb(mddev); 4545 4546 4547 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4548 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 4549 /* resync/recovery still happening */ 4550 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4551 goto unlock; 4552 } 4553 if (mddev->sync_thread) { 4554 /* resync has finished, collect result */ 4555 md_unregister_thread(mddev->sync_thread); 4556 mddev->sync_thread = NULL; 4557 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4558 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4559 /* success...*/ 4560 /* activate any spares */ 4561 mddev->pers->spare_active(mddev); 4562 } 4563 md_update_sb(mddev); 4564 4565 /* if array is no-longer degraded, then any saved_raid_disk 4566 * information must be scrapped 4567 */ 4568 if (!mddev->degraded) 4569 ITERATE_RDEV(mddev,rdev,rtmp) 4570 rdev->saved_raid_disk = -1; 4571 4572 mddev->recovery = 0; 4573 /* flag recovery needed just to double check */ 4574 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4575 md_new_event(mddev); 4576 goto unlock; 4577 } 4578 /* Clear some bits that don't mean anything, but 4579 * might be left set 4580 */ 4581 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4582 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 4583 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 4584 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4585 4586 /* no recovery is running. 4587 * remove any failed drives, then 4588 * add spares if possible. 4589 * Spare are also removed and re-added, to allow 4590 * the personality to fail the re-add. 4591 */ 4592 ITERATE_RDEV(mddev,rdev,rtmp) 4593 if (rdev->raid_disk >= 0 && 4594 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 4595 atomic_read(&rdev->nr_pending)==0) { 4596 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 4597 char nm[20]; 4598 sprintf(nm,"rd%d", rdev->raid_disk); 4599 sysfs_remove_link(&mddev->kobj, nm); 4600 rdev->raid_disk = -1; 4601 } 4602 } 4603 4604 if (mddev->degraded) { 4605 ITERATE_RDEV(mddev,rdev,rtmp) 4606 if (rdev->raid_disk < 0 4607 && !test_bit(Faulty, &rdev->flags)) { 4608 if (mddev->pers->hot_add_disk(mddev,rdev)) { 4609 char nm[20]; 4610 sprintf(nm, "rd%d", rdev->raid_disk); 4611 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4612 spares++; 4613 md_new_event(mddev); 4614 } else 4615 break; 4616 } 4617 } 4618 4619 if (spares) { 4620 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4621 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4622 } else if (mddev->recovery_cp < MaxSector) { 4623 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4624 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4625 /* nothing to be done ... */ 4626 goto unlock; 4627 4628 if (mddev->pers->sync_request) { 4629 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4630 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4631 /* We are adding a device or devices to an array 4632 * which has the bitmap stored on all devices. 4633 * So make sure all bitmap pages get written 4634 */ 4635 bitmap_write_all(mddev->bitmap); 4636 } 4637 mddev->sync_thread = md_register_thread(md_do_sync, 4638 mddev, 4639 "%s_resync"); 4640 if (!mddev->sync_thread) { 4641 printk(KERN_ERR "%s: could not start resync" 4642 " thread...\n", 4643 mdname(mddev)); 4644 /* leave the spares where they are, it shouldn't hurt */ 4645 mddev->recovery = 0; 4646 } else 4647 md_wakeup_thread(mddev->sync_thread); 4648 md_new_event(mddev); 4649 } 4650 unlock: 4651 mddev_unlock(mddev); 4652 } 4653 } 4654 4655 static int md_notify_reboot(struct notifier_block *this, 4656 unsigned long code, void *x) 4657 { 4658 struct list_head *tmp; 4659 mddev_t *mddev; 4660 4661 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 4662 4663 printk(KERN_INFO "md: stopping all md devices.\n"); 4664 4665 ITERATE_MDDEV(mddev,tmp) 4666 if (mddev_trylock(mddev)==0) 4667 do_md_stop (mddev, 1); 4668 /* 4669 * certain more exotic SCSI devices are known to be 4670 * volatile wrt too early system reboots. While the 4671 * right place to handle this issue is the given 4672 * driver, we do want to have a safe RAID driver ... 4673 */ 4674 mdelay(1000*1); 4675 } 4676 return NOTIFY_DONE; 4677 } 4678 4679 static struct notifier_block md_notifier = { 4680 .notifier_call = md_notify_reboot, 4681 .next = NULL, 4682 .priority = INT_MAX, /* before any real devices */ 4683 }; 4684 4685 static void md_geninit(void) 4686 { 4687 struct proc_dir_entry *p; 4688 4689 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 4690 4691 p = create_proc_entry("mdstat", S_IRUGO, NULL); 4692 if (p) 4693 p->proc_fops = &md_seq_fops; 4694 } 4695 4696 static int __init md_init(void) 4697 { 4698 int minor; 4699 4700 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 4701 " MD_SB_DISKS=%d\n", 4702 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4703 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4704 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 4705 BITMAP_MINOR); 4706 4707 if (register_blkdev(MAJOR_NR, "md")) 4708 return -1; 4709 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 4710 unregister_blkdev(MAJOR_NR, "md"); 4711 return -1; 4712 } 4713 devfs_mk_dir("md"); 4714 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 4715 md_probe, NULL, NULL); 4716 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 4717 md_probe, NULL, NULL); 4718 4719 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4720 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 4721 S_IFBLK|S_IRUSR|S_IWUSR, 4722 "md/%d", minor); 4723 4724 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4725 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4726 S_IFBLK|S_IRUSR|S_IWUSR, 4727 "md/mdp%d", minor); 4728 4729 4730 register_reboot_notifier(&md_notifier); 4731 raid_table_header = register_sysctl_table(raid_root_table, 1); 4732 4733 md_geninit(); 4734 return (0); 4735 } 4736 4737 4738 #ifndef MODULE 4739 4740 /* 4741 * Searches all registered partitions for autorun RAID arrays 4742 * at boot time. 4743 */ 4744 static dev_t detected_devices[128]; 4745 static int dev_cnt; 4746 4747 void md_autodetect_dev(dev_t dev) 4748 { 4749 if (dev_cnt >= 0 && dev_cnt < 127) 4750 detected_devices[dev_cnt++] = dev; 4751 } 4752 4753 4754 static void autostart_arrays(int part) 4755 { 4756 mdk_rdev_t *rdev; 4757 int i; 4758 4759 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4760 4761 for (i = 0; i < dev_cnt; i++) { 4762 dev_t dev = detected_devices[i]; 4763 4764 rdev = md_import_device(dev,0, 0); 4765 if (IS_ERR(rdev)) 4766 continue; 4767 4768 if (test_bit(Faulty, &rdev->flags)) { 4769 MD_BUG(); 4770 continue; 4771 } 4772 list_add(&rdev->same_set, &pending_raid_disks); 4773 } 4774 dev_cnt = 0; 4775 4776 autorun_devices(part); 4777 } 4778 4779 #endif 4780 4781 static __exit void md_exit(void) 4782 { 4783 mddev_t *mddev; 4784 struct list_head *tmp; 4785 int i; 4786 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4787 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4788 for (i=0; i < MAX_MD_DEVS; i++) 4789 devfs_remove("md/%d", i); 4790 for (i=0; i < MAX_MD_DEVS; i++) 4791 devfs_remove("md/d%d", i); 4792 4793 devfs_remove("md"); 4794 4795 unregister_blkdev(MAJOR_NR,"md"); 4796 unregister_blkdev(mdp_major, "mdp"); 4797 unregister_reboot_notifier(&md_notifier); 4798 unregister_sysctl_table(raid_table_header); 4799 remove_proc_entry("mdstat", NULL); 4800 ITERATE_MDDEV(mddev,tmp) { 4801 struct gendisk *disk = mddev->gendisk; 4802 if (!disk) 4803 continue; 4804 export_array(mddev); 4805 del_gendisk(disk); 4806 put_disk(disk); 4807 mddev->gendisk = NULL; 4808 mddev_put(mddev); 4809 } 4810 } 4811 4812 module_init(md_init) 4813 module_exit(md_exit) 4814 4815 static int get_ro(char *buffer, struct kernel_param *kp) 4816 { 4817 return sprintf(buffer, "%d", start_readonly); 4818 } 4819 static int set_ro(const char *val, struct kernel_param *kp) 4820 { 4821 char *e; 4822 int num = simple_strtoul(val, &e, 10); 4823 if (*val && (*e == '\0' || *e == '\n')) { 4824 start_readonly = num; 4825 return 0; 4826 } 4827 return -EINVAL; 4828 } 4829 4830 module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 4831 module_param(start_dirty_degraded, int, 0644); 4832 4833 4834 EXPORT_SYMBOL(register_md_personality); 4835 EXPORT_SYMBOL(unregister_md_personality); 4836 EXPORT_SYMBOL(md_error); 4837 EXPORT_SYMBOL(md_done_sync); 4838 EXPORT_SYMBOL(md_write_start); 4839 EXPORT_SYMBOL(md_write_end); 4840 EXPORT_SYMBOL(md_register_thread); 4841 EXPORT_SYMBOL(md_unregister_thread); 4842 EXPORT_SYMBOL(md_wakeup_thread); 4843 EXPORT_SYMBOL(md_print_devices); 4844 EXPORT_SYMBOL(md_check_recovery); 4845 MODULE_LICENSE("GPL"); 4846 MODULE_ALIAS("md"); 4847 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4848