1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 #include <linux/poll.h> 46 47 #include <linux/init.h> 48 49 #include <linux/file.h> 50 51 #ifdef CONFIG_KMOD 52 #include <linux/kmod.h> 53 #endif 54 55 #include <asm/unaligned.h> 56 57 #define MAJOR_NR MD_MAJOR 58 #define MD_DRIVER 59 60 /* 63 partitions with the alternate major number (mdp) */ 61 #define MdpMinorShift 6 62 63 #define DEBUG 0 64 #define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67 #ifndef MODULE 68 static void autostart_arrays (int part); 69 #endif 70 71 static LIST_HEAD(pers_list); 72 static DEFINE_SPINLOCK(pers_lock); 73 74 /* 75 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 76 * is 1000 KB/sec, so the extra system load does not show up that much. 77 * Increase it if you want to have more _guaranteed_ speed. Note that 78 * the RAID driver will use the maximum available bandwidth if the IO 79 * subsystem is idle. There is also an 'absolute maximum' reconstruction 80 * speed limit - in case reconstruction slows down your system despite 81 * idle IO detection. 82 * 83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 84 */ 85 86 static int sysctl_speed_limit_min = 1000; 87 static int sysctl_speed_limit_max = 200000; 88 89 static struct ctl_table_header *raid_table_header; 90 91 static ctl_table raid_table[] = { 92 { 93 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 94 .procname = "speed_limit_min", 95 .data = &sysctl_speed_limit_min, 96 .maxlen = sizeof(int), 97 .mode = 0644, 98 .proc_handler = &proc_dointvec, 99 }, 100 { 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 102 .procname = "speed_limit_max", 103 .data = &sysctl_speed_limit_max, 104 .maxlen = sizeof(int), 105 .mode = 0644, 106 .proc_handler = &proc_dointvec, 107 }, 108 { .ctl_name = 0 } 109 }; 110 111 static ctl_table raid_dir_table[] = { 112 { 113 .ctl_name = DEV_RAID, 114 .procname = "raid", 115 .maxlen = 0, 116 .mode = 0555, 117 .child = raid_table, 118 }, 119 { .ctl_name = 0 } 120 }; 121 122 static ctl_table raid_root_table[] = { 123 { 124 .ctl_name = CTL_DEV, 125 .procname = "dev", 126 .maxlen = 0, 127 .mode = 0555, 128 .child = raid_dir_table, 129 }, 130 { .ctl_name = 0 } 131 }; 132 133 static struct block_device_operations md_fops; 134 135 static int start_readonly; 136 137 /* 138 * We have a system wide 'event count' that is incremented 139 * on any 'interesting' event, and readers of /proc/mdstat 140 * can use 'poll' or 'select' to find out when the event 141 * count increases. 142 * 143 * Events are: 144 * start array, stop array, error, add device, remove device, 145 * start build, activate spare 146 */ 147 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 148 static atomic_t md_event_count; 149 static void md_new_event(mddev_t *mddev) 150 { 151 atomic_inc(&md_event_count); 152 wake_up(&md_event_waiters); 153 } 154 155 /* 156 * Enables to iterate over all existing md arrays 157 * all_mddevs_lock protects this list. 158 */ 159 static LIST_HEAD(all_mddevs); 160 static DEFINE_SPINLOCK(all_mddevs_lock); 161 162 163 /* 164 * iterates through all used mddevs in the system. 165 * We take care to grab the all_mddevs_lock whenever navigating 166 * the list, and to always hold a refcount when unlocked. 167 * Any code which breaks out of this loop while own 168 * a reference to the current mddev and must mddev_put it. 169 */ 170 #define ITERATE_MDDEV(mddev,tmp) \ 171 \ 172 for (({ spin_lock(&all_mddevs_lock); \ 173 tmp = all_mddevs.next; \ 174 mddev = NULL;}); \ 175 ({ if (tmp != &all_mddevs) \ 176 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 177 spin_unlock(&all_mddevs_lock); \ 178 if (mddev) mddev_put(mddev); \ 179 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 180 tmp != &all_mddevs;}); \ 181 ({ spin_lock(&all_mddevs_lock); \ 182 tmp = tmp->next;}) \ 183 ) 184 185 186 static int md_fail_request (request_queue_t *q, struct bio *bio) 187 { 188 bio_io_error(bio, bio->bi_size); 189 return 0; 190 } 191 192 static inline mddev_t *mddev_get(mddev_t *mddev) 193 { 194 atomic_inc(&mddev->active); 195 return mddev; 196 } 197 198 static void mddev_put(mddev_t *mddev) 199 { 200 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 201 return; 202 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 203 list_del(&mddev->all_mddevs); 204 blk_put_queue(mddev->queue); 205 kobject_unregister(&mddev->kobj); 206 } 207 spin_unlock(&all_mddevs_lock); 208 } 209 210 static mddev_t * mddev_find(dev_t unit) 211 { 212 mddev_t *mddev, *new = NULL; 213 214 retry: 215 spin_lock(&all_mddevs_lock); 216 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 217 if (mddev->unit == unit) { 218 mddev_get(mddev); 219 spin_unlock(&all_mddevs_lock); 220 kfree(new); 221 return mddev; 222 } 223 224 if (new) { 225 list_add(&new->all_mddevs, &all_mddevs); 226 spin_unlock(&all_mddevs_lock); 227 return new; 228 } 229 spin_unlock(&all_mddevs_lock); 230 231 new = kzalloc(sizeof(*new), GFP_KERNEL); 232 if (!new) 233 return NULL; 234 235 new->unit = unit; 236 if (MAJOR(unit) == MD_MAJOR) 237 new->md_minor = MINOR(unit); 238 else 239 new->md_minor = MINOR(unit) >> MdpMinorShift; 240 241 init_MUTEX(&new->reconfig_sem); 242 INIT_LIST_HEAD(&new->disks); 243 INIT_LIST_HEAD(&new->all_mddevs); 244 init_timer(&new->safemode_timer); 245 atomic_set(&new->active, 1); 246 spin_lock_init(&new->write_lock); 247 init_waitqueue_head(&new->sb_wait); 248 249 new->queue = blk_alloc_queue(GFP_KERNEL); 250 if (!new->queue) { 251 kfree(new); 252 return NULL; 253 } 254 255 blk_queue_make_request(new->queue, md_fail_request); 256 257 goto retry; 258 } 259 260 static inline int mddev_lock(mddev_t * mddev) 261 { 262 return down_interruptible(&mddev->reconfig_sem); 263 } 264 265 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 266 { 267 down(&mddev->reconfig_sem); 268 } 269 270 static inline int mddev_trylock(mddev_t * mddev) 271 { 272 return down_trylock(&mddev->reconfig_sem); 273 } 274 275 static inline void mddev_unlock(mddev_t * mddev) 276 { 277 up(&mddev->reconfig_sem); 278 279 md_wakeup_thread(mddev->thread); 280 } 281 282 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 283 { 284 mdk_rdev_t * rdev; 285 struct list_head *tmp; 286 287 ITERATE_RDEV(mddev,rdev,tmp) { 288 if (rdev->desc_nr == nr) 289 return rdev; 290 } 291 return NULL; 292 } 293 294 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 295 { 296 struct list_head *tmp; 297 mdk_rdev_t *rdev; 298 299 ITERATE_RDEV(mddev,rdev,tmp) { 300 if (rdev->bdev->bd_dev == dev) 301 return rdev; 302 } 303 return NULL; 304 } 305 306 static struct mdk_personality *find_pers(int level, char *clevel) 307 { 308 struct mdk_personality *pers; 309 list_for_each_entry(pers, &pers_list, list) { 310 if (level != LEVEL_NONE && pers->level == level) 311 return pers; 312 if (strcmp(pers->name, clevel)==0) 313 return pers; 314 } 315 return NULL; 316 } 317 318 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 319 { 320 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 321 return MD_NEW_SIZE_BLOCKS(size); 322 } 323 324 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 325 { 326 sector_t size; 327 328 size = rdev->sb_offset; 329 330 if (chunk_size) 331 size &= ~((sector_t)chunk_size/1024 - 1); 332 return size; 333 } 334 335 static int alloc_disk_sb(mdk_rdev_t * rdev) 336 { 337 if (rdev->sb_page) 338 MD_BUG(); 339 340 rdev->sb_page = alloc_page(GFP_KERNEL); 341 if (!rdev->sb_page) { 342 printk(KERN_ALERT "md: out of memory.\n"); 343 return -EINVAL; 344 } 345 346 return 0; 347 } 348 349 static void free_disk_sb(mdk_rdev_t * rdev) 350 { 351 if (rdev->sb_page) { 352 put_page(rdev->sb_page); 353 rdev->sb_loaded = 0; 354 rdev->sb_page = NULL; 355 rdev->sb_offset = 0; 356 rdev->size = 0; 357 } 358 } 359 360 361 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 362 { 363 mdk_rdev_t *rdev = bio->bi_private; 364 mddev_t *mddev = rdev->mddev; 365 if (bio->bi_size) 366 return 1; 367 368 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 369 md_error(mddev, rdev); 370 371 if (atomic_dec_and_test(&mddev->pending_writes)) 372 wake_up(&mddev->sb_wait); 373 bio_put(bio); 374 return 0; 375 } 376 377 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 378 { 379 struct bio *bio2 = bio->bi_private; 380 mdk_rdev_t *rdev = bio2->bi_private; 381 mddev_t *mddev = rdev->mddev; 382 if (bio->bi_size) 383 return 1; 384 385 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 386 error == -EOPNOTSUPP) { 387 unsigned long flags; 388 /* barriers don't appear to be supported :-( */ 389 set_bit(BarriersNotsupp, &rdev->flags); 390 mddev->barriers_work = 0; 391 spin_lock_irqsave(&mddev->write_lock, flags); 392 bio2->bi_next = mddev->biolist; 393 mddev->biolist = bio2; 394 spin_unlock_irqrestore(&mddev->write_lock, flags); 395 wake_up(&mddev->sb_wait); 396 bio_put(bio); 397 return 0; 398 } 399 bio_put(bio2); 400 bio->bi_private = rdev; 401 return super_written(bio, bytes_done, error); 402 } 403 404 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 405 sector_t sector, int size, struct page *page) 406 { 407 /* write first size bytes of page to sector of rdev 408 * Increment mddev->pending_writes before returning 409 * and decrement it on completion, waking up sb_wait 410 * if zero is reached. 411 * If an error occurred, call md_error 412 * 413 * As we might need to resubmit the request if BIO_RW_BARRIER 414 * causes ENOTSUPP, we allocate a spare bio... 415 */ 416 struct bio *bio = bio_alloc(GFP_NOIO, 1); 417 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 418 419 bio->bi_bdev = rdev->bdev; 420 bio->bi_sector = sector; 421 bio_add_page(bio, page, size, 0); 422 bio->bi_private = rdev; 423 bio->bi_end_io = super_written; 424 bio->bi_rw = rw; 425 426 atomic_inc(&mddev->pending_writes); 427 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 428 struct bio *rbio; 429 rw |= (1<<BIO_RW_BARRIER); 430 rbio = bio_clone(bio, GFP_NOIO); 431 rbio->bi_private = bio; 432 rbio->bi_end_io = super_written_barrier; 433 submit_bio(rw, rbio); 434 } else 435 submit_bio(rw, bio); 436 } 437 438 void md_super_wait(mddev_t *mddev) 439 { 440 /* wait for all superblock writes that were scheduled to complete. 441 * if any had to be retried (due to BARRIER problems), retry them 442 */ 443 DEFINE_WAIT(wq); 444 for(;;) { 445 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 446 if (atomic_read(&mddev->pending_writes)==0) 447 break; 448 while (mddev->biolist) { 449 struct bio *bio; 450 spin_lock_irq(&mddev->write_lock); 451 bio = mddev->biolist; 452 mddev->biolist = bio->bi_next ; 453 bio->bi_next = NULL; 454 spin_unlock_irq(&mddev->write_lock); 455 submit_bio(bio->bi_rw, bio); 456 } 457 schedule(); 458 } 459 finish_wait(&mddev->sb_wait, &wq); 460 } 461 462 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 463 { 464 if (bio->bi_size) 465 return 1; 466 467 complete((struct completion*)bio->bi_private); 468 return 0; 469 } 470 471 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 472 struct page *page, int rw) 473 { 474 struct bio *bio = bio_alloc(GFP_NOIO, 1); 475 struct completion event; 476 int ret; 477 478 rw |= (1 << BIO_RW_SYNC); 479 480 bio->bi_bdev = bdev; 481 bio->bi_sector = sector; 482 bio_add_page(bio, page, size, 0); 483 init_completion(&event); 484 bio->bi_private = &event; 485 bio->bi_end_io = bi_complete; 486 submit_bio(rw, bio); 487 wait_for_completion(&event); 488 489 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 490 bio_put(bio); 491 return ret; 492 } 493 EXPORT_SYMBOL_GPL(sync_page_io); 494 495 static int read_disk_sb(mdk_rdev_t * rdev, int size) 496 { 497 char b[BDEVNAME_SIZE]; 498 if (!rdev->sb_page) { 499 MD_BUG(); 500 return -EINVAL; 501 } 502 if (rdev->sb_loaded) 503 return 0; 504 505 506 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 507 goto fail; 508 rdev->sb_loaded = 1; 509 return 0; 510 511 fail: 512 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 513 bdevname(rdev->bdev,b)); 514 return -EINVAL; 515 } 516 517 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 518 { 519 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 520 (sb1->set_uuid1 == sb2->set_uuid1) && 521 (sb1->set_uuid2 == sb2->set_uuid2) && 522 (sb1->set_uuid3 == sb2->set_uuid3)) 523 524 return 1; 525 526 return 0; 527 } 528 529 530 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 531 { 532 int ret; 533 mdp_super_t *tmp1, *tmp2; 534 535 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 536 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 537 538 if (!tmp1 || !tmp2) { 539 ret = 0; 540 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 541 goto abort; 542 } 543 544 *tmp1 = *sb1; 545 *tmp2 = *sb2; 546 547 /* 548 * nr_disks is not constant 549 */ 550 tmp1->nr_disks = 0; 551 tmp2->nr_disks = 0; 552 553 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 554 ret = 0; 555 else 556 ret = 1; 557 558 abort: 559 kfree(tmp1); 560 kfree(tmp2); 561 return ret; 562 } 563 564 static unsigned int calc_sb_csum(mdp_super_t * sb) 565 { 566 unsigned int disk_csum, csum; 567 568 disk_csum = sb->sb_csum; 569 sb->sb_csum = 0; 570 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 571 sb->sb_csum = disk_csum; 572 return csum; 573 } 574 575 576 /* 577 * Handle superblock details. 578 * We want to be able to handle multiple superblock formats 579 * so we have a common interface to them all, and an array of 580 * different handlers. 581 * We rely on user-space to write the initial superblock, and support 582 * reading and updating of superblocks. 583 * Interface methods are: 584 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 585 * loads and validates a superblock on dev. 586 * if refdev != NULL, compare superblocks on both devices 587 * Return: 588 * 0 - dev has a superblock that is compatible with refdev 589 * 1 - dev has a superblock that is compatible and newer than refdev 590 * so dev should be used as the refdev in future 591 * -EINVAL superblock incompatible or invalid 592 * -othererror e.g. -EIO 593 * 594 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 595 * Verify that dev is acceptable into mddev. 596 * The first time, mddev->raid_disks will be 0, and data from 597 * dev should be merged in. Subsequent calls check that dev 598 * is new enough. Return 0 or -EINVAL 599 * 600 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 601 * Update the superblock for rdev with data in mddev 602 * This does not write to disc. 603 * 604 */ 605 606 struct super_type { 607 char *name; 608 struct module *owner; 609 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 610 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 611 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 612 }; 613 614 /* 615 * load_super for 0.90.0 616 */ 617 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 618 { 619 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 620 mdp_super_t *sb; 621 int ret; 622 sector_t sb_offset; 623 624 /* 625 * Calculate the position of the superblock, 626 * it's at the end of the disk. 627 * 628 * It also happens to be a multiple of 4Kb. 629 */ 630 sb_offset = calc_dev_sboffset(rdev->bdev); 631 rdev->sb_offset = sb_offset; 632 633 ret = read_disk_sb(rdev, MD_SB_BYTES); 634 if (ret) return ret; 635 636 ret = -EINVAL; 637 638 bdevname(rdev->bdev, b); 639 sb = (mdp_super_t*)page_address(rdev->sb_page); 640 641 if (sb->md_magic != MD_SB_MAGIC) { 642 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 643 b); 644 goto abort; 645 } 646 647 if (sb->major_version != 0 || 648 sb->minor_version != 90) { 649 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 650 sb->major_version, sb->minor_version, 651 b); 652 goto abort; 653 } 654 655 if (sb->raid_disks <= 0) 656 goto abort; 657 658 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 659 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 660 b); 661 goto abort; 662 } 663 664 rdev->preferred_minor = sb->md_minor; 665 rdev->data_offset = 0; 666 rdev->sb_size = MD_SB_BYTES; 667 668 if (sb->level == LEVEL_MULTIPATH) 669 rdev->desc_nr = -1; 670 else 671 rdev->desc_nr = sb->this_disk.number; 672 673 if (refdev == 0) 674 ret = 1; 675 else { 676 __u64 ev1, ev2; 677 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 678 if (!uuid_equal(refsb, sb)) { 679 printk(KERN_WARNING "md: %s has different UUID to %s\n", 680 b, bdevname(refdev->bdev,b2)); 681 goto abort; 682 } 683 if (!sb_equal(refsb, sb)) { 684 printk(KERN_WARNING "md: %s has same UUID" 685 " but different superblock to %s\n", 686 b, bdevname(refdev->bdev, b2)); 687 goto abort; 688 } 689 ev1 = md_event(sb); 690 ev2 = md_event(refsb); 691 if (ev1 > ev2) 692 ret = 1; 693 else 694 ret = 0; 695 } 696 rdev->size = calc_dev_size(rdev, sb->chunk_size); 697 698 if (rdev->size < sb->size && sb->level > 1) 699 /* "this cannot possibly happen" ... */ 700 ret = -EINVAL; 701 702 abort: 703 return ret; 704 } 705 706 /* 707 * validate_super for 0.90.0 708 */ 709 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 710 { 711 mdp_disk_t *desc; 712 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 713 714 rdev->raid_disk = -1; 715 rdev->flags = 0; 716 if (mddev->raid_disks == 0) { 717 mddev->major_version = 0; 718 mddev->minor_version = sb->minor_version; 719 mddev->patch_version = sb->patch_version; 720 mddev->persistent = ! sb->not_persistent; 721 mddev->chunk_size = sb->chunk_size; 722 mddev->ctime = sb->ctime; 723 mddev->utime = sb->utime; 724 mddev->level = sb->level; 725 mddev->clevel[0] = 0; 726 mddev->layout = sb->layout; 727 mddev->raid_disks = sb->raid_disks; 728 mddev->size = sb->size; 729 mddev->events = md_event(sb); 730 mddev->bitmap_offset = 0; 731 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 732 733 if (sb->state & (1<<MD_SB_CLEAN)) 734 mddev->recovery_cp = MaxSector; 735 else { 736 if (sb->events_hi == sb->cp_events_hi && 737 sb->events_lo == sb->cp_events_lo) { 738 mddev->recovery_cp = sb->recovery_cp; 739 } else 740 mddev->recovery_cp = 0; 741 } 742 743 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 744 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 745 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 746 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 747 748 mddev->max_disks = MD_SB_DISKS; 749 750 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 751 mddev->bitmap_file == NULL) { 752 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 753 && mddev->level != 10) { 754 /* FIXME use a better test */ 755 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 756 return -EINVAL; 757 } 758 mddev->bitmap_offset = mddev->default_bitmap_offset; 759 } 760 761 } else if (mddev->pers == NULL) { 762 /* Insist on good event counter while assembling */ 763 __u64 ev1 = md_event(sb); 764 ++ev1; 765 if (ev1 < mddev->events) 766 return -EINVAL; 767 } else if (mddev->bitmap) { 768 /* if adding to array with a bitmap, then we can accept an 769 * older device ... but not too old. 770 */ 771 __u64 ev1 = md_event(sb); 772 if (ev1 < mddev->bitmap->events_cleared) 773 return 0; 774 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 775 return 0; 776 777 if (mddev->level != LEVEL_MULTIPATH) { 778 desc = sb->disks + rdev->desc_nr; 779 780 if (desc->state & (1<<MD_DISK_FAULTY)) 781 set_bit(Faulty, &rdev->flags); 782 else if (desc->state & (1<<MD_DISK_SYNC) && 783 desc->raid_disk < mddev->raid_disks) { 784 set_bit(In_sync, &rdev->flags); 785 rdev->raid_disk = desc->raid_disk; 786 } 787 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 788 set_bit(WriteMostly, &rdev->flags); 789 } else /* MULTIPATH are always insync */ 790 set_bit(In_sync, &rdev->flags); 791 return 0; 792 } 793 794 /* 795 * sync_super for 0.90.0 796 */ 797 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 798 { 799 mdp_super_t *sb; 800 struct list_head *tmp; 801 mdk_rdev_t *rdev2; 802 int next_spare = mddev->raid_disks; 803 804 805 /* make rdev->sb match mddev data.. 806 * 807 * 1/ zero out disks 808 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 809 * 3/ any empty disks < next_spare become removed 810 * 811 * disks[0] gets initialised to REMOVED because 812 * we cannot be sure from other fields if it has 813 * been initialised or not. 814 */ 815 int i; 816 int active=0, working=0,failed=0,spare=0,nr_disks=0; 817 818 rdev->sb_size = MD_SB_BYTES; 819 820 sb = (mdp_super_t*)page_address(rdev->sb_page); 821 822 memset(sb, 0, sizeof(*sb)); 823 824 sb->md_magic = MD_SB_MAGIC; 825 sb->major_version = mddev->major_version; 826 sb->minor_version = mddev->minor_version; 827 sb->patch_version = mddev->patch_version; 828 sb->gvalid_words = 0; /* ignored */ 829 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 830 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 831 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 832 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 833 834 sb->ctime = mddev->ctime; 835 sb->level = mddev->level; 836 sb->size = mddev->size; 837 sb->raid_disks = mddev->raid_disks; 838 sb->md_minor = mddev->md_minor; 839 sb->not_persistent = !mddev->persistent; 840 sb->utime = mddev->utime; 841 sb->state = 0; 842 sb->events_hi = (mddev->events>>32); 843 sb->events_lo = (u32)mddev->events; 844 845 if (mddev->in_sync) 846 { 847 sb->recovery_cp = mddev->recovery_cp; 848 sb->cp_events_hi = (mddev->events>>32); 849 sb->cp_events_lo = (u32)mddev->events; 850 if (mddev->recovery_cp == MaxSector) 851 sb->state = (1<< MD_SB_CLEAN); 852 } else 853 sb->recovery_cp = 0; 854 855 sb->layout = mddev->layout; 856 sb->chunk_size = mddev->chunk_size; 857 858 if (mddev->bitmap && mddev->bitmap_file == NULL) 859 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 860 861 sb->disks[0].state = (1<<MD_DISK_REMOVED); 862 ITERATE_RDEV(mddev,rdev2,tmp) { 863 mdp_disk_t *d; 864 int desc_nr; 865 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 866 && !test_bit(Faulty, &rdev2->flags)) 867 desc_nr = rdev2->raid_disk; 868 else 869 desc_nr = next_spare++; 870 rdev2->desc_nr = desc_nr; 871 d = &sb->disks[rdev2->desc_nr]; 872 nr_disks++; 873 d->number = rdev2->desc_nr; 874 d->major = MAJOR(rdev2->bdev->bd_dev); 875 d->minor = MINOR(rdev2->bdev->bd_dev); 876 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 877 && !test_bit(Faulty, &rdev2->flags)) 878 d->raid_disk = rdev2->raid_disk; 879 else 880 d->raid_disk = rdev2->desc_nr; /* compatibility */ 881 if (test_bit(Faulty, &rdev2->flags)) { 882 d->state = (1<<MD_DISK_FAULTY); 883 failed++; 884 } else if (test_bit(In_sync, &rdev2->flags)) { 885 d->state = (1<<MD_DISK_ACTIVE); 886 d->state |= (1<<MD_DISK_SYNC); 887 active++; 888 working++; 889 } else { 890 d->state = 0; 891 spare++; 892 working++; 893 } 894 if (test_bit(WriteMostly, &rdev2->flags)) 895 d->state |= (1<<MD_DISK_WRITEMOSTLY); 896 } 897 /* now set the "removed" and "faulty" bits on any missing devices */ 898 for (i=0 ; i < mddev->raid_disks ; i++) { 899 mdp_disk_t *d = &sb->disks[i]; 900 if (d->state == 0 && d->number == 0) { 901 d->number = i; 902 d->raid_disk = i; 903 d->state = (1<<MD_DISK_REMOVED); 904 d->state |= (1<<MD_DISK_FAULTY); 905 failed++; 906 } 907 } 908 sb->nr_disks = nr_disks; 909 sb->active_disks = active; 910 sb->working_disks = working; 911 sb->failed_disks = failed; 912 sb->spare_disks = spare; 913 914 sb->this_disk = sb->disks[rdev->desc_nr]; 915 sb->sb_csum = calc_sb_csum(sb); 916 } 917 918 /* 919 * version 1 superblock 920 */ 921 922 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 923 { 924 unsigned int disk_csum, csum; 925 unsigned long long newcsum; 926 int size = 256 + le32_to_cpu(sb->max_dev)*2; 927 unsigned int *isuper = (unsigned int*)sb; 928 int i; 929 930 disk_csum = sb->sb_csum; 931 sb->sb_csum = 0; 932 newcsum = 0; 933 for (i=0; size>=4; size -= 4 ) 934 newcsum += le32_to_cpu(*isuper++); 935 936 if (size == 2) 937 newcsum += le16_to_cpu(*(unsigned short*) isuper); 938 939 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 940 sb->sb_csum = disk_csum; 941 return cpu_to_le32(csum); 942 } 943 944 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 945 { 946 struct mdp_superblock_1 *sb; 947 int ret; 948 sector_t sb_offset; 949 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 950 int bmask; 951 952 /* 953 * Calculate the position of the superblock. 954 * It is always aligned to a 4K boundary and 955 * depeding on minor_version, it can be: 956 * 0: At least 8K, but less than 12K, from end of device 957 * 1: At start of device 958 * 2: 4K from start of device. 959 */ 960 switch(minor_version) { 961 case 0: 962 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 963 sb_offset -= 8*2; 964 sb_offset &= ~(sector_t)(4*2-1); 965 /* convert from sectors to K */ 966 sb_offset /= 2; 967 break; 968 case 1: 969 sb_offset = 0; 970 break; 971 case 2: 972 sb_offset = 4; 973 break; 974 default: 975 return -EINVAL; 976 } 977 rdev->sb_offset = sb_offset; 978 979 /* superblock is rarely larger than 1K, but it can be larger, 980 * and it is safe to read 4k, so we do that 981 */ 982 ret = read_disk_sb(rdev, 4096); 983 if (ret) return ret; 984 985 986 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 987 988 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 989 sb->major_version != cpu_to_le32(1) || 990 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 991 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 992 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 993 return -EINVAL; 994 995 if (calc_sb_1_csum(sb) != sb->sb_csum) { 996 printk("md: invalid superblock checksum on %s\n", 997 bdevname(rdev->bdev,b)); 998 return -EINVAL; 999 } 1000 if (le64_to_cpu(sb->data_size) < 10) { 1001 printk("md: data_size too small on %s\n", 1002 bdevname(rdev->bdev,b)); 1003 return -EINVAL; 1004 } 1005 rdev->preferred_minor = 0xffff; 1006 rdev->data_offset = le64_to_cpu(sb->data_offset); 1007 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1008 1009 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1010 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1011 if (rdev->sb_size & bmask) 1012 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1013 1014 if (refdev == 0) 1015 return 1; 1016 else { 1017 __u64 ev1, ev2; 1018 struct mdp_superblock_1 *refsb = 1019 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1020 1021 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1022 sb->level != refsb->level || 1023 sb->layout != refsb->layout || 1024 sb->chunksize != refsb->chunksize) { 1025 printk(KERN_WARNING "md: %s has strangely different" 1026 " superblock to %s\n", 1027 bdevname(rdev->bdev,b), 1028 bdevname(refdev->bdev,b2)); 1029 return -EINVAL; 1030 } 1031 ev1 = le64_to_cpu(sb->events); 1032 ev2 = le64_to_cpu(refsb->events); 1033 1034 if (ev1 > ev2) 1035 return 1; 1036 } 1037 if (minor_version) 1038 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1039 else 1040 rdev->size = rdev->sb_offset; 1041 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1042 return -EINVAL; 1043 rdev->size = le64_to_cpu(sb->data_size)/2; 1044 if (le32_to_cpu(sb->chunksize)) 1045 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1046 1047 if (le32_to_cpu(sb->size) > rdev->size*2) 1048 return -EINVAL; 1049 return 0; 1050 } 1051 1052 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1053 { 1054 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1055 1056 rdev->raid_disk = -1; 1057 rdev->flags = 0; 1058 if (mddev->raid_disks == 0) { 1059 mddev->major_version = 1; 1060 mddev->patch_version = 0; 1061 mddev->persistent = 1; 1062 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1063 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1064 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1065 mddev->level = le32_to_cpu(sb->level); 1066 mddev->clevel[0] = 0; 1067 mddev->layout = le32_to_cpu(sb->layout); 1068 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1069 mddev->size = le64_to_cpu(sb->size)/2; 1070 mddev->events = le64_to_cpu(sb->events); 1071 mddev->bitmap_offset = 0; 1072 mddev->default_bitmap_offset = 1024; 1073 1074 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1075 memcpy(mddev->uuid, sb->set_uuid, 16); 1076 1077 mddev->max_disks = (4096-256)/2; 1078 1079 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1080 mddev->bitmap_file == NULL ) { 1081 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1082 && mddev->level != 10) { 1083 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1084 return -EINVAL; 1085 } 1086 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1087 } 1088 } else if (mddev->pers == NULL) { 1089 /* Insist of good event counter while assembling */ 1090 __u64 ev1 = le64_to_cpu(sb->events); 1091 ++ev1; 1092 if (ev1 < mddev->events) 1093 return -EINVAL; 1094 } else if (mddev->bitmap) { 1095 /* If adding to array with a bitmap, then we can accept an 1096 * older device, but not too old. 1097 */ 1098 __u64 ev1 = le64_to_cpu(sb->events); 1099 if (ev1 < mddev->bitmap->events_cleared) 1100 return 0; 1101 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1102 return 0; 1103 1104 if (mddev->level != LEVEL_MULTIPATH) { 1105 int role; 1106 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1107 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1108 switch(role) { 1109 case 0xffff: /* spare */ 1110 break; 1111 case 0xfffe: /* faulty */ 1112 set_bit(Faulty, &rdev->flags); 1113 break; 1114 default: 1115 set_bit(In_sync, &rdev->flags); 1116 rdev->raid_disk = role; 1117 break; 1118 } 1119 if (sb->devflags & WriteMostly1) 1120 set_bit(WriteMostly, &rdev->flags); 1121 } else /* MULTIPATH are always insync */ 1122 set_bit(In_sync, &rdev->flags); 1123 1124 return 0; 1125 } 1126 1127 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1128 { 1129 struct mdp_superblock_1 *sb; 1130 struct list_head *tmp; 1131 mdk_rdev_t *rdev2; 1132 int max_dev, i; 1133 /* make rdev->sb match mddev and rdev data. */ 1134 1135 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1136 1137 sb->feature_map = 0; 1138 sb->pad0 = 0; 1139 memset(sb->pad1, 0, sizeof(sb->pad1)); 1140 memset(sb->pad2, 0, sizeof(sb->pad2)); 1141 memset(sb->pad3, 0, sizeof(sb->pad3)); 1142 1143 sb->utime = cpu_to_le64((__u64)mddev->utime); 1144 sb->events = cpu_to_le64(mddev->events); 1145 if (mddev->in_sync) 1146 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1147 else 1148 sb->resync_offset = cpu_to_le64(0); 1149 1150 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); 1151 1152 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1153 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1154 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1155 } 1156 1157 max_dev = 0; 1158 ITERATE_RDEV(mddev,rdev2,tmp) 1159 if (rdev2->desc_nr+1 > max_dev) 1160 max_dev = rdev2->desc_nr+1; 1161 1162 sb->max_dev = cpu_to_le32(max_dev); 1163 for (i=0; i<max_dev;i++) 1164 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1165 1166 ITERATE_RDEV(mddev,rdev2,tmp) { 1167 i = rdev2->desc_nr; 1168 if (test_bit(Faulty, &rdev2->flags)) 1169 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1170 else if (test_bit(In_sync, &rdev2->flags)) 1171 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1172 else 1173 sb->dev_roles[i] = cpu_to_le16(0xffff); 1174 } 1175 1176 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1177 sb->sb_csum = calc_sb_1_csum(sb); 1178 } 1179 1180 1181 static struct super_type super_types[] = { 1182 [0] = { 1183 .name = "0.90.0", 1184 .owner = THIS_MODULE, 1185 .load_super = super_90_load, 1186 .validate_super = super_90_validate, 1187 .sync_super = super_90_sync, 1188 }, 1189 [1] = { 1190 .name = "md-1", 1191 .owner = THIS_MODULE, 1192 .load_super = super_1_load, 1193 .validate_super = super_1_validate, 1194 .sync_super = super_1_sync, 1195 }, 1196 }; 1197 1198 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1199 { 1200 struct list_head *tmp; 1201 mdk_rdev_t *rdev; 1202 1203 ITERATE_RDEV(mddev,rdev,tmp) 1204 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1205 return rdev; 1206 1207 return NULL; 1208 } 1209 1210 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1211 { 1212 struct list_head *tmp; 1213 mdk_rdev_t *rdev; 1214 1215 ITERATE_RDEV(mddev1,rdev,tmp) 1216 if (match_dev_unit(mddev2, rdev)) 1217 return 1; 1218 1219 return 0; 1220 } 1221 1222 static LIST_HEAD(pending_raid_disks); 1223 1224 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1225 { 1226 mdk_rdev_t *same_pdev; 1227 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1228 struct kobject *ko; 1229 1230 if (rdev->mddev) { 1231 MD_BUG(); 1232 return -EINVAL; 1233 } 1234 /* make sure rdev->size exceeds mddev->size */ 1235 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1236 if (mddev->pers) 1237 /* Cannot change size, so fail */ 1238 return -ENOSPC; 1239 else 1240 mddev->size = rdev->size; 1241 } 1242 same_pdev = match_dev_unit(mddev, rdev); 1243 if (same_pdev) 1244 printk(KERN_WARNING 1245 "%s: WARNING: %s appears to be on the same physical" 1246 " disk as %s. True\n protection against single-disk" 1247 " failure might be compromised.\n", 1248 mdname(mddev), bdevname(rdev->bdev,b), 1249 bdevname(same_pdev->bdev,b2)); 1250 1251 /* Verify rdev->desc_nr is unique. 1252 * If it is -1, assign a free number, else 1253 * check number is not in use 1254 */ 1255 if (rdev->desc_nr < 0) { 1256 int choice = 0; 1257 if (mddev->pers) choice = mddev->raid_disks; 1258 while (find_rdev_nr(mddev, choice)) 1259 choice++; 1260 rdev->desc_nr = choice; 1261 } else { 1262 if (find_rdev_nr(mddev, rdev->desc_nr)) 1263 return -EBUSY; 1264 } 1265 bdevname(rdev->bdev,b); 1266 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1267 return -ENOMEM; 1268 1269 list_add(&rdev->same_set, &mddev->disks); 1270 rdev->mddev = mddev; 1271 printk(KERN_INFO "md: bind<%s>\n", b); 1272 1273 rdev->kobj.parent = &mddev->kobj; 1274 kobject_add(&rdev->kobj); 1275 1276 if (rdev->bdev->bd_part) 1277 ko = &rdev->bdev->bd_part->kobj; 1278 else 1279 ko = &rdev->bdev->bd_disk->kobj; 1280 sysfs_create_link(&rdev->kobj, ko, "block"); 1281 return 0; 1282 } 1283 1284 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1285 { 1286 char b[BDEVNAME_SIZE]; 1287 if (!rdev->mddev) { 1288 MD_BUG(); 1289 return; 1290 } 1291 list_del_init(&rdev->same_set); 1292 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1293 rdev->mddev = NULL; 1294 sysfs_remove_link(&rdev->kobj, "block"); 1295 kobject_del(&rdev->kobj); 1296 } 1297 1298 /* 1299 * prevent the device from being mounted, repartitioned or 1300 * otherwise reused by a RAID array (or any other kernel 1301 * subsystem), by bd_claiming the device. 1302 */ 1303 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1304 { 1305 int err = 0; 1306 struct block_device *bdev; 1307 char b[BDEVNAME_SIZE]; 1308 1309 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1310 if (IS_ERR(bdev)) { 1311 printk(KERN_ERR "md: could not open %s.\n", 1312 __bdevname(dev, b)); 1313 return PTR_ERR(bdev); 1314 } 1315 err = bd_claim(bdev, rdev); 1316 if (err) { 1317 printk(KERN_ERR "md: could not bd_claim %s.\n", 1318 bdevname(bdev, b)); 1319 blkdev_put(bdev); 1320 return err; 1321 } 1322 rdev->bdev = bdev; 1323 return err; 1324 } 1325 1326 static void unlock_rdev(mdk_rdev_t *rdev) 1327 { 1328 struct block_device *bdev = rdev->bdev; 1329 rdev->bdev = NULL; 1330 if (!bdev) 1331 MD_BUG(); 1332 bd_release(bdev); 1333 blkdev_put(bdev); 1334 } 1335 1336 void md_autodetect_dev(dev_t dev); 1337 1338 static void export_rdev(mdk_rdev_t * rdev) 1339 { 1340 char b[BDEVNAME_SIZE]; 1341 printk(KERN_INFO "md: export_rdev(%s)\n", 1342 bdevname(rdev->bdev,b)); 1343 if (rdev->mddev) 1344 MD_BUG(); 1345 free_disk_sb(rdev); 1346 list_del_init(&rdev->same_set); 1347 #ifndef MODULE 1348 md_autodetect_dev(rdev->bdev->bd_dev); 1349 #endif 1350 unlock_rdev(rdev); 1351 kobject_put(&rdev->kobj); 1352 } 1353 1354 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1355 { 1356 unbind_rdev_from_array(rdev); 1357 export_rdev(rdev); 1358 } 1359 1360 static void export_array(mddev_t *mddev) 1361 { 1362 struct list_head *tmp; 1363 mdk_rdev_t *rdev; 1364 1365 ITERATE_RDEV(mddev,rdev,tmp) { 1366 if (!rdev->mddev) { 1367 MD_BUG(); 1368 continue; 1369 } 1370 kick_rdev_from_array(rdev); 1371 } 1372 if (!list_empty(&mddev->disks)) 1373 MD_BUG(); 1374 mddev->raid_disks = 0; 1375 mddev->major_version = 0; 1376 } 1377 1378 static void print_desc(mdp_disk_t *desc) 1379 { 1380 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1381 desc->major,desc->minor,desc->raid_disk,desc->state); 1382 } 1383 1384 static void print_sb(mdp_super_t *sb) 1385 { 1386 int i; 1387 1388 printk(KERN_INFO 1389 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1390 sb->major_version, sb->minor_version, sb->patch_version, 1391 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1392 sb->ctime); 1393 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1394 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1395 sb->md_minor, sb->layout, sb->chunk_size); 1396 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1397 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1398 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1399 sb->failed_disks, sb->spare_disks, 1400 sb->sb_csum, (unsigned long)sb->events_lo); 1401 1402 printk(KERN_INFO); 1403 for (i = 0; i < MD_SB_DISKS; i++) { 1404 mdp_disk_t *desc; 1405 1406 desc = sb->disks + i; 1407 if (desc->number || desc->major || desc->minor || 1408 desc->raid_disk || (desc->state && (desc->state != 4))) { 1409 printk(" D %2d: ", i); 1410 print_desc(desc); 1411 } 1412 } 1413 printk(KERN_INFO "md: THIS: "); 1414 print_desc(&sb->this_disk); 1415 1416 } 1417 1418 static void print_rdev(mdk_rdev_t *rdev) 1419 { 1420 char b[BDEVNAME_SIZE]; 1421 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1422 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1423 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1424 rdev->desc_nr); 1425 if (rdev->sb_loaded) { 1426 printk(KERN_INFO "md: rdev superblock:\n"); 1427 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1428 } else 1429 printk(KERN_INFO "md: no rdev superblock!\n"); 1430 } 1431 1432 void md_print_devices(void) 1433 { 1434 struct list_head *tmp, *tmp2; 1435 mdk_rdev_t *rdev; 1436 mddev_t *mddev; 1437 char b[BDEVNAME_SIZE]; 1438 1439 printk("\n"); 1440 printk("md: **********************************\n"); 1441 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1442 printk("md: **********************************\n"); 1443 ITERATE_MDDEV(mddev,tmp) { 1444 1445 if (mddev->bitmap) 1446 bitmap_print_sb(mddev->bitmap); 1447 else 1448 printk("%s: ", mdname(mddev)); 1449 ITERATE_RDEV(mddev,rdev,tmp2) 1450 printk("<%s>", bdevname(rdev->bdev,b)); 1451 printk("\n"); 1452 1453 ITERATE_RDEV(mddev,rdev,tmp2) 1454 print_rdev(rdev); 1455 } 1456 printk("md: **********************************\n"); 1457 printk("\n"); 1458 } 1459 1460 1461 static void sync_sbs(mddev_t * mddev) 1462 { 1463 mdk_rdev_t *rdev; 1464 struct list_head *tmp; 1465 1466 ITERATE_RDEV(mddev,rdev,tmp) { 1467 super_types[mddev->major_version]. 1468 sync_super(mddev, rdev); 1469 rdev->sb_loaded = 1; 1470 } 1471 } 1472 1473 static void md_update_sb(mddev_t * mddev) 1474 { 1475 int err; 1476 struct list_head *tmp; 1477 mdk_rdev_t *rdev; 1478 int sync_req; 1479 1480 repeat: 1481 spin_lock_irq(&mddev->write_lock); 1482 sync_req = mddev->in_sync; 1483 mddev->utime = get_seconds(); 1484 mddev->events ++; 1485 1486 if (!mddev->events) { 1487 /* 1488 * oops, this 64-bit counter should never wrap. 1489 * Either we are in around ~1 trillion A.C., assuming 1490 * 1 reboot per second, or we have a bug: 1491 */ 1492 MD_BUG(); 1493 mddev->events --; 1494 } 1495 mddev->sb_dirty = 2; 1496 sync_sbs(mddev); 1497 1498 /* 1499 * do not write anything to disk if using 1500 * nonpersistent superblocks 1501 */ 1502 if (!mddev->persistent) { 1503 mddev->sb_dirty = 0; 1504 spin_unlock_irq(&mddev->write_lock); 1505 wake_up(&mddev->sb_wait); 1506 return; 1507 } 1508 spin_unlock_irq(&mddev->write_lock); 1509 1510 dprintk(KERN_INFO 1511 "md: updating %s RAID superblock on device (in sync %d)\n", 1512 mdname(mddev),mddev->in_sync); 1513 1514 err = bitmap_update_sb(mddev->bitmap); 1515 ITERATE_RDEV(mddev,rdev,tmp) { 1516 char b[BDEVNAME_SIZE]; 1517 dprintk(KERN_INFO "md: "); 1518 if (test_bit(Faulty, &rdev->flags)) 1519 dprintk("(skipping faulty "); 1520 1521 dprintk("%s ", bdevname(rdev->bdev,b)); 1522 if (!test_bit(Faulty, &rdev->flags)) { 1523 md_super_write(mddev,rdev, 1524 rdev->sb_offset<<1, rdev->sb_size, 1525 rdev->sb_page); 1526 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1527 bdevname(rdev->bdev,b), 1528 (unsigned long long)rdev->sb_offset); 1529 1530 } else 1531 dprintk(")\n"); 1532 if (mddev->level == LEVEL_MULTIPATH) 1533 /* only need to write one superblock... */ 1534 break; 1535 } 1536 md_super_wait(mddev); 1537 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1538 1539 spin_lock_irq(&mddev->write_lock); 1540 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1541 /* have to write it out again */ 1542 spin_unlock_irq(&mddev->write_lock); 1543 goto repeat; 1544 } 1545 mddev->sb_dirty = 0; 1546 spin_unlock_irq(&mddev->write_lock); 1547 wake_up(&mddev->sb_wait); 1548 1549 } 1550 1551 /* words written to sysfs files may, or my not, be \n terminated. 1552 * We want to accept with case. For this we use cmd_match. 1553 */ 1554 static int cmd_match(const char *cmd, const char *str) 1555 { 1556 /* See if cmd, written into a sysfs file, matches 1557 * str. They must either be the same, or cmd can 1558 * have a trailing newline 1559 */ 1560 while (*cmd && *str && *cmd == *str) { 1561 cmd++; 1562 str++; 1563 } 1564 if (*cmd == '\n') 1565 cmd++; 1566 if (*str || *cmd) 1567 return 0; 1568 return 1; 1569 } 1570 1571 struct rdev_sysfs_entry { 1572 struct attribute attr; 1573 ssize_t (*show)(mdk_rdev_t *, char *); 1574 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1575 }; 1576 1577 static ssize_t 1578 state_show(mdk_rdev_t *rdev, char *page) 1579 { 1580 char *sep = ""; 1581 int len=0; 1582 1583 if (test_bit(Faulty, &rdev->flags)) { 1584 len+= sprintf(page+len, "%sfaulty",sep); 1585 sep = ","; 1586 } 1587 if (test_bit(In_sync, &rdev->flags)) { 1588 len += sprintf(page+len, "%sin_sync",sep); 1589 sep = ","; 1590 } 1591 if (!test_bit(Faulty, &rdev->flags) && 1592 !test_bit(In_sync, &rdev->flags)) { 1593 len += sprintf(page+len, "%sspare", sep); 1594 sep = ","; 1595 } 1596 return len+sprintf(page+len, "\n"); 1597 } 1598 1599 static struct rdev_sysfs_entry 1600 rdev_state = __ATTR_RO(state); 1601 1602 static ssize_t 1603 super_show(mdk_rdev_t *rdev, char *page) 1604 { 1605 if (rdev->sb_loaded && rdev->sb_size) { 1606 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1607 return rdev->sb_size; 1608 } else 1609 return 0; 1610 } 1611 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1612 1613 static ssize_t 1614 errors_show(mdk_rdev_t *rdev, char *page) 1615 { 1616 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1617 } 1618 1619 static ssize_t 1620 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1621 { 1622 char *e; 1623 unsigned long n = simple_strtoul(buf, &e, 10); 1624 if (*buf && (*e == 0 || *e == '\n')) { 1625 atomic_set(&rdev->corrected_errors, n); 1626 return len; 1627 } 1628 return -EINVAL; 1629 } 1630 static struct rdev_sysfs_entry rdev_errors = 1631 __ATTR(errors, 0644, errors_show, errors_store); 1632 1633 static struct attribute *rdev_default_attrs[] = { 1634 &rdev_state.attr, 1635 &rdev_super.attr, 1636 &rdev_errors.attr, 1637 NULL, 1638 }; 1639 static ssize_t 1640 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1641 { 1642 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1643 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1644 1645 if (!entry->show) 1646 return -EIO; 1647 return entry->show(rdev, page); 1648 } 1649 1650 static ssize_t 1651 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1652 const char *page, size_t length) 1653 { 1654 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1655 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1656 1657 if (!entry->store) 1658 return -EIO; 1659 return entry->store(rdev, page, length); 1660 } 1661 1662 static void rdev_free(struct kobject *ko) 1663 { 1664 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1665 kfree(rdev); 1666 } 1667 static struct sysfs_ops rdev_sysfs_ops = { 1668 .show = rdev_attr_show, 1669 .store = rdev_attr_store, 1670 }; 1671 static struct kobj_type rdev_ktype = { 1672 .release = rdev_free, 1673 .sysfs_ops = &rdev_sysfs_ops, 1674 .default_attrs = rdev_default_attrs, 1675 }; 1676 1677 /* 1678 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1679 * 1680 * mark the device faulty if: 1681 * 1682 * - the device is nonexistent (zero size) 1683 * - the device has no valid superblock 1684 * 1685 * a faulty rdev _never_ has rdev->sb set. 1686 */ 1687 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1688 { 1689 char b[BDEVNAME_SIZE]; 1690 int err; 1691 mdk_rdev_t *rdev; 1692 sector_t size; 1693 1694 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1695 if (!rdev) { 1696 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1697 return ERR_PTR(-ENOMEM); 1698 } 1699 1700 if ((err = alloc_disk_sb(rdev))) 1701 goto abort_free; 1702 1703 err = lock_rdev(rdev, newdev); 1704 if (err) 1705 goto abort_free; 1706 1707 rdev->kobj.parent = NULL; 1708 rdev->kobj.ktype = &rdev_ktype; 1709 kobject_init(&rdev->kobj); 1710 1711 rdev->desc_nr = -1; 1712 rdev->flags = 0; 1713 rdev->data_offset = 0; 1714 atomic_set(&rdev->nr_pending, 0); 1715 atomic_set(&rdev->read_errors, 0); 1716 atomic_set(&rdev->corrected_errors, 0); 1717 1718 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1719 if (!size) { 1720 printk(KERN_WARNING 1721 "md: %s has zero or unknown size, marking faulty!\n", 1722 bdevname(rdev->bdev,b)); 1723 err = -EINVAL; 1724 goto abort_free; 1725 } 1726 1727 if (super_format >= 0) { 1728 err = super_types[super_format]. 1729 load_super(rdev, NULL, super_minor); 1730 if (err == -EINVAL) { 1731 printk(KERN_WARNING 1732 "md: %s has invalid sb, not importing!\n", 1733 bdevname(rdev->bdev,b)); 1734 goto abort_free; 1735 } 1736 if (err < 0) { 1737 printk(KERN_WARNING 1738 "md: could not read %s's sb, not importing!\n", 1739 bdevname(rdev->bdev,b)); 1740 goto abort_free; 1741 } 1742 } 1743 INIT_LIST_HEAD(&rdev->same_set); 1744 1745 return rdev; 1746 1747 abort_free: 1748 if (rdev->sb_page) { 1749 if (rdev->bdev) 1750 unlock_rdev(rdev); 1751 free_disk_sb(rdev); 1752 } 1753 kfree(rdev); 1754 return ERR_PTR(err); 1755 } 1756 1757 /* 1758 * Check a full RAID array for plausibility 1759 */ 1760 1761 1762 static void analyze_sbs(mddev_t * mddev) 1763 { 1764 int i; 1765 struct list_head *tmp; 1766 mdk_rdev_t *rdev, *freshest; 1767 char b[BDEVNAME_SIZE]; 1768 1769 freshest = NULL; 1770 ITERATE_RDEV(mddev,rdev,tmp) 1771 switch (super_types[mddev->major_version]. 1772 load_super(rdev, freshest, mddev->minor_version)) { 1773 case 1: 1774 freshest = rdev; 1775 break; 1776 case 0: 1777 break; 1778 default: 1779 printk( KERN_ERR \ 1780 "md: fatal superblock inconsistency in %s" 1781 " -- removing from array\n", 1782 bdevname(rdev->bdev,b)); 1783 kick_rdev_from_array(rdev); 1784 } 1785 1786 1787 super_types[mddev->major_version]. 1788 validate_super(mddev, freshest); 1789 1790 i = 0; 1791 ITERATE_RDEV(mddev,rdev,tmp) { 1792 if (rdev != freshest) 1793 if (super_types[mddev->major_version]. 1794 validate_super(mddev, rdev)) { 1795 printk(KERN_WARNING "md: kicking non-fresh %s" 1796 " from array!\n", 1797 bdevname(rdev->bdev,b)); 1798 kick_rdev_from_array(rdev); 1799 continue; 1800 } 1801 if (mddev->level == LEVEL_MULTIPATH) { 1802 rdev->desc_nr = i++; 1803 rdev->raid_disk = rdev->desc_nr; 1804 set_bit(In_sync, &rdev->flags); 1805 } 1806 } 1807 1808 1809 1810 if (mddev->recovery_cp != MaxSector && 1811 mddev->level >= 1) 1812 printk(KERN_ERR "md: %s: raid array is not clean" 1813 " -- starting background reconstruction\n", 1814 mdname(mddev)); 1815 1816 } 1817 1818 static ssize_t 1819 level_show(mddev_t *mddev, char *page) 1820 { 1821 struct mdk_personality *p = mddev->pers; 1822 if (p) 1823 return sprintf(page, "%s\n", p->name); 1824 else if (mddev->clevel[0]) 1825 return sprintf(page, "%s\n", mddev->clevel); 1826 else if (mddev->level != LEVEL_NONE) 1827 return sprintf(page, "%d\n", mddev->level); 1828 else 1829 return 0; 1830 } 1831 1832 static ssize_t 1833 level_store(mddev_t *mddev, const char *buf, size_t len) 1834 { 1835 int rv = len; 1836 if (mddev->pers) 1837 return -EBUSY; 1838 if (len == 0) 1839 return 0; 1840 if (len >= sizeof(mddev->clevel)) 1841 return -ENOSPC; 1842 strncpy(mddev->clevel, buf, len); 1843 if (mddev->clevel[len-1] == '\n') 1844 len--; 1845 mddev->clevel[len] = 0; 1846 mddev->level = LEVEL_NONE; 1847 return rv; 1848 } 1849 1850 static struct md_sysfs_entry md_level = 1851 __ATTR(level, 0644, level_show, level_store); 1852 1853 static ssize_t 1854 raid_disks_show(mddev_t *mddev, char *page) 1855 { 1856 if (mddev->raid_disks == 0) 1857 return 0; 1858 return sprintf(page, "%d\n", mddev->raid_disks); 1859 } 1860 1861 static int update_raid_disks(mddev_t *mddev, int raid_disks); 1862 1863 static ssize_t 1864 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 1865 { 1866 /* can only set raid_disks if array is not yet active */ 1867 char *e; 1868 int rv = 0; 1869 unsigned long n = simple_strtoul(buf, &e, 10); 1870 1871 if (!*buf || (*e && *e != '\n')) 1872 return -EINVAL; 1873 1874 if (mddev->pers) 1875 rv = update_raid_disks(mddev, n); 1876 else 1877 mddev->raid_disks = n; 1878 return rv ? rv : len; 1879 } 1880 static struct md_sysfs_entry md_raid_disks = 1881 __ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); 1882 1883 static ssize_t 1884 chunk_size_show(mddev_t *mddev, char *page) 1885 { 1886 return sprintf(page, "%d\n", mddev->chunk_size); 1887 } 1888 1889 static ssize_t 1890 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 1891 { 1892 /* can only set chunk_size if array is not yet active */ 1893 char *e; 1894 unsigned long n = simple_strtoul(buf, &e, 10); 1895 1896 if (mddev->pers) 1897 return -EBUSY; 1898 if (!*buf || (*e && *e != '\n')) 1899 return -EINVAL; 1900 1901 mddev->chunk_size = n; 1902 return len; 1903 } 1904 static struct md_sysfs_entry md_chunk_size = 1905 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 1906 1907 1908 static ssize_t 1909 size_show(mddev_t *mddev, char *page) 1910 { 1911 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 1912 } 1913 1914 static int update_size(mddev_t *mddev, unsigned long size); 1915 1916 static ssize_t 1917 size_store(mddev_t *mddev, const char *buf, size_t len) 1918 { 1919 /* If array is inactive, we can reduce the component size, but 1920 * not increase it (except from 0). 1921 * If array is active, we can try an on-line resize 1922 */ 1923 char *e; 1924 int err = 0; 1925 unsigned long long size = simple_strtoull(buf, &e, 10); 1926 if (!*buf || *buf == '\n' || 1927 (*e && *e != '\n')) 1928 return -EINVAL; 1929 1930 if (mddev->pers) { 1931 err = update_size(mddev, size); 1932 md_update_sb(mddev); 1933 } else { 1934 if (mddev->size == 0 || 1935 mddev->size > size) 1936 mddev->size = size; 1937 else 1938 err = -ENOSPC; 1939 } 1940 return err ? err : len; 1941 } 1942 1943 static struct md_sysfs_entry md_size = 1944 __ATTR(component_size, 0644, size_show, size_store); 1945 1946 1947 /* Metdata version. 1948 * This is either 'none' for arrays with externally managed metadata, 1949 * or N.M for internally known formats 1950 */ 1951 static ssize_t 1952 metadata_show(mddev_t *mddev, char *page) 1953 { 1954 if (mddev->persistent) 1955 return sprintf(page, "%d.%d\n", 1956 mddev->major_version, mddev->minor_version); 1957 else 1958 return sprintf(page, "none\n"); 1959 } 1960 1961 static ssize_t 1962 metadata_store(mddev_t *mddev, const char *buf, size_t len) 1963 { 1964 int major, minor; 1965 char *e; 1966 if (!list_empty(&mddev->disks)) 1967 return -EBUSY; 1968 1969 if (cmd_match(buf, "none")) { 1970 mddev->persistent = 0; 1971 mddev->major_version = 0; 1972 mddev->minor_version = 90; 1973 return len; 1974 } 1975 major = simple_strtoul(buf, &e, 10); 1976 if (e==buf || *e != '.') 1977 return -EINVAL; 1978 buf = e+1; 1979 minor = simple_strtoul(buf, &e, 10); 1980 if (e==buf || *e != '\n') 1981 return -EINVAL; 1982 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 1983 super_types[major].name == NULL) 1984 return -ENOENT; 1985 mddev->major_version = major; 1986 mddev->minor_version = minor; 1987 mddev->persistent = 1; 1988 return len; 1989 } 1990 1991 static struct md_sysfs_entry md_metadata = 1992 __ATTR(metadata_version, 0644, metadata_show, metadata_store); 1993 1994 static ssize_t 1995 action_show(mddev_t *mddev, char *page) 1996 { 1997 char *type = "idle"; 1998 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1999 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2000 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2001 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2002 type = "resync"; 2003 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2004 type = "check"; 2005 else 2006 type = "repair"; 2007 } else 2008 type = "recover"; 2009 } 2010 return sprintf(page, "%s\n", type); 2011 } 2012 2013 static ssize_t 2014 action_store(mddev_t *mddev, const char *page, size_t len) 2015 { 2016 if (!mddev->pers || !mddev->pers->sync_request) 2017 return -EINVAL; 2018 2019 if (cmd_match(page, "idle")) { 2020 if (mddev->sync_thread) { 2021 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2022 md_unregister_thread(mddev->sync_thread); 2023 mddev->sync_thread = NULL; 2024 mddev->recovery = 0; 2025 } 2026 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2027 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2028 return -EBUSY; 2029 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2030 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2031 else { 2032 if (cmd_match(page, "check")) 2033 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2034 else if (cmd_match(page, "repair")) 2035 return -EINVAL; 2036 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2037 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2038 } 2039 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2040 md_wakeup_thread(mddev->thread); 2041 return len; 2042 } 2043 2044 static ssize_t 2045 mismatch_cnt_show(mddev_t *mddev, char *page) 2046 { 2047 return sprintf(page, "%llu\n", 2048 (unsigned long long) mddev->resync_mismatches); 2049 } 2050 2051 static struct md_sysfs_entry 2052 md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2053 2054 2055 static struct md_sysfs_entry 2056 md_mismatches = __ATTR_RO(mismatch_cnt); 2057 2058 static struct attribute *md_default_attrs[] = { 2059 &md_level.attr, 2060 &md_raid_disks.attr, 2061 &md_chunk_size.attr, 2062 &md_size.attr, 2063 &md_metadata.attr, 2064 NULL, 2065 }; 2066 2067 static struct attribute *md_redundancy_attrs[] = { 2068 &md_scan_mode.attr, 2069 &md_mismatches.attr, 2070 NULL, 2071 }; 2072 static struct attribute_group md_redundancy_group = { 2073 .name = NULL, 2074 .attrs = md_redundancy_attrs, 2075 }; 2076 2077 2078 static ssize_t 2079 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2080 { 2081 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2082 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2083 ssize_t rv; 2084 2085 if (!entry->show) 2086 return -EIO; 2087 mddev_lock(mddev); 2088 rv = entry->show(mddev, page); 2089 mddev_unlock(mddev); 2090 return rv; 2091 } 2092 2093 static ssize_t 2094 md_attr_store(struct kobject *kobj, struct attribute *attr, 2095 const char *page, size_t length) 2096 { 2097 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2098 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2099 ssize_t rv; 2100 2101 if (!entry->store) 2102 return -EIO; 2103 mddev_lock(mddev); 2104 rv = entry->store(mddev, page, length); 2105 mddev_unlock(mddev); 2106 return rv; 2107 } 2108 2109 static void md_free(struct kobject *ko) 2110 { 2111 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2112 kfree(mddev); 2113 } 2114 2115 static struct sysfs_ops md_sysfs_ops = { 2116 .show = md_attr_show, 2117 .store = md_attr_store, 2118 }; 2119 static struct kobj_type md_ktype = { 2120 .release = md_free, 2121 .sysfs_ops = &md_sysfs_ops, 2122 .default_attrs = md_default_attrs, 2123 }; 2124 2125 int mdp_major = 0; 2126 2127 static struct kobject *md_probe(dev_t dev, int *part, void *data) 2128 { 2129 static DECLARE_MUTEX(disks_sem); 2130 mddev_t *mddev = mddev_find(dev); 2131 struct gendisk *disk; 2132 int partitioned = (MAJOR(dev) != MD_MAJOR); 2133 int shift = partitioned ? MdpMinorShift : 0; 2134 int unit = MINOR(dev) >> shift; 2135 2136 if (!mddev) 2137 return NULL; 2138 2139 down(&disks_sem); 2140 if (mddev->gendisk) { 2141 up(&disks_sem); 2142 mddev_put(mddev); 2143 return NULL; 2144 } 2145 disk = alloc_disk(1 << shift); 2146 if (!disk) { 2147 up(&disks_sem); 2148 mddev_put(mddev); 2149 return NULL; 2150 } 2151 disk->major = MAJOR(dev); 2152 disk->first_minor = unit << shift; 2153 if (partitioned) { 2154 sprintf(disk->disk_name, "md_d%d", unit); 2155 sprintf(disk->devfs_name, "md/d%d", unit); 2156 } else { 2157 sprintf(disk->disk_name, "md%d", unit); 2158 sprintf(disk->devfs_name, "md/%d", unit); 2159 } 2160 disk->fops = &md_fops; 2161 disk->private_data = mddev; 2162 disk->queue = mddev->queue; 2163 add_disk(disk); 2164 mddev->gendisk = disk; 2165 up(&disks_sem); 2166 mddev->kobj.parent = &disk->kobj; 2167 mddev->kobj.k_name = NULL; 2168 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2169 mddev->kobj.ktype = &md_ktype; 2170 kobject_register(&mddev->kobj); 2171 return NULL; 2172 } 2173 2174 void md_wakeup_thread(mdk_thread_t *thread); 2175 2176 static void md_safemode_timeout(unsigned long data) 2177 { 2178 mddev_t *mddev = (mddev_t *) data; 2179 2180 mddev->safemode = 1; 2181 md_wakeup_thread(mddev->thread); 2182 } 2183 2184 static int start_dirty_degraded; 2185 2186 static int do_md_run(mddev_t * mddev) 2187 { 2188 int err; 2189 int chunk_size; 2190 struct list_head *tmp; 2191 mdk_rdev_t *rdev; 2192 struct gendisk *disk; 2193 struct mdk_personality *pers; 2194 char b[BDEVNAME_SIZE]; 2195 2196 if (list_empty(&mddev->disks)) 2197 /* cannot run an array with no devices.. */ 2198 return -EINVAL; 2199 2200 if (mddev->pers) 2201 return -EBUSY; 2202 2203 /* 2204 * Analyze all RAID superblock(s) 2205 */ 2206 if (!mddev->raid_disks) 2207 analyze_sbs(mddev); 2208 2209 chunk_size = mddev->chunk_size; 2210 2211 if (chunk_size) { 2212 if (chunk_size > MAX_CHUNK_SIZE) { 2213 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2214 chunk_size, MAX_CHUNK_SIZE); 2215 return -EINVAL; 2216 } 2217 /* 2218 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 2219 */ 2220 if ( (1 << ffz(~chunk_size)) != chunk_size) { 2221 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 2222 return -EINVAL; 2223 } 2224 if (chunk_size < PAGE_SIZE) { 2225 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 2226 chunk_size, PAGE_SIZE); 2227 return -EINVAL; 2228 } 2229 2230 /* devices must have minimum size of one chunk */ 2231 ITERATE_RDEV(mddev,rdev,tmp) { 2232 if (test_bit(Faulty, &rdev->flags)) 2233 continue; 2234 if (rdev->size < chunk_size / 1024) { 2235 printk(KERN_WARNING 2236 "md: Dev %s smaller than chunk_size:" 2237 " %lluk < %dk\n", 2238 bdevname(rdev->bdev,b), 2239 (unsigned long long)rdev->size, 2240 chunk_size / 1024); 2241 return -EINVAL; 2242 } 2243 } 2244 } 2245 2246 #ifdef CONFIG_KMOD 2247 if (mddev->level != LEVEL_NONE) 2248 request_module("md-level-%d", mddev->level); 2249 else if (mddev->clevel[0]) 2250 request_module("md-%s", mddev->clevel); 2251 #endif 2252 2253 /* 2254 * Drop all container device buffers, from now on 2255 * the only valid external interface is through the md 2256 * device. 2257 * Also find largest hardsector size 2258 */ 2259 ITERATE_RDEV(mddev,rdev,tmp) { 2260 if (test_bit(Faulty, &rdev->flags)) 2261 continue; 2262 sync_blockdev(rdev->bdev); 2263 invalidate_bdev(rdev->bdev, 0); 2264 } 2265 2266 md_probe(mddev->unit, NULL, NULL); 2267 disk = mddev->gendisk; 2268 if (!disk) 2269 return -ENOMEM; 2270 2271 spin_lock(&pers_lock); 2272 pers = find_pers(mddev->level, mddev->clevel); 2273 if (!pers || !try_module_get(pers->owner)) { 2274 spin_unlock(&pers_lock); 2275 if (mddev->level != LEVEL_NONE) 2276 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 2277 mddev->level); 2278 else 2279 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 2280 mddev->clevel); 2281 return -EINVAL; 2282 } 2283 mddev->pers = pers; 2284 spin_unlock(&pers_lock); 2285 mddev->level = pers->level; 2286 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2287 2288 mddev->recovery = 0; 2289 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2290 mddev->barriers_work = 1; 2291 mddev->ok_start_degraded = start_dirty_degraded; 2292 2293 if (start_readonly) 2294 mddev->ro = 2; /* read-only, but switch on first write */ 2295 2296 err = mddev->pers->run(mddev); 2297 if (!err && mddev->pers->sync_request) { 2298 err = bitmap_create(mddev); 2299 if (err) { 2300 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2301 mdname(mddev), err); 2302 mddev->pers->stop(mddev); 2303 } 2304 } 2305 if (err) { 2306 printk(KERN_ERR "md: pers->run() failed ...\n"); 2307 module_put(mddev->pers->owner); 2308 mddev->pers = NULL; 2309 bitmap_destroy(mddev); 2310 return err; 2311 } 2312 if (mddev->pers->sync_request) 2313 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 2314 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 2315 mddev->ro = 0; 2316 2317 atomic_set(&mddev->writes_pending,0); 2318 mddev->safemode = 0; 2319 mddev->safemode_timer.function = md_safemode_timeout; 2320 mddev->safemode_timer.data = (unsigned long) mddev; 2321 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2322 mddev->in_sync = 1; 2323 2324 ITERATE_RDEV(mddev,rdev,tmp) 2325 if (rdev->raid_disk >= 0) { 2326 char nm[20]; 2327 sprintf(nm, "rd%d", rdev->raid_disk); 2328 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2329 } 2330 2331 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2332 md_wakeup_thread(mddev->thread); 2333 2334 if (mddev->sb_dirty) 2335 md_update_sb(mddev); 2336 2337 set_capacity(disk, mddev->array_size<<1); 2338 2339 /* If we call blk_queue_make_request here, it will 2340 * re-initialise max_sectors etc which may have been 2341 * refined inside -> run. So just set the bits we need to set. 2342 * Most initialisation happended when we called 2343 * blk_queue_make_request(..., md_fail_request) 2344 * earlier. 2345 */ 2346 mddev->queue->queuedata = mddev; 2347 mddev->queue->make_request_fn = mddev->pers->make_request; 2348 2349 mddev->changed = 1; 2350 md_new_event(mddev); 2351 return 0; 2352 } 2353 2354 static int restart_array(mddev_t *mddev) 2355 { 2356 struct gendisk *disk = mddev->gendisk; 2357 int err; 2358 2359 /* 2360 * Complain if it has no devices 2361 */ 2362 err = -ENXIO; 2363 if (list_empty(&mddev->disks)) 2364 goto out; 2365 2366 if (mddev->pers) { 2367 err = -EBUSY; 2368 if (!mddev->ro) 2369 goto out; 2370 2371 mddev->safemode = 0; 2372 mddev->ro = 0; 2373 set_disk_ro(disk, 0); 2374 2375 printk(KERN_INFO "md: %s switched to read-write mode.\n", 2376 mdname(mddev)); 2377 /* 2378 * Kick recovery or resync if necessary 2379 */ 2380 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2381 md_wakeup_thread(mddev->thread); 2382 err = 0; 2383 } else { 2384 printk(KERN_ERR "md: %s has no personality assigned.\n", 2385 mdname(mddev)); 2386 err = -EINVAL; 2387 } 2388 2389 out: 2390 return err; 2391 } 2392 2393 static int do_md_stop(mddev_t * mddev, int ro) 2394 { 2395 int err = 0; 2396 struct gendisk *disk = mddev->gendisk; 2397 2398 if (mddev->pers) { 2399 if (atomic_read(&mddev->active)>2) { 2400 printk("md: %s still in use.\n",mdname(mddev)); 2401 return -EBUSY; 2402 } 2403 2404 if (mddev->sync_thread) { 2405 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2406 md_unregister_thread(mddev->sync_thread); 2407 mddev->sync_thread = NULL; 2408 } 2409 2410 del_timer_sync(&mddev->safemode_timer); 2411 2412 invalidate_partition(disk, 0); 2413 2414 if (ro) { 2415 err = -ENXIO; 2416 if (mddev->ro==1) 2417 goto out; 2418 mddev->ro = 1; 2419 } else { 2420 bitmap_flush(mddev); 2421 md_super_wait(mddev); 2422 if (mddev->ro) 2423 set_disk_ro(disk, 0); 2424 blk_queue_make_request(mddev->queue, md_fail_request); 2425 mddev->pers->stop(mddev); 2426 if (mddev->pers->sync_request) 2427 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 2428 2429 module_put(mddev->pers->owner); 2430 mddev->pers = NULL; 2431 if (mddev->ro) 2432 mddev->ro = 0; 2433 } 2434 if (!mddev->in_sync) { 2435 /* mark array as shutdown cleanly */ 2436 mddev->in_sync = 1; 2437 md_update_sb(mddev); 2438 } 2439 if (ro) 2440 set_disk_ro(disk, 1); 2441 } 2442 2443 bitmap_destroy(mddev); 2444 if (mddev->bitmap_file) { 2445 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 2446 fput(mddev->bitmap_file); 2447 mddev->bitmap_file = NULL; 2448 } 2449 mddev->bitmap_offset = 0; 2450 2451 /* 2452 * Free resources if final stop 2453 */ 2454 if (!ro) { 2455 mdk_rdev_t *rdev; 2456 struct list_head *tmp; 2457 struct gendisk *disk; 2458 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2459 2460 ITERATE_RDEV(mddev,rdev,tmp) 2461 if (rdev->raid_disk >= 0) { 2462 char nm[20]; 2463 sprintf(nm, "rd%d", rdev->raid_disk); 2464 sysfs_remove_link(&mddev->kobj, nm); 2465 } 2466 2467 export_array(mddev); 2468 2469 mddev->array_size = 0; 2470 disk = mddev->gendisk; 2471 if (disk) 2472 set_capacity(disk, 0); 2473 mddev->changed = 1; 2474 } else 2475 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2476 mdname(mddev)); 2477 err = 0; 2478 md_new_event(mddev); 2479 out: 2480 return err; 2481 } 2482 2483 static void autorun_array(mddev_t *mddev) 2484 { 2485 mdk_rdev_t *rdev; 2486 struct list_head *tmp; 2487 int err; 2488 2489 if (list_empty(&mddev->disks)) 2490 return; 2491 2492 printk(KERN_INFO "md: running: "); 2493 2494 ITERATE_RDEV(mddev,rdev,tmp) { 2495 char b[BDEVNAME_SIZE]; 2496 printk("<%s>", bdevname(rdev->bdev,b)); 2497 } 2498 printk("\n"); 2499 2500 err = do_md_run (mddev); 2501 if (err) { 2502 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 2503 do_md_stop (mddev, 0); 2504 } 2505 } 2506 2507 /* 2508 * lets try to run arrays based on all disks that have arrived 2509 * until now. (those are in pending_raid_disks) 2510 * 2511 * the method: pick the first pending disk, collect all disks with 2512 * the same UUID, remove all from the pending list and put them into 2513 * the 'same_array' list. Then order this list based on superblock 2514 * update time (freshest comes first), kick out 'old' disks and 2515 * compare superblocks. If everything's fine then run it. 2516 * 2517 * If "unit" is allocated, then bump its reference count 2518 */ 2519 static void autorun_devices(int part) 2520 { 2521 struct list_head candidates; 2522 struct list_head *tmp; 2523 mdk_rdev_t *rdev0, *rdev; 2524 mddev_t *mddev; 2525 char b[BDEVNAME_SIZE]; 2526 2527 printk(KERN_INFO "md: autorun ...\n"); 2528 while (!list_empty(&pending_raid_disks)) { 2529 dev_t dev; 2530 rdev0 = list_entry(pending_raid_disks.next, 2531 mdk_rdev_t, same_set); 2532 2533 printk(KERN_INFO "md: considering %s ...\n", 2534 bdevname(rdev0->bdev,b)); 2535 INIT_LIST_HEAD(&candidates); 2536 ITERATE_RDEV_PENDING(rdev,tmp) 2537 if (super_90_load(rdev, rdev0, 0) >= 0) { 2538 printk(KERN_INFO "md: adding %s ...\n", 2539 bdevname(rdev->bdev,b)); 2540 list_move(&rdev->same_set, &candidates); 2541 } 2542 /* 2543 * now we have a set of devices, with all of them having 2544 * mostly sane superblocks. It's time to allocate the 2545 * mddev. 2546 */ 2547 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 2548 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 2549 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 2550 break; 2551 } 2552 if (part) 2553 dev = MKDEV(mdp_major, 2554 rdev0->preferred_minor << MdpMinorShift); 2555 else 2556 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 2557 2558 md_probe(dev, NULL, NULL); 2559 mddev = mddev_find(dev); 2560 if (!mddev) { 2561 printk(KERN_ERR 2562 "md: cannot allocate memory for md drive.\n"); 2563 break; 2564 } 2565 if (mddev_lock(mddev)) 2566 printk(KERN_WARNING "md: %s locked, cannot run\n", 2567 mdname(mddev)); 2568 else if (mddev->raid_disks || mddev->major_version 2569 || !list_empty(&mddev->disks)) { 2570 printk(KERN_WARNING 2571 "md: %s already running, cannot run %s\n", 2572 mdname(mddev), bdevname(rdev0->bdev,b)); 2573 mddev_unlock(mddev); 2574 } else { 2575 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 2576 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 2577 list_del_init(&rdev->same_set); 2578 if (bind_rdev_to_array(rdev, mddev)) 2579 export_rdev(rdev); 2580 } 2581 autorun_array(mddev); 2582 mddev_unlock(mddev); 2583 } 2584 /* on success, candidates will be empty, on error 2585 * it won't... 2586 */ 2587 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 2588 export_rdev(rdev); 2589 mddev_put(mddev); 2590 } 2591 printk(KERN_INFO "md: ... autorun DONE.\n"); 2592 } 2593 2594 /* 2595 * import RAID devices based on one partition 2596 * if possible, the array gets run as well. 2597 */ 2598 2599 static int autostart_array(dev_t startdev) 2600 { 2601 char b[BDEVNAME_SIZE]; 2602 int err = -EINVAL, i; 2603 mdp_super_t *sb = NULL; 2604 mdk_rdev_t *start_rdev = NULL, *rdev; 2605 2606 start_rdev = md_import_device(startdev, 0, 0); 2607 if (IS_ERR(start_rdev)) 2608 return err; 2609 2610 2611 /* NOTE: this can only work for 0.90.0 superblocks */ 2612 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2613 if (sb->major_version != 0 || 2614 sb->minor_version != 90 ) { 2615 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2616 export_rdev(start_rdev); 2617 return err; 2618 } 2619 2620 if (test_bit(Faulty, &start_rdev->flags)) { 2621 printk(KERN_WARNING 2622 "md: can not autostart based on faulty %s!\n", 2623 bdevname(start_rdev->bdev,b)); 2624 export_rdev(start_rdev); 2625 return err; 2626 } 2627 list_add(&start_rdev->same_set, &pending_raid_disks); 2628 2629 for (i = 0; i < MD_SB_DISKS; i++) { 2630 mdp_disk_t *desc = sb->disks + i; 2631 dev_t dev = MKDEV(desc->major, desc->minor); 2632 2633 if (!dev) 2634 continue; 2635 if (dev == startdev) 2636 continue; 2637 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2638 continue; 2639 rdev = md_import_device(dev, 0, 0); 2640 if (IS_ERR(rdev)) 2641 continue; 2642 2643 list_add(&rdev->same_set, &pending_raid_disks); 2644 } 2645 2646 /* 2647 * possibly return codes 2648 */ 2649 autorun_devices(0); 2650 return 0; 2651 2652 } 2653 2654 2655 static int get_version(void __user * arg) 2656 { 2657 mdu_version_t ver; 2658 2659 ver.major = MD_MAJOR_VERSION; 2660 ver.minor = MD_MINOR_VERSION; 2661 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2662 2663 if (copy_to_user(arg, &ver, sizeof(ver))) 2664 return -EFAULT; 2665 2666 return 0; 2667 } 2668 2669 static int get_array_info(mddev_t * mddev, void __user * arg) 2670 { 2671 mdu_array_info_t info; 2672 int nr,working,active,failed,spare; 2673 mdk_rdev_t *rdev; 2674 struct list_head *tmp; 2675 2676 nr=working=active=failed=spare=0; 2677 ITERATE_RDEV(mddev,rdev,tmp) { 2678 nr++; 2679 if (test_bit(Faulty, &rdev->flags)) 2680 failed++; 2681 else { 2682 working++; 2683 if (test_bit(In_sync, &rdev->flags)) 2684 active++; 2685 else 2686 spare++; 2687 } 2688 } 2689 2690 info.major_version = mddev->major_version; 2691 info.minor_version = mddev->minor_version; 2692 info.patch_version = MD_PATCHLEVEL_VERSION; 2693 info.ctime = mddev->ctime; 2694 info.level = mddev->level; 2695 info.size = mddev->size; 2696 info.nr_disks = nr; 2697 info.raid_disks = mddev->raid_disks; 2698 info.md_minor = mddev->md_minor; 2699 info.not_persistent= !mddev->persistent; 2700 2701 info.utime = mddev->utime; 2702 info.state = 0; 2703 if (mddev->in_sync) 2704 info.state = (1<<MD_SB_CLEAN); 2705 if (mddev->bitmap && mddev->bitmap_offset) 2706 info.state = (1<<MD_SB_BITMAP_PRESENT); 2707 info.active_disks = active; 2708 info.working_disks = working; 2709 info.failed_disks = failed; 2710 info.spare_disks = spare; 2711 2712 info.layout = mddev->layout; 2713 info.chunk_size = mddev->chunk_size; 2714 2715 if (copy_to_user(arg, &info, sizeof(info))) 2716 return -EFAULT; 2717 2718 return 0; 2719 } 2720 2721 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2722 { 2723 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2724 char *ptr, *buf = NULL; 2725 int err = -ENOMEM; 2726 2727 file = kmalloc(sizeof(*file), GFP_KERNEL); 2728 if (!file) 2729 goto out; 2730 2731 /* bitmap disabled, zero the first byte and copy out */ 2732 if (!mddev->bitmap || !mddev->bitmap->file) { 2733 file->pathname[0] = '\0'; 2734 goto copy_out; 2735 } 2736 2737 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2738 if (!buf) 2739 goto out; 2740 2741 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2742 if (!ptr) 2743 goto out; 2744 2745 strcpy(file->pathname, ptr); 2746 2747 copy_out: 2748 err = 0; 2749 if (copy_to_user(arg, file, sizeof(*file))) 2750 err = -EFAULT; 2751 out: 2752 kfree(buf); 2753 kfree(file); 2754 return err; 2755 } 2756 2757 static int get_disk_info(mddev_t * mddev, void __user * arg) 2758 { 2759 mdu_disk_info_t info; 2760 unsigned int nr; 2761 mdk_rdev_t *rdev; 2762 2763 if (copy_from_user(&info, arg, sizeof(info))) 2764 return -EFAULT; 2765 2766 nr = info.number; 2767 2768 rdev = find_rdev_nr(mddev, nr); 2769 if (rdev) { 2770 info.major = MAJOR(rdev->bdev->bd_dev); 2771 info.minor = MINOR(rdev->bdev->bd_dev); 2772 info.raid_disk = rdev->raid_disk; 2773 info.state = 0; 2774 if (test_bit(Faulty, &rdev->flags)) 2775 info.state |= (1<<MD_DISK_FAULTY); 2776 else if (test_bit(In_sync, &rdev->flags)) { 2777 info.state |= (1<<MD_DISK_ACTIVE); 2778 info.state |= (1<<MD_DISK_SYNC); 2779 } 2780 if (test_bit(WriteMostly, &rdev->flags)) 2781 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2782 } else { 2783 info.major = info.minor = 0; 2784 info.raid_disk = -1; 2785 info.state = (1<<MD_DISK_REMOVED); 2786 } 2787 2788 if (copy_to_user(arg, &info, sizeof(info))) 2789 return -EFAULT; 2790 2791 return 0; 2792 } 2793 2794 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2795 { 2796 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2797 mdk_rdev_t *rdev; 2798 dev_t dev = MKDEV(info->major,info->minor); 2799 2800 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2801 return -EOVERFLOW; 2802 2803 if (!mddev->raid_disks) { 2804 int err; 2805 /* expecting a device which has a superblock */ 2806 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2807 if (IS_ERR(rdev)) { 2808 printk(KERN_WARNING 2809 "md: md_import_device returned %ld\n", 2810 PTR_ERR(rdev)); 2811 return PTR_ERR(rdev); 2812 } 2813 if (!list_empty(&mddev->disks)) { 2814 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2815 mdk_rdev_t, same_set); 2816 int err = super_types[mddev->major_version] 2817 .load_super(rdev, rdev0, mddev->minor_version); 2818 if (err < 0) { 2819 printk(KERN_WARNING 2820 "md: %s has different UUID to %s\n", 2821 bdevname(rdev->bdev,b), 2822 bdevname(rdev0->bdev,b2)); 2823 export_rdev(rdev); 2824 return -EINVAL; 2825 } 2826 } 2827 err = bind_rdev_to_array(rdev, mddev); 2828 if (err) 2829 export_rdev(rdev); 2830 return err; 2831 } 2832 2833 /* 2834 * add_new_disk can be used once the array is assembled 2835 * to add "hot spares". They must already have a superblock 2836 * written 2837 */ 2838 if (mddev->pers) { 2839 int err; 2840 if (!mddev->pers->hot_add_disk) { 2841 printk(KERN_WARNING 2842 "%s: personality does not support diskops!\n", 2843 mdname(mddev)); 2844 return -EINVAL; 2845 } 2846 if (mddev->persistent) 2847 rdev = md_import_device(dev, mddev->major_version, 2848 mddev->minor_version); 2849 else 2850 rdev = md_import_device(dev, -1, -1); 2851 if (IS_ERR(rdev)) { 2852 printk(KERN_WARNING 2853 "md: md_import_device returned %ld\n", 2854 PTR_ERR(rdev)); 2855 return PTR_ERR(rdev); 2856 } 2857 /* set save_raid_disk if appropriate */ 2858 if (!mddev->persistent) { 2859 if (info->state & (1<<MD_DISK_SYNC) && 2860 info->raid_disk < mddev->raid_disks) 2861 rdev->raid_disk = info->raid_disk; 2862 else 2863 rdev->raid_disk = -1; 2864 } else 2865 super_types[mddev->major_version]. 2866 validate_super(mddev, rdev); 2867 rdev->saved_raid_disk = rdev->raid_disk; 2868 2869 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 2870 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2871 set_bit(WriteMostly, &rdev->flags); 2872 2873 rdev->raid_disk = -1; 2874 err = bind_rdev_to_array(rdev, mddev); 2875 if (err) 2876 export_rdev(rdev); 2877 2878 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2879 md_wakeup_thread(mddev->thread); 2880 return err; 2881 } 2882 2883 /* otherwise, add_new_disk is only allowed 2884 * for major_version==0 superblocks 2885 */ 2886 if (mddev->major_version != 0) { 2887 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2888 mdname(mddev)); 2889 return -EINVAL; 2890 } 2891 2892 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2893 int err; 2894 rdev = md_import_device (dev, -1, 0); 2895 if (IS_ERR(rdev)) { 2896 printk(KERN_WARNING 2897 "md: error, md_import_device() returned %ld\n", 2898 PTR_ERR(rdev)); 2899 return PTR_ERR(rdev); 2900 } 2901 rdev->desc_nr = info->number; 2902 if (info->raid_disk < mddev->raid_disks) 2903 rdev->raid_disk = info->raid_disk; 2904 else 2905 rdev->raid_disk = -1; 2906 2907 rdev->flags = 0; 2908 2909 if (rdev->raid_disk < mddev->raid_disks) 2910 if (info->state & (1<<MD_DISK_SYNC)) 2911 set_bit(In_sync, &rdev->flags); 2912 2913 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2914 set_bit(WriteMostly, &rdev->flags); 2915 2916 if (!mddev->persistent) { 2917 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2918 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2919 } else 2920 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2921 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2922 2923 err = bind_rdev_to_array(rdev, mddev); 2924 if (err) { 2925 export_rdev(rdev); 2926 return err; 2927 } 2928 } 2929 2930 return 0; 2931 } 2932 2933 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2934 { 2935 char b[BDEVNAME_SIZE]; 2936 mdk_rdev_t *rdev; 2937 2938 if (!mddev->pers) 2939 return -ENODEV; 2940 2941 rdev = find_rdev(mddev, dev); 2942 if (!rdev) 2943 return -ENXIO; 2944 2945 if (rdev->raid_disk >= 0) 2946 goto busy; 2947 2948 kick_rdev_from_array(rdev); 2949 md_update_sb(mddev); 2950 md_new_event(mddev); 2951 2952 return 0; 2953 busy: 2954 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2955 bdevname(rdev->bdev,b), mdname(mddev)); 2956 return -EBUSY; 2957 } 2958 2959 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2960 { 2961 char b[BDEVNAME_SIZE]; 2962 int err; 2963 unsigned int size; 2964 mdk_rdev_t *rdev; 2965 2966 if (!mddev->pers) 2967 return -ENODEV; 2968 2969 if (mddev->major_version != 0) { 2970 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2971 " version-0 superblocks.\n", 2972 mdname(mddev)); 2973 return -EINVAL; 2974 } 2975 if (!mddev->pers->hot_add_disk) { 2976 printk(KERN_WARNING 2977 "%s: personality does not support diskops!\n", 2978 mdname(mddev)); 2979 return -EINVAL; 2980 } 2981 2982 rdev = md_import_device (dev, -1, 0); 2983 if (IS_ERR(rdev)) { 2984 printk(KERN_WARNING 2985 "md: error, md_import_device() returned %ld\n", 2986 PTR_ERR(rdev)); 2987 return -EINVAL; 2988 } 2989 2990 if (mddev->persistent) 2991 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2992 else 2993 rdev->sb_offset = 2994 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2995 2996 size = calc_dev_size(rdev, mddev->chunk_size); 2997 rdev->size = size; 2998 2999 if (test_bit(Faulty, &rdev->flags)) { 3000 printk(KERN_WARNING 3001 "md: can not hot-add faulty %s disk to %s!\n", 3002 bdevname(rdev->bdev,b), mdname(mddev)); 3003 err = -EINVAL; 3004 goto abort_export; 3005 } 3006 clear_bit(In_sync, &rdev->flags); 3007 rdev->desc_nr = -1; 3008 err = bind_rdev_to_array(rdev, mddev); 3009 if (err) 3010 goto abort_export; 3011 3012 /* 3013 * The rest should better be atomic, we can have disk failures 3014 * noticed in interrupt contexts ... 3015 */ 3016 3017 if (rdev->desc_nr == mddev->max_disks) { 3018 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3019 mdname(mddev)); 3020 err = -EBUSY; 3021 goto abort_unbind_export; 3022 } 3023 3024 rdev->raid_disk = -1; 3025 3026 md_update_sb(mddev); 3027 3028 /* 3029 * Kick recovery, maybe this spare has to be added to the 3030 * array immediately. 3031 */ 3032 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3033 md_wakeup_thread(mddev->thread); 3034 md_new_event(mddev); 3035 return 0; 3036 3037 abort_unbind_export: 3038 unbind_rdev_from_array(rdev); 3039 3040 abort_export: 3041 export_rdev(rdev); 3042 return err; 3043 } 3044 3045 /* similar to deny_write_access, but accounts for our holding a reference 3046 * to the file ourselves */ 3047 static int deny_bitmap_write_access(struct file * file) 3048 { 3049 struct inode *inode = file->f_mapping->host; 3050 3051 spin_lock(&inode->i_lock); 3052 if (atomic_read(&inode->i_writecount) > 1) { 3053 spin_unlock(&inode->i_lock); 3054 return -ETXTBSY; 3055 } 3056 atomic_set(&inode->i_writecount, -1); 3057 spin_unlock(&inode->i_lock); 3058 3059 return 0; 3060 } 3061 3062 static int set_bitmap_file(mddev_t *mddev, int fd) 3063 { 3064 int err; 3065 3066 if (mddev->pers) { 3067 if (!mddev->pers->quiesce) 3068 return -EBUSY; 3069 if (mddev->recovery || mddev->sync_thread) 3070 return -EBUSY; 3071 /* we should be able to change the bitmap.. */ 3072 } 3073 3074 3075 if (fd >= 0) { 3076 if (mddev->bitmap) 3077 return -EEXIST; /* cannot add when bitmap is present */ 3078 mddev->bitmap_file = fget(fd); 3079 3080 if (mddev->bitmap_file == NULL) { 3081 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3082 mdname(mddev)); 3083 return -EBADF; 3084 } 3085 3086 err = deny_bitmap_write_access(mddev->bitmap_file); 3087 if (err) { 3088 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3089 mdname(mddev)); 3090 fput(mddev->bitmap_file); 3091 mddev->bitmap_file = NULL; 3092 return err; 3093 } 3094 mddev->bitmap_offset = 0; /* file overrides offset */ 3095 } else if (mddev->bitmap == NULL) 3096 return -ENOENT; /* cannot remove what isn't there */ 3097 err = 0; 3098 if (mddev->pers) { 3099 mddev->pers->quiesce(mddev, 1); 3100 if (fd >= 0) 3101 err = bitmap_create(mddev); 3102 if (fd < 0 || err) 3103 bitmap_destroy(mddev); 3104 mddev->pers->quiesce(mddev, 0); 3105 } else if (fd < 0) { 3106 if (mddev->bitmap_file) 3107 fput(mddev->bitmap_file); 3108 mddev->bitmap_file = NULL; 3109 } 3110 3111 return err; 3112 } 3113 3114 /* 3115 * set_array_info is used two different ways 3116 * The original usage is when creating a new array. 3117 * In this usage, raid_disks is > 0 and it together with 3118 * level, size, not_persistent,layout,chunksize determine the 3119 * shape of the array. 3120 * This will always create an array with a type-0.90.0 superblock. 3121 * The newer usage is when assembling an array. 3122 * In this case raid_disks will be 0, and the major_version field is 3123 * use to determine which style super-blocks are to be found on the devices. 3124 * The minor and patch _version numbers are also kept incase the 3125 * super_block handler wishes to interpret them. 3126 */ 3127 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3128 { 3129 3130 if (info->raid_disks == 0) { 3131 /* just setting version number for superblock loading */ 3132 if (info->major_version < 0 || 3133 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3134 super_types[info->major_version].name == NULL) { 3135 /* maybe try to auto-load a module? */ 3136 printk(KERN_INFO 3137 "md: superblock version %d not known\n", 3138 info->major_version); 3139 return -EINVAL; 3140 } 3141 mddev->major_version = info->major_version; 3142 mddev->minor_version = info->minor_version; 3143 mddev->patch_version = info->patch_version; 3144 return 0; 3145 } 3146 mddev->major_version = MD_MAJOR_VERSION; 3147 mddev->minor_version = MD_MINOR_VERSION; 3148 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3149 mddev->ctime = get_seconds(); 3150 3151 mddev->level = info->level; 3152 mddev->size = info->size; 3153 mddev->raid_disks = info->raid_disks; 3154 /* don't set md_minor, it is determined by which /dev/md* was 3155 * openned 3156 */ 3157 if (info->state & (1<<MD_SB_CLEAN)) 3158 mddev->recovery_cp = MaxSector; 3159 else 3160 mddev->recovery_cp = 0; 3161 mddev->persistent = ! info->not_persistent; 3162 3163 mddev->layout = info->layout; 3164 mddev->chunk_size = info->chunk_size; 3165 3166 mddev->max_disks = MD_SB_DISKS; 3167 3168 mddev->sb_dirty = 1; 3169 3170 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 3171 mddev->bitmap_offset = 0; 3172 3173 /* 3174 * Generate a 128 bit UUID 3175 */ 3176 get_random_bytes(mddev->uuid, 16); 3177 3178 return 0; 3179 } 3180 3181 static int update_size(mddev_t *mddev, unsigned long size) 3182 { 3183 mdk_rdev_t * rdev; 3184 int rv; 3185 struct list_head *tmp; 3186 3187 if (mddev->pers->resize == NULL) 3188 return -EINVAL; 3189 /* The "size" is the amount of each device that is used. 3190 * This can only make sense for arrays with redundancy. 3191 * linear and raid0 always use whatever space is available 3192 * We can only consider changing the size if no resync 3193 * or reconstruction is happening, and if the new size 3194 * is acceptable. It must fit before the sb_offset or, 3195 * if that is <data_offset, it must fit before the 3196 * size of each device. 3197 * If size is zero, we find the largest size that fits. 3198 */ 3199 if (mddev->sync_thread) 3200 return -EBUSY; 3201 ITERATE_RDEV(mddev,rdev,tmp) { 3202 sector_t avail; 3203 int fit = (size == 0); 3204 if (rdev->sb_offset > rdev->data_offset) 3205 avail = (rdev->sb_offset*2) - rdev->data_offset; 3206 else 3207 avail = get_capacity(rdev->bdev->bd_disk) 3208 - rdev->data_offset; 3209 if (fit && (size == 0 || size > avail/2)) 3210 size = avail/2; 3211 if (avail < ((sector_t)size << 1)) 3212 return -ENOSPC; 3213 } 3214 rv = mddev->pers->resize(mddev, (sector_t)size *2); 3215 if (!rv) { 3216 struct block_device *bdev; 3217 3218 bdev = bdget_disk(mddev->gendisk, 0); 3219 if (bdev) { 3220 down(&bdev->bd_inode->i_sem); 3221 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3222 up(&bdev->bd_inode->i_sem); 3223 bdput(bdev); 3224 } 3225 } 3226 return rv; 3227 } 3228 3229 static int update_raid_disks(mddev_t *mddev, int raid_disks) 3230 { 3231 int rv; 3232 /* change the number of raid disks */ 3233 if (mddev->pers->reshape == NULL) 3234 return -EINVAL; 3235 if (raid_disks <= 0 || 3236 raid_disks >= mddev->max_disks) 3237 return -EINVAL; 3238 if (mddev->sync_thread) 3239 return -EBUSY; 3240 rv = mddev->pers->reshape(mddev, raid_disks); 3241 if (!rv) { 3242 struct block_device *bdev; 3243 3244 bdev = bdget_disk(mddev->gendisk, 0); 3245 if (bdev) { 3246 down(&bdev->bd_inode->i_sem); 3247 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3248 up(&bdev->bd_inode->i_sem); 3249 bdput(bdev); 3250 } 3251 } 3252 return rv; 3253 } 3254 3255 3256 /* 3257 * update_array_info is used to change the configuration of an 3258 * on-line array. 3259 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 3260 * fields in the info are checked against the array. 3261 * Any differences that cannot be handled will cause an error. 3262 * Normally, only one change can be managed at a time. 3263 */ 3264 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 3265 { 3266 int rv = 0; 3267 int cnt = 0; 3268 int state = 0; 3269 3270 /* calculate expected state,ignoring low bits */ 3271 if (mddev->bitmap && mddev->bitmap_offset) 3272 state |= (1 << MD_SB_BITMAP_PRESENT); 3273 3274 if (mddev->major_version != info->major_version || 3275 mddev->minor_version != info->minor_version || 3276 /* mddev->patch_version != info->patch_version || */ 3277 mddev->ctime != info->ctime || 3278 mddev->level != info->level || 3279 /* mddev->layout != info->layout || */ 3280 !mddev->persistent != info->not_persistent|| 3281 mddev->chunk_size != info->chunk_size || 3282 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 3283 ((state^info->state) & 0xfffffe00) 3284 ) 3285 return -EINVAL; 3286 /* Check there is only one change */ 3287 if (mddev->size != info->size) cnt++; 3288 if (mddev->raid_disks != info->raid_disks) cnt++; 3289 if (mddev->layout != info->layout) cnt++; 3290 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 3291 if (cnt == 0) return 0; 3292 if (cnt > 1) return -EINVAL; 3293 3294 if (mddev->layout != info->layout) { 3295 /* Change layout 3296 * we don't need to do anything at the md level, the 3297 * personality will take care of it all. 3298 */ 3299 if (mddev->pers->reconfig == NULL) 3300 return -EINVAL; 3301 else 3302 return mddev->pers->reconfig(mddev, info->layout, -1); 3303 } 3304 if (mddev->size != info->size) 3305 rv = update_size(mddev, info->size); 3306 3307 if (mddev->raid_disks != info->raid_disks) 3308 rv = update_raid_disks(mddev, info->raid_disks); 3309 3310 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3311 if (mddev->pers->quiesce == NULL) 3312 return -EINVAL; 3313 if (mddev->recovery || mddev->sync_thread) 3314 return -EBUSY; 3315 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 3316 /* add the bitmap */ 3317 if (mddev->bitmap) 3318 return -EEXIST; 3319 if (mddev->default_bitmap_offset == 0) 3320 return -EINVAL; 3321 mddev->bitmap_offset = mddev->default_bitmap_offset; 3322 mddev->pers->quiesce(mddev, 1); 3323 rv = bitmap_create(mddev); 3324 if (rv) 3325 bitmap_destroy(mddev); 3326 mddev->pers->quiesce(mddev, 0); 3327 } else { 3328 /* remove the bitmap */ 3329 if (!mddev->bitmap) 3330 return -ENOENT; 3331 if (mddev->bitmap->file) 3332 return -EINVAL; 3333 mddev->pers->quiesce(mddev, 1); 3334 bitmap_destroy(mddev); 3335 mddev->pers->quiesce(mddev, 0); 3336 mddev->bitmap_offset = 0; 3337 } 3338 } 3339 md_update_sb(mddev); 3340 return rv; 3341 } 3342 3343 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 3344 { 3345 mdk_rdev_t *rdev; 3346 3347 if (mddev->pers == NULL) 3348 return -ENODEV; 3349 3350 rdev = find_rdev(mddev, dev); 3351 if (!rdev) 3352 return -ENODEV; 3353 3354 md_error(mddev, rdev); 3355 return 0; 3356 } 3357 3358 static int md_ioctl(struct inode *inode, struct file *file, 3359 unsigned int cmd, unsigned long arg) 3360 { 3361 int err = 0; 3362 void __user *argp = (void __user *)arg; 3363 struct hd_geometry __user *loc = argp; 3364 mddev_t *mddev = NULL; 3365 3366 if (!capable(CAP_SYS_ADMIN)) 3367 return -EACCES; 3368 3369 /* 3370 * Commands dealing with the RAID driver but not any 3371 * particular array: 3372 */ 3373 switch (cmd) 3374 { 3375 case RAID_VERSION: 3376 err = get_version(argp); 3377 goto done; 3378 3379 case PRINT_RAID_DEBUG: 3380 err = 0; 3381 md_print_devices(); 3382 goto done; 3383 3384 #ifndef MODULE 3385 case RAID_AUTORUN: 3386 err = 0; 3387 autostart_arrays(arg); 3388 goto done; 3389 #endif 3390 default:; 3391 } 3392 3393 /* 3394 * Commands creating/starting a new array: 3395 */ 3396 3397 mddev = inode->i_bdev->bd_disk->private_data; 3398 3399 if (!mddev) { 3400 BUG(); 3401 goto abort; 3402 } 3403 3404 3405 if (cmd == START_ARRAY) { 3406 /* START_ARRAY doesn't need to lock the array as autostart_array 3407 * does the locking, and it could even be a different array 3408 */ 3409 static int cnt = 3; 3410 if (cnt > 0 ) { 3411 printk(KERN_WARNING 3412 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 3413 "This will not be supported beyond July 2006\n", 3414 current->comm, current->pid); 3415 cnt--; 3416 } 3417 err = autostart_array(new_decode_dev(arg)); 3418 if (err) { 3419 printk(KERN_WARNING "md: autostart failed!\n"); 3420 goto abort; 3421 } 3422 goto done; 3423 } 3424 3425 err = mddev_lock(mddev); 3426 if (err) { 3427 printk(KERN_INFO 3428 "md: ioctl lock interrupted, reason %d, cmd %d\n", 3429 err, cmd); 3430 goto abort; 3431 } 3432 3433 switch (cmd) 3434 { 3435 case SET_ARRAY_INFO: 3436 { 3437 mdu_array_info_t info; 3438 if (!arg) 3439 memset(&info, 0, sizeof(info)); 3440 else if (copy_from_user(&info, argp, sizeof(info))) { 3441 err = -EFAULT; 3442 goto abort_unlock; 3443 } 3444 if (mddev->pers) { 3445 err = update_array_info(mddev, &info); 3446 if (err) { 3447 printk(KERN_WARNING "md: couldn't update" 3448 " array info. %d\n", err); 3449 goto abort_unlock; 3450 } 3451 goto done_unlock; 3452 } 3453 if (!list_empty(&mddev->disks)) { 3454 printk(KERN_WARNING 3455 "md: array %s already has disks!\n", 3456 mdname(mddev)); 3457 err = -EBUSY; 3458 goto abort_unlock; 3459 } 3460 if (mddev->raid_disks) { 3461 printk(KERN_WARNING 3462 "md: array %s already initialised!\n", 3463 mdname(mddev)); 3464 err = -EBUSY; 3465 goto abort_unlock; 3466 } 3467 err = set_array_info(mddev, &info); 3468 if (err) { 3469 printk(KERN_WARNING "md: couldn't set" 3470 " array info. %d\n", err); 3471 goto abort_unlock; 3472 } 3473 } 3474 goto done_unlock; 3475 3476 default:; 3477 } 3478 3479 /* 3480 * Commands querying/configuring an existing array: 3481 */ 3482 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 3483 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 3484 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 3485 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 3486 err = -ENODEV; 3487 goto abort_unlock; 3488 } 3489 3490 /* 3491 * Commands even a read-only array can execute: 3492 */ 3493 switch (cmd) 3494 { 3495 case GET_ARRAY_INFO: 3496 err = get_array_info(mddev, argp); 3497 goto done_unlock; 3498 3499 case GET_BITMAP_FILE: 3500 err = get_bitmap_file(mddev, argp); 3501 goto done_unlock; 3502 3503 case GET_DISK_INFO: 3504 err = get_disk_info(mddev, argp); 3505 goto done_unlock; 3506 3507 case RESTART_ARRAY_RW: 3508 err = restart_array(mddev); 3509 goto done_unlock; 3510 3511 case STOP_ARRAY: 3512 err = do_md_stop (mddev, 0); 3513 goto done_unlock; 3514 3515 case STOP_ARRAY_RO: 3516 err = do_md_stop (mddev, 1); 3517 goto done_unlock; 3518 3519 /* 3520 * We have a problem here : there is no easy way to give a CHS 3521 * virtual geometry. We currently pretend that we have a 2 heads 3522 * 4 sectors (with a BIG number of cylinders...). This drives 3523 * dosfs just mad... ;-) 3524 */ 3525 case HDIO_GETGEO: 3526 if (!loc) { 3527 err = -EINVAL; 3528 goto abort_unlock; 3529 } 3530 err = put_user (2, (char __user *) &loc->heads); 3531 if (err) 3532 goto abort_unlock; 3533 err = put_user (4, (char __user *) &loc->sectors); 3534 if (err) 3535 goto abort_unlock; 3536 err = put_user(get_capacity(mddev->gendisk)/8, 3537 (short __user *) &loc->cylinders); 3538 if (err) 3539 goto abort_unlock; 3540 err = put_user (get_start_sect(inode->i_bdev), 3541 (long __user *) &loc->start); 3542 goto done_unlock; 3543 } 3544 3545 /* 3546 * The remaining ioctls are changing the state of the 3547 * superblock, so we do not allow them on read-only arrays. 3548 * However non-MD ioctls (e.g. get-size) will still come through 3549 * here and hit the 'default' below, so only disallow 3550 * 'md' ioctls, and switch to rw mode if started auto-readonly. 3551 */ 3552 if (_IOC_TYPE(cmd) == MD_MAJOR && 3553 mddev->ro && mddev->pers) { 3554 if (mddev->ro == 2) { 3555 mddev->ro = 0; 3556 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3557 md_wakeup_thread(mddev->thread); 3558 3559 } else { 3560 err = -EROFS; 3561 goto abort_unlock; 3562 } 3563 } 3564 3565 switch (cmd) 3566 { 3567 case ADD_NEW_DISK: 3568 { 3569 mdu_disk_info_t info; 3570 if (copy_from_user(&info, argp, sizeof(info))) 3571 err = -EFAULT; 3572 else 3573 err = add_new_disk(mddev, &info); 3574 goto done_unlock; 3575 } 3576 3577 case HOT_REMOVE_DISK: 3578 err = hot_remove_disk(mddev, new_decode_dev(arg)); 3579 goto done_unlock; 3580 3581 case HOT_ADD_DISK: 3582 err = hot_add_disk(mddev, new_decode_dev(arg)); 3583 goto done_unlock; 3584 3585 case SET_DISK_FAULTY: 3586 err = set_disk_faulty(mddev, new_decode_dev(arg)); 3587 goto done_unlock; 3588 3589 case RUN_ARRAY: 3590 err = do_md_run (mddev); 3591 goto done_unlock; 3592 3593 case SET_BITMAP_FILE: 3594 err = set_bitmap_file(mddev, (int)arg); 3595 goto done_unlock; 3596 3597 default: 3598 if (_IOC_TYPE(cmd) == MD_MAJOR) 3599 printk(KERN_WARNING "md: %s(pid %d) used" 3600 " obsolete MD ioctl, upgrade your" 3601 " software to use new ictls.\n", 3602 current->comm, current->pid); 3603 err = -EINVAL; 3604 goto abort_unlock; 3605 } 3606 3607 done_unlock: 3608 abort_unlock: 3609 mddev_unlock(mddev); 3610 3611 return err; 3612 done: 3613 if (err) 3614 MD_BUG(); 3615 abort: 3616 return err; 3617 } 3618 3619 static int md_open(struct inode *inode, struct file *file) 3620 { 3621 /* 3622 * Succeed if we can lock the mddev, which confirms that 3623 * it isn't being stopped right now. 3624 */ 3625 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3626 int err; 3627 3628 if ((err = mddev_lock(mddev))) 3629 goto out; 3630 3631 err = 0; 3632 mddev_get(mddev); 3633 mddev_unlock(mddev); 3634 3635 check_disk_change(inode->i_bdev); 3636 out: 3637 return err; 3638 } 3639 3640 static int md_release(struct inode *inode, struct file * file) 3641 { 3642 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3643 3644 if (!mddev) 3645 BUG(); 3646 mddev_put(mddev); 3647 3648 return 0; 3649 } 3650 3651 static int md_media_changed(struct gendisk *disk) 3652 { 3653 mddev_t *mddev = disk->private_data; 3654 3655 return mddev->changed; 3656 } 3657 3658 static int md_revalidate(struct gendisk *disk) 3659 { 3660 mddev_t *mddev = disk->private_data; 3661 3662 mddev->changed = 0; 3663 return 0; 3664 } 3665 static struct block_device_operations md_fops = 3666 { 3667 .owner = THIS_MODULE, 3668 .open = md_open, 3669 .release = md_release, 3670 .ioctl = md_ioctl, 3671 .media_changed = md_media_changed, 3672 .revalidate_disk= md_revalidate, 3673 }; 3674 3675 static int md_thread(void * arg) 3676 { 3677 mdk_thread_t *thread = arg; 3678 3679 /* 3680 * md_thread is a 'system-thread', it's priority should be very 3681 * high. We avoid resource deadlocks individually in each 3682 * raid personality. (RAID5 does preallocation) We also use RR and 3683 * the very same RT priority as kswapd, thus we will never get 3684 * into a priority inversion deadlock. 3685 * 3686 * we definitely have to have equal or higher priority than 3687 * bdflush, otherwise bdflush will deadlock if there are too 3688 * many dirty RAID5 blocks. 3689 */ 3690 3691 allow_signal(SIGKILL); 3692 while (!kthread_should_stop()) { 3693 3694 /* We need to wait INTERRUPTIBLE so that 3695 * we don't add to the load-average. 3696 * That means we need to be sure no signals are 3697 * pending 3698 */ 3699 if (signal_pending(current)) 3700 flush_signals(current); 3701 3702 wait_event_interruptible_timeout 3703 (thread->wqueue, 3704 test_bit(THREAD_WAKEUP, &thread->flags) 3705 || kthread_should_stop(), 3706 thread->timeout); 3707 try_to_freeze(); 3708 3709 clear_bit(THREAD_WAKEUP, &thread->flags); 3710 3711 thread->run(thread->mddev); 3712 } 3713 3714 return 0; 3715 } 3716 3717 void md_wakeup_thread(mdk_thread_t *thread) 3718 { 3719 if (thread) { 3720 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3721 set_bit(THREAD_WAKEUP, &thread->flags); 3722 wake_up(&thread->wqueue); 3723 } 3724 } 3725 3726 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3727 const char *name) 3728 { 3729 mdk_thread_t *thread; 3730 3731 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3732 if (!thread) 3733 return NULL; 3734 3735 init_waitqueue_head(&thread->wqueue); 3736 3737 thread->run = run; 3738 thread->mddev = mddev; 3739 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3740 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3741 if (IS_ERR(thread->tsk)) { 3742 kfree(thread); 3743 return NULL; 3744 } 3745 return thread; 3746 } 3747 3748 void md_unregister_thread(mdk_thread_t *thread) 3749 { 3750 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3751 3752 kthread_stop(thread->tsk); 3753 kfree(thread); 3754 } 3755 3756 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3757 { 3758 if (!mddev) { 3759 MD_BUG(); 3760 return; 3761 } 3762 3763 if (!rdev || test_bit(Faulty, &rdev->flags)) 3764 return; 3765 /* 3766 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3767 mdname(mddev), 3768 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3769 __builtin_return_address(0),__builtin_return_address(1), 3770 __builtin_return_address(2),__builtin_return_address(3)); 3771 */ 3772 if (!mddev->pers->error_handler) 3773 return; 3774 mddev->pers->error_handler(mddev,rdev); 3775 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3776 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3777 md_wakeup_thread(mddev->thread); 3778 md_new_event(mddev); 3779 } 3780 3781 /* seq_file implementation /proc/mdstat */ 3782 3783 static void status_unused(struct seq_file *seq) 3784 { 3785 int i = 0; 3786 mdk_rdev_t *rdev; 3787 struct list_head *tmp; 3788 3789 seq_printf(seq, "unused devices: "); 3790 3791 ITERATE_RDEV_PENDING(rdev,tmp) { 3792 char b[BDEVNAME_SIZE]; 3793 i++; 3794 seq_printf(seq, "%s ", 3795 bdevname(rdev->bdev,b)); 3796 } 3797 if (!i) 3798 seq_printf(seq, "<none>"); 3799 3800 seq_printf(seq, "\n"); 3801 } 3802 3803 3804 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3805 { 3806 unsigned long max_blocks, resync, res, dt, db, rt; 3807 3808 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3809 3810 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3811 max_blocks = mddev->resync_max_sectors >> 1; 3812 else 3813 max_blocks = mddev->size; 3814 3815 /* 3816 * Should not happen. 3817 */ 3818 if (!max_blocks) { 3819 MD_BUG(); 3820 return; 3821 } 3822 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3823 { 3824 int i, x = res/50, y = 20-x; 3825 seq_printf(seq, "["); 3826 for (i = 0; i < x; i++) 3827 seq_printf(seq, "="); 3828 seq_printf(seq, ">"); 3829 for (i = 0; i < y; i++) 3830 seq_printf(seq, "."); 3831 seq_printf(seq, "] "); 3832 } 3833 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3834 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3835 "resync" : "recovery"), 3836 res/10, res % 10, resync, max_blocks); 3837 3838 /* 3839 * We do not want to overflow, so the order of operands and 3840 * the * 100 / 100 trick are important. We do a +1 to be 3841 * safe against division by zero. We only estimate anyway. 3842 * 3843 * dt: time from mark until now 3844 * db: blocks written from mark until now 3845 * rt: remaining time 3846 */ 3847 dt = ((jiffies - mddev->resync_mark) / HZ); 3848 if (!dt) dt++; 3849 db = resync - (mddev->resync_mark_cnt/2); 3850 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3851 3852 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3853 3854 seq_printf(seq, " speed=%ldK/sec", db/dt); 3855 } 3856 3857 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3858 { 3859 struct list_head *tmp; 3860 loff_t l = *pos; 3861 mddev_t *mddev; 3862 3863 if (l >= 0x10000) 3864 return NULL; 3865 if (!l--) 3866 /* header */ 3867 return (void*)1; 3868 3869 spin_lock(&all_mddevs_lock); 3870 list_for_each(tmp,&all_mddevs) 3871 if (!l--) { 3872 mddev = list_entry(tmp, mddev_t, all_mddevs); 3873 mddev_get(mddev); 3874 spin_unlock(&all_mddevs_lock); 3875 return mddev; 3876 } 3877 spin_unlock(&all_mddevs_lock); 3878 if (!l--) 3879 return (void*)2;/* tail */ 3880 return NULL; 3881 } 3882 3883 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3884 { 3885 struct list_head *tmp; 3886 mddev_t *next_mddev, *mddev = v; 3887 3888 ++*pos; 3889 if (v == (void*)2) 3890 return NULL; 3891 3892 spin_lock(&all_mddevs_lock); 3893 if (v == (void*)1) 3894 tmp = all_mddevs.next; 3895 else 3896 tmp = mddev->all_mddevs.next; 3897 if (tmp != &all_mddevs) 3898 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3899 else { 3900 next_mddev = (void*)2; 3901 *pos = 0x10000; 3902 } 3903 spin_unlock(&all_mddevs_lock); 3904 3905 if (v != (void*)1) 3906 mddev_put(mddev); 3907 return next_mddev; 3908 3909 } 3910 3911 static void md_seq_stop(struct seq_file *seq, void *v) 3912 { 3913 mddev_t *mddev = v; 3914 3915 if (mddev && v != (void*)1 && v != (void*)2) 3916 mddev_put(mddev); 3917 } 3918 3919 struct mdstat_info { 3920 int event; 3921 }; 3922 3923 static int md_seq_show(struct seq_file *seq, void *v) 3924 { 3925 mddev_t *mddev = v; 3926 sector_t size; 3927 struct list_head *tmp2; 3928 mdk_rdev_t *rdev; 3929 struct mdstat_info *mi = seq->private; 3930 struct bitmap *bitmap; 3931 3932 if (v == (void*)1) { 3933 struct mdk_personality *pers; 3934 seq_printf(seq, "Personalities : "); 3935 spin_lock(&pers_lock); 3936 list_for_each_entry(pers, &pers_list, list) 3937 seq_printf(seq, "[%s] ", pers->name); 3938 3939 spin_unlock(&pers_lock); 3940 seq_printf(seq, "\n"); 3941 mi->event = atomic_read(&md_event_count); 3942 return 0; 3943 } 3944 if (v == (void*)2) { 3945 status_unused(seq); 3946 return 0; 3947 } 3948 3949 if (mddev_lock(mddev)!=0) 3950 return -EINTR; 3951 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3952 seq_printf(seq, "%s : %sactive", mdname(mddev), 3953 mddev->pers ? "" : "in"); 3954 if (mddev->pers) { 3955 if (mddev->ro==1) 3956 seq_printf(seq, " (read-only)"); 3957 if (mddev->ro==2) 3958 seq_printf(seq, "(auto-read-only)"); 3959 seq_printf(seq, " %s", mddev->pers->name); 3960 } 3961 3962 size = 0; 3963 ITERATE_RDEV(mddev,rdev,tmp2) { 3964 char b[BDEVNAME_SIZE]; 3965 seq_printf(seq, " %s[%d]", 3966 bdevname(rdev->bdev,b), rdev->desc_nr); 3967 if (test_bit(WriteMostly, &rdev->flags)) 3968 seq_printf(seq, "(W)"); 3969 if (test_bit(Faulty, &rdev->flags)) { 3970 seq_printf(seq, "(F)"); 3971 continue; 3972 } else if (rdev->raid_disk < 0) 3973 seq_printf(seq, "(S)"); /* spare */ 3974 size += rdev->size; 3975 } 3976 3977 if (!list_empty(&mddev->disks)) { 3978 if (mddev->pers) 3979 seq_printf(seq, "\n %llu blocks", 3980 (unsigned long long)mddev->array_size); 3981 else 3982 seq_printf(seq, "\n %llu blocks", 3983 (unsigned long long)size); 3984 } 3985 if (mddev->persistent) { 3986 if (mddev->major_version != 0 || 3987 mddev->minor_version != 90) { 3988 seq_printf(seq," super %d.%d", 3989 mddev->major_version, 3990 mddev->minor_version); 3991 } 3992 } else 3993 seq_printf(seq, " super non-persistent"); 3994 3995 if (mddev->pers) { 3996 mddev->pers->status (seq, mddev); 3997 seq_printf(seq, "\n "); 3998 if (mddev->pers->sync_request) { 3999 if (mddev->curr_resync > 2) { 4000 status_resync (seq, mddev); 4001 seq_printf(seq, "\n "); 4002 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4003 seq_printf(seq, "\tresync=DELAYED\n "); 4004 else if (mddev->recovery_cp < MaxSector) 4005 seq_printf(seq, "\tresync=PENDING\n "); 4006 } 4007 } else 4008 seq_printf(seq, "\n "); 4009 4010 if ((bitmap = mddev->bitmap)) { 4011 unsigned long chunk_kb; 4012 unsigned long flags; 4013 spin_lock_irqsave(&bitmap->lock, flags); 4014 chunk_kb = bitmap->chunksize >> 10; 4015 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4016 "%lu%s chunk", 4017 bitmap->pages - bitmap->missing_pages, 4018 bitmap->pages, 4019 (bitmap->pages - bitmap->missing_pages) 4020 << (PAGE_SHIFT - 10), 4021 chunk_kb ? chunk_kb : bitmap->chunksize, 4022 chunk_kb ? "KB" : "B"); 4023 if (bitmap->file) { 4024 seq_printf(seq, ", file: "); 4025 seq_path(seq, bitmap->file->f_vfsmnt, 4026 bitmap->file->f_dentry," \t\n"); 4027 } 4028 4029 seq_printf(seq, "\n"); 4030 spin_unlock_irqrestore(&bitmap->lock, flags); 4031 } 4032 4033 seq_printf(seq, "\n"); 4034 } 4035 mddev_unlock(mddev); 4036 4037 return 0; 4038 } 4039 4040 static struct seq_operations md_seq_ops = { 4041 .start = md_seq_start, 4042 .next = md_seq_next, 4043 .stop = md_seq_stop, 4044 .show = md_seq_show, 4045 }; 4046 4047 static int md_seq_open(struct inode *inode, struct file *file) 4048 { 4049 int error; 4050 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4051 if (mi == NULL) 4052 return -ENOMEM; 4053 4054 error = seq_open(file, &md_seq_ops); 4055 if (error) 4056 kfree(mi); 4057 else { 4058 struct seq_file *p = file->private_data; 4059 p->private = mi; 4060 mi->event = atomic_read(&md_event_count); 4061 } 4062 return error; 4063 } 4064 4065 static int md_seq_release(struct inode *inode, struct file *file) 4066 { 4067 struct seq_file *m = file->private_data; 4068 struct mdstat_info *mi = m->private; 4069 m->private = NULL; 4070 kfree(mi); 4071 return seq_release(inode, file); 4072 } 4073 4074 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4075 { 4076 struct seq_file *m = filp->private_data; 4077 struct mdstat_info *mi = m->private; 4078 int mask; 4079 4080 poll_wait(filp, &md_event_waiters, wait); 4081 4082 /* always allow read */ 4083 mask = POLLIN | POLLRDNORM; 4084 4085 if (mi->event != atomic_read(&md_event_count)) 4086 mask |= POLLERR | POLLPRI; 4087 return mask; 4088 } 4089 4090 static struct file_operations md_seq_fops = { 4091 .open = md_seq_open, 4092 .read = seq_read, 4093 .llseek = seq_lseek, 4094 .release = md_seq_release, 4095 .poll = mdstat_poll, 4096 }; 4097 4098 int register_md_personality(struct mdk_personality *p) 4099 { 4100 spin_lock(&pers_lock); 4101 list_add_tail(&p->list, &pers_list); 4102 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4103 spin_unlock(&pers_lock); 4104 return 0; 4105 } 4106 4107 int unregister_md_personality(struct mdk_personality *p) 4108 { 4109 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4110 spin_lock(&pers_lock); 4111 list_del_init(&p->list); 4112 spin_unlock(&pers_lock); 4113 return 0; 4114 } 4115 4116 static int is_mddev_idle(mddev_t *mddev) 4117 { 4118 mdk_rdev_t * rdev; 4119 struct list_head *tmp; 4120 int idle; 4121 unsigned long curr_events; 4122 4123 idle = 1; 4124 ITERATE_RDEV(mddev,rdev,tmp) { 4125 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4126 curr_events = disk_stat_read(disk, sectors[0]) + 4127 disk_stat_read(disk, sectors[1]) - 4128 atomic_read(&disk->sync_io); 4129 /* The difference between curr_events and last_events 4130 * will be affected by any new non-sync IO (making 4131 * curr_events bigger) and any difference in the amount of 4132 * in-flight syncio (making current_events bigger or smaller) 4133 * The amount in-flight is currently limited to 4134 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4135 * which is at most 4096 sectors. 4136 * These numbers are fairly fragile and should be made 4137 * more robust, probably by enforcing the 4138 * 'window size' that md_do_sync sort-of uses. 4139 * 4140 * Note: the following is an unsigned comparison. 4141 */ 4142 if ((curr_events - rdev->last_events + 4096) > 8192) { 4143 rdev->last_events = curr_events; 4144 idle = 0; 4145 } 4146 } 4147 return idle; 4148 } 4149 4150 void md_done_sync(mddev_t *mddev, int blocks, int ok) 4151 { 4152 /* another "blocks" (512byte) blocks have been synced */ 4153 atomic_sub(blocks, &mddev->recovery_active); 4154 wake_up(&mddev->recovery_wait); 4155 if (!ok) { 4156 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4157 md_wakeup_thread(mddev->thread); 4158 // stop recovery, signal do_sync .... 4159 } 4160 } 4161 4162 4163 /* md_write_start(mddev, bi) 4164 * If we need to update some array metadata (e.g. 'active' flag 4165 * in superblock) before writing, schedule a superblock update 4166 * and wait for it to complete. 4167 */ 4168 void md_write_start(mddev_t *mddev, struct bio *bi) 4169 { 4170 if (bio_data_dir(bi) != WRITE) 4171 return; 4172 4173 BUG_ON(mddev->ro == 1); 4174 if (mddev->ro == 2) { 4175 /* need to switch to read/write */ 4176 mddev->ro = 0; 4177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4178 md_wakeup_thread(mddev->thread); 4179 } 4180 atomic_inc(&mddev->writes_pending); 4181 if (mddev->in_sync) { 4182 spin_lock_irq(&mddev->write_lock); 4183 if (mddev->in_sync) { 4184 mddev->in_sync = 0; 4185 mddev->sb_dirty = 1; 4186 md_wakeup_thread(mddev->thread); 4187 } 4188 spin_unlock_irq(&mddev->write_lock); 4189 } 4190 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 4191 } 4192 4193 void md_write_end(mddev_t *mddev) 4194 { 4195 if (atomic_dec_and_test(&mddev->writes_pending)) { 4196 if (mddev->safemode == 2) 4197 md_wakeup_thread(mddev->thread); 4198 else 4199 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 4200 } 4201 } 4202 4203 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 4204 4205 #define SYNC_MARKS 10 4206 #define SYNC_MARK_STEP (3*HZ) 4207 static void md_do_sync(mddev_t *mddev) 4208 { 4209 mddev_t *mddev2; 4210 unsigned int currspeed = 0, 4211 window; 4212 sector_t max_sectors,j, io_sectors; 4213 unsigned long mark[SYNC_MARKS]; 4214 sector_t mark_cnt[SYNC_MARKS]; 4215 int last_mark,m; 4216 struct list_head *tmp; 4217 sector_t last_check; 4218 int skipped = 0; 4219 4220 /* just incase thread restarts... */ 4221 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 4222 return; 4223 4224 /* we overload curr_resync somewhat here. 4225 * 0 == not engaged in resync at all 4226 * 2 == checking that there is no conflict with another sync 4227 * 1 == like 2, but have yielded to allow conflicting resync to 4228 * commense 4229 * other == active in resync - this many blocks 4230 * 4231 * Before starting a resync we must have set curr_resync to 4232 * 2, and then checked that every "conflicting" array has curr_resync 4233 * less than ours. When we find one that is the same or higher 4234 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 4235 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 4236 * This will mean we have to start checking from the beginning again. 4237 * 4238 */ 4239 4240 do { 4241 mddev->curr_resync = 2; 4242 4243 try_again: 4244 if (kthread_should_stop()) { 4245 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4246 goto skip; 4247 } 4248 ITERATE_MDDEV(mddev2,tmp) { 4249 if (mddev2 == mddev) 4250 continue; 4251 if (mddev2->curr_resync && 4252 match_mddev_units(mddev,mddev2)) { 4253 DEFINE_WAIT(wq); 4254 if (mddev < mddev2 && mddev->curr_resync == 2) { 4255 /* arbitrarily yield */ 4256 mddev->curr_resync = 1; 4257 wake_up(&resync_wait); 4258 } 4259 if (mddev > mddev2 && mddev->curr_resync == 1) 4260 /* no need to wait here, we can wait the next 4261 * time 'round when curr_resync == 2 4262 */ 4263 continue; 4264 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 4265 if (!kthread_should_stop() && 4266 mddev2->curr_resync >= mddev->curr_resync) { 4267 printk(KERN_INFO "md: delaying resync of %s" 4268 " until %s has finished resync (they" 4269 " share one or more physical units)\n", 4270 mdname(mddev), mdname(mddev2)); 4271 mddev_put(mddev2); 4272 schedule(); 4273 finish_wait(&resync_wait, &wq); 4274 goto try_again; 4275 } 4276 finish_wait(&resync_wait, &wq); 4277 } 4278 } 4279 } while (mddev->curr_resync < 2); 4280 4281 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4282 /* resync follows the size requested by the personality, 4283 * which defaults to physical size, but can be virtual size 4284 */ 4285 max_sectors = mddev->resync_max_sectors; 4286 mddev->resync_mismatches = 0; 4287 } else 4288 /* recovery follows the physical size of devices */ 4289 max_sectors = mddev->size << 1; 4290 4291 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 4292 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 4293 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 4294 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 4295 "(but not more than %d KB/sec) for reconstruction.\n", 4296 sysctl_speed_limit_max); 4297 4298 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4299 /* we don't use the checkpoint if there's a bitmap */ 4300 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap 4301 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 4302 j = mddev->recovery_cp; 4303 else 4304 j = 0; 4305 io_sectors = 0; 4306 for (m = 0; m < SYNC_MARKS; m++) { 4307 mark[m] = jiffies; 4308 mark_cnt[m] = io_sectors; 4309 } 4310 last_mark = 0; 4311 mddev->resync_mark = mark[last_mark]; 4312 mddev->resync_mark_cnt = mark_cnt[last_mark]; 4313 4314 /* 4315 * Tune reconstruction: 4316 */ 4317 window = 32*(PAGE_SIZE/512); 4318 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 4319 window/2,(unsigned long long) max_sectors/2); 4320 4321 atomic_set(&mddev->recovery_active, 0); 4322 init_waitqueue_head(&mddev->recovery_wait); 4323 last_check = 0; 4324 4325 if (j>2) { 4326 printk(KERN_INFO 4327 "md: resuming recovery of %s from checkpoint.\n", 4328 mdname(mddev)); 4329 mddev->curr_resync = j; 4330 } 4331 4332 while (j < max_sectors) { 4333 sector_t sectors; 4334 4335 skipped = 0; 4336 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4337 currspeed < sysctl_speed_limit_min); 4338 if (sectors == 0) { 4339 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4340 goto out; 4341 } 4342 4343 if (!skipped) { /* actual IO requested */ 4344 io_sectors += sectors; 4345 atomic_add(sectors, &mddev->recovery_active); 4346 } 4347 4348 j += sectors; 4349 if (j>1) mddev->curr_resync = j; 4350 if (last_check == 0) 4351 /* this is the earliers that rebuilt will be 4352 * visible in /proc/mdstat 4353 */ 4354 md_new_event(mddev); 4355 4356 if (last_check + window > io_sectors || j == max_sectors) 4357 continue; 4358 4359 last_check = io_sectors; 4360 4361 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 4362 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 4363 break; 4364 4365 repeat: 4366 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 4367 /* step marks */ 4368 int next = (last_mark+1) % SYNC_MARKS; 4369 4370 mddev->resync_mark = mark[next]; 4371 mddev->resync_mark_cnt = mark_cnt[next]; 4372 mark[next] = jiffies; 4373 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 4374 last_mark = next; 4375 } 4376 4377 4378 if (kthread_should_stop()) { 4379 /* 4380 * got a signal, exit. 4381 */ 4382 printk(KERN_INFO 4383 "md: md_do_sync() got signal ... exiting\n"); 4384 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4385 goto out; 4386 } 4387 4388 /* 4389 * this loop exits only if either when we are slower than 4390 * the 'hard' speed limit, or the system was IO-idle for 4391 * a jiffy. 4392 * the system might be non-idle CPU-wise, but we only care 4393 * about not overloading the IO subsystem. (things like an 4394 * e2fsck being done on the RAID array should execute fast) 4395 */ 4396 mddev->queue->unplug_fn(mddev->queue); 4397 cond_resched(); 4398 4399 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4400 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4401 4402 if (currspeed > sysctl_speed_limit_min) { 4403 if ((currspeed > sysctl_speed_limit_max) || 4404 !is_mddev_idle(mddev)) { 4405 msleep(500); 4406 goto repeat; 4407 } 4408 } 4409 } 4410 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 4411 /* 4412 * this also signals 'finished resyncing' to md_stop 4413 */ 4414 out: 4415 mddev->queue->unplug_fn(mddev->queue); 4416 4417 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 4418 4419 /* tell personality that we are finished */ 4420 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4421 4422 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4423 mddev->curr_resync > 2 && 4424 mddev->curr_resync >= mddev->recovery_cp) { 4425 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4426 printk(KERN_INFO 4427 "md: checkpointing recovery of %s.\n", 4428 mdname(mddev)); 4429 mddev->recovery_cp = mddev->curr_resync; 4430 } else 4431 mddev->recovery_cp = MaxSector; 4432 } 4433 4434 skip: 4435 mddev->curr_resync = 0; 4436 wake_up(&resync_wait); 4437 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4438 md_wakeup_thread(mddev->thread); 4439 } 4440 4441 4442 /* 4443 * This routine is regularly called by all per-raid-array threads to 4444 * deal with generic issues like resync and super-block update. 4445 * Raid personalities that don't have a thread (linear/raid0) do not 4446 * need this as they never do any recovery or update the superblock. 4447 * 4448 * It does not do any resync itself, but rather "forks" off other threads 4449 * to do that as needed. 4450 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 4451 * "->recovery" and create a thread at ->sync_thread. 4452 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 4453 * and wakeups up this thread which will reap the thread and finish up. 4454 * This thread also removes any faulty devices (with nr_pending == 0). 4455 * 4456 * The overall approach is: 4457 * 1/ if the superblock needs updating, update it. 4458 * 2/ If a recovery thread is running, don't do anything else. 4459 * 3/ If recovery has finished, clean up, possibly marking spares active. 4460 * 4/ If there are any faulty devices, remove them. 4461 * 5/ If array is degraded, try to add spares devices 4462 * 6/ If array has spares or is not in-sync, start a resync thread. 4463 */ 4464 void md_check_recovery(mddev_t *mddev) 4465 { 4466 mdk_rdev_t *rdev; 4467 struct list_head *rtmp; 4468 4469 4470 if (mddev->bitmap) 4471 bitmap_daemon_work(mddev->bitmap); 4472 4473 if (mddev->ro) 4474 return; 4475 4476 if (signal_pending(current)) { 4477 if (mddev->pers->sync_request) { 4478 printk(KERN_INFO "md: %s in immediate safe mode\n", 4479 mdname(mddev)); 4480 mddev->safemode = 2; 4481 } 4482 flush_signals(current); 4483 } 4484 4485 if ( ! ( 4486 mddev->sb_dirty || 4487 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 4488 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 4489 (mddev->safemode == 1) || 4490 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 4491 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 4492 )) 4493 return; 4494 4495 if (mddev_trylock(mddev)==0) { 4496 int spares =0; 4497 4498 spin_lock_irq(&mddev->write_lock); 4499 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4500 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4501 mddev->in_sync = 1; 4502 mddev->sb_dirty = 1; 4503 } 4504 if (mddev->safemode == 1) 4505 mddev->safemode = 0; 4506 spin_unlock_irq(&mddev->write_lock); 4507 4508 if (mddev->sb_dirty) 4509 md_update_sb(mddev); 4510 4511 4512 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4513 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 4514 /* resync/recovery still happening */ 4515 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4516 goto unlock; 4517 } 4518 if (mddev->sync_thread) { 4519 /* resync has finished, collect result */ 4520 md_unregister_thread(mddev->sync_thread); 4521 mddev->sync_thread = NULL; 4522 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4523 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4524 /* success...*/ 4525 /* activate any spares */ 4526 mddev->pers->spare_active(mddev); 4527 } 4528 md_update_sb(mddev); 4529 4530 /* if array is no-longer degraded, then any saved_raid_disk 4531 * information must be scrapped 4532 */ 4533 if (!mddev->degraded) 4534 ITERATE_RDEV(mddev,rdev,rtmp) 4535 rdev->saved_raid_disk = -1; 4536 4537 mddev->recovery = 0; 4538 /* flag recovery needed just to double check */ 4539 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4540 md_new_event(mddev); 4541 goto unlock; 4542 } 4543 /* Clear some bits that don't mean anything, but 4544 * might be left set 4545 */ 4546 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4547 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 4548 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 4549 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4550 4551 /* no recovery is running. 4552 * remove any failed drives, then 4553 * add spares if possible. 4554 * Spare are also removed and re-added, to allow 4555 * the personality to fail the re-add. 4556 */ 4557 ITERATE_RDEV(mddev,rdev,rtmp) 4558 if (rdev->raid_disk >= 0 && 4559 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 4560 atomic_read(&rdev->nr_pending)==0) { 4561 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 4562 char nm[20]; 4563 sprintf(nm,"rd%d", rdev->raid_disk); 4564 sysfs_remove_link(&mddev->kobj, nm); 4565 rdev->raid_disk = -1; 4566 } 4567 } 4568 4569 if (mddev->degraded) { 4570 ITERATE_RDEV(mddev,rdev,rtmp) 4571 if (rdev->raid_disk < 0 4572 && !test_bit(Faulty, &rdev->flags)) { 4573 if (mddev->pers->hot_add_disk(mddev,rdev)) { 4574 char nm[20]; 4575 sprintf(nm, "rd%d", rdev->raid_disk); 4576 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4577 spares++; 4578 md_new_event(mddev); 4579 } else 4580 break; 4581 } 4582 } 4583 4584 if (spares) { 4585 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4586 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4587 } else if (mddev->recovery_cp < MaxSector) { 4588 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4589 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4590 /* nothing to be done ... */ 4591 goto unlock; 4592 4593 if (mddev->pers->sync_request) { 4594 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4595 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4596 /* We are adding a device or devices to an array 4597 * which has the bitmap stored on all devices. 4598 * So make sure all bitmap pages get written 4599 */ 4600 bitmap_write_all(mddev->bitmap); 4601 } 4602 mddev->sync_thread = md_register_thread(md_do_sync, 4603 mddev, 4604 "%s_resync"); 4605 if (!mddev->sync_thread) { 4606 printk(KERN_ERR "%s: could not start resync" 4607 " thread...\n", 4608 mdname(mddev)); 4609 /* leave the spares where they are, it shouldn't hurt */ 4610 mddev->recovery = 0; 4611 } else 4612 md_wakeup_thread(mddev->sync_thread); 4613 md_new_event(mddev); 4614 } 4615 unlock: 4616 mddev_unlock(mddev); 4617 } 4618 } 4619 4620 static int md_notify_reboot(struct notifier_block *this, 4621 unsigned long code, void *x) 4622 { 4623 struct list_head *tmp; 4624 mddev_t *mddev; 4625 4626 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 4627 4628 printk(KERN_INFO "md: stopping all md devices.\n"); 4629 4630 ITERATE_MDDEV(mddev,tmp) 4631 if (mddev_trylock(mddev)==0) 4632 do_md_stop (mddev, 1); 4633 /* 4634 * certain more exotic SCSI devices are known to be 4635 * volatile wrt too early system reboots. While the 4636 * right place to handle this issue is the given 4637 * driver, we do want to have a safe RAID driver ... 4638 */ 4639 mdelay(1000*1); 4640 } 4641 return NOTIFY_DONE; 4642 } 4643 4644 static struct notifier_block md_notifier = { 4645 .notifier_call = md_notify_reboot, 4646 .next = NULL, 4647 .priority = INT_MAX, /* before any real devices */ 4648 }; 4649 4650 static void md_geninit(void) 4651 { 4652 struct proc_dir_entry *p; 4653 4654 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 4655 4656 p = create_proc_entry("mdstat", S_IRUGO, NULL); 4657 if (p) 4658 p->proc_fops = &md_seq_fops; 4659 } 4660 4661 static int __init md_init(void) 4662 { 4663 int minor; 4664 4665 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 4666 " MD_SB_DISKS=%d\n", 4667 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4668 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4669 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 4670 BITMAP_MINOR); 4671 4672 if (register_blkdev(MAJOR_NR, "md")) 4673 return -1; 4674 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 4675 unregister_blkdev(MAJOR_NR, "md"); 4676 return -1; 4677 } 4678 devfs_mk_dir("md"); 4679 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 4680 md_probe, NULL, NULL); 4681 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 4682 md_probe, NULL, NULL); 4683 4684 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4685 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 4686 S_IFBLK|S_IRUSR|S_IWUSR, 4687 "md/%d", minor); 4688 4689 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4690 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4691 S_IFBLK|S_IRUSR|S_IWUSR, 4692 "md/mdp%d", minor); 4693 4694 4695 register_reboot_notifier(&md_notifier); 4696 raid_table_header = register_sysctl_table(raid_root_table, 1); 4697 4698 md_geninit(); 4699 return (0); 4700 } 4701 4702 4703 #ifndef MODULE 4704 4705 /* 4706 * Searches all registered partitions for autorun RAID arrays 4707 * at boot time. 4708 */ 4709 static dev_t detected_devices[128]; 4710 static int dev_cnt; 4711 4712 void md_autodetect_dev(dev_t dev) 4713 { 4714 if (dev_cnt >= 0 && dev_cnt < 127) 4715 detected_devices[dev_cnt++] = dev; 4716 } 4717 4718 4719 static void autostart_arrays(int part) 4720 { 4721 mdk_rdev_t *rdev; 4722 int i; 4723 4724 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4725 4726 for (i = 0; i < dev_cnt; i++) { 4727 dev_t dev = detected_devices[i]; 4728 4729 rdev = md_import_device(dev,0, 0); 4730 if (IS_ERR(rdev)) 4731 continue; 4732 4733 if (test_bit(Faulty, &rdev->flags)) { 4734 MD_BUG(); 4735 continue; 4736 } 4737 list_add(&rdev->same_set, &pending_raid_disks); 4738 } 4739 dev_cnt = 0; 4740 4741 autorun_devices(part); 4742 } 4743 4744 #endif 4745 4746 static __exit void md_exit(void) 4747 { 4748 mddev_t *mddev; 4749 struct list_head *tmp; 4750 int i; 4751 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4752 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4753 for (i=0; i < MAX_MD_DEVS; i++) 4754 devfs_remove("md/%d", i); 4755 for (i=0; i < MAX_MD_DEVS; i++) 4756 devfs_remove("md/d%d", i); 4757 4758 devfs_remove("md"); 4759 4760 unregister_blkdev(MAJOR_NR,"md"); 4761 unregister_blkdev(mdp_major, "mdp"); 4762 unregister_reboot_notifier(&md_notifier); 4763 unregister_sysctl_table(raid_table_header); 4764 remove_proc_entry("mdstat", NULL); 4765 ITERATE_MDDEV(mddev,tmp) { 4766 struct gendisk *disk = mddev->gendisk; 4767 if (!disk) 4768 continue; 4769 export_array(mddev); 4770 del_gendisk(disk); 4771 put_disk(disk); 4772 mddev->gendisk = NULL; 4773 mddev_put(mddev); 4774 } 4775 } 4776 4777 module_init(md_init) 4778 module_exit(md_exit) 4779 4780 static int get_ro(char *buffer, struct kernel_param *kp) 4781 { 4782 return sprintf(buffer, "%d", start_readonly); 4783 } 4784 static int set_ro(const char *val, struct kernel_param *kp) 4785 { 4786 char *e; 4787 int num = simple_strtoul(val, &e, 10); 4788 if (*val && (*e == '\0' || *e == '\n')) { 4789 start_readonly = num; 4790 return 0; 4791 } 4792 return -EINVAL; 4793 } 4794 4795 module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 4796 module_param(start_dirty_degraded, int, 0644); 4797 4798 4799 EXPORT_SYMBOL(register_md_personality); 4800 EXPORT_SYMBOL(unregister_md_personality); 4801 EXPORT_SYMBOL(md_error); 4802 EXPORT_SYMBOL(md_done_sync); 4803 EXPORT_SYMBOL(md_write_start); 4804 EXPORT_SYMBOL(md_write_end); 4805 EXPORT_SYMBOL(md_register_thread); 4806 EXPORT_SYMBOL(md_unregister_thread); 4807 EXPORT_SYMBOL(md_wakeup_thread); 4808 EXPORT_SYMBOL(md_print_devices); 4809 EXPORT_SYMBOL(md_check_recovery); 4810 MODULE_LICENSE("GPL"); 4811 MODULE_ALIAS("md"); 4812 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4813