1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 #include <linux/poll.h> 46 47 #include <linux/init.h> 48 49 #include <linux/file.h> 50 51 #ifdef CONFIG_KMOD 52 #include <linux/kmod.h> 53 #endif 54 55 #include <asm/unaligned.h> 56 57 #define MAJOR_NR MD_MAJOR 58 #define MD_DRIVER 59 60 /* 63 partitions with the alternate major number (mdp) */ 61 #define MdpMinorShift 6 62 63 #define DEBUG 0 64 #define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67 #ifndef MODULE 68 static void autostart_arrays (int part); 69 #endif 70 71 static LIST_HEAD(pers_list); 72 static DEFINE_SPINLOCK(pers_lock); 73 74 /* 75 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 76 * is 1000 KB/sec, so the extra system load does not show up that much. 77 * Increase it if you want to have more _guaranteed_ speed. Note that 78 * the RAID driver will use the maximum available bandwidth if the IO 79 * subsystem is idle. There is also an 'absolute maximum' reconstruction 80 * speed limit - in case reconstruction slows down your system despite 81 * idle IO detection. 82 * 83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 84 */ 85 86 static int sysctl_speed_limit_min = 1000; 87 static int sysctl_speed_limit_max = 200000; 88 89 static struct ctl_table_header *raid_table_header; 90 91 static ctl_table raid_table[] = { 92 { 93 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 94 .procname = "speed_limit_min", 95 .data = &sysctl_speed_limit_min, 96 .maxlen = sizeof(int), 97 .mode = 0644, 98 .proc_handler = &proc_dointvec, 99 }, 100 { 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 102 .procname = "speed_limit_max", 103 .data = &sysctl_speed_limit_max, 104 .maxlen = sizeof(int), 105 .mode = 0644, 106 .proc_handler = &proc_dointvec, 107 }, 108 { .ctl_name = 0 } 109 }; 110 111 static ctl_table raid_dir_table[] = { 112 { 113 .ctl_name = DEV_RAID, 114 .procname = "raid", 115 .maxlen = 0, 116 .mode = 0555, 117 .child = raid_table, 118 }, 119 { .ctl_name = 0 } 120 }; 121 122 static ctl_table raid_root_table[] = { 123 { 124 .ctl_name = CTL_DEV, 125 .procname = "dev", 126 .maxlen = 0, 127 .mode = 0555, 128 .child = raid_dir_table, 129 }, 130 { .ctl_name = 0 } 131 }; 132 133 static struct block_device_operations md_fops; 134 135 static int start_readonly; 136 137 /* 138 * We have a system wide 'event count' that is incremented 139 * on any 'interesting' event, and readers of /proc/mdstat 140 * can use 'poll' or 'select' to find out when the event 141 * count increases. 142 * 143 * Events are: 144 * start array, stop array, error, add device, remove device, 145 * start build, activate spare 146 */ 147 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 148 static atomic_t md_event_count; 149 static void md_new_event(mddev_t *mddev) 150 { 151 atomic_inc(&md_event_count); 152 wake_up(&md_event_waiters); 153 } 154 155 /* 156 * Enables to iterate over all existing md arrays 157 * all_mddevs_lock protects this list. 158 */ 159 static LIST_HEAD(all_mddevs); 160 static DEFINE_SPINLOCK(all_mddevs_lock); 161 162 163 /* 164 * iterates through all used mddevs in the system. 165 * We take care to grab the all_mddevs_lock whenever navigating 166 * the list, and to always hold a refcount when unlocked. 167 * Any code which breaks out of this loop while own 168 * a reference to the current mddev and must mddev_put it. 169 */ 170 #define ITERATE_MDDEV(mddev,tmp) \ 171 \ 172 for (({ spin_lock(&all_mddevs_lock); \ 173 tmp = all_mddevs.next; \ 174 mddev = NULL;}); \ 175 ({ if (tmp != &all_mddevs) \ 176 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 177 spin_unlock(&all_mddevs_lock); \ 178 if (mddev) mddev_put(mddev); \ 179 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 180 tmp != &all_mddevs;}); \ 181 ({ spin_lock(&all_mddevs_lock); \ 182 tmp = tmp->next;}) \ 183 ) 184 185 186 static int md_fail_request (request_queue_t *q, struct bio *bio) 187 { 188 bio_io_error(bio, bio->bi_size); 189 return 0; 190 } 191 192 static inline mddev_t *mddev_get(mddev_t *mddev) 193 { 194 atomic_inc(&mddev->active); 195 return mddev; 196 } 197 198 static void mddev_put(mddev_t *mddev) 199 { 200 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 201 return; 202 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 203 list_del(&mddev->all_mddevs); 204 blk_put_queue(mddev->queue); 205 kobject_unregister(&mddev->kobj); 206 } 207 spin_unlock(&all_mddevs_lock); 208 } 209 210 static mddev_t * mddev_find(dev_t unit) 211 { 212 mddev_t *mddev, *new = NULL; 213 214 retry: 215 spin_lock(&all_mddevs_lock); 216 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 217 if (mddev->unit == unit) { 218 mddev_get(mddev); 219 spin_unlock(&all_mddevs_lock); 220 kfree(new); 221 return mddev; 222 } 223 224 if (new) { 225 list_add(&new->all_mddevs, &all_mddevs); 226 spin_unlock(&all_mddevs_lock); 227 return new; 228 } 229 spin_unlock(&all_mddevs_lock); 230 231 new = kzalloc(sizeof(*new), GFP_KERNEL); 232 if (!new) 233 return NULL; 234 235 new->unit = unit; 236 if (MAJOR(unit) == MD_MAJOR) 237 new->md_minor = MINOR(unit); 238 else 239 new->md_minor = MINOR(unit) >> MdpMinorShift; 240 241 init_MUTEX(&new->reconfig_sem); 242 INIT_LIST_HEAD(&new->disks); 243 INIT_LIST_HEAD(&new->all_mddevs); 244 init_timer(&new->safemode_timer); 245 atomic_set(&new->active, 1); 246 spin_lock_init(&new->write_lock); 247 init_waitqueue_head(&new->sb_wait); 248 249 new->queue = blk_alloc_queue(GFP_KERNEL); 250 if (!new->queue) { 251 kfree(new); 252 return NULL; 253 } 254 255 blk_queue_make_request(new->queue, md_fail_request); 256 257 goto retry; 258 } 259 260 static inline int mddev_lock(mddev_t * mddev) 261 { 262 return down_interruptible(&mddev->reconfig_sem); 263 } 264 265 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 266 { 267 down(&mddev->reconfig_sem); 268 } 269 270 static inline int mddev_trylock(mddev_t * mddev) 271 { 272 return down_trylock(&mddev->reconfig_sem); 273 } 274 275 static inline void mddev_unlock(mddev_t * mddev) 276 { 277 up(&mddev->reconfig_sem); 278 279 md_wakeup_thread(mddev->thread); 280 } 281 282 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 283 { 284 mdk_rdev_t * rdev; 285 struct list_head *tmp; 286 287 ITERATE_RDEV(mddev,rdev,tmp) { 288 if (rdev->desc_nr == nr) 289 return rdev; 290 } 291 return NULL; 292 } 293 294 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 295 { 296 struct list_head *tmp; 297 mdk_rdev_t *rdev; 298 299 ITERATE_RDEV(mddev,rdev,tmp) { 300 if (rdev->bdev->bd_dev == dev) 301 return rdev; 302 } 303 return NULL; 304 } 305 306 static struct mdk_personality *find_pers(int level) 307 { 308 struct mdk_personality *pers; 309 list_for_each_entry(pers, &pers_list, list) 310 if (pers->level == level) 311 return pers; 312 return NULL; 313 } 314 315 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 316 { 317 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 318 return MD_NEW_SIZE_BLOCKS(size); 319 } 320 321 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 322 { 323 sector_t size; 324 325 size = rdev->sb_offset; 326 327 if (chunk_size) 328 size &= ~((sector_t)chunk_size/1024 - 1); 329 return size; 330 } 331 332 static int alloc_disk_sb(mdk_rdev_t * rdev) 333 { 334 if (rdev->sb_page) 335 MD_BUG(); 336 337 rdev->sb_page = alloc_page(GFP_KERNEL); 338 if (!rdev->sb_page) { 339 printk(KERN_ALERT "md: out of memory.\n"); 340 return -EINVAL; 341 } 342 343 return 0; 344 } 345 346 static void free_disk_sb(mdk_rdev_t * rdev) 347 { 348 if (rdev->sb_page) { 349 put_page(rdev->sb_page); 350 rdev->sb_loaded = 0; 351 rdev->sb_page = NULL; 352 rdev->sb_offset = 0; 353 rdev->size = 0; 354 } 355 } 356 357 358 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 359 { 360 mdk_rdev_t *rdev = bio->bi_private; 361 mddev_t *mddev = rdev->mddev; 362 if (bio->bi_size) 363 return 1; 364 365 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 366 md_error(mddev, rdev); 367 368 if (atomic_dec_and_test(&mddev->pending_writes)) 369 wake_up(&mddev->sb_wait); 370 bio_put(bio); 371 return 0; 372 } 373 374 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 375 { 376 struct bio *bio2 = bio->bi_private; 377 mdk_rdev_t *rdev = bio2->bi_private; 378 mddev_t *mddev = rdev->mddev; 379 if (bio->bi_size) 380 return 1; 381 382 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 383 error == -EOPNOTSUPP) { 384 unsigned long flags; 385 /* barriers don't appear to be supported :-( */ 386 set_bit(BarriersNotsupp, &rdev->flags); 387 mddev->barriers_work = 0; 388 spin_lock_irqsave(&mddev->write_lock, flags); 389 bio2->bi_next = mddev->biolist; 390 mddev->biolist = bio2; 391 spin_unlock_irqrestore(&mddev->write_lock, flags); 392 wake_up(&mddev->sb_wait); 393 bio_put(bio); 394 return 0; 395 } 396 bio_put(bio2); 397 bio->bi_private = rdev; 398 return super_written(bio, bytes_done, error); 399 } 400 401 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 402 sector_t sector, int size, struct page *page) 403 { 404 /* write first size bytes of page to sector of rdev 405 * Increment mddev->pending_writes before returning 406 * and decrement it on completion, waking up sb_wait 407 * if zero is reached. 408 * If an error occurred, call md_error 409 * 410 * As we might need to resubmit the request if BIO_RW_BARRIER 411 * causes ENOTSUPP, we allocate a spare bio... 412 */ 413 struct bio *bio = bio_alloc(GFP_NOIO, 1); 414 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 415 416 bio->bi_bdev = rdev->bdev; 417 bio->bi_sector = sector; 418 bio_add_page(bio, page, size, 0); 419 bio->bi_private = rdev; 420 bio->bi_end_io = super_written; 421 bio->bi_rw = rw; 422 423 atomic_inc(&mddev->pending_writes); 424 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 425 struct bio *rbio; 426 rw |= (1<<BIO_RW_BARRIER); 427 rbio = bio_clone(bio, GFP_NOIO); 428 rbio->bi_private = bio; 429 rbio->bi_end_io = super_written_barrier; 430 submit_bio(rw, rbio); 431 } else 432 submit_bio(rw, bio); 433 } 434 435 void md_super_wait(mddev_t *mddev) 436 { 437 /* wait for all superblock writes that were scheduled to complete. 438 * if any had to be retried (due to BARRIER problems), retry them 439 */ 440 DEFINE_WAIT(wq); 441 for(;;) { 442 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 443 if (atomic_read(&mddev->pending_writes)==0) 444 break; 445 while (mddev->biolist) { 446 struct bio *bio; 447 spin_lock_irq(&mddev->write_lock); 448 bio = mddev->biolist; 449 mddev->biolist = bio->bi_next ; 450 bio->bi_next = NULL; 451 spin_unlock_irq(&mddev->write_lock); 452 submit_bio(bio->bi_rw, bio); 453 } 454 schedule(); 455 } 456 finish_wait(&mddev->sb_wait, &wq); 457 } 458 459 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 460 { 461 if (bio->bi_size) 462 return 1; 463 464 complete((struct completion*)bio->bi_private); 465 return 0; 466 } 467 468 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 469 struct page *page, int rw) 470 { 471 struct bio *bio = bio_alloc(GFP_NOIO, 1); 472 struct completion event; 473 int ret; 474 475 rw |= (1 << BIO_RW_SYNC); 476 477 bio->bi_bdev = bdev; 478 bio->bi_sector = sector; 479 bio_add_page(bio, page, size, 0); 480 init_completion(&event); 481 bio->bi_private = &event; 482 bio->bi_end_io = bi_complete; 483 submit_bio(rw, bio); 484 wait_for_completion(&event); 485 486 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 487 bio_put(bio); 488 return ret; 489 } 490 EXPORT_SYMBOL_GPL(sync_page_io); 491 492 static int read_disk_sb(mdk_rdev_t * rdev, int size) 493 { 494 char b[BDEVNAME_SIZE]; 495 if (!rdev->sb_page) { 496 MD_BUG(); 497 return -EINVAL; 498 } 499 if (rdev->sb_loaded) 500 return 0; 501 502 503 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 504 goto fail; 505 rdev->sb_loaded = 1; 506 return 0; 507 508 fail: 509 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 510 bdevname(rdev->bdev,b)); 511 return -EINVAL; 512 } 513 514 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 515 { 516 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 517 (sb1->set_uuid1 == sb2->set_uuid1) && 518 (sb1->set_uuid2 == sb2->set_uuid2) && 519 (sb1->set_uuid3 == sb2->set_uuid3)) 520 521 return 1; 522 523 return 0; 524 } 525 526 527 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 528 { 529 int ret; 530 mdp_super_t *tmp1, *tmp2; 531 532 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 533 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 534 535 if (!tmp1 || !tmp2) { 536 ret = 0; 537 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 538 goto abort; 539 } 540 541 *tmp1 = *sb1; 542 *tmp2 = *sb2; 543 544 /* 545 * nr_disks is not constant 546 */ 547 tmp1->nr_disks = 0; 548 tmp2->nr_disks = 0; 549 550 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 551 ret = 0; 552 else 553 ret = 1; 554 555 abort: 556 kfree(tmp1); 557 kfree(tmp2); 558 return ret; 559 } 560 561 static unsigned int calc_sb_csum(mdp_super_t * sb) 562 { 563 unsigned int disk_csum, csum; 564 565 disk_csum = sb->sb_csum; 566 sb->sb_csum = 0; 567 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 568 sb->sb_csum = disk_csum; 569 return csum; 570 } 571 572 573 /* 574 * Handle superblock details. 575 * We want to be able to handle multiple superblock formats 576 * so we have a common interface to them all, and an array of 577 * different handlers. 578 * We rely on user-space to write the initial superblock, and support 579 * reading and updating of superblocks. 580 * Interface methods are: 581 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 582 * loads and validates a superblock on dev. 583 * if refdev != NULL, compare superblocks on both devices 584 * Return: 585 * 0 - dev has a superblock that is compatible with refdev 586 * 1 - dev has a superblock that is compatible and newer than refdev 587 * so dev should be used as the refdev in future 588 * -EINVAL superblock incompatible or invalid 589 * -othererror e.g. -EIO 590 * 591 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 592 * Verify that dev is acceptable into mddev. 593 * The first time, mddev->raid_disks will be 0, and data from 594 * dev should be merged in. Subsequent calls check that dev 595 * is new enough. Return 0 or -EINVAL 596 * 597 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 598 * Update the superblock for rdev with data in mddev 599 * This does not write to disc. 600 * 601 */ 602 603 struct super_type { 604 char *name; 605 struct module *owner; 606 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 607 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 608 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 609 }; 610 611 /* 612 * load_super for 0.90.0 613 */ 614 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 615 { 616 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 617 mdp_super_t *sb; 618 int ret; 619 sector_t sb_offset; 620 621 /* 622 * Calculate the position of the superblock, 623 * it's at the end of the disk. 624 * 625 * It also happens to be a multiple of 4Kb. 626 */ 627 sb_offset = calc_dev_sboffset(rdev->bdev); 628 rdev->sb_offset = sb_offset; 629 630 ret = read_disk_sb(rdev, MD_SB_BYTES); 631 if (ret) return ret; 632 633 ret = -EINVAL; 634 635 bdevname(rdev->bdev, b); 636 sb = (mdp_super_t*)page_address(rdev->sb_page); 637 638 if (sb->md_magic != MD_SB_MAGIC) { 639 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 640 b); 641 goto abort; 642 } 643 644 if (sb->major_version != 0 || 645 sb->minor_version != 90) { 646 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 647 sb->major_version, sb->minor_version, 648 b); 649 goto abort; 650 } 651 652 if (sb->raid_disks <= 0) 653 goto abort; 654 655 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 656 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 657 b); 658 goto abort; 659 } 660 661 rdev->preferred_minor = sb->md_minor; 662 rdev->data_offset = 0; 663 rdev->sb_size = MD_SB_BYTES; 664 665 if (sb->level == LEVEL_MULTIPATH) 666 rdev->desc_nr = -1; 667 else 668 rdev->desc_nr = sb->this_disk.number; 669 670 if (refdev == 0) 671 ret = 1; 672 else { 673 __u64 ev1, ev2; 674 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 675 if (!uuid_equal(refsb, sb)) { 676 printk(KERN_WARNING "md: %s has different UUID to %s\n", 677 b, bdevname(refdev->bdev,b2)); 678 goto abort; 679 } 680 if (!sb_equal(refsb, sb)) { 681 printk(KERN_WARNING "md: %s has same UUID" 682 " but different superblock to %s\n", 683 b, bdevname(refdev->bdev, b2)); 684 goto abort; 685 } 686 ev1 = md_event(sb); 687 ev2 = md_event(refsb); 688 if (ev1 > ev2) 689 ret = 1; 690 else 691 ret = 0; 692 } 693 rdev->size = calc_dev_size(rdev, sb->chunk_size); 694 695 abort: 696 return ret; 697 } 698 699 /* 700 * validate_super for 0.90.0 701 */ 702 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 703 { 704 mdp_disk_t *desc; 705 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 706 707 rdev->raid_disk = -1; 708 rdev->flags = 0; 709 if (mddev->raid_disks == 0) { 710 mddev->major_version = 0; 711 mddev->minor_version = sb->minor_version; 712 mddev->patch_version = sb->patch_version; 713 mddev->persistent = ! sb->not_persistent; 714 mddev->chunk_size = sb->chunk_size; 715 mddev->ctime = sb->ctime; 716 mddev->utime = sb->utime; 717 mddev->level = sb->level; 718 mddev->layout = sb->layout; 719 mddev->raid_disks = sb->raid_disks; 720 mddev->size = sb->size; 721 mddev->events = md_event(sb); 722 mddev->bitmap_offset = 0; 723 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 724 725 if (sb->state & (1<<MD_SB_CLEAN)) 726 mddev->recovery_cp = MaxSector; 727 else { 728 if (sb->events_hi == sb->cp_events_hi && 729 sb->events_lo == sb->cp_events_lo) { 730 mddev->recovery_cp = sb->recovery_cp; 731 } else 732 mddev->recovery_cp = 0; 733 } 734 735 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 736 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 737 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 738 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 739 740 mddev->max_disks = MD_SB_DISKS; 741 742 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 743 mddev->bitmap_file == NULL) { 744 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 745 && mddev->level != 10) { 746 /* FIXME use a better test */ 747 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 748 return -EINVAL; 749 } 750 mddev->bitmap_offset = mddev->default_bitmap_offset; 751 } 752 753 } else if (mddev->pers == NULL) { 754 /* Insist on good event counter while assembling */ 755 __u64 ev1 = md_event(sb); 756 ++ev1; 757 if (ev1 < mddev->events) 758 return -EINVAL; 759 } else if (mddev->bitmap) { 760 /* if adding to array with a bitmap, then we can accept an 761 * older device ... but not too old. 762 */ 763 __u64 ev1 = md_event(sb); 764 if (ev1 < mddev->bitmap->events_cleared) 765 return 0; 766 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 767 return 0; 768 769 if (mddev->level != LEVEL_MULTIPATH) { 770 desc = sb->disks + rdev->desc_nr; 771 772 if (desc->state & (1<<MD_DISK_FAULTY)) 773 set_bit(Faulty, &rdev->flags); 774 else if (desc->state & (1<<MD_DISK_SYNC) && 775 desc->raid_disk < mddev->raid_disks) { 776 set_bit(In_sync, &rdev->flags); 777 rdev->raid_disk = desc->raid_disk; 778 } 779 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 780 set_bit(WriteMostly, &rdev->flags); 781 } else /* MULTIPATH are always insync */ 782 set_bit(In_sync, &rdev->flags); 783 return 0; 784 } 785 786 /* 787 * sync_super for 0.90.0 788 */ 789 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 790 { 791 mdp_super_t *sb; 792 struct list_head *tmp; 793 mdk_rdev_t *rdev2; 794 int next_spare = mddev->raid_disks; 795 796 797 /* make rdev->sb match mddev data.. 798 * 799 * 1/ zero out disks 800 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 801 * 3/ any empty disks < next_spare become removed 802 * 803 * disks[0] gets initialised to REMOVED because 804 * we cannot be sure from other fields if it has 805 * been initialised or not. 806 */ 807 int i; 808 int active=0, working=0,failed=0,spare=0,nr_disks=0; 809 810 rdev->sb_size = MD_SB_BYTES; 811 812 sb = (mdp_super_t*)page_address(rdev->sb_page); 813 814 memset(sb, 0, sizeof(*sb)); 815 816 sb->md_magic = MD_SB_MAGIC; 817 sb->major_version = mddev->major_version; 818 sb->minor_version = mddev->minor_version; 819 sb->patch_version = mddev->patch_version; 820 sb->gvalid_words = 0; /* ignored */ 821 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 822 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 823 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 824 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 825 826 sb->ctime = mddev->ctime; 827 sb->level = mddev->level; 828 sb->size = mddev->size; 829 sb->raid_disks = mddev->raid_disks; 830 sb->md_minor = mddev->md_minor; 831 sb->not_persistent = !mddev->persistent; 832 sb->utime = mddev->utime; 833 sb->state = 0; 834 sb->events_hi = (mddev->events>>32); 835 sb->events_lo = (u32)mddev->events; 836 837 if (mddev->in_sync) 838 { 839 sb->recovery_cp = mddev->recovery_cp; 840 sb->cp_events_hi = (mddev->events>>32); 841 sb->cp_events_lo = (u32)mddev->events; 842 if (mddev->recovery_cp == MaxSector) 843 sb->state = (1<< MD_SB_CLEAN); 844 } else 845 sb->recovery_cp = 0; 846 847 sb->layout = mddev->layout; 848 sb->chunk_size = mddev->chunk_size; 849 850 if (mddev->bitmap && mddev->bitmap_file == NULL) 851 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 852 853 sb->disks[0].state = (1<<MD_DISK_REMOVED); 854 ITERATE_RDEV(mddev,rdev2,tmp) { 855 mdp_disk_t *d; 856 int desc_nr; 857 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 858 && !test_bit(Faulty, &rdev2->flags)) 859 desc_nr = rdev2->raid_disk; 860 else 861 desc_nr = next_spare++; 862 rdev2->desc_nr = desc_nr; 863 d = &sb->disks[rdev2->desc_nr]; 864 nr_disks++; 865 d->number = rdev2->desc_nr; 866 d->major = MAJOR(rdev2->bdev->bd_dev); 867 d->minor = MINOR(rdev2->bdev->bd_dev); 868 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 869 && !test_bit(Faulty, &rdev2->flags)) 870 d->raid_disk = rdev2->raid_disk; 871 else 872 d->raid_disk = rdev2->desc_nr; /* compatibility */ 873 if (test_bit(Faulty, &rdev2->flags)) { 874 d->state = (1<<MD_DISK_FAULTY); 875 failed++; 876 } else if (test_bit(In_sync, &rdev2->flags)) { 877 d->state = (1<<MD_DISK_ACTIVE); 878 d->state |= (1<<MD_DISK_SYNC); 879 active++; 880 working++; 881 } else { 882 d->state = 0; 883 spare++; 884 working++; 885 } 886 if (test_bit(WriteMostly, &rdev2->flags)) 887 d->state |= (1<<MD_DISK_WRITEMOSTLY); 888 } 889 /* now set the "removed" and "faulty" bits on any missing devices */ 890 for (i=0 ; i < mddev->raid_disks ; i++) { 891 mdp_disk_t *d = &sb->disks[i]; 892 if (d->state == 0 && d->number == 0) { 893 d->number = i; 894 d->raid_disk = i; 895 d->state = (1<<MD_DISK_REMOVED); 896 d->state |= (1<<MD_DISK_FAULTY); 897 failed++; 898 } 899 } 900 sb->nr_disks = nr_disks; 901 sb->active_disks = active; 902 sb->working_disks = working; 903 sb->failed_disks = failed; 904 sb->spare_disks = spare; 905 906 sb->this_disk = sb->disks[rdev->desc_nr]; 907 sb->sb_csum = calc_sb_csum(sb); 908 } 909 910 /* 911 * version 1 superblock 912 */ 913 914 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 915 { 916 unsigned int disk_csum, csum; 917 unsigned long long newcsum; 918 int size = 256 + le32_to_cpu(sb->max_dev)*2; 919 unsigned int *isuper = (unsigned int*)sb; 920 int i; 921 922 disk_csum = sb->sb_csum; 923 sb->sb_csum = 0; 924 newcsum = 0; 925 for (i=0; size>=4; size -= 4 ) 926 newcsum += le32_to_cpu(*isuper++); 927 928 if (size == 2) 929 newcsum += le16_to_cpu(*(unsigned short*) isuper); 930 931 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 932 sb->sb_csum = disk_csum; 933 return cpu_to_le32(csum); 934 } 935 936 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 937 { 938 struct mdp_superblock_1 *sb; 939 int ret; 940 sector_t sb_offset; 941 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 942 int bmask; 943 944 /* 945 * Calculate the position of the superblock. 946 * It is always aligned to a 4K boundary and 947 * depeding on minor_version, it can be: 948 * 0: At least 8K, but less than 12K, from end of device 949 * 1: At start of device 950 * 2: 4K from start of device. 951 */ 952 switch(minor_version) { 953 case 0: 954 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 955 sb_offset -= 8*2; 956 sb_offset &= ~(sector_t)(4*2-1); 957 /* convert from sectors to K */ 958 sb_offset /= 2; 959 break; 960 case 1: 961 sb_offset = 0; 962 break; 963 case 2: 964 sb_offset = 4; 965 break; 966 default: 967 return -EINVAL; 968 } 969 rdev->sb_offset = sb_offset; 970 971 /* superblock is rarely larger than 1K, but it can be larger, 972 * and it is safe to read 4k, so we do that 973 */ 974 ret = read_disk_sb(rdev, 4096); 975 if (ret) return ret; 976 977 978 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 979 980 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 981 sb->major_version != cpu_to_le32(1) || 982 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 983 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 984 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 985 return -EINVAL; 986 987 if (calc_sb_1_csum(sb) != sb->sb_csum) { 988 printk("md: invalid superblock checksum on %s\n", 989 bdevname(rdev->bdev,b)); 990 return -EINVAL; 991 } 992 if (le64_to_cpu(sb->data_size) < 10) { 993 printk("md: data_size too small on %s\n", 994 bdevname(rdev->bdev,b)); 995 return -EINVAL; 996 } 997 rdev->preferred_minor = 0xffff; 998 rdev->data_offset = le64_to_cpu(sb->data_offset); 999 1000 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1001 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1002 if (rdev->sb_size & bmask) 1003 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1004 1005 if (refdev == 0) 1006 return 1; 1007 else { 1008 __u64 ev1, ev2; 1009 struct mdp_superblock_1 *refsb = 1010 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1011 1012 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1013 sb->level != refsb->level || 1014 sb->layout != refsb->layout || 1015 sb->chunksize != refsb->chunksize) { 1016 printk(KERN_WARNING "md: %s has strangely different" 1017 " superblock to %s\n", 1018 bdevname(rdev->bdev,b), 1019 bdevname(refdev->bdev,b2)); 1020 return -EINVAL; 1021 } 1022 ev1 = le64_to_cpu(sb->events); 1023 ev2 = le64_to_cpu(refsb->events); 1024 1025 if (ev1 > ev2) 1026 return 1; 1027 } 1028 if (minor_version) 1029 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1030 else 1031 rdev->size = rdev->sb_offset; 1032 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1033 return -EINVAL; 1034 rdev->size = le64_to_cpu(sb->data_size)/2; 1035 if (le32_to_cpu(sb->chunksize)) 1036 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1037 return 0; 1038 } 1039 1040 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1041 { 1042 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1043 1044 rdev->raid_disk = -1; 1045 rdev->flags = 0; 1046 if (mddev->raid_disks == 0) { 1047 mddev->major_version = 1; 1048 mddev->patch_version = 0; 1049 mddev->persistent = 1; 1050 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1051 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1052 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1053 mddev->level = le32_to_cpu(sb->level); 1054 mddev->layout = le32_to_cpu(sb->layout); 1055 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1056 mddev->size = le64_to_cpu(sb->size)/2; 1057 mddev->events = le64_to_cpu(sb->events); 1058 mddev->bitmap_offset = 0; 1059 mddev->default_bitmap_offset = 1024; 1060 1061 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1062 memcpy(mddev->uuid, sb->set_uuid, 16); 1063 1064 mddev->max_disks = (4096-256)/2; 1065 1066 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1067 mddev->bitmap_file == NULL ) { 1068 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1069 && mddev->level != 10) { 1070 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1071 return -EINVAL; 1072 } 1073 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1074 } 1075 } else if (mddev->pers == NULL) { 1076 /* Insist of good event counter while assembling */ 1077 __u64 ev1 = le64_to_cpu(sb->events); 1078 ++ev1; 1079 if (ev1 < mddev->events) 1080 return -EINVAL; 1081 } else if (mddev->bitmap) { 1082 /* If adding to array with a bitmap, then we can accept an 1083 * older device, but not too old. 1084 */ 1085 __u64 ev1 = le64_to_cpu(sb->events); 1086 if (ev1 < mddev->bitmap->events_cleared) 1087 return 0; 1088 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1089 return 0; 1090 1091 if (mddev->level != LEVEL_MULTIPATH) { 1092 int role; 1093 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1094 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1095 switch(role) { 1096 case 0xffff: /* spare */ 1097 break; 1098 case 0xfffe: /* faulty */ 1099 set_bit(Faulty, &rdev->flags); 1100 break; 1101 default: 1102 set_bit(In_sync, &rdev->flags); 1103 rdev->raid_disk = role; 1104 break; 1105 } 1106 if (sb->devflags & WriteMostly1) 1107 set_bit(WriteMostly, &rdev->flags); 1108 } else /* MULTIPATH are always insync */ 1109 set_bit(In_sync, &rdev->flags); 1110 1111 return 0; 1112 } 1113 1114 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1115 { 1116 struct mdp_superblock_1 *sb; 1117 struct list_head *tmp; 1118 mdk_rdev_t *rdev2; 1119 int max_dev, i; 1120 /* make rdev->sb match mddev and rdev data. */ 1121 1122 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1123 1124 sb->feature_map = 0; 1125 sb->pad0 = 0; 1126 memset(sb->pad1, 0, sizeof(sb->pad1)); 1127 memset(sb->pad2, 0, sizeof(sb->pad2)); 1128 memset(sb->pad3, 0, sizeof(sb->pad3)); 1129 1130 sb->utime = cpu_to_le64((__u64)mddev->utime); 1131 sb->events = cpu_to_le64(mddev->events); 1132 if (mddev->in_sync) 1133 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1134 else 1135 sb->resync_offset = cpu_to_le64(0); 1136 1137 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1138 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1139 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1140 } 1141 1142 max_dev = 0; 1143 ITERATE_RDEV(mddev,rdev2,tmp) 1144 if (rdev2->desc_nr+1 > max_dev) 1145 max_dev = rdev2->desc_nr+1; 1146 1147 sb->max_dev = cpu_to_le32(max_dev); 1148 for (i=0; i<max_dev;i++) 1149 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1150 1151 ITERATE_RDEV(mddev,rdev2,tmp) { 1152 i = rdev2->desc_nr; 1153 if (test_bit(Faulty, &rdev2->flags)) 1154 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1155 else if (test_bit(In_sync, &rdev2->flags)) 1156 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1157 else 1158 sb->dev_roles[i] = cpu_to_le16(0xffff); 1159 } 1160 1161 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1162 sb->sb_csum = calc_sb_1_csum(sb); 1163 } 1164 1165 1166 static struct super_type super_types[] = { 1167 [0] = { 1168 .name = "0.90.0", 1169 .owner = THIS_MODULE, 1170 .load_super = super_90_load, 1171 .validate_super = super_90_validate, 1172 .sync_super = super_90_sync, 1173 }, 1174 [1] = { 1175 .name = "md-1", 1176 .owner = THIS_MODULE, 1177 .load_super = super_1_load, 1178 .validate_super = super_1_validate, 1179 .sync_super = super_1_sync, 1180 }, 1181 }; 1182 1183 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1184 { 1185 struct list_head *tmp; 1186 mdk_rdev_t *rdev; 1187 1188 ITERATE_RDEV(mddev,rdev,tmp) 1189 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1190 return rdev; 1191 1192 return NULL; 1193 } 1194 1195 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1196 { 1197 struct list_head *tmp; 1198 mdk_rdev_t *rdev; 1199 1200 ITERATE_RDEV(mddev1,rdev,tmp) 1201 if (match_dev_unit(mddev2, rdev)) 1202 return 1; 1203 1204 return 0; 1205 } 1206 1207 static LIST_HEAD(pending_raid_disks); 1208 1209 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1210 { 1211 mdk_rdev_t *same_pdev; 1212 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1213 struct kobject *ko; 1214 1215 if (rdev->mddev) { 1216 MD_BUG(); 1217 return -EINVAL; 1218 } 1219 same_pdev = match_dev_unit(mddev, rdev); 1220 if (same_pdev) 1221 printk(KERN_WARNING 1222 "%s: WARNING: %s appears to be on the same physical" 1223 " disk as %s. True\n protection against single-disk" 1224 " failure might be compromised.\n", 1225 mdname(mddev), bdevname(rdev->bdev,b), 1226 bdevname(same_pdev->bdev,b2)); 1227 1228 /* Verify rdev->desc_nr is unique. 1229 * If it is -1, assign a free number, else 1230 * check number is not in use 1231 */ 1232 if (rdev->desc_nr < 0) { 1233 int choice = 0; 1234 if (mddev->pers) choice = mddev->raid_disks; 1235 while (find_rdev_nr(mddev, choice)) 1236 choice++; 1237 rdev->desc_nr = choice; 1238 } else { 1239 if (find_rdev_nr(mddev, rdev->desc_nr)) 1240 return -EBUSY; 1241 } 1242 bdevname(rdev->bdev,b); 1243 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1244 return -ENOMEM; 1245 1246 list_add(&rdev->same_set, &mddev->disks); 1247 rdev->mddev = mddev; 1248 printk(KERN_INFO "md: bind<%s>\n", b); 1249 1250 rdev->kobj.parent = &mddev->kobj; 1251 kobject_add(&rdev->kobj); 1252 1253 if (rdev->bdev->bd_part) 1254 ko = &rdev->bdev->bd_part->kobj; 1255 else 1256 ko = &rdev->bdev->bd_disk->kobj; 1257 sysfs_create_link(&rdev->kobj, ko, "block"); 1258 return 0; 1259 } 1260 1261 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1262 { 1263 char b[BDEVNAME_SIZE]; 1264 if (!rdev->mddev) { 1265 MD_BUG(); 1266 return; 1267 } 1268 list_del_init(&rdev->same_set); 1269 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1270 rdev->mddev = NULL; 1271 sysfs_remove_link(&rdev->kobj, "block"); 1272 kobject_del(&rdev->kobj); 1273 } 1274 1275 /* 1276 * prevent the device from being mounted, repartitioned or 1277 * otherwise reused by a RAID array (or any other kernel 1278 * subsystem), by bd_claiming the device. 1279 */ 1280 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1281 { 1282 int err = 0; 1283 struct block_device *bdev; 1284 char b[BDEVNAME_SIZE]; 1285 1286 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1287 if (IS_ERR(bdev)) { 1288 printk(KERN_ERR "md: could not open %s.\n", 1289 __bdevname(dev, b)); 1290 return PTR_ERR(bdev); 1291 } 1292 err = bd_claim(bdev, rdev); 1293 if (err) { 1294 printk(KERN_ERR "md: could not bd_claim %s.\n", 1295 bdevname(bdev, b)); 1296 blkdev_put(bdev); 1297 return err; 1298 } 1299 rdev->bdev = bdev; 1300 return err; 1301 } 1302 1303 static void unlock_rdev(mdk_rdev_t *rdev) 1304 { 1305 struct block_device *bdev = rdev->bdev; 1306 rdev->bdev = NULL; 1307 if (!bdev) 1308 MD_BUG(); 1309 bd_release(bdev); 1310 blkdev_put(bdev); 1311 } 1312 1313 void md_autodetect_dev(dev_t dev); 1314 1315 static void export_rdev(mdk_rdev_t * rdev) 1316 { 1317 char b[BDEVNAME_SIZE]; 1318 printk(KERN_INFO "md: export_rdev(%s)\n", 1319 bdevname(rdev->bdev,b)); 1320 if (rdev->mddev) 1321 MD_BUG(); 1322 free_disk_sb(rdev); 1323 list_del_init(&rdev->same_set); 1324 #ifndef MODULE 1325 md_autodetect_dev(rdev->bdev->bd_dev); 1326 #endif 1327 unlock_rdev(rdev); 1328 kobject_put(&rdev->kobj); 1329 } 1330 1331 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1332 { 1333 unbind_rdev_from_array(rdev); 1334 export_rdev(rdev); 1335 } 1336 1337 static void export_array(mddev_t *mddev) 1338 { 1339 struct list_head *tmp; 1340 mdk_rdev_t *rdev; 1341 1342 ITERATE_RDEV(mddev,rdev,tmp) { 1343 if (!rdev->mddev) { 1344 MD_BUG(); 1345 continue; 1346 } 1347 kick_rdev_from_array(rdev); 1348 } 1349 if (!list_empty(&mddev->disks)) 1350 MD_BUG(); 1351 mddev->raid_disks = 0; 1352 mddev->major_version = 0; 1353 } 1354 1355 static void print_desc(mdp_disk_t *desc) 1356 { 1357 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1358 desc->major,desc->minor,desc->raid_disk,desc->state); 1359 } 1360 1361 static void print_sb(mdp_super_t *sb) 1362 { 1363 int i; 1364 1365 printk(KERN_INFO 1366 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1367 sb->major_version, sb->minor_version, sb->patch_version, 1368 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1369 sb->ctime); 1370 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1371 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1372 sb->md_minor, sb->layout, sb->chunk_size); 1373 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1374 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1375 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1376 sb->failed_disks, sb->spare_disks, 1377 sb->sb_csum, (unsigned long)sb->events_lo); 1378 1379 printk(KERN_INFO); 1380 for (i = 0; i < MD_SB_DISKS; i++) { 1381 mdp_disk_t *desc; 1382 1383 desc = sb->disks + i; 1384 if (desc->number || desc->major || desc->minor || 1385 desc->raid_disk || (desc->state && (desc->state != 4))) { 1386 printk(" D %2d: ", i); 1387 print_desc(desc); 1388 } 1389 } 1390 printk(KERN_INFO "md: THIS: "); 1391 print_desc(&sb->this_disk); 1392 1393 } 1394 1395 static void print_rdev(mdk_rdev_t *rdev) 1396 { 1397 char b[BDEVNAME_SIZE]; 1398 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1399 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1400 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1401 rdev->desc_nr); 1402 if (rdev->sb_loaded) { 1403 printk(KERN_INFO "md: rdev superblock:\n"); 1404 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1405 } else 1406 printk(KERN_INFO "md: no rdev superblock!\n"); 1407 } 1408 1409 void md_print_devices(void) 1410 { 1411 struct list_head *tmp, *tmp2; 1412 mdk_rdev_t *rdev; 1413 mddev_t *mddev; 1414 char b[BDEVNAME_SIZE]; 1415 1416 printk("\n"); 1417 printk("md: **********************************\n"); 1418 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1419 printk("md: **********************************\n"); 1420 ITERATE_MDDEV(mddev,tmp) { 1421 1422 if (mddev->bitmap) 1423 bitmap_print_sb(mddev->bitmap); 1424 else 1425 printk("%s: ", mdname(mddev)); 1426 ITERATE_RDEV(mddev,rdev,tmp2) 1427 printk("<%s>", bdevname(rdev->bdev,b)); 1428 printk("\n"); 1429 1430 ITERATE_RDEV(mddev,rdev,tmp2) 1431 print_rdev(rdev); 1432 } 1433 printk("md: **********************************\n"); 1434 printk("\n"); 1435 } 1436 1437 1438 static void sync_sbs(mddev_t * mddev) 1439 { 1440 mdk_rdev_t *rdev; 1441 struct list_head *tmp; 1442 1443 ITERATE_RDEV(mddev,rdev,tmp) { 1444 super_types[mddev->major_version]. 1445 sync_super(mddev, rdev); 1446 rdev->sb_loaded = 1; 1447 } 1448 } 1449 1450 static void md_update_sb(mddev_t * mddev) 1451 { 1452 int err; 1453 struct list_head *tmp; 1454 mdk_rdev_t *rdev; 1455 int sync_req; 1456 1457 repeat: 1458 spin_lock_irq(&mddev->write_lock); 1459 sync_req = mddev->in_sync; 1460 mddev->utime = get_seconds(); 1461 mddev->events ++; 1462 1463 if (!mddev->events) { 1464 /* 1465 * oops, this 64-bit counter should never wrap. 1466 * Either we are in around ~1 trillion A.C., assuming 1467 * 1 reboot per second, or we have a bug: 1468 */ 1469 MD_BUG(); 1470 mddev->events --; 1471 } 1472 mddev->sb_dirty = 2; 1473 sync_sbs(mddev); 1474 1475 /* 1476 * do not write anything to disk if using 1477 * nonpersistent superblocks 1478 */ 1479 if (!mddev->persistent) { 1480 mddev->sb_dirty = 0; 1481 spin_unlock_irq(&mddev->write_lock); 1482 wake_up(&mddev->sb_wait); 1483 return; 1484 } 1485 spin_unlock_irq(&mddev->write_lock); 1486 1487 dprintk(KERN_INFO 1488 "md: updating %s RAID superblock on device (in sync %d)\n", 1489 mdname(mddev),mddev->in_sync); 1490 1491 err = bitmap_update_sb(mddev->bitmap); 1492 ITERATE_RDEV(mddev,rdev,tmp) { 1493 char b[BDEVNAME_SIZE]; 1494 dprintk(KERN_INFO "md: "); 1495 if (test_bit(Faulty, &rdev->flags)) 1496 dprintk("(skipping faulty "); 1497 1498 dprintk("%s ", bdevname(rdev->bdev,b)); 1499 if (!test_bit(Faulty, &rdev->flags)) { 1500 md_super_write(mddev,rdev, 1501 rdev->sb_offset<<1, rdev->sb_size, 1502 rdev->sb_page); 1503 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1504 bdevname(rdev->bdev,b), 1505 (unsigned long long)rdev->sb_offset); 1506 1507 } else 1508 dprintk(")\n"); 1509 if (mddev->level == LEVEL_MULTIPATH) 1510 /* only need to write one superblock... */ 1511 break; 1512 } 1513 md_super_wait(mddev); 1514 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1515 1516 spin_lock_irq(&mddev->write_lock); 1517 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1518 /* have to write it out again */ 1519 spin_unlock_irq(&mddev->write_lock); 1520 goto repeat; 1521 } 1522 mddev->sb_dirty = 0; 1523 spin_unlock_irq(&mddev->write_lock); 1524 wake_up(&mddev->sb_wait); 1525 1526 } 1527 1528 /* words written to sysfs files may, or my not, be \n terminated. 1529 * We want to accept with case. For this we use cmd_match. 1530 */ 1531 static int cmd_match(const char *cmd, const char *str) 1532 { 1533 /* See if cmd, written into a sysfs file, matches 1534 * str. They must either be the same, or cmd can 1535 * have a trailing newline 1536 */ 1537 while (*cmd && *str && *cmd == *str) { 1538 cmd++; 1539 str++; 1540 } 1541 if (*cmd == '\n') 1542 cmd++; 1543 if (*str || *cmd) 1544 return 0; 1545 return 1; 1546 } 1547 1548 struct rdev_sysfs_entry { 1549 struct attribute attr; 1550 ssize_t (*show)(mdk_rdev_t *, char *); 1551 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1552 }; 1553 1554 static ssize_t 1555 state_show(mdk_rdev_t *rdev, char *page) 1556 { 1557 char *sep = ""; 1558 int len=0; 1559 1560 if (test_bit(Faulty, &rdev->flags)) { 1561 len+= sprintf(page+len, "%sfaulty",sep); 1562 sep = ","; 1563 } 1564 if (test_bit(In_sync, &rdev->flags)) { 1565 len += sprintf(page+len, "%sin_sync",sep); 1566 sep = ","; 1567 } 1568 if (!test_bit(Faulty, &rdev->flags) && 1569 !test_bit(In_sync, &rdev->flags)) { 1570 len += sprintf(page+len, "%sspare", sep); 1571 sep = ","; 1572 } 1573 return len+sprintf(page+len, "\n"); 1574 } 1575 1576 static struct rdev_sysfs_entry 1577 rdev_state = __ATTR_RO(state); 1578 1579 static ssize_t 1580 super_show(mdk_rdev_t *rdev, char *page) 1581 { 1582 if (rdev->sb_loaded && rdev->sb_size) { 1583 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1584 return rdev->sb_size; 1585 } else 1586 return 0; 1587 } 1588 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1589 1590 static struct attribute *rdev_default_attrs[] = { 1591 &rdev_state.attr, 1592 &rdev_super.attr, 1593 NULL, 1594 }; 1595 static ssize_t 1596 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1597 { 1598 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1599 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1600 1601 if (!entry->show) 1602 return -EIO; 1603 return entry->show(rdev, page); 1604 } 1605 1606 static ssize_t 1607 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1608 const char *page, size_t length) 1609 { 1610 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1611 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1612 1613 if (!entry->store) 1614 return -EIO; 1615 return entry->store(rdev, page, length); 1616 } 1617 1618 static void rdev_free(struct kobject *ko) 1619 { 1620 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1621 kfree(rdev); 1622 } 1623 static struct sysfs_ops rdev_sysfs_ops = { 1624 .show = rdev_attr_show, 1625 .store = rdev_attr_store, 1626 }; 1627 static struct kobj_type rdev_ktype = { 1628 .release = rdev_free, 1629 .sysfs_ops = &rdev_sysfs_ops, 1630 .default_attrs = rdev_default_attrs, 1631 }; 1632 1633 /* 1634 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1635 * 1636 * mark the device faulty if: 1637 * 1638 * - the device is nonexistent (zero size) 1639 * - the device has no valid superblock 1640 * 1641 * a faulty rdev _never_ has rdev->sb set. 1642 */ 1643 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1644 { 1645 char b[BDEVNAME_SIZE]; 1646 int err; 1647 mdk_rdev_t *rdev; 1648 sector_t size; 1649 1650 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1651 if (!rdev) { 1652 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1653 return ERR_PTR(-ENOMEM); 1654 } 1655 1656 if ((err = alloc_disk_sb(rdev))) 1657 goto abort_free; 1658 1659 err = lock_rdev(rdev, newdev); 1660 if (err) 1661 goto abort_free; 1662 1663 rdev->kobj.parent = NULL; 1664 rdev->kobj.ktype = &rdev_ktype; 1665 kobject_init(&rdev->kobj); 1666 1667 rdev->desc_nr = -1; 1668 rdev->flags = 0; 1669 rdev->data_offset = 0; 1670 atomic_set(&rdev->nr_pending, 0); 1671 atomic_set(&rdev->read_errors, 0); 1672 1673 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1674 if (!size) { 1675 printk(KERN_WARNING 1676 "md: %s has zero or unknown size, marking faulty!\n", 1677 bdevname(rdev->bdev,b)); 1678 err = -EINVAL; 1679 goto abort_free; 1680 } 1681 1682 if (super_format >= 0) { 1683 err = super_types[super_format]. 1684 load_super(rdev, NULL, super_minor); 1685 if (err == -EINVAL) { 1686 printk(KERN_WARNING 1687 "md: %s has invalid sb, not importing!\n", 1688 bdevname(rdev->bdev,b)); 1689 goto abort_free; 1690 } 1691 if (err < 0) { 1692 printk(KERN_WARNING 1693 "md: could not read %s's sb, not importing!\n", 1694 bdevname(rdev->bdev,b)); 1695 goto abort_free; 1696 } 1697 } 1698 INIT_LIST_HEAD(&rdev->same_set); 1699 1700 return rdev; 1701 1702 abort_free: 1703 if (rdev->sb_page) { 1704 if (rdev->bdev) 1705 unlock_rdev(rdev); 1706 free_disk_sb(rdev); 1707 } 1708 kfree(rdev); 1709 return ERR_PTR(err); 1710 } 1711 1712 /* 1713 * Check a full RAID array for plausibility 1714 */ 1715 1716 1717 static void analyze_sbs(mddev_t * mddev) 1718 { 1719 int i; 1720 struct list_head *tmp; 1721 mdk_rdev_t *rdev, *freshest; 1722 char b[BDEVNAME_SIZE]; 1723 1724 freshest = NULL; 1725 ITERATE_RDEV(mddev,rdev,tmp) 1726 switch (super_types[mddev->major_version]. 1727 load_super(rdev, freshest, mddev->minor_version)) { 1728 case 1: 1729 freshest = rdev; 1730 break; 1731 case 0: 1732 break; 1733 default: 1734 printk( KERN_ERR \ 1735 "md: fatal superblock inconsistency in %s" 1736 " -- removing from array\n", 1737 bdevname(rdev->bdev,b)); 1738 kick_rdev_from_array(rdev); 1739 } 1740 1741 1742 super_types[mddev->major_version]. 1743 validate_super(mddev, freshest); 1744 1745 i = 0; 1746 ITERATE_RDEV(mddev,rdev,tmp) { 1747 if (rdev != freshest) 1748 if (super_types[mddev->major_version]. 1749 validate_super(mddev, rdev)) { 1750 printk(KERN_WARNING "md: kicking non-fresh %s" 1751 " from array!\n", 1752 bdevname(rdev->bdev,b)); 1753 kick_rdev_from_array(rdev); 1754 continue; 1755 } 1756 if (mddev->level == LEVEL_MULTIPATH) { 1757 rdev->desc_nr = i++; 1758 rdev->raid_disk = rdev->desc_nr; 1759 set_bit(In_sync, &rdev->flags); 1760 } 1761 } 1762 1763 1764 1765 if (mddev->recovery_cp != MaxSector && 1766 mddev->level >= 1) 1767 printk(KERN_ERR "md: %s: raid array is not clean" 1768 " -- starting background reconstruction\n", 1769 mdname(mddev)); 1770 1771 } 1772 1773 static ssize_t 1774 level_show(mddev_t *mddev, char *page) 1775 { 1776 struct mdk_personality *p = mddev->pers; 1777 if (p == NULL && mddev->raid_disks == 0) 1778 return 0; 1779 if (mddev->level >= 0) 1780 return sprintf(page, "raid%d\n", mddev->level); 1781 else 1782 return sprintf(page, "%s\n", p->name); 1783 } 1784 1785 static struct md_sysfs_entry md_level = __ATTR_RO(level); 1786 1787 static ssize_t 1788 raid_disks_show(mddev_t *mddev, char *page) 1789 { 1790 if (mddev->raid_disks == 0) 1791 return 0; 1792 return sprintf(page, "%d\n", mddev->raid_disks); 1793 } 1794 1795 static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); 1796 1797 static ssize_t 1798 chunk_size_show(mddev_t *mddev, char *page) 1799 { 1800 return sprintf(page, "%d\n", mddev->chunk_size); 1801 } 1802 1803 static ssize_t 1804 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 1805 { 1806 /* can only set chunk_size if array is not yet active */ 1807 char *e; 1808 unsigned long n = simple_strtoul(buf, &e, 10); 1809 1810 if (mddev->pers) 1811 return -EBUSY; 1812 if (!*buf || (*e && *e != '\n')) 1813 return -EINVAL; 1814 1815 mddev->chunk_size = n; 1816 return len; 1817 } 1818 static struct md_sysfs_entry md_chunk_size = 1819 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 1820 1821 1822 static ssize_t 1823 action_show(mddev_t *mddev, char *page) 1824 { 1825 char *type = "idle"; 1826 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1827 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 1828 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1829 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1830 type = "resync"; 1831 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1832 type = "check"; 1833 else 1834 type = "repair"; 1835 } else 1836 type = "recover"; 1837 } 1838 return sprintf(page, "%s\n", type); 1839 } 1840 1841 static ssize_t 1842 action_store(mddev_t *mddev, const char *page, size_t len) 1843 { 1844 if (!mddev->pers || !mddev->pers->sync_request) 1845 return -EINVAL; 1846 1847 if (cmd_match(page, "idle")) { 1848 if (mddev->sync_thread) { 1849 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1850 md_unregister_thread(mddev->sync_thread); 1851 mddev->sync_thread = NULL; 1852 mddev->recovery = 0; 1853 } 1854 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1855 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 1856 return -EBUSY; 1857 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 1858 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1859 else { 1860 if (cmd_match(page, "check")) 1861 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 1862 else if (cmd_match(page, "repair")) 1863 return -EINVAL; 1864 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 1865 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 1866 } 1867 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1868 md_wakeup_thread(mddev->thread); 1869 return len; 1870 } 1871 1872 static ssize_t 1873 mismatch_cnt_show(mddev_t *mddev, char *page) 1874 { 1875 return sprintf(page, "%llu\n", 1876 (unsigned long long) mddev->resync_mismatches); 1877 } 1878 1879 static struct md_sysfs_entry 1880 md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 1881 1882 1883 static struct md_sysfs_entry 1884 md_mismatches = __ATTR_RO(mismatch_cnt); 1885 1886 static struct attribute *md_default_attrs[] = { 1887 &md_level.attr, 1888 &md_raid_disks.attr, 1889 &md_chunk_size.attr, 1890 NULL, 1891 }; 1892 1893 static struct attribute *md_redundancy_attrs[] = { 1894 &md_scan_mode.attr, 1895 &md_mismatches.attr, 1896 NULL, 1897 }; 1898 static struct attribute_group md_redundancy_group = { 1899 .name = NULL, 1900 .attrs = md_redundancy_attrs, 1901 }; 1902 1903 1904 static ssize_t 1905 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1906 { 1907 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1908 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1909 ssize_t rv; 1910 1911 if (!entry->show) 1912 return -EIO; 1913 mddev_lock(mddev); 1914 rv = entry->show(mddev, page); 1915 mddev_unlock(mddev); 1916 return rv; 1917 } 1918 1919 static ssize_t 1920 md_attr_store(struct kobject *kobj, struct attribute *attr, 1921 const char *page, size_t length) 1922 { 1923 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1924 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1925 ssize_t rv; 1926 1927 if (!entry->store) 1928 return -EIO; 1929 mddev_lock(mddev); 1930 rv = entry->store(mddev, page, length); 1931 mddev_unlock(mddev); 1932 return rv; 1933 } 1934 1935 static void md_free(struct kobject *ko) 1936 { 1937 mddev_t *mddev = container_of(ko, mddev_t, kobj); 1938 kfree(mddev); 1939 } 1940 1941 static struct sysfs_ops md_sysfs_ops = { 1942 .show = md_attr_show, 1943 .store = md_attr_store, 1944 }; 1945 static struct kobj_type md_ktype = { 1946 .release = md_free, 1947 .sysfs_ops = &md_sysfs_ops, 1948 .default_attrs = md_default_attrs, 1949 }; 1950 1951 int mdp_major = 0; 1952 1953 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1954 { 1955 static DECLARE_MUTEX(disks_sem); 1956 mddev_t *mddev = mddev_find(dev); 1957 struct gendisk *disk; 1958 int partitioned = (MAJOR(dev) != MD_MAJOR); 1959 int shift = partitioned ? MdpMinorShift : 0; 1960 int unit = MINOR(dev) >> shift; 1961 1962 if (!mddev) 1963 return NULL; 1964 1965 down(&disks_sem); 1966 if (mddev->gendisk) { 1967 up(&disks_sem); 1968 mddev_put(mddev); 1969 return NULL; 1970 } 1971 disk = alloc_disk(1 << shift); 1972 if (!disk) { 1973 up(&disks_sem); 1974 mddev_put(mddev); 1975 return NULL; 1976 } 1977 disk->major = MAJOR(dev); 1978 disk->first_minor = unit << shift; 1979 if (partitioned) { 1980 sprintf(disk->disk_name, "md_d%d", unit); 1981 sprintf(disk->devfs_name, "md/d%d", unit); 1982 } else { 1983 sprintf(disk->disk_name, "md%d", unit); 1984 sprintf(disk->devfs_name, "md/%d", unit); 1985 } 1986 disk->fops = &md_fops; 1987 disk->private_data = mddev; 1988 disk->queue = mddev->queue; 1989 add_disk(disk); 1990 mddev->gendisk = disk; 1991 up(&disks_sem); 1992 mddev->kobj.parent = &disk->kobj; 1993 mddev->kobj.k_name = NULL; 1994 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 1995 mddev->kobj.ktype = &md_ktype; 1996 kobject_register(&mddev->kobj); 1997 return NULL; 1998 } 1999 2000 void md_wakeup_thread(mdk_thread_t *thread); 2001 2002 static void md_safemode_timeout(unsigned long data) 2003 { 2004 mddev_t *mddev = (mddev_t *) data; 2005 2006 mddev->safemode = 1; 2007 md_wakeup_thread(mddev->thread); 2008 } 2009 2010 static int start_dirty_degraded; 2011 2012 static int do_md_run(mddev_t * mddev) 2013 { 2014 int err; 2015 int chunk_size; 2016 struct list_head *tmp; 2017 mdk_rdev_t *rdev; 2018 struct gendisk *disk; 2019 struct mdk_personality *pers; 2020 char b[BDEVNAME_SIZE]; 2021 2022 if (list_empty(&mddev->disks)) 2023 /* cannot run an array with no devices.. */ 2024 return -EINVAL; 2025 2026 if (mddev->pers) 2027 return -EBUSY; 2028 2029 /* 2030 * Analyze all RAID superblock(s) 2031 */ 2032 if (!mddev->raid_disks) 2033 analyze_sbs(mddev); 2034 2035 chunk_size = mddev->chunk_size; 2036 2037 if (chunk_size) { 2038 if (chunk_size > MAX_CHUNK_SIZE) { 2039 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2040 chunk_size, MAX_CHUNK_SIZE); 2041 return -EINVAL; 2042 } 2043 /* 2044 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 2045 */ 2046 if ( (1 << ffz(~chunk_size)) != chunk_size) { 2047 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 2048 return -EINVAL; 2049 } 2050 if (chunk_size < PAGE_SIZE) { 2051 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 2052 chunk_size, PAGE_SIZE); 2053 return -EINVAL; 2054 } 2055 2056 /* devices must have minimum size of one chunk */ 2057 ITERATE_RDEV(mddev,rdev,tmp) { 2058 if (test_bit(Faulty, &rdev->flags)) 2059 continue; 2060 if (rdev->size < chunk_size / 1024) { 2061 printk(KERN_WARNING 2062 "md: Dev %s smaller than chunk_size:" 2063 " %lluk < %dk\n", 2064 bdevname(rdev->bdev,b), 2065 (unsigned long long)rdev->size, 2066 chunk_size / 1024); 2067 return -EINVAL; 2068 } 2069 } 2070 } 2071 2072 #ifdef CONFIG_KMOD 2073 request_module("md-level-%d", mddev->level); 2074 #endif 2075 2076 /* 2077 * Drop all container device buffers, from now on 2078 * the only valid external interface is through the md 2079 * device. 2080 * Also find largest hardsector size 2081 */ 2082 ITERATE_RDEV(mddev,rdev,tmp) { 2083 if (test_bit(Faulty, &rdev->flags)) 2084 continue; 2085 sync_blockdev(rdev->bdev); 2086 invalidate_bdev(rdev->bdev, 0); 2087 } 2088 2089 md_probe(mddev->unit, NULL, NULL); 2090 disk = mddev->gendisk; 2091 if (!disk) 2092 return -ENOMEM; 2093 2094 spin_lock(&pers_lock); 2095 pers = find_pers(mddev->level); 2096 if (!pers || !try_module_get(pers->owner)) { 2097 spin_unlock(&pers_lock); 2098 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 2099 mddev->level); 2100 return -EINVAL; 2101 } 2102 mddev->pers = pers; 2103 spin_unlock(&pers_lock); 2104 2105 mddev->recovery = 0; 2106 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2107 mddev->barriers_work = 1; 2108 mddev->ok_start_degraded = start_dirty_degraded; 2109 2110 if (start_readonly) 2111 mddev->ro = 2; /* read-only, but switch on first write */ 2112 2113 err = mddev->pers->run(mddev); 2114 if (!err && mddev->pers->sync_request) { 2115 err = bitmap_create(mddev); 2116 if (err) { 2117 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2118 mdname(mddev), err); 2119 mddev->pers->stop(mddev); 2120 } 2121 } 2122 if (err) { 2123 printk(KERN_ERR "md: pers->run() failed ...\n"); 2124 module_put(mddev->pers->owner); 2125 mddev->pers = NULL; 2126 bitmap_destroy(mddev); 2127 return err; 2128 } 2129 if (mddev->pers->sync_request) 2130 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 2131 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 2132 mddev->ro = 0; 2133 2134 atomic_set(&mddev->writes_pending,0); 2135 mddev->safemode = 0; 2136 mddev->safemode_timer.function = md_safemode_timeout; 2137 mddev->safemode_timer.data = (unsigned long) mddev; 2138 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2139 mddev->in_sync = 1; 2140 2141 ITERATE_RDEV(mddev,rdev,tmp) 2142 if (rdev->raid_disk >= 0) { 2143 char nm[20]; 2144 sprintf(nm, "rd%d", rdev->raid_disk); 2145 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2146 } 2147 2148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2149 md_wakeup_thread(mddev->thread); 2150 2151 if (mddev->sb_dirty) 2152 md_update_sb(mddev); 2153 2154 set_capacity(disk, mddev->array_size<<1); 2155 2156 /* If we call blk_queue_make_request here, it will 2157 * re-initialise max_sectors etc which may have been 2158 * refined inside -> run. So just set the bits we need to set. 2159 * Most initialisation happended when we called 2160 * blk_queue_make_request(..., md_fail_request) 2161 * earlier. 2162 */ 2163 mddev->queue->queuedata = mddev; 2164 mddev->queue->make_request_fn = mddev->pers->make_request; 2165 2166 mddev->changed = 1; 2167 md_new_event(mddev); 2168 return 0; 2169 } 2170 2171 static int restart_array(mddev_t *mddev) 2172 { 2173 struct gendisk *disk = mddev->gendisk; 2174 int err; 2175 2176 /* 2177 * Complain if it has no devices 2178 */ 2179 err = -ENXIO; 2180 if (list_empty(&mddev->disks)) 2181 goto out; 2182 2183 if (mddev->pers) { 2184 err = -EBUSY; 2185 if (!mddev->ro) 2186 goto out; 2187 2188 mddev->safemode = 0; 2189 mddev->ro = 0; 2190 set_disk_ro(disk, 0); 2191 2192 printk(KERN_INFO "md: %s switched to read-write mode.\n", 2193 mdname(mddev)); 2194 /* 2195 * Kick recovery or resync if necessary 2196 */ 2197 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2198 md_wakeup_thread(mddev->thread); 2199 err = 0; 2200 } else { 2201 printk(KERN_ERR "md: %s has no personality assigned.\n", 2202 mdname(mddev)); 2203 err = -EINVAL; 2204 } 2205 2206 out: 2207 return err; 2208 } 2209 2210 static int do_md_stop(mddev_t * mddev, int ro) 2211 { 2212 int err = 0; 2213 struct gendisk *disk = mddev->gendisk; 2214 2215 if (mddev->pers) { 2216 if (atomic_read(&mddev->active)>2) { 2217 printk("md: %s still in use.\n",mdname(mddev)); 2218 return -EBUSY; 2219 } 2220 2221 if (mddev->sync_thread) { 2222 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2223 md_unregister_thread(mddev->sync_thread); 2224 mddev->sync_thread = NULL; 2225 } 2226 2227 del_timer_sync(&mddev->safemode_timer); 2228 2229 invalidate_partition(disk, 0); 2230 2231 if (ro) { 2232 err = -ENXIO; 2233 if (mddev->ro==1) 2234 goto out; 2235 mddev->ro = 1; 2236 } else { 2237 bitmap_flush(mddev); 2238 md_super_wait(mddev); 2239 if (mddev->ro) 2240 set_disk_ro(disk, 0); 2241 blk_queue_make_request(mddev->queue, md_fail_request); 2242 mddev->pers->stop(mddev); 2243 if (mddev->pers->sync_request) 2244 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 2245 2246 module_put(mddev->pers->owner); 2247 mddev->pers = NULL; 2248 if (mddev->ro) 2249 mddev->ro = 0; 2250 } 2251 if (!mddev->in_sync) { 2252 /* mark array as shutdown cleanly */ 2253 mddev->in_sync = 1; 2254 md_update_sb(mddev); 2255 } 2256 if (ro) 2257 set_disk_ro(disk, 1); 2258 } 2259 2260 bitmap_destroy(mddev); 2261 if (mddev->bitmap_file) { 2262 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 2263 fput(mddev->bitmap_file); 2264 mddev->bitmap_file = NULL; 2265 } 2266 mddev->bitmap_offset = 0; 2267 2268 /* 2269 * Free resources if final stop 2270 */ 2271 if (!ro) { 2272 mdk_rdev_t *rdev; 2273 struct list_head *tmp; 2274 struct gendisk *disk; 2275 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2276 2277 ITERATE_RDEV(mddev,rdev,tmp) 2278 if (rdev->raid_disk >= 0) { 2279 char nm[20]; 2280 sprintf(nm, "rd%d", rdev->raid_disk); 2281 sysfs_remove_link(&mddev->kobj, nm); 2282 } 2283 2284 export_array(mddev); 2285 2286 mddev->array_size = 0; 2287 disk = mddev->gendisk; 2288 if (disk) 2289 set_capacity(disk, 0); 2290 mddev->changed = 1; 2291 } else 2292 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2293 mdname(mddev)); 2294 err = 0; 2295 md_new_event(mddev); 2296 out: 2297 return err; 2298 } 2299 2300 static void autorun_array(mddev_t *mddev) 2301 { 2302 mdk_rdev_t *rdev; 2303 struct list_head *tmp; 2304 int err; 2305 2306 if (list_empty(&mddev->disks)) 2307 return; 2308 2309 printk(KERN_INFO "md: running: "); 2310 2311 ITERATE_RDEV(mddev,rdev,tmp) { 2312 char b[BDEVNAME_SIZE]; 2313 printk("<%s>", bdevname(rdev->bdev,b)); 2314 } 2315 printk("\n"); 2316 2317 err = do_md_run (mddev); 2318 if (err) { 2319 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 2320 do_md_stop (mddev, 0); 2321 } 2322 } 2323 2324 /* 2325 * lets try to run arrays based on all disks that have arrived 2326 * until now. (those are in pending_raid_disks) 2327 * 2328 * the method: pick the first pending disk, collect all disks with 2329 * the same UUID, remove all from the pending list and put them into 2330 * the 'same_array' list. Then order this list based on superblock 2331 * update time (freshest comes first), kick out 'old' disks and 2332 * compare superblocks. If everything's fine then run it. 2333 * 2334 * If "unit" is allocated, then bump its reference count 2335 */ 2336 static void autorun_devices(int part) 2337 { 2338 struct list_head candidates; 2339 struct list_head *tmp; 2340 mdk_rdev_t *rdev0, *rdev; 2341 mddev_t *mddev; 2342 char b[BDEVNAME_SIZE]; 2343 2344 printk(KERN_INFO "md: autorun ...\n"); 2345 while (!list_empty(&pending_raid_disks)) { 2346 dev_t dev; 2347 rdev0 = list_entry(pending_raid_disks.next, 2348 mdk_rdev_t, same_set); 2349 2350 printk(KERN_INFO "md: considering %s ...\n", 2351 bdevname(rdev0->bdev,b)); 2352 INIT_LIST_HEAD(&candidates); 2353 ITERATE_RDEV_PENDING(rdev,tmp) 2354 if (super_90_load(rdev, rdev0, 0) >= 0) { 2355 printk(KERN_INFO "md: adding %s ...\n", 2356 bdevname(rdev->bdev,b)); 2357 list_move(&rdev->same_set, &candidates); 2358 } 2359 /* 2360 * now we have a set of devices, with all of them having 2361 * mostly sane superblocks. It's time to allocate the 2362 * mddev. 2363 */ 2364 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 2365 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 2366 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 2367 break; 2368 } 2369 if (part) 2370 dev = MKDEV(mdp_major, 2371 rdev0->preferred_minor << MdpMinorShift); 2372 else 2373 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 2374 2375 md_probe(dev, NULL, NULL); 2376 mddev = mddev_find(dev); 2377 if (!mddev) { 2378 printk(KERN_ERR 2379 "md: cannot allocate memory for md drive.\n"); 2380 break; 2381 } 2382 if (mddev_lock(mddev)) 2383 printk(KERN_WARNING "md: %s locked, cannot run\n", 2384 mdname(mddev)); 2385 else if (mddev->raid_disks || mddev->major_version 2386 || !list_empty(&mddev->disks)) { 2387 printk(KERN_WARNING 2388 "md: %s already running, cannot run %s\n", 2389 mdname(mddev), bdevname(rdev0->bdev,b)); 2390 mddev_unlock(mddev); 2391 } else { 2392 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 2393 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 2394 list_del_init(&rdev->same_set); 2395 if (bind_rdev_to_array(rdev, mddev)) 2396 export_rdev(rdev); 2397 } 2398 autorun_array(mddev); 2399 mddev_unlock(mddev); 2400 } 2401 /* on success, candidates will be empty, on error 2402 * it won't... 2403 */ 2404 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 2405 export_rdev(rdev); 2406 mddev_put(mddev); 2407 } 2408 printk(KERN_INFO "md: ... autorun DONE.\n"); 2409 } 2410 2411 /* 2412 * import RAID devices based on one partition 2413 * if possible, the array gets run as well. 2414 */ 2415 2416 static int autostart_array(dev_t startdev) 2417 { 2418 char b[BDEVNAME_SIZE]; 2419 int err = -EINVAL, i; 2420 mdp_super_t *sb = NULL; 2421 mdk_rdev_t *start_rdev = NULL, *rdev; 2422 2423 start_rdev = md_import_device(startdev, 0, 0); 2424 if (IS_ERR(start_rdev)) 2425 return err; 2426 2427 2428 /* NOTE: this can only work for 0.90.0 superblocks */ 2429 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2430 if (sb->major_version != 0 || 2431 sb->minor_version != 90 ) { 2432 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2433 export_rdev(start_rdev); 2434 return err; 2435 } 2436 2437 if (test_bit(Faulty, &start_rdev->flags)) { 2438 printk(KERN_WARNING 2439 "md: can not autostart based on faulty %s!\n", 2440 bdevname(start_rdev->bdev,b)); 2441 export_rdev(start_rdev); 2442 return err; 2443 } 2444 list_add(&start_rdev->same_set, &pending_raid_disks); 2445 2446 for (i = 0; i < MD_SB_DISKS; i++) { 2447 mdp_disk_t *desc = sb->disks + i; 2448 dev_t dev = MKDEV(desc->major, desc->minor); 2449 2450 if (!dev) 2451 continue; 2452 if (dev == startdev) 2453 continue; 2454 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2455 continue; 2456 rdev = md_import_device(dev, 0, 0); 2457 if (IS_ERR(rdev)) 2458 continue; 2459 2460 list_add(&rdev->same_set, &pending_raid_disks); 2461 } 2462 2463 /* 2464 * possibly return codes 2465 */ 2466 autorun_devices(0); 2467 return 0; 2468 2469 } 2470 2471 2472 static int get_version(void __user * arg) 2473 { 2474 mdu_version_t ver; 2475 2476 ver.major = MD_MAJOR_VERSION; 2477 ver.minor = MD_MINOR_VERSION; 2478 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2479 2480 if (copy_to_user(arg, &ver, sizeof(ver))) 2481 return -EFAULT; 2482 2483 return 0; 2484 } 2485 2486 static int get_array_info(mddev_t * mddev, void __user * arg) 2487 { 2488 mdu_array_info_t info; 2489 int nr,working,active,failed,spare; 2490 mdk_rdev_t *rdev; 2491 struct list_head *tmp; 2492 2493 nr=working=active=failed=spare=0; 2494 ITERATE_RDEV(mddev,rdev,tmp) { 2495 nr++; 2496 if (test_bit(Faulty, &rdev->flags)) 2497 failed++; 2498 else { 2499 working++; 2500 if (test_bit(In_sync, &rdev->flags)) 2501 active++; 2502 else 2503 spare++; 2504 } 2505 } 2506 2507 info.major_version = mddev->major_version; 2508 info.minor_version = mddev->minor_version; 2509 info.patch_version = MD_PATCHLEVEL_VERSION; 2510 info.ctime = mddev->ctime; 2511 info.level = mddev->level; 2512 info.size = mddev->size; 2513 info.nr_disks = nr; 2514 info.raid_disks = mddev->raid_disks; 2515 info.md_minor = mddev->md_minor; 2516 info.not_persistent= !mddev->persistent; 2517 2518 info.utime = mddev->utime; 2519 info.state = 0; 2520 if (mddev->in_sync) 2521 info.state = (1<<MD_SB_CLEAN); 2522 if (mddev->bitmap && mddev->bitmap_offset) 2523 info.state = (1<<MD_SB_BITMAP_PRESENT); 2524 info.active_disks = active; 2525 info.working_disks = working; 2526 info.failed_disks = failed; 2527 info.spare_disks = spare; 2528 2529 info.layout = mddev->layout; 2530 info.chunk_size = mddev->chunk_size; 2531 2532 if (copy_to_user(arg, &info, sizeof(info))) 2533 return -EFAULT; 2534 2535 return 0; 2536 } 2537 2538 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2539 { 2540 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2541 char *ptr, *buf = NULL; 2542 int err = -ENOMEM; 2543 2544 file = kmalloc(sizeof(*file), GFP_KERNEL); 2545 if (!file) 2546 goto out; 2547 2548 /* bitmap disabled, zero the first byte and copy out */ 2549 if (!mddev->bitmap || !mddev->bitmap->file) { 2550 file->pathname[0] = '\0'; 2551 goto copy_out; 2552 } 2553 2554 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2555 if (!buf) 2556 goto out; 2557 2558 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2559 if (!ptr) 2560 goto out; 2561 2562 strcpy(file->pathname, ptr); 2563 2564 copy_out: 2565 err = 0; 2566 if (copy_to_user(arg, file, sizeof(*file))) 2567 err = -EFAULT; 2568 out: 2569 kfree(buf); 2570 kfree(file); 2571 return err; 2572 } 2573 2574 static int get_disk_info(mddev_t * mddev, void __user * arg) 2575 { 2576 mdu_disk_info_t info; 2577 unsigned int nr; 2578 mdk_rdev_t *rdev; 2579 2580 if (copy_from_user(&info, arg, sizeof(info))) 2581 return -EFAULT; 2582 2583 nr = info.number; 2584 2585 rdev = find_rdev_nr(mddev, nr); 2586 if (rdev) { 2587 info.major = MAJOR(rdev->bdev->bd_dev); 2588 info.minor = MINOR(rdev->bdev->bd_dev); 2589 info.raid_disk = rdev->raid_disk; 2590 info.state = 0; 2591 if (test_bit(Faulty, &rdev->flags)) 2592 info.state |= (1<<MD_DISK_FAULTY); 2593 else if (test_bit(In_sync, &rdev->flags)) { 2594 info.state |= (1<<MD_DISK_ACTIVE); 2595 info.state |= (1<<MD_DISK_SYNC); 2596 } 2597 if (test_bit(WriteMostly, &rdev->flags)) 2598 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2599 } else { 2600 info.major = info.minor = 0; 2601 info.raid_disk = -1; 2602 info.state = (1<<MD_DISK_REMOVED); 2603 } 2604 2605 if (copy_to_user(arg, &info, sizeof(info))) 2606 return -EFAULT; 2607 2608 return 0; 2609 } 2610 2611 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2612 { 2613 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2614 mdk_rdev_t *rdev; 2615 dev_t dev = MKDEV(info->major,info->minor); 2616 2617 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2618 return -EOVERFLOW; 2619 2620 if (!mddev->raid_disks) { 2621 int err; 2622 /* expecting a device which has a superblock */ 2623 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2624 if (IS_ERR(rdev)) { 2625 printk(KERN_WARNING 2626 "md: md_import_device returned %ld\n", 2627 PTR_ERR(rdev)); 2628 return PTR_ERR(rdev); 2629 } 2630 if (!list_empty(&mddev->disks)) { 2631 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2632 mdk_rdev_t, same_set); 2633 int err = super_types[mddev->major_version] 2634 .load_super(rdev, rdev0, mddev->minor_version); 2635 if (err < 0) { 2636 printk(KERN_WARNING 2637 "md: %s has different UUID to %s\n", 2638 bdevname(rdev->bdev,b), 2639 bdevname(rdev0->bdev,b2)); 2640 export_rdev(rdev); 2641 return -EINVAL; 2642 } 2643 } 2644 err = bind_rdev_to_array(rdev, mddev); 2645 if (err) 2646 export_rdev(rdev); 2647 return err; 2648 } 2649 2650 /* 2651 * add_new_disk can be used once the array is assembled 2652 * to add "hot spares". They must already have a superblock 2653 * written 2654 */ 2655 if (mddev->pers) { 2656 int err; 2657 if (!mddev->pers->hot_add_disk) { 2658 printk(KERN_WARNING 2659 "%s: personality does not support diskops!\n", 2660 mdname(mddev)); 2661 return -EINVAL; 2662 } 2663 if (mddev->persistent) 2664 rdev = md_import_device(dev, mddev->major_version, 2665 mddev->minor_version); 2666 else 2667 rdev = md_import_device(dev, -1, -1); 2668 if (IS_ERR(rdev)) { 2669 printk(KERN_WARNING 2670 "md: md_import_device returned %ld\n", 2671 PTR_ERR(rdev)); 2672 return PTR_ERR(rdev); 2673 } 2674 /* set save_raid_disk if appropriate */ 2675 if (!mddev->persistent) { 2676 if (info->state & (1<<MD_DISK_SYNC) && 2677 info->raid_disk < mddev->raid_disks) 2678 rdev->raid_disk = info->raid_disk; 2679 else 2680 rdev->raid_disk = -1; 2681 } else 2682 super_types[mddev->major_version]. 2683 validate_super(mddev, rdev); 2684 rdev->saved_raid_disk = rdev->raid_disk; 2685 2686 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 2687 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2688 set_bit(WriteMostly, &rdev->flags); 2689 2690 rdev->raid_disk = -1; 2691 err = bind_rdev_to_array(rdev, mddev); 2692 if (err) 2693 export_rdev(rdev); 2694 2695 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2696 md_wakeup_thread(mddev->thread); 2697 return err; 2698 } 2699 2700 /* otherwise, add_new_disk is only allowed 2701 * for major_version==0 superblocks 2702 */ 2703 if (mddev->major_version != 0) { 2704 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2705 mdname(mddev)); 2706 return -EINVAL; 2707 } 2708 2709 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2710 int err; 2711 rdev = md_import_device (dev, -1, 0); 2712 if (IS_ERR(rdev)) { 2713 printk(KERN_WARNING 2714 "md: error, md_import_device() returned %ld\n", 2715 PTR_ERR(rdev)); 2716 return PTR_ERR(rdev); 2717 } 2718 rdev->desc_nr = info->number; 2719 if (info->raid_disk < mddev->raid_disks) 2720 rdev->raid_disk = info->raid_disk; 2721 else 2722 rdev->raid_disk = -1; 2723 2724 rdev->flags = 0; 2725 2726 if (rdev->raid_disk < mddev->raid_disks) 2727 if (info->state & (1<<MD_DISK_SYNC)) 2728 set_bit(In_sync, &rdev->flags); 2729 2730 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2731 set_bit(WriteMostly, &rdev->flags); 2732 2733 err = bind_rdev_to_array(rdev, mddev); 2734 if (err) { 2735 export_rdev(rdev); 2736 return err; 2737 } 2738 2739 if (!mddev->persistent) { 2740 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2741 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2742 } else 2743 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2744 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2745 2746 if (!mddev->size || (mddev->size > rdev->size)) 2747 mddev->size = rdev->size; 2748 } 2749 2750 return 0; 2751 } 2752 2753 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2754 { 2755 char b[BDEVNAME_SIZE]; 2756 mdk_rdev_t *rdev; 2757 2758 if (!mddev->pers) 2759 return -ENODEV; 2760 2761 rdev = find_rdev(mddev, dev); 2762 if (!rdev) 2763 return -ENXIO; 2764 2765 if (rdev->raid_disk >= 0) 2766 goto busy; 2767 2768 kick_rdev_from_array(rdev); 2769 md_update_sb(mddev); 2770 md_new_event(mddev); 2771 2772 return 0; 2773 busy: 2774 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2775 bdevname(rdev->bdev,b), mdname(mddev)); 2776 return -EBUSY; 2777 } 2778 2779 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2780 { 2781 char b[BDEVNAME_SIZE]; 2782 int err; 2783 unsigned int size; 2784 mdk_rdev_t *rdev; 2785 2786 if (!mddev->pers) 2787 return -ENODEV; 2788 2789 if (mddev->major_version != 0) { 2790 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2791 " version-0 superblocks.\n", 2792 mdname(mddev)); 2793 return -EINVAL; 2794 } 2795 if (!mddev->pers->hot_add_disk) { 2796 printk(KERN_WARNING 2797 "%s: personality does not support diskops!\n", 2798 mdname(mddev)); 2799 return -EINVAL; 2800 } 2801 2802 rdev = md_import_device (dev, -1, 0); 2803 if (IS_ERR(rdev)) { 2804 printk(KERN_WARNING 2805 "md: error, md_import_device() returned %ld\n", 2806 PTR_ERR(rdev)); 2807 return -EINVAL; 2808 } 2809 2810 if (mddev->persistent) 2811 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2812 else 2813 rdev->sb_offset = 2814 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2815 2816 size = calc_dev_size(rdev, mddev->chunk_size); 2817 rdev->size = size; 2818 2819 if (size < mddev->size) { 2820 printk(KERN_WARNING 2821 "%s: disk size %llu blocks < array size %llu\n", 2822 mdname(mddev), (unsigned long long)size, 2823 (unsigned long long)mddev->size); 2824 err = -ENOSPC; 2825 goto abort_export; 2826 } 2827 2828 if (test_bit(Faulty, &rdev->flags)) { 2829 printk(KERN_WARNING 2830 "md: can not hot-add faulty %s disk to %s!\n", 2831 bdevname(rdev->bdev,b), mdname(mddev)); 2832 err = -EINVAL; 2833 goto abort_export; 2834 } 2835 clear_bit(In_sync, &rdev->flags); 2836 rdev->desc_nr = -1; 2837 bind_rdev_to_array(rdev, mddev); 2838 2839 /* 2840 * The rest should better be atomic, we can have disk failures 2841 * noticed in interrupt contexts ... 2842 */ 2843 2844 if (rdev->desc_nr == mddev->max_disks) { 2845 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2846 mdname(mddev)); 2847 err = -EBUSY; 2848 goto abort_unbind_export; 2849 } 2850 2851 rdev->raid_disk = -1; 2852 2853 md_update_sb(mddev); 2854 2855 /* 2856 * Kick recovery, maybe this spare has to be added to the 2857 * array immediately. 2858 */ 2859 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2860 md_wakeup_thread(mddev->thread); 2861 md_new_event(mddev); 2862 return 0; 2863 2864 abort_unbind_export: 2865 unbind_rdev_from_array(rdev); 2866 2867 abort_export: 2868 export_rdev(rdev); 2869 return err; 2870 } 2871 2872 /* similar to deny_write_access, but accounts for our holding a reference 2873 * to the file ourselves */ 2874 static int deny_bitmap_write_access(struct file * file) 2875 { 2876 struct inode *inode = file->f_mapping->host; 2877 2878 spin_lock(&inode->i_lock); 2879 if (atomic_read(&inode->i_writecount) > 1) { 2880 spin_unlock(&inode->i_lock); 2881 return -ETXTBSY; 2882 } 2883 atomic_set(&inode->i_writecount, -1); 2884 spin_unlock(&inode->i_lock); 2885 2886 return 0; 2887 } 2888 2889 static int set_bitmap_file(mddev_t *mddev, int fd) 2890 { 2891 int err; 2892 2893 if (mddev->pers) { 2894 if (!mddev->pers->quiesce) 2895 return -EBUSY; 2896 if (mddev->recovery || mddev->sync_thread) 2897 return -EBUSY; 2898 /* we should be able to change the bitmap.. */ 2899 } 2900 2901 2902 if (fd >= 0) { 2903 if (mddev->bitmap) 2904 return -EEXIST; /* cannot add when bitmap is present */ 2905 mddev->bitmap_file = fget(fd); 2906 2907 if (mddev->bitmap_file == NULL) { 2908 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2909 mdname(mddev)); 2910 return -EBADF; 2911 } 2912 2913 err = deny_bitmap_write_access(mddev->bitmap_file); 2914 if (err) { 2915 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2916 mdname(mddev)); 2917 fput(mddev->bitmap_file); 2918 mddev->bitmap_file = NULL; 2919 return err; 2920 } 2921 mddev->bitmap_offset = 0; /* file overrides offset */ 2922 } else if (mddev->bitmap == NULL) 2923 return -ENOENT; /* cannot remove what isn't there */ 2924 err = 0; 2925 if (mddev->pers) { 2926 mddev->pers->quiesce(mddev, 1); 2927 if (fd >= 0) 2928 err = bitmap_create(mddev); 2929 if (fd < 0 || err) 2930 bitmap_destroy(mddev); 2931 mddev->pers->quiesce(mddev, 0); 2932 } else if (fd < 0) { 2933 if (mddev->bitmap_file) 2934 fput(mddev->bitmap_file); 2935 mddev->bitmap_file = NULL; 2936 } 2937 2938 return err; 2939 } 2940 2941 /* 2942 * set_array_info is used two different ways 2943 * The original usage is when creating a new array. 2944 * In this usage, raid_disks is > 0 and it together with 2945 * level, size, not_persistent,layout,chunksize determine the 2946 * shape of the array. 2947 * This will always create an array with a type-0.90.0 superblock. 2948 * The newer usage is when assembling an array. 2949 * In this case raid_disks will be 0, and the major_version field is 2950 * use to determine which style super-blocks are to be found on the devices. 2951 * The minor and patch _version numbers are also kept incase the 2952 * super_block handler wishes to interpret them. 2953 */ 2954 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2955 { 2956 2957 if (info->raid_disks == 0) { 2958 /* just setting version number for superblock loading */ 2959 if (info->major_version < 0 || 2960 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2961 super_types[info->major_version].name == NULL) { 2962 /* maybe try to auto-load a module? */ 2963 printk(KERN_INFO 2964 "md: superblock version %d not known\n", 2965 info->major_version); 2966 return -EINVAL; 2967 } 2968 mddev->major_version = info->major_version; 2969 mddev->minor_version = info->minor_version; 2970 mddev->patch_version = info->patch_version; 2971 return 0; 2972 } 2973 mddev->major_version = MD_MAJOR_VERSION; 2974 mddev->minor_version = MD_MINOR_VERSION; 2975 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2976 mddev->ctime = get_seconds(); 2977 2978 mddev->level = info->level; 2979 mddev->size = info->size; 2980 mddev->raid_disks = info->raid_disks; 2981 /* don't set md_minor, it is determined by which /dev/md* was 2982 * openned 2983 */ 2984 if (info->state & (1<<MD_SB_CLEAN)) 2985 mddev->recovery_cp = MaxSector; 2986 else 2987 mddev->recovery_cp = 0; 2988 mddev->persistent = ! info->not_persistent; 2989 2990 mddev->layout = info->layout; 2991 mddev->chunk_size = info->chunk_size; 2992 2993 mddev->max_disks = MD_SB_DISKS; 2994 2995 mddev->sb_dirty = 1; 2996 2997 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 2998 mddev->bitmap_offset = 0; 2999 3000 /* 3001 * Generate a 128 bit UUID 3002 */ 3003 get_random_bytes(mddev->uuid, 16); 3004 3005 return 0; 3006 } 3007 3008 /* 3009 * update_array_info is used to change the configuration of an 3010 * on-line array. 3011 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 3012 * fields in the info are checked against the array. 3013 * Any differences that cannot be handled will cause an error. 3014 * Normally, only one change can be managed at a time. 3015 */ 3016 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 3017 { 3018 int rv = 0; 3019 int cnt = 0; 3020 int state = 0; 3021 3022 /* calculate expected state,ignoring low bits */ 3023 if (mddev->bitmap && mddev->bitmap_offset) 3024 state |= (1 << MD_SB_BITMAP_PRESENT); 3025 3026 if (mddev->major_version != info->major_version || 3027 mddev->minor_version != info->minor_version || 3028 /* mddev->patch_version != info->patch_version || */ 3029 mddev->ctime != info->ctime || 3030 mddev->level != info->level || 3031 /* mddev->layout != info->layout || */ 3032 !mddev->persistent != info->not_persistent|| 3033 mddev->chunk_size != info->chunk_size || 3034 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 3035 ((state^info->state) & 0xfffffe00) 3036 ) 3037 return -EINVAL; 3038 /* Check there is only one change */ 3039 if (mddev->size != info->size) cnt++; 3040 if (mddev->raid_disks != info->raid_disks) cnt++; 3041 if (mddev->layout != info->layout) cnt++; 3042 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 3043 if (cnt == 0) return 0; 3044 if (cnt > 1) return -EINVAL; 3045 3046 if (mddev->layout != info->layout) { 3047 /* Change layout 3048 * we don't need to do anything at the md level, the 3049 * personality will take care of it all. 3050 */ 3051 if (mddev->pers->reconfig == NULL) 3052 return -EINVAL; 3053 else 3054 return mddev->pers->reconfig(mddev, info->layout, -1); 3055 } 3056 if (mddev->size != info->size) { 3057 mdk_rdev_t * rdev; 3058 struct list_head *tmp; 3059 if (mddev->pers->resize == NULL) 3060 return -EINVAL; 3061 /* The "size" is the amount of each device that is used. 3062 * This can only make sense for arrays with redundancy. 3063 * linear and raid0 always use whatever space is available 3064 * We can only consider changing the size if no resync 3065 * or reconstruction is happening, and if the new size 3066 * is acceptable. It must fit before the sb_offset or, 3067 * if that is <data_offset, it must fit before the 3068 * size of each device. 3069 * If size is zero, we find the largest size that fits. 3070 */ 3071 if (mddev->sync_thread) 3072 return -EBUSY; 3073 ITERATE_RDEV(mddev,rdev,tmp) { 3074 sector_t avail; 3075 int fit = (info->size == 0); 3076 if (rdev->sb_offset > rdev->data_offset) 3077 avail = (rdev->sb_offset*2) - rdev->data_offset; 3078 else 3079 avail = get_capacity(rdev->bdev->bd_disk) 3080 - rdev->data_offset; 3081 if (fit && (info->size == 0 || info->size > avail/2)) 3082 info->size = avail/2; 3083 if (avail < ((sector_t)info->size << 1)) 3084 return -ENOSPC; 3085 } 3086 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 3087 if (!rv) { 3088 struct block_device *bdev; 3089 3090 bdev = bdget_disk(mddev->gendisk, 0); 3091 if (bdev) { 3092 down(&bdev->bd_inode->i_sem); 3093 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3094 up(&bdev->bd_inode->i_sem); 3095 bdput(bdev); 3096 } 3097 } 3098 } 3099 if (mddev->raid_disks != info->raid_disks) { 3100 /* change the number of raid disks */ 3101 if (mddev->pers->reshape == NULL) 3102 return -EINVAL; 3103 if (info->raid_disks <= 0 || 3104 info->raid_disks >= mddev->max_disks) 3105 return -EINVAL; 3106 if (mddev->sync_thread) 3107 return -EBUSY; 3108 rv = mddev->pers->reshape(mddev, info->raid_disks); 3109 if (!rv) { 3110 struct block_device *bdev; 3111 3112 bdev = bdget_disk(mddev->gendisk, 0); 3113 if (bdev) { 3114 down(&bdev->bd_inode->i_sem); 3115 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3116 up(&bdev->bd_inode->i_sem); 3117 bdput(bdev); 3118 } 3119 } 3120 } 3121 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3122 if (mddev->pers->quiesce == NULL) 3123 return -EINVAL; 3124 if (mddev->recovery || mddev->sync_thread) 3125 return -EBUSY; 3126 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 3127 /* add the bitmap */ 3128 if (mddev->bitmap) 3129 return -EEXIST; 3130 if (mddev->default_bitmap_offset == 0) 3131 return -EINVAL; 3132 mddev->bitmap_offset = mddev->default_bitmap_offset; 3133 mddev->pers->quiesce(mddev, 1); 3134 rv = bitmap_create(mddev); 3135 if (rv) 3136 bitmap_destroy(mddev); 3137 mddev->pers->quiesce(mddev, 0); 3138 } else { 3139 /* remove the bitmap */ 3140 if (!mddev->bitmap) 3141 return -ENOENT; 3142 if (mddev->bitmap->file) 3143 return -EINVAL; 3144 mddev->pers->quiesce(mddev, 1); 3145 bitmap_destroy(mddev); 3146 mddev->pers->quiesce(mddev, 0); 3147 mddev->bitmap_offset = 0; 3148 } 3149 } 3150 md_update_sb(mddev); 3151 return rv; 3152 } 3153 3154 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 3155 { 3156 mdk_rdev_t *rdev; 3157 3158 if (mddev->pers == NULL) 3159 return -ENODEV; 3160 3161 rdev = find_rdev(mddev, dev); 3162 if (!rdev) 3163 return -ENODEV; 3164 3165 md_error(mddev, rdev); 3166 return 0; 3167 } 3168 3169 static int md_ioctl(struct inode *inode, struct file *file, 3170 unsigned int cmd, unsigned long arg) 3171 { 3172 int err = 0; 3173 void __user *argp = (void __user *)arg; 3174 struct hd_geometry __user *loc = argp; 3175 mddev_t *mddev = NULL; 3176 3177 if (!capable(CAP_SYS_ADMIN)) 3178 return -EACCES; 3179 3180 /* 3181 * Commands dealing with the RAID driver but not any 3182 * particular array: 3183 */ 3184 switch (cmd) 3185 { 3186 case RAID_VERSION: 3187 err = get_version(argp); 3188 goto done; 3189 3190 case PRINT_RAID_DEBUG: 3191 err = 0; 3192 md_print_devices(); 3193 goto done; 3194 3195 #ifndef MODULE 3196 case RAID_AUTORUN: 3197 err = 0; 3198 autostart_arrays(arg); 3199 goto done; 3200 #endif 3201 default:; 3202 } 3203 3204 /* 3205 * Commands creating/starting a new array: 3206 */ 3207 3208 mddev = inode->i_bdev->bd_disk->private_data; 3209 3210 if (!mddev) { 3211 BUG(); 3212 goto abort; 3213 } 3214 3215 3216 if (cmd == START_ARRAY) { 3217 /* START_ARRAY doesn't need to lock the array as autostart_array 3218 * does the locking, and it could even be a different array 3219 */ 3220 static int cnt = 3; 3221 if (cnt > 0 ) { 3222 printk(KERN_WARNING 3223 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 3224 "This will not be supported beyond July 2006\n", 3225 current->comm, current->pid); 3226 cnt--; 3227 } 3228 err = autostart_array(new_decode_dev(arg)); 3229 if (err) { 3230 printk(KERN_WARNING "md: autostart failed!\n"); 3231 goto abort; 3232 } 3233 goto done; 3234 } 3235 3236 err = mddev_lock(mddev); 3237 if (err) { 3238 printk(KERN_INFO 3239 "md: ioctl lock interrupted, reason %d, cmd %d\n", 3240 err, cmd); 3241 goto abort; 3242 } 3243 3244 switch (cmd) 3245 { 3246 case SET_ARRAY_INFO: 3247 { 3248 mdu_array_info_t info; 3249 if (!arg) 3250 memset(&info, 0, sizeof(info)); 3251 else if (copy_from_user(&info, argp, sizeof(info))) { 3252 err = -EFAULT; 3253 goto abort_unlock; 3254 } 3255 if (mddev->pers) { 3256 err = update_array_info(mddev, &info); 3257 if (err) { 3258 printk(KERN_WARNING "md: couldn't update" 3259 " array info. %d\n", err); 3260 goto abort_unlock; 3261 } 3262 goto done_unlock; 3263 } 3264 if (!list_empty(&mddev->disks)) { 3265 printk(KERN_WARNING 3266 "md: array %s already has disks!\n", 3267 mdname(mddev)); 3268 err = -EBUSY; 3269 goto abort_unlock; 3270 } 3271 if (mddev->raid_disks) { 3272 printk(KERN_WARNING 3273 "md: array %s already initialised!\n", 3274 mdname(mddev)); 3275 err = -EBUSY; 3276 goto abort_unlock; 3277 } 3278 err = set_array_info(mddev, &info); 3279 if (err) { 3280 printk(KERN_WARNING "md: couldn't set" 3281 " array info. %d\n", err); 3282 goto abort_unlock; 3283 } 3284 } 3285 goto done_unlock; 3286 3287 default:; 3288 } 3289 3290 /* 3291 * Commands querying/configuring an existing array: 3292 */ 3293 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 3294 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 3295 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 3296 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 3297 err = -ENODEV; 3298 goto abort_unlock; 3299 } 3300 3301 /* 3302 * Commands even a read-only array can execute: 3303 */ 3304 switch (cmd) 3305 { 3306 case GET_ARRAY_INFO: 3307 err = get_array_info(mddev, argp); 3308 goto done_unlock; 3309 3310 case GET_BITMAP_FILE: 3311 err = get_bitmap_file(mddev, argp); 3312 goto done_unlock; 3313 3314 case GET_DISK_INFO: 3315 err = get_disk_info(mddev, argp); 3316 goto done_unlock; 3317 3318 case RESTART_ARRAY_RW: 3319 err = restart_array(mddev); 3320 goto done_unlock; 3321 3322 case STOP_ARRAY: 3323 err = do_md_stop (mddev, 0); 3324 goto done_unlock; 3325 3326 case STOP_ARRAY_RO: 3327 err = do_md_stop (mddev, 1); 3328 goto done_unlock; 3329 3330 /* 3331 * We have a problem here : there is no easy way to give a CHS 3332 * virtual geometry. We currently pretend that we have a 2 heads 3333 * 4 sectors (with a BIG number of cylinders...). This drives 3334 * dosfs just mad... ;-) 3335 */ 3336 case HDIO_GETGEO: 3337 if (!loc) { 3338 err = -EINVAL; 3339 goto abort_unlock; 3340 } 3341 err = put_user (2, (char __user *) &loc->heads); 3342 if (err) 3343 goto abort_unlock; 3344 err = put_user (4, (char __user *) &loc->sectors); 3345 if (err) 3346 goto abort_unlock; 3347 err = put_user(get_capacity(mddev->gendisk)/8, 3348 (short __user *) &loc->cylinders); 3349 if (err) 3350 goto abort_unlock; 3351 err = put_user (get_start_sect(inode->i_bdev), 3352 (long __user *) &loc->start); 3353 goto done_unlock; 3354 } 3355 3356 /* 3357 * The remaining ioctls are changing the state of the 3358 * superblock, so we do not allow them on read-only arrays. 3359 * However non-MD ioctls (e.g. get-size) will still come through 3360 * here and hit the 'default' below, so only disallow 3361 * 'md' ioctls, and switch to rw mode if started auto-readonly. 3362 */ 3363 if (_IOC_TYPE(cmd) == MD_MAJOR && 3364 mddev->ro && mddev->pers) { 3365 if (mddev->ro == 2) { 3366 mddev->ro = 0; 3367 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3368 md_wakeup_thread(mddev->thread); 3369 3370 } else { 3371 err = -EROFS; 3372 goto abort_unlock; 3373 } 3374 } 3375 3376 switch (cmd) 3377 { 3378 case ADD_NEW_DISK: 3379 { 3380 mdu_disk_info_t info; 3381 if (copy_from_user(&info, argp, sizeof(info))) 3382 err = -EFAULT; 3383 else 3384 err = add_new_disk(mddev, &info); 3385 goto done_unlock; 3386 } 3387 3388 case HOT_REMOVE_DISK: 3389 err = hot_remove_disk(mddev, new_decode_dev(arg)); 3390 goto done_unlock; 3391 3392 case HOT_ADD_DISK: 3393 err = hot_add_disk(mddev, new_decode_dev(arg)); 3394 goto done_unlock; 3395 3396 case SET_DISK_FAULTY: 3397 err = set_disk_faulty(mddev, new_decode_dev(arg)); 3398 goto done_unlock; 3399 3400 case RUN_ARRAY: 3401 err = do_md_run (mddev); 3402 goto done_unlock; 3403 3404 case SET_BITMAP_FILE: 3405 err = set_bitmap_file(mddev, (int)arg); 3406 goto done_unlock; 3407 3408 default: 3409 if (_IOC_TYPE(cmd) == MD_MAJOR) 3410 printk(KERN_WARNING "md: %s(pid %d) used" 3411 " obsolete MD ioctl, upgrade your" 3412 " software to use new ictls.\n", 3413 current->comm, current->pid); 3414 err = -EINVAL; 3415 goto abort_unlock; 3416 } 3417 3418 done_unlock: 3419 abort_unlock: 3420 mddev_unlock(mddev); 3421 3422 return err; 3423 done: 3424 if (err) 3425 MD_BUG(); 3426 abort: 3427 return err; 3428 } 3429 3430 static int md_open(struct inode *inode, struct file *file) 3431 { 3432 /* 3433 * Succeed if we can lock the mddev, which confirms that 3434 * it isn't being stopped right now. 3435 */ 3436 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3437 int err; 3438 3439 if ((err = mddev_lock(mddev))) 3440 goto out; 3441 3442 err = 0; 3443 mddev_get(mddev); 3444 mddev_unlock(mddev); 3445 3446 check_disk_change(inode->i_bdev); 3447 out: 3448 return err; 3449 } 3450 3451 static int md_release(struct inode *inode, struct file * file) 3452 { 3453 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3454 3455 if (!mddev) 3456 BUG(); 3457 mddev_put(mddev); 3458 3459 return 0; 3460 } 3461 3462 static int md_media_changed(struct gendisk *disk) 3463 { 3464 mddev_t *mddev = disk->private_data; 3465 3466 return mddev->changed; 3467 } 3468 3469 static int md_revalidate(struct gendisk *disk) 3470 { 3471 mddev_t *mddev = disk->private_data; 3472 3473 mddev->changed = 0; 3474 return 0; 3475 } 3476 static struct block_device_operations md_fops = 3477 { 3478 .owner = THIS_MODULE, 3479 .open = md_open, 3480 .release = md_release, 3481 .ioctl = md_ioctl, 3482 .media_changed = md_media_changed, 3483 .revalidate_disk= md_revalidate, 3484 }; 3485 3486 static int md_thread(void * arg) 3487 { 3488 mdk_thread_t *thread = arg; 3489 3490 /* 3491 * md_thread is a 'system-thread', it's priority should be very 3492 * high. We avoid resource deadlocks individually in each 3493 * raid personality. (RAID5 does preallocation) We also use RR and 3494 * the very same RT priority as kswapd, thus we will never get 3495 * into a priority inversion deadlock. 3496 * 3497 * we definitely have to have equal or higher priority than 3498 * bdflush, otherwise bdflush will deadlock if there are too 3499 * many dirty RAID5 blocks. 3500 */ 3501 3502 allow_signal(SIGKILL); 3503 while (!kthread_should_stop()) { 3504 3505 /* We need to wait INTERRUPTIBLE so that 3506 * we don't add to the load-average. 3507 * That means we need to be sure no signals are 3508 * pending 3509 */ 3510 if (signal_pending(current)) 3511 flush_signals(current); 3512 3513 wait_event_interruptible_timeout 3514 (thread->wqueue, 3515 test_bit(THREAD_WAKEUP, &thread->flags) 3516 || kthread_should_stop(), 3517 thread->timeout); 3518 try_to_freeze(); 3519 3520 clear_bit(THREAD_WAKEUP, &thread->flags); 3521 3522 thread->run(thread->mddev); 3523 } 3524 3525 return 0; 3526 } 3527 3528 void md_wakeup_thread(mdk_thread_t *thread) 3529 { 3530 if (thread) { 3531 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3532 set_bit(THREAD_WAKEUP, &thread->flags); 3533 wake_up(&thread->wqueue); 3534 } 3535 } 3536 3537 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3538 const char *name) 3539 { 3540 mdk_thread_t *thread; 3541 3542 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3543 if (!thread) 3544 return NULL; 3545 3546 init_waitqueue_head(&thread->wqueue); 3547 3548 thread->run = run; 3549 thread->mddev = mddev; 3550 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3551 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3552 if (IS_ERR(thread->tsk)) { 3553 kfree(thread); 3554 return NULL; 3555 } 3556 return thread; 3557 } 3558 3559 void md_unregister_thread(mdk_thread_t *thread) 3560 { 3561 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3562 3563 kthread_stop(thread->tsk); 3564 kfree(thread); 3565 } 3566 3567 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3568 { 3569 if (!mddev) { 3570 MD_BUG(); 3571 return; 3572 } 3573 3574 if (!rdev || test_bit(Faulty, &rdev->flags)) 3575 return; 3576 /* 3577 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3578 mdname(mddev), 3579 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3580 __builtin_return_address(0),__builtin_return_address(1), 3581 __builtin_return_address(2),__builtin_return_address(3)); 3582 */ 3583 if (!mddev->pers->error_handler) 3584 return; 3585 mddev->pers->error_handler(mddev,rdev); 3586 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3587 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3588 md_wakeup_thread(mddev->thread); 3589 md_new_event(mddev); 3590 } 3591 3592 /* seq_file implementation /proc/mdstat */ 3593 3594 static void status_unused(struct seq_file *seq) 3595 { 3596 int i = 0; 3597 mdk_rdev_t *rdev; 3598 struct list_head *tmp; 3599 3600 seq_printf(seq, "unused devices: "); 3601 3602 ITERATE_RDEV_PENDING(rdev,tmp) { 3603 char b[BDEVNAME_SIZE]; 3604 i++; 3605 seq_printf(seq, "%s ", 3606 bdevname(rdev->bdev,b)); 3607 } 3608 if (!i) 3609 seq_printf(seq, "<none>"); 3610 3611 seq_printf(seq, "\n"); 3612 } 3613 3614 3615 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3616 { 3617 unsigned long max_blocks, resync, res, dt, db, rt; 3618 3619 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3620 3621 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3622 max_blocks = mddev->resync_max_sectors >> 1; 3623 else 3624 max_blocks = mddev->size; 3625 3626 /* 3627 * Should not happen. 3628 */ 3629 if (!max_blocks) { 3630 MD_BUG(); 3631 return; 3632 } 3633 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3634 { 3635 int i, x = res/50, y = 20-x; 3636 seq_printf(seq, "["); 3637 for (i = 0; i < x; i++) 3638 seq_printf(seq, "="); 3639 seq_printf(seq, ">"); 3640 for (i = 0; i < y; i++) 3641 seq_printf(seq, "."); 3642 seq_printf(seq, "] "); 3643 } 3644 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3645 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3646 "resync" : "recovery"), 3647 res/10, res % 10, resync, max_blocks); 3648 3649 /* 3650 * We do not want to overflow, so the order of operands and 3651 * the * 100 / 100 trick are important. We do a +1 to be 3652 * safe against division by zero. We only estimate anyway. 3653 * 3654 * dt: time from mark until now 3655 * db: blocks written from mark until now 3656 * rt: remaining time 3657 */ 3658 dt = ((jiffies - mddev->resync_mark) / HZ); 3659 if (!dt) dt++; 3660 db = resync - (mddev->resync_mark_cnt/2); 3661 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3662 3663 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3664 3665 seq_printf(seq, " speed=%ldK/sec", db/dt); 3666 } 3667 3668 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3669 { 3670 struct list_head *tmp; 3671 loff_t l = *pos; 3672 mddev_t *mddev; 3673 3674 if (l >= 0x10000) 3675 return NULL; 3676 if (!l--) 3677 /* header */ 3678 return (void*)1; 3679 3680 spin_lock(&all_mddevs_lock); 3681 list_for_each(tmp,&all_mddevs) 3682 if (!l--) { 3683 mddev = list_entry(tmp, mddev_t, all_mddevs); 3684 mddev_get(mddev); 3685 spin_unlock(&all_mddevs_lock); 3686 return mddev; 3687 } 3688 spin_unlock(&all_mddevs_lock); 3689 if (!l--) 3690 return (void*)2;/* tail */ 3691 return NULL; 3692 } 3693 3694 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3695 { 3696 struct list_head *tmp; 3697 mddev_t *next_mddev, *mddev = v; 3698 3699 ++*pos; 3700 if (v == (void*)2) 3701 return NULL; 3702 3703 spin_lock(&all_mddevs_lock); 3704 if (v == (void*)1) 3705 tmp = all_mddevs.next; 3706 else 3707 tmp = mddev->all_mddevs.next; 3708 if (tmp != &all_mddevs) 3709 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3710 else { 3711 next_mddev = (void*)2; 3712 *pos = 0x10000; 3713 } 3714 spin_unlock(&all_mddevs_lock); 3715 3716 if (v != (void*)1) 3717 mddev_put(mddev); 3718 return next_mddev; 3719 3720 } 3721 3722 static void md_seq_stop(struct seq_file *seq, void *v) 3723 { 3724 mddev_t *mddev = v; 3725 3726 if (mddev && v != (void*)1 && v != (void*)2) 3727 mddev_put(mddev); 3728 } 3729 3730 struct mdstat_info { 3731 int event; 3732 }; 3733 3734 static int md_seq_show(struct seq_file *seq, void *v) 3735 { 3736 mddev_t *mddev = v; 3737 sector_t size; 3738 struct list_head *tmp2; 3739 mdk_rdev_t *rdev; 3740 struct mdstat_info *mi = seq->private; 3741 struct bitmap *bitmap; 3742 3743 if (v == (void*)1) { 3744 struct mdk_personality *pers; 3745 seq_printf(seq, "Personalities : "); 3746 spin_lock(&pers_lock); 3747 list_for_each_entry(pers, &pers_list, list) 3748 seq_printf(seq, "[%s] ", pers->name); 3749 3750 spin_unlock(&pers_lock); 3751 seq_printf(seq, "\n"); 3752 mi->event = atomic_read(&md_event_count); 3753 return 0; 3754 } 3755 if (v == (void*)2) { 3756 status_unused(seq); 3757 return 0; 3758 } 3759 3760 if (mddev_lock(mddev)!=0) 3761 return -EINTR; 3762 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3763 seq_printf(seq, "%s : %sactive", mdname(mddev), 3764 mddev->pers ? "" : "in"); 3765 if (mddev->pers) { 3766 if (mddev->ro==1) 3767 seq_printf(seq, " (read-only)"); 3768 if (mddev->ro==2) 3769 seq_printf(seq, "(auto-read-only)"); 3770 seq_printf(seq, " %s", mddev->pers->name); 3771 } 3772 3773 size = 0; 3774 ITERATE_RDEV(mddev,rdev,tmp2) { 3775 char b[BDEVNAME_SIZE]; 3776 seq_printf(seq, " %s[%d]", 3777 bdevname(rdev->bdev,b), rdev->desc_nr); 3778 if (test_bit(WriteMostly, &rdev->flags)) 3779 seq_printf(seq, "(W)"); 3780 if (test_bit(Faulty, &rdev->flags)) { 3781 seq_printf(seq, "(F)"); 3782 continue; 3783 } else if (rdev->raid_disk < 0) 3784 seq_printf(seq, "(S)"); /* spare */ 3785 size += rdev->size; 3786 } 3787 3788 if (!list_empty(&mddev->disks)) { 3789 if (mddev->pers) 3790 seq_printf(seq, "\n %llu blocks", 3791 (unsigned long long)mddev->array_size); 3792 else 3793 seq_printf(seq, "\n %llu blocks", 3794 (unsigned long long)size); 3795 } 3796 if (mddev->persistent) { 3797 if (mddev->major_version != 0 || 3798 mddev->minor_version != 90) { 3799 seq_printf(seq," super %d.%d", 3800 mddev->major_version, 3801 mddev->minor_version); 3802 } 3803 } else 3804 seq_printf(seq, " super non-persistent"); 3805 3806 if (mddev->pers) { 3807 mddev->pers->status (seq, mddev); 3808 seq_printf(seq, "\n "); 3809 if (mddev->pers->sync_request) { 3810 if (mddev->curr_resync > 2) { 3811 status_resync (seq, mddev); 3812 seq_printf(seq, "\n "); 3813 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3814 seq_printf(seq, "\tresync=DELAYED\n "); 3815 else if (mddev->recovery_cp < MaxSector) 3816 seq_printf(seq, "\tresync=PENDING\n "); 3817 } 3818 } else 3819 seq_printf(seq, "\n "); 3820 3821 if ((bitmap = mddev->bitmap)) { 3822 unsigned long chunk_kb; 3823 unsigned long flags; 3824 spin_lock_irqsave(&bitmap->lock, flags); 3825 chunk_kb = bitmap->chunksize >> 10; 3826 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3827 "%lu%s chunk", 3828 bitmap->pages - bitmap->missing_pages, 3829 bitmap->pages, 3830 (bitmap->pages - bitmap->missing_pages) 3831 << (PAGE_SHIFT - 10), 3832 chunk_kb ? chunk_kb : bitmap->chunksize, 3833 chunk_kb ? "KB" : "B"); 3834 if (bitmap->file) { 3835 seq_printf(seq, ", file: "); 3836 seq_path(seq, bitmap->file->f_vfsmnt, 3837 bitmap->file->f_dentry," \t\n"); 3838 } 3839 3840 seq_printf(seq, "\n"); 3841 spin_unlock_irqrestore(&bitmap->lock, flags); 3842 } 3843 3844 seq_printf(seq, "\n"); 3845 } 3846 mddev_unlock(mddev); 3847 3848 return 0; 3849 } 3850 3851 static struct seq_operations md_seq_ops = { 3852 .start = md_seq_start, 3853 .next = md_seq_next, 3854 .stop = md_seq_stop, 3855 .show = md_seq_show, 3856 }; 3857 3858 static int md_seq_open(struct inode *inode, struct file *file) 3859 { 3860 int error; 3861 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 3862 if (mi == NULL) 3863 return -ENOMEM; 3864 3865 error = seq_open(file, &md_seq_ops); 3866 if (error) 3867 kfree(mi); 3868 else { 3869 struct seq_file *p = file->private_data; 3870 p->private = mi; 3871 mi->event = atomic_read(&md_event_count); 3872 } 3873 return error; 3874 } 3875 3876 static int md_seq_release(struct inode *inode, struct file *file) 3877 { 3878 struct seq_file *m = file->private_data; 3879 struct mdstat_info *mi = m->private; 3880 m->private = NULL; 3881 kfree(mi); 3882 return seq_release(inode, file); 3883 } 3884 3885 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 3886 { 3887 struct seq_file *m = filp->private_data; 3888 struct mdstat_info *mi = m->private; 3889 int mask; 3890 3891 poll_wait(filp, &md_event_waiters, wait); 3892 3893 /* always allow read */ 3894 mask = POLLIN | POLLRDNORM; 3895 3896 if (mi->event != atomic_read(&md_event_count)) 3897 mask |= POLLERR | POLLPRI; 3898 return mask; 3899 } 3900 3901 static struct file_operations md_seq_fops = { 3902 .open = md_seq_open, 3903 .read = seq_read, 3904 .llseek = seq_lseek, 3905 .release = md_seq_release, 3906 .poll = mdstat_poll, 3907 }; 3908 3909 int register_md_personality(struct mdk_personality *p) 3910 { 3911 spin_lock(&pers_lock); 3912 list_add_tail(&p->list, &pers_list); 3913 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 3914 spin_unlock(&pers_lock); 3915 return 0; 3916 } 3917 3918 int unregister_md_personality(struct mdk_personality *p) 3919 { 3920 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 3921 spin_lock(&pers_lock); 3922 list_del_init(&p->list); 3923 spin_unlock(&pers_lock); 3924 return 0; 3925 } 3926 3927 static int is_mddev_idle(mddev_t *mddev) 3928 { 3929 mdk_rdev_t * rdev; 3930 struct list_head *tmp; 3931 int idle; 3932 unsigned long curr_events; 3933 3934 idle = 1; 3935 ITERATE_RDEV(mddev,rdev,tmp) { 3936 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3937 curr_events = disk_stat_read(disk, sectors[0]) + 3938 disk_stat_read(disk, sectors[1]) - 3939 atomic_read(&disk->sync_io); 3940 /* The difference between curr_events and last_events 3941 * will be affected by any new non-sync IO (making 3942 * curr_events bigger) and any difference in the amount of 3943 * in-flight syncio (making current_events bigger or smaller) 3944 * The amount in-flight is currently limited to 3945 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 3946 * which is at most 4096 sectors. 3947 * These numbers are fairly fragile and should be made 3948 * more robust, probably by enforcing the 3949 * 'window size' that md_do_sync sort-of uses. 3950 * 3951 * Note: the following is an unsigned comparison. 3952 */ 3953 if ((curr_events - rdev->last_events + 4096) > 8192) { 3954 rdev->last_events = curr_events; 3955 idle = 0; 3956 } 3957 } 3958 return idle; 3959 } 3960 3961 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3962 { 3963 /* another "blocks" (512byte) blocks have been synced */ 3964 atomic_sub(blocks, &mddev->recovery_active); 3965 wake_up(&mddev->recovery_wait); 3966 if (!ok) { 3967 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3968 md_wakeup_thread(mddev->thread); 3969 // stop recovery, signal do_sync .... 3970 } 3971 } 3972 3973 3974 /* md_write_start(mddev, bi) 3975 * If we need to update some array metadata (e.g. 'active' flag 3976 * in superblock) before writing, schedule a superblock update 3977 * and wait for it to complete. 3978 */ 3979 void md_write_start(mddev_t *mddev, struct bio *bi) 3980 { 3981 if (bio_data_dir(bi) != WRITE) 3982 return; 3983 3984 BUG_ON(mddev->ro == 1); 3985 if (mddev->ro == 2) { 3986 /* need to switch to read/write */ 3987 mddev->ro = 0; 3988 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3989 md_wakeup_thread(mddev->thread); 3990 } 3991 atomic_inc(&mddev->writes_pending); 3992 if (mddev->in_sync) { 3993 spin_lock_irq(&mddev->write_lock); 3994 if (mddev->in_sync) { 3995 mddev->in_sync = 0; 3996 mddev->sb_dirty = 1; 3997 md_wakeup_thread(mddev->thread); 3998 } 3999 spin_unlock_irq(&mddev->write_lock); 4000 } 4001 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 4002 } 4003 4004 void md_write_end(mddev_t *mddev) 4005 { 4006 if (atomic_dec_and_test(&mddev->writes_pending)) { 4007 if (mddev->safemode == 2) 4008 md_wakeup_thread(mddev->thread); 4009 else 4010 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 4011 } 4012 } 4013 4014 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 4015 4016 #define SYNC_MARKS 10 4017 #define SYNC_MARK_STEP (3*HZ) 4018 static void md_do_sync(mddev_t *mddev) 4019 { 4020 mddev_t *mddev2; 4021 unsigned int currspeed = 0, 4022 window; 4023 sector_t max_sectors,j, io_sectors; 4024 unsigned long mark[SYNC_MARKS]; 4025 sector_t mark_cnt[SYNC_MARKS]; 4026 int last_mark,m; 4027 struct list_head *tmp; 4028 sector_t last_check; 4029 int skipped = 0; 4030 4031 /* just incase thread restarts... */ 4032 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 4033 return; 4034 4035 /* we overload curr_resync somewhat here. 4036 * 0 == not engaged in resync at all 4037 * 2 == checking that there is no conflict with another sync 4038 * 1 == like 2, but have yielded to allow conflicting resync to 4039 * commense 4040 * other == active in resync - this many blocks 4041 * 4042 * Before starting a resync we must have set curr_resync to 4043 * 2, and then checked that every "conflicting" array has curr_resync 4044 * less than ours. When we find one that is the same or higher 4045 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 4046 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 4047 * This will mean we have to start checking from the beginning again. 4048 * 4049 */ 4050 4051 do { 4052 mddev->curr_resync = 2; 4053 4054 try_again: 4055 if (kthread_should_stop()) { 4056 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4057 goto skip; 4058 } 4059 ITERATE_MDDEV(mddev2,tmp) { 4060 if (mddev2 == mddev) 4061 continue; 4062 if (mddev2->curr_resync && 4063 match_mddev_units(mddev,mddev2)) { 4064 DEFINE_WAIT(wq); 4065 if (mddev < mddev2 && mddev->curr_resync == 2) { 4066 /* arbitrarily yield */ 4067 mddev->curr_resync = 1; 4068 wake_up(&resync_wait); 4069 } 4070 if (mddev > mddev2 && mddev->curr_resync == 1) 4071 /* no need to wait here, we can wait the next 4072 * time 'round when curr_resync == 2 4073 */ 4074 continue; 4075 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 4076 if (!kthread_should_stop() && 4077 mddev2->curr_resync >= mddev->curr_resync) { 4078 printk(KERN_INFO "md: delaying resync of %s" 4079 " until %s has finished resync (they" 4080 " share one or more physical units)\n", 4081 mdname(mddev), mdname(mddev2)); 4082 mddev_put(mddev2); 4083 schedule(); 4084 finish_wait(&resync_wait, &wq); 4085 goto try_again; 4086 } 4087 finish_wait(&resync_wait, &wq); 4088 } 4089 } 4090 } while (mddev->curr_resync < 2); 4091 4092 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4093 /* resync follows the size requested by the personality, 4094 * which defaults to physical size, but can be virtual size 4095 */ 4096 max_sectors = mddev->resync_max_sectors; 4097 mddev->resync_mismatches = 0; 4098 } else 4099 /* recovery follows the physical size of devices */ 4100 max_sectors = mddev->size << 1; 4101 4102 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 4103 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 4104 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 4105 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 4106 "(but not more than %d KB/sec) for reconstruction.\n", 4107 sysctl_speed_limit_max); 4108 4109 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4110 /* we don't use the checkpoint if there's a bitmap */ 4111 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap 4112 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 4113 j = mddev->recovery_cp; 4114 else 4115 j = 0; 4116 io_sectors = 0; 4117 for (m = 0; m < SYNC_MARKS; m++) { 4118 mark[m] = jiffies; 4119 mark_cnt[m] = io_sectors; 4120 } 4121 last_mark = 0; 4122 mddev->resync_mark = mark[last_mark]; 4123 mddev->resync_mark_cnt = mark_cnt[last_mark]; 4124 4125 /* 4126 * Tune reconstruction: 4127 */ 4128 window = 32*(PAGE_SIZE/512); 4129 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 4130 window/2,(unsigned long long) max_sectors/2); 4131 4132 atomic_set(&mddev->recovery_active, 0); 4133 init_waitqueue_head(&mddev->recovery_wait); 4134 last_check = 0; 4135 4136 if (j>2) { 4137 printk(KERN_INFO 4138 "md: resuming recovery of %s from checkpoint.\n", 4139 mdname(mddev)); 4140 mddev->curr_resync = j; 4141 } 4142 4143 while (j < max_sectors) { 4144 sector_t sectors; 4145 4146 skipped = 0; 4147 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4148 currspeed < sysctl_speed_limit_min); 4149 if (sectors == 0) { 4150 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4151 goto out; 4152 } 4153 4154 if (!skipped) { /* actual IO requested */ 4155 io_sectors += sectors; 4156 atomic_add(sectors, &mddev->recovery_active); 4157 } 4158 4159 j += sectors; 4160 if (j>1) mddev->curr_resync = j; 4161 if (last_check == 0) 4162 /* this is the earliers that rebuilt will be 4163 * visible in /proc/mdstat 4164 */ 4165 md_new_event(mddev); 4166 4167 if (last_check + window > io_sectors || j == max_sectors) 4168 continue; 4169 4170 last_check = io_sectors; 4171 4172 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 4173 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 4174 break; 4175 4176 repeat: 4177 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 4178 /* step marks */ 4179 int next = (last_mark+1) % SYNC_MARKS; 4180 4181 mddev->resync_mark = mark[next]; 4182 mddev->resync_mark_cnt = mark_cnt[next]; 4183 mark[next] = jiffies; 4184 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 4185 last_mark = next; 4186 } 4187 4188 4189 if (kthread_should_stop()) { 4190 /* 4191 * got a signal, exit. 4192 */ 4193 printk(KERN_INFO 4194 "md: md_do_sync() got signal ... exiting\n"); 4195 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4196 goto out; 4197 } 4198 4199 /* 4200 * this loop exits only if either when we are slower than 4201 * the 'hard' speed limit, or the system was IO-idle for 4202 * a jiffy. 4203 * the system might be non-idle CPU-wise, but we only care 4204 * about not overloading the IO subsystem. (things like an 4205 * e2fsck being done on the RAID array should execute fast) 4206 */ 4207 mddev->queue->unplug_fn(mddev->queue); 4208 cond_resched(); 4209 4210 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4211 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4212 4213 if (currspeed > sysctl_speed_limit_min) { 4214 if ((currspeed > sysctl_speed_limit_max) || 4215 !is_mddev_idle(mddev)) { 4216 msleep(500); 4217 goto repeat; 4218 } 4219 } 4220 } 4221 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 4222 /* 4223 * this also signals 'finished resyncing' to md_stop 4224 */ 4225 out: 4226 mddev->queue->unplug_fn(mddev->queue); 4227 4228 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 4229 4230 /* tell personality that we are finished */ 4231 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4232 4233 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4234 mddev->curr_resync > 2 && 4235 mddev->curr_resync >= mddev->recovery_cp) { 4236 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4237 printk(KERN_INFO 4238 "md: checkpointing recovery of %s.\n", 4239 mdname(mddev)); 4240 mddev->recovery_cp = mddev->curr_resync; 4241 } else 4242 mddev->recovery_cp = MaxSector; 4243 } 4244 4245 skip: 4246 mddev->curr_resync = 0; 4247 wake_up(&resync_wait); 4248 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4249 md_wakeup_thread(mddev->thread); 4250 } 4251 4252 4253 /* 4254 * This routine is regularly called by all per-raid-array threads to 4255 * deal with generic issues like resync and super-block update. 4256 * Raid personalities that don't have a thread (linear/raid0) do not 4257 * need this as they never do any recovery or update the superblock. 4258 * 4259 * It does not do any resync itself, but rather "forks" off other threads 4260 * to do that as needed. 4261 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 4262 * "->recovery" and create a thread at ->sync_thread. 4263 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 4264 * and wakeups up this thread which will reap the thread and finish up. 4265 * This thread also removes any faulty devices (with nr_pending == 0). 4266 * 4267 * The overall approach is: 4268 * 1/ if the superblock needs updating, update it. 4269 * 2/ If a recovery thread is running, don't do anything else. 4270 * 3/ If recovery has finished, clean up, possibly marking spares active. 4271 * 4/ If there are any faulty devices, remove them. 4272 * 5/ If array is degraded, try to add spares devices 4273 * 6/ If array has spares or is not in-sync, start a resync thread. 4274 */ 4275 void md_check_recovery(mddev_t *mddev) 4276 { 4277 mdk_rdev_t *rdev; 4278 struct list_head *rtmp; 4279 4280 4281 if (mddev->bitmap) 4282 bitmap_daemon_work(mddev->bitmap); 4283 4284 if (mddev->ro) 4285 return; 4286 4287 if (signal_pending(current)) { 4288 if (mddev->pers->sync_request) { 4289 printk(KERN_INFO "md: %s in immediate safe mode\n", 4290 mdname(mddev)); 4291 mddev->safemode = 2; 4292 } 4293 flush_signals(current); 4294 } 4295 4296 if ( ! ( 4297 mddev->sb_dirty || 4298 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 4299 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 4300 (mddev->safemode == 1) || 4301 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 4302 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 4303 )) 4304 return; 4305 4306 if (mddev_trylock(mddev)==0) { 4307 int spares =0; 4308 4309 spin_lock_irq(&mddev->write_lock); 4310 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4311 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4312 mddev->in_sync = 1; 4313 mddev->sb_dirty = 1; 4314 } 4315 if (mddev->safemode == 1) 4316 mddev->safemode = 0; 4317 spin_unlock_irq(&mddev->write_lock); 4318 4319 if (mddev->sb_dirty) 4320 md_update_sb(mddev); 4321 4322 4323 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4324 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 4325 /* resync/recovery still happening */ 4326 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4327 goto unlock; 4328 } 4329 if (mddev->sync_thread) { 4330 /* resync has finished, collect result */ 4331 md_unregister_thread(mddev->sync_thread); 4332 mddev->sync_thread = NULL; 4333 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4334 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4335 /* success...*/ 4336 /* activate any spares */ 4337 mddev->pers->spare_active(mddev); 4338 } 4339 md_update_sb(mddev); 4340 4341 /* if array is no-longer degraded, then any saved_raid_disk 4342 * information must be scrapped 4343 */ 4344 if (!mddev->degraded) 4345 ITERATE_RDEV(mddev,rdev,rtmp) 4346 rdev->saved_raid_disk = -1; 4347 4348 mddev->recovery = 0; 4349 /* flag recovery needed just to double check */ 4350 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4351 md_new_event(mddev); 4352 goto unlock; 4353 } 4354 /* Clear some bits that don't mean anything, but 4355 * might be left set 4356 */ 4357 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4358 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 4359 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 4360 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4361 4362 /* no recovery is running. 4363 * remove any failed drives, then 4364 * add spares if possible. 4365 * Spare are also removed and re-added, to allow 4366 * the personality to fail the re-add. 4367 */ 4368 ITERATE_RDEV(mddev,rdev,rtmp) 4369 if (rdev->raid_disk >= 0 && 4370 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 4371 atomic_read(&rdev->nr_pending)==0) { 4372 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 4373 char nm[20]; 4374 sprintf(nm,"rd%d", rdev->raid_disk); 4375 sysfs_remove_link(&mddev->kobj, nm); 4376 rdev->raid_disk = -1; 4377 } 4378 } 4379 4380 if (mddev->degraded) { 4381 ITERATE_RDEV(mddev,rdev,rtmp) 4382 if (rdev->raid_disk < 0 4383 && !test_bit(Faulty, &rdev->flags)) { 4384 if (mddev->pers->hot_add_disk(mddev,rdev)) { 4385 char nm[20]; 4386 sprintf(nm, "rd%d", rdev->raid_disk); 4387 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4388 spares++; 4389 md_new_event(mddev); 4390 } else 4391 break; 4392 } 4393 } 4394 4395 if (spares) { 4396 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4397 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4398 } else if (mddev->recovery_cp < MaxSector) { 4399 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4400 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4401 /* nothing to be done ... */ 4402 goto unlock; 4403 4404 if (mddev->pers->sync_request) { 4405 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4406 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4407 /* We are adding a device or devices to an array 4408 * which has the bitmap stored on all devices. 4409 * So make sure all bitmap pages get written 4410 */ 4411 bitmap_write_all(mddev->bitmap); 4412 } 4413 mddev->sync_thread = md_register_thread(md_do_sync, 4414 mddev, 4415 "%s_resync"); 4416 if (!mddev->sync_thread) { 4417 printk(KERN_ERR "%s: could not start resync" 4418 " thread...\n", 4419 mdname(mddev)); 4420 /* leave the spares where they are, it shouldn't hurt */ 4421 mddev->recovery = 0; 4422 } else 4423 md_wakeup_thread(mddev->sync_thread); 4424 md_new_event(mddev); 4425 } 4426 unlock: 4427 mddev_unlock(mddev); 4428 } 4429 } 4430 4431 static int md_notify_reboot(struct notifier_block *this, 4432 unsigned long code, void *x) 4433 { 4434 struct list_head *tmp; 4435 mddev_t *mddev; 4436 4437 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 4438 4439 printk(KERN_INFO "md: stopping all md devices.\n"); 4440 4441 ITERATE_MDDEV(mddev,tmp) 4442 if (mddev_trylock(mddev)==0) 4443 do_md_stop (mddev, 1); 4444 /* 4445 * certain more exotic SCSI devices are known to be 4446 * volatile wrt too early system reboots. While the 4447 * right place to handle this issue is the given 4448 * driver, we do want to have a safe RAID driver ... 4449 */ 4450 mdelay(1000*1); 4451 } 4452 return NOTIFY_DONE; 4453 } 4454 4455 static struct notifier_block md_notifier = { 4456 .notifier_call = md_notify_reboot, 4457 .next = NULL, 4458 .priority = INT_MAX, /* before any real devices */ 4459 }; 4460 4461 static void md_geninit(void) 4462 { 4463 struct proc_dir_entry *p; 4464 4465 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 4466 4467 p = create_proc_entry("mdstat", S_IRUGO, NULL); 4468 if (p) 4469 p->proc_fops = &md_seq_fops; 4470 } 4471 4472 static int __init md_init(void) 4473 { 4474 int minor; 4475 4476 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 4477 " MD_SB_DISKS=%d\n", 4478 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4479 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4480 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 4481 BITMAP_MINOR); 4482 4483 if (register_blkdev(MAJOR_NR, "md")) 4484 return -1; 4485 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 4486 unregister_blkdev(MAJOR_NR, "md"); 4487 return -1; 4488 } 4489 devfs_mk_dir("md"); 4490 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 4491 md_probe, NULL, NULL); 4492 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 4493 md_probe, NULL, NULL); 4494 4495 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4496 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 4497 S_IFBLK|S_IRUSR|S_IWUSR, 4498 "md/%d", minor); 4499 4500 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4501 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4502 S_IFBLK|S_IRUSR|S_IWUSR, 4503 "md/mdp%d", minor); 4504 4505 4506 register_reboot_notifier(&md_notifier); 4507 raid_table_header = register_sysctl_table(raid_root_table, 1); 4508 4509 md_geninit(); 4510 return (0); 4511 } 4512 4513 4514 #ifndef MODULE 4515 4516 /* 4517 * Searches all registered partitions for autorun RAID arrays 4518 * at boot time. 4519 */ 4520 static dev_t detected_devices[128]; 4521 static int dev_cnt; 4522 4523 void md_autodetect_dev(dev_t dev) 4524 { 4525 if (dev_cnt >= 0 && dev_cnt < 127) 4526 detected_devices[dev_cnt++] = dev; 4527 } 4528 4529 4530 static void autostart_arrays(int part) 4531 { 4532 mdk_rdev_t *rdev; 4533 int i; 4534 4535 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4536 4537 for (i = 0; i < dev_cnt; i++) { 4538 dev_t dev = detected_devices[i]; 4539 4540 rdev = md_import_device(dev,0, 0); 4541 if (IS_ERR(rdev)) 4542 continue; 4543 4544 if (test_bit(Faulty, &rdev->flags)) { 4545 MD_BUG(); 4546 continue; 4547 } 4548 list_add(&rdev->same_set, &pending_raid_disks); 4549 } 4550 dev_cnt = 0; 4551 4552 autorun_devices(part); 4553 } 4554 4555 #endif 4556 4557 static __exit void md_exit(void) 4558 { 4559 mddev_t *mddev; 4560 struct list_head *tmp; 4561 int i; 4562 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4563 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4564 for (i=0; i < MAX_MD_DEVS; i++) 4565 devfs_remove("md/%d", i); 4566 for (i=0; i < MAX_MD_DEVS; i++) 4567 devfs_remove("md/d%d", i); 4568 4569 devfs_remove("md"); 4570 4571 unregister_blkdev(MAJOR_NR,"md"); 4572 unregister_blkdev(mdp_major, "mdp"); 4573 unregister_reboot_notifier(&md_notifier); 4574 unregister_sysctl_table(raid_table_header); 4575 remove_proc_entry("mdstat", NULL); 4576 ITERATE_MDDEV(mddev,tmp) { 4577 struct gendisk *disk = mddev->gendisk; 4578 if (!disk) 4579 continue; 4580 export_array(mddev); 4581 del_gendisk(disk); 4582 put_disk(disk); 4583 mddev->gendisk = NULL; 4584 mddev_put(mddev); 4585 } 4586 } 4587 4588 module_init(md_init) 4589 module_exit(md_exit) 4590 4591 static int get_ro(char *buffer, struct kernel_param *kp) 4592 { 4593 return sprintf(buffer, "%d", start_readonly); 4594 } 4595 static int set_ro(const char *val, struct kernel_param *kp) 4596 { 4597 char *e; 4598 int num = simple_strtoul(val, &e, 10); 4599 if (*val && (*e == '\0' || *e == '\n')) { 4600 start_readonly = num; 4601 return 0;; 4602 } 4603 return -EINVAL; 4604 } 4605 4606 module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 4607 module_param(start_dirty_degraded, int, 0644); 4608 4609 4610 EXPORT_SYMBOL(register_md_personality); 4611 EXPORT_SYMBOL(unregister_md_personality); 4612 EXPORT_SYMBOL(md_error); 4613 EXPORT_SYMBOL(md_done_sync); 4614 EXPORT_SYMBOL(md_write_start); 4615 EXPORT_SYMBOL(md_write_end); 4616 EXPORT_SYMBOL(md_register_thread); 4617 EXPORT_SYMBOL(md_unregister_thread); 4618 EXPORT_SYMBOL(md_wakeup_thread); 4619 EXPORT_SYMBOL(md_print_devices); 4620 EXPORT_SYMBOL(md_check_recovery); 4621 MODULE_LICENSE("GPL"); 4622 MODULE_ALIAS("md"); 4623 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4624