1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 46 #include <linux/init.h> 47 48 #include <linux/file.h> 49 50 #ifdef CONFIG_KMOD 51 #include <linux/kmod.h> 52 #endif 53 54 #include <asm/unaligned.h> 55 56 #define MAJOR_NR MD_MAJOR 57 #define MD_DRIVER 58 59 /* 63 partitions with the alternate major number (mdp) */ 60 #define MdpMinorShift 6 61 62 #define DEBUG 0 63 #define dprintk(x...) ((void)(DEBUG && printk(x))) 64 65 66 #ifndef MODULE 67 static void autostart_arrays (int part); 68 #endif 69 70 static mdk_personality_t *pers[MAX_PERSONALITY]; 71 static DEFINE_SPINLOCK(pers_lock); 72 73 /* 74 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 75 * is 1000 KB/sec, so the extra system load does not show up that much. 76 * Increase it if you want to have more _guaranteed_ speed. Note that 77 * the RAID driver will use the maximum available bandwidth if the IO 78 * subsystem is idle. There is also an 'absolute maximum' reconstruction 79 * speed limit - in case reconstruction slows down your system despite 80 * idle IO detection. 81 * 82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 */ 84 85 static int sysctl_speed_limit_min = 1000; 86 static int sysctl_speed_limit_max = 200000; 87 88 static struct ctl_table_header *raid_table_header; 89 90 static ctl_table raid_table[] = { 91 { 92 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 93 .procname = "speed_limit_min", 94 .data = &sysctl_speed_limit_min, 95 .maxlen = sizeof(int), 96 .mode = 0644, 97 .proc_handler = &proc_dointvec, 98 }, 99 { 100 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 101 .procname = "speed_limit_max", 102 .data = &sysctl_speed_limit_max, 103 .maxlen = sizeof(int), 104 .mode = 0644, 105 .proc_handler = &proc_dointvec, 106 }, 107 { .ctl_name = 0 } 108 }; 109 110 static ctl_table raid_dir_table[] = { 111 { 112 .ctl_name = DEV_RAID, 113 .procname = "raid", 114 .maxlen = 0, 115 .mode = 0555, 116 .child = raid_table, 117 }, 118 { .ctl_name = 0 } 119 }; 120 121 static ctl_table raid_root_table[] = { 122 { 123 .ctl_name = CTL_DEV, 124 .procname = "dev", 125 .maxlen = 0, 126 .mode = 0555, 127 .child = raid_dir_table, 128 }, 129 { .ctl_name = 0 } 130 }; 131 132 static struct block_device_operations md_fops; 133 134 /* 135 * Enables to iterate over all existing md arrays 136 * all_mddevs_lock protects this list. 137 */ 138 static LIST_HEAD(all_mddevs); 139 static DEFINE_SPINLOCK(all_mddevs_lock); 140 141 142 /* 143 * iterates through all used mddevs in the system. 144 * We take care to grab the all_mddevs_lock whenever navigating 145 * the list, and to always hold a refcount when unlocked. 146 * Any code which breaks out of this loop while own 147 * a reference to the current mddev and must mddev_put it. 148 */ 149 #define ITERATE_MDDEV(mddev,tmp) \ 150 \ 151 for (({ spin_lock(&all_mddevs_lock); \ 152 tmp = all_mddevs.next; \ 153 mddev = NULL;}); \ 154 ({ if (tmp != &all_mddevs) \ 155 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 156 spin_unlock(&all_mddevs_lock); \ 157 if (mddev) mddev_put(mddev); \ 158 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 159 tmp != &all_mddevs;}); \ 160 ({ spin_lock(&all_mddevs_lock); \ 161 tmp = tmp->next;}) \ 162 ) 163 164 165 static int md_fail_request (request_queue_t *q, struct bio *bio) 166 { 167 bio_io_error(bio, bio->bi_size); 168 return 0; 169 } 170 171 static inline mddev_t *mddev_get(mddev_t *mddev) 172 { 173 atomic_inc(&mddev->active); 174 return mddev; 175 } 176 177 static void mddev_put(mddev_t *mddev) 178 { 179 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 180 return; 181 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 182 list_del(&mddev->all_mddevs); 183 blk_put_queue(mddev->queue); 184 kobject_unregister(&mddev->kobj); 185 } 186 spin_unlock(&all_mddevs_lock); 187 } 188 189 static mddev_t * mddev_find(dev_t unit) 190 { 191 mddev_t *mddev, *new = NULL; 192 193 retry: 194 spin_lock(&all_mddevs_lock); 195 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 196 if (mddev->unit == unit) { 197 mddev_get(mddev); 198 spin_unlock(&all_mddevs_lock); 199 kfree(new); 200 return mddev; 201 } 202 203 if (new) { 204 list_add(&new->all_mddevs, &all_mddevs); 205 spin_unlock(&all_mddevs_lock); 206 return new; 207 } 208 spin_unlock(&all_mddevs_lock); 209 210 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 211 if (!new) 212 return NULL; 213 214 memset(new, 0, sizeof(*new)); 215 216 new->unit = unit; 217 if (MAJOR(unit) == MD_MAJOR) 218 new->md_minor = MINOR(unit); 219 else 220 new->md_minor = MINOR(unit) >> MdpMinorShift; 221 222 init_MUTEX(&new->reconfig_sem); 223 INIT_LIST_HEAD(&new->disks); 224 INIT_LIST_HEAD(&new->all_mddevs); 225 init_timer(&new->safemode_timer); 226 atomic_set(&new->active, 1); 227 spin_lock_init(&new->write_lock); 228 init_waitqueue_head(&new->sb_wait); 229 230 new->queue = blk_alloc_queue(GFP_KERNEL); 231 if (!new->queue) { 232 kfree(new); 233 return NULL; 234 } 235 236 blk_queue_make_request(new->queue, md_fail_request); 237 238 goto retry; 239 } 240 241 static inline int mddev_lock(mddev_t * mddev) 242 { 243 return down_interruptible(&mddev->reconfig_sem); 244 } 245 246 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 247 { 248 down(&mddev->reconfig_sem); 249 } 250 251 static inline int mddev_trylock(mddev_t * mddev) 252 { 253 return down_trylock(&mddev->reconfig_sem); 254 } 255 256 static inline void mddev_unlock(mddev_t * mddev) 257 { 258 up(&mddev->reconfig_sem); 259 260 md_wakeup_thread(mddev->thread); 261 } 262 263 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 264 { 265 mdk_rdev_t * rdev; 266 struct list_head *tmp; 267 268 ITERATE_RDEV(mddev,rdev,tmp) { 269 if (rdev->desc_nr == nr) 270 return rdev; 271 } 272 return NULL; 273 } 274 275 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 276 { 277 struct list_head *tmp; 278 mdk_rdev_t *rdev; 279 280 ITERATE_RDEV(mddev,rdev,tmp) { 281 if (rdev->bdev->bd_dev == dev) 282 return rdev; 283 } 284 return NULL; 285 } 286 287 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 288 { 289 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 290 return MD_NEW_SIZE_BLOCKS(size); 291 } 292 293 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 294 { 295 sector_t size; 296 297 size = rdev->sb_offset; 298 299 if (chunk_size) 300 size &= ~((sector_t)chunk_size/1024 - 1); 301 return size; 302 } 303 304 static int alloc_disk_sb(mdk_rdev_t * rdev) 305 { 306 if (rdev->sb_page) 307 MD_BUG(); 308 309 rdev->sb_page = alloc_page(GFP_KERNEL); 310 if (!rdev->sb_page) { 311 printk(KERN_ALERT "md: out of memory.\n"); 312 return -EINVAL; 313 } 314 315 return 0; 316 } 317 318 static void free_disk_sb(mdk_rdev_t * rdev) 319 { 320 if (rdev->sb_page) { 321 page_cache_release(rdev->sb_page); 322 rdev->sb_loaded = 0; 323 rdev->sb_page = NULL; 324 rdev->sb_offset = 0; 325 rdev->size = 0; 326 } 327 } 328 329 330 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 331 { 332 mdk_rdev_t *rdev = bio->bi_private; 333 mddev_t *mddev = rdev->mddev; 334 if (bio->bi_size) 335 return 1; 336 337 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 338 md_error(mddev, rdev); 339 340 if (atomic_dec_and_test(&mddev->pending_writes)) 341 wake_up(&mddev->sb_wait); 342 bio_put(bio); 343 return 0; 344 } 345 346 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 347 { 348 struct bio *bio2 = bio->bi_private; 349 mdk_rdev_t *rdev = bio2->bi_private; 350 mddev_t *mddev = rdev->mddev; 351 if (bio->bi_size) 352 return 1; 353 354 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 355 error == -EOPNOTSUPP) { 356 unsigned long flags; 357 /* barriers don't appear to be supported :-( */ 358 set_bit(BarriersNotsupp, &rdev->flags); 359 mddev->barriers_work = 0; 360 spin_lock_irqsave(&mddev->write_lock, flags); 361 bio2->bi_next = mddev->biolist; 362 mddev->biolist = bio2; 363 spin_unlock_irqrestore(&mddev->write_lock, flags); 364 wake_up(&mddev->sb_wait); 365 bio_put(bio); 366 return 0; 367 } 368 bio_put(bio2); 369 bio->bi_private = rdev; 370 return super_written(bio, bytes_done, error); 371 } 372 373 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 374 sector_t sector, int size, struct page *page) 375 { 376 /* write first size bytes of page to sector of rdev 377 * Increment mddev->pending_writes before returning 378 * and decrement it on completion, waking up sb_wait 379 * if zero is reached. 380 * If an error occurred, call md_error 381 * 382 * As we might need to resubmit the request if BIO_RW_BARRIER 383 * causes ENOTSUPP, we allocate a spare bio... 384 */ 385 struct bio *bio = bio_alloc(GFP_NOIO, 1); 386 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 387 388 bio->bi_bdev = rdev->bdev; 389 bio->bi_sector = sector; 390 bio_add_page(bio, page, size, 0); 391 bio->bi_private = rdev; 392 bio->bi_end_io = super_written; 393 bio->bi_rw = rw; 394 395 atomic_inc(&mddev->pending_writes); 396 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 397 struct bio *rbio; 398 rw |= (1<<BIO_RW_BARRIER); 399 rbio = bio_clone(bio, GFP_NOIO); 400 rbio->bi_private = bio; 401 rbio->bi_end_io = super_written_barrier; 402 submit_bio(rw, rbio); 403 } else 404 submit_bio(rw, bio); 405 } 406 407 void md_super_wait(mddev_t *mddev) 408 { 409 /* wait for all superblock writes that were scheduled to complete. 410 * if any had to be retried (due to BARRIER problems), retry them 411 */ 412 DEFINE_WAIT(wq); 413 for(;;) { 414 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 415 if (atomic_read(&mddev->pending_writes)==0) 416 break; 417 while (mddev->biolist) { 418 struct bio *bio; 419 spin_lock_irq(&mddev->write_lock); 420 bio = mddev->biolist; 421 mddev->biolist = bio->bi_next ; 422 bio->bi_next = NULL; 423 spin_unlock_irq(&mddev->write_lock); 424 submit_bio(bio->bi_rw, bio); 425 } 426 schedule(); 427 } 428 finish_wait(&mddev->sb_wait, &wq); 429 } 430 431 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 432 { 433 if (bio->bi_size) 434 return 1; 435 436 complete((struct completion*)bio->bi_private); 437 return 0; 438 } 439 440 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 441 struct page *page, int rw) 442 { 443 struct bio *bio = bio_alloc(GFP_NOIO, 1); 444 struct completion event; 445 int ret; 446 447 rw |= (1 << BIO_RW_SYNC); 448 449 bio->bi_bdev = bdev; 450 bio->bi_sector = sector; 451 bio_add_page(bio, page, size, 0); 452 init_completion(&event); 453 bio->bi_private = &event; 454 bio->bi_end_io = bi_complete; 455 submit_bio(rw, bio); 456 wait_for_completion(&event); 457 458 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 459 bio_put(bio); 460 return ret; 461 } 462 463 static int read_disk_sb(mdk_rdev_t * rdev, int size) 464 { 465 char b[BDEVNAME_SIZE]; 466 if (!rdev->sb_page) { 467 MD_BUG(); 468 return -EINVAL; 469 } 470 if (rdev->sb_loaded) 471 return 0; 472 473 474 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 475 goto fail; 476 rdev->sb_loaded = 1; 477 return 0; 478 479 fail: 480 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 481 bdevname(rdev->bdev,b)); 482 return -EINVAL; 483 } 484 485 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 486 { 487 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 488 (sb1->set_uuid1 == sb2->set_uuid1) && 489 (sb1->set_uuid2 == sb2->set_uuid2) && 490 (sb1->set_uuid3 == sb2->set_uuid3)) 491 492 return 1; 493 494 return 0; 495 } 496 497 498 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 499 { 500 int ret; 501 mdp_super_t *tmp1, *tmp2; 502 503 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 504 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 505 506 if (!tmp1 || !tmp2) { 507 ret = 0; 508 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 509 goto abort; 510 } 511 512 *tmp1 = *sb1; 513 *tmp2 = *sb2; 514 515 /* 516 * nr_disks is not constant 517 */ 518 tmp1->nr_disks = 0; 519 tmp2->nr_disks = 0; 520 521 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 522 ret = 0; 523 else 524 ret = 1; 525 526 abort: 527 kfree(tmp1); 528 kfree(tmp2); 529 return ret; 530 } 531 532 static unsigned int calc_sb_csum(mdp_super_t * sb) 533 { 534 unsigned int disk_csum, csum; 535 536 disk_csum = sb->sb_csum; 537 sb->sb_csum = 0; 538 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 539 sb->sb_csum = disk_csum; 540 return csum; 541 } 542 543 544 /* 545 * Handle superblock details. 546 * We want to be able to handle multiple superblock formats 547 * so we have a common interface to them all, and an array of 548 * different handlers. 549 * We rely on user-space to write the initial superblock, and support 550 * reading and updating of superblocks. 551 * Interface methods are: 552 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 553 * loads and validates a superblock on dev. 554 * if refdev != NULL, compare superblocks on both devices 555 * Return: 556 * 0 - dev has a superblock that is compatible with refdev 557 * 1 - dev has a superblock that is compatible and newer than refdev 558 * so dev should be used as the refdev in future 559 * -EINVAL superblock incompatible or invalid 560 * -othererror e.g. -EIO 561 * 562 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 563 * Verify that dev is acceptable into mddev. 564 * The first time, mddev->raid_disks will be 0, and data from 565 * dev should be merged in. Subsequent calls check that dev 566 * is new enough. Return 0 or -EINVAL 567 * 568 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 569 * Update the superblock for rdev with data in mddev 570 * This does not write to disc. 571 * 572 */ 573 574 struct super_type { 575 char *name; 576 struct module *owner; 577 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 578 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 579 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 580 }; 581 582 /* 583 * load_super for 0.90.0 584 */ 585 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 586 { 587 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 588 mdp_super_t *sb; 589 int ret; 590 sector_t sb_offset; 591 592 /* 593 * Calculate the position of the superblock, 594 * it's at the end of the disk. 595 * 596 * It also happens to be a multiple of 4Kb. 597 */ 598 sb_offset = calc_dev_sboffset(rdev->bdev); 599 rdev->sb_offset = sb_offset; 600 601 ret = read_disk_sb(rdev, MD_SB_BYTES); 602 if (ret) return ret; 603 604 ret = -EINVAL; 605 606 bdevname(rdev->bdev, b); 607 sb = (mdp_super_t*)page_address(rdev->sb_page); 608 609 if (sb->md_magic != MD_SB_MAGIC) { 610 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 611 b); 612 goto abort; 613 } 614 615 if (sb->major_version != 0 || 616 sb->minor_version != 90) { 617 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 618 sb->major_version, sb->minor_version, 619 b); 620 goto abort; 621 } 622 623 if (sb->raid_disks <= 0) 624 goto abort; 625 626 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 627 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 628 b); 629 goto abort; 630 } 631 632 rdev->preferred_minor = sb->md_minor; 633 rdev->data_offset = 0; 634 rdev->sb_size = MD_SB_BYTES; 635 636 if (sb->level == LEVEL_MULTIPATH) 637 rdev->desc_nr = -1; 638 else 639 rdev->desc_nr = sb->this_disk.number; 640 641 if (refdev == 0) 642 ret = 1; 643 else { 644 __u64 ev1, ev2; 645 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 646 if (!uuid_equal(refsb, sb)) { 647 printk(KERN_WARNING "md: %s has different UUID to %s\n", 648 b, bdevname(refdev->bdev,b2)); 649 goto abort; 650 } 651 if (!sb_equal(refsb, sb)) { 652 printk(KERN_WARNING "md: %s has same UUID" 653 " but different superblock to %s\n", 654 b, bdevname(refdev->bdev, b2)); 655 goto abort; 656 } 657 ev1 = md_event(sb); 658 ev2 = md_event(refsb); 659 if (ev1 > ev2) 660 ret = 1; 661 else 662 ret = 0; 663 } 664 rdev->size = calc_dev_size(rdev, sb->chunk_size); 665 666 abort: 667 return ret; 668 } 669 670 /* 671 * validate_super for 0.90.0 672 */ 673 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 674 { 675 mdp_disk_t *desc; 676 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 677 678 rdev->raid_disk = -1; 679 rdev->flags = 0; 680 if (mddev->raid_disks == 0) { 681 mddev->major_version = 0; 682 mddev->minor_version = sb->minor_version; 683 mddev->patch_version = sb->patch_version; 684 mddev->persistent = ! sb->not_persistent; 685 mddev->chunk_size = sb->chunk_size; 686 mddev->ctime = sb->ctime; 687 mddev->utime = sb->utime; 688 mddev->level = sb->level; 689 mddev->layout = sb->layout; 690 mddev->raid_disks = sb->raid_disks; 691 mddev->size = sb->size; 692 mddev->events = md_event(sb); 693 mddev->bitmap_offset = 0; 694 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 695 696 if (sb->state & (1<<MD_SB_CLEAN)) 697 mddev->recovery_cp = MaxSector; 698 else { 699 if (sb->events_hi == sb->cp_events_hi && 700 sb->events_lo == sb->cp_events_lo) { 701 mddev->recovery_cp = sb->recovery_cp; 702 } else 703 mddev->recovery_cp = 0; 704 } 705 706 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 707 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 708 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 709 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 710 711 mddev->max_disks = MD_SB_DISKS; 712 713 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 714 mddev->bitmap_file == NULL) { 715 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 716 /* FIXME use a better test */ 717 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 718 return -EINVAL; 719 } 720 mddev->bitmap_offset = mddev->default_bitmap_offset; 721 } 722 723 } else if (mddev->pers == NULL) { 724 /* Insist on good event counter while assembling */ 725 __u64 ev1 = md_event(sb); 726 ++ev1; 727 if (ev1 < mddev->events) 728 return -EINVAL; 729 } else if (mddev->bitmap) { 730 /* if adding to array with a bitmap, then we can accept an 731 * older device ... but not too old. 732 */ 733 __u64 ev1 = md_event(sb); 734 if (ev1 < mddev->bitmap->events_cleared) 735 return 0; 736 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 737 return 0; 738 739 if (mddev->level != LEVEL_MULTIPATH) { 740 desc = sb->disks + rdev->desc_nr; 741 742 if (desc->state & (1<<MD_DISK_FAULTY)) 743 set_bit(Faulty, &rdev->flags); 744 else if (desc->state & (1<<MD_DISK_SYNC) && 745 desc->raid_disk < mddev->raid_disks) { 746 set_bit(In_sync, &rdev->flags); 747 rdev->raid_disk = desc->raid_disk; 748 } 749 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 750 set_bit(WriteMostly, &rdev->flags); 751 } else /* MULTIPATH are always insync */ 752 set_bit(In_sync, &rdev->flags); 753 return 0; 754 } 755 756 /* 757 * sync_super for 0.90.0 758 */ 759 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 760 { 761 mdp_super_t *sb; 762 struct list_head *tmp; 763 mdk_rdev_t *rdev2; 764 int next_spare = mddev->raid_disks; 765 char nm[20]; 766 767 /* make rdev->sb match mddev data.. 768 * 769 * 1/ zero out disks 770 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 771 * 3/ any empty disks < next_spare become removed 772 * 773 * disks[0] gets initialised to REMOVED because 774 * we cannot be sure from other fields if it has 775 * been initialised or not. 776 */ 777 int i; 778 int active=0, working=0,failed=0,spare=0,nr_disks=0; 779 unsigned int fixdesc=0; 780 781 rdev->sb_size = MD_SB_BYTES; 782 783 sb = (mdp_super_t*)page_address(rdev->sb_page); 784 785 memset(sb, 0, sizeof(*sb)); 786 787 sb->md_magic = MD_SB_MAGIC; 788 sb->major_version = mddev->major_version; 789 sb->minor_version = mddev->minor_version; 790 sb->patch_version = mddev->patch_version; 791 sb->gvalid_words = 0; /* ignored */ 792 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 793 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 794 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 795 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 796 797 sb->ctime = mddev->ctime; 798 sb->level = mddev->level; 799 sb->size = mddev->size; 800 sb->raid_disks = mddev->raid_disks; 801 sb->md_minor = mddev->md_minor; 802 sb->not_persistent = !mddev->persistent; 803 sb->utime = mddev->utime; 804 sb->state = 0; 805 sb->events_hi = (mddev->events>>32); 806 sb->events_lo = (u32)mddev->events; 807 808 if (mddev->in_sync) 809 { 810 sb->recovery_cp = mddev->recovery_cp; 811 sb->cp_events_hi = (mddev->events>>32); 812 sb->cp_events_lo = (u32)mddev->events; 813 if (mddev->recovery_cp == MaxSector) 814 sb->state = (1<< MD_SB_CLEAN); 815 } else 816 sb->recovery_cp = 0; 817 818 sb->layout = mddev->layout; 819 sb->chunk_size = mddev->chunk_size; 820 821 if (mddev->bitmap && mddev->bitmap_file == NULL) 822 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 823 824 sb->disks[0].state = (1<<MD_DISK_REMOVED); 825 ITERATE_RDEV(mddev,rdev2,tmp) { 826 mdp_disk_t *d; 827 int desc_nr; 828 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 829 && !test_bit(Faulty, &rdev2->flags)) 830 desc_nr = rdev2->raid_disk; 831 else 832 desc_nr = next_spare++; 833 if (desc_nr != rdev2->desc_nr) { 834 fixdesc |= (1 << desc_nr); 835 rdev2->desc_nr = desc_nr; 836 if (rdev2->raid_disk >= 0) { 837 sprintf(nm, "rd%d", rdev2->raid_disk); 838 sysfs_remove_link(&mddev->kobj, nm); 839 } 840 sysfs_remove_link(&rdev2->kobj, "block"); 841 kobject_del(&rdev2->kobj); 842 } 843 d = &sb->disks[rdev2->desc_nr]; 844 nr_disks++; 845 d->number = rdev2->desc_nr; 846 d->major = MAJOR(rdev2->bdev->bd_dev); 847 d->minor = MINOR(rdev2->bdev->bd_dev); 848 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 849 && !test_bit(Faulty, &rdev2->flags)) 850 d->raid_disk = rdev2->raid_disk; 851 else 852 d->raid_disk = rdev2->desc_nr; /* compatibility */ 853 if (test_bit(Faulty, &rdev2->flags)) { 854 d->state = (1<<MD_DISK_FAULTY); 855 failed++; 856 } else if (test_bit(In_sync, &rdev2->flags)) { 857 d->state = (1<<MD_DISK_ACTIVE); 858 d->state |= (1<<MD_DISK_SYNC); 859 active++; 860 working++; 861 } else { 862 d->state = 0; 863 spare++; 864 working++; 865 } 866 if (test_bit(WriteMostly, &rdev2->flags)) 867 d->state |= (1<<MD_DISK_WRITEMOSTLY); 868 } 869 if (fixdesc) 870 ITERATE_RDEV(mddev,rdev2,tmp) 871 if (fixdesc & (1<<rdev2->desc_nr)) { 872 snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", 873 rdev2->desc_nr); 874 /* kobject_add gets a ref on the parent, so 875 * we have to drop the one we already have 876 */ 877 kobject_add(&rdev2->kobj); 878 kobject_put(rdev->kobj.parent); 879 sysfs_create_link(&rdev2->kobj, 880 &rdev2->bdev->bd_disk->kobj, 881 "block"); 882 if (rdev2->raid_disk >= 0) { 883 sprintf(nm, "rd%d", rdev2->raid_disk); 884 sysfs_create_link(&mddev->kobj, 885 &rdev2->kobj, nm); 886 } 887 } 888 /* now set the "removed" and "faulty" bits on any missing devices */ 889 for (i=0 ; i < mddev->raid_disks ; i++) { 890 mdp_disk_t *d = &sb->disks[i]; 891 if (d->state == 0 && d->number == 0) { 892 d->number = i; 893 d->raid_disk = i; 894 d->state = (1<<MD_DISK_REMOVED); 895 d->state |= (1<<MD_DISK_FAULTY); 896 failed++; 897 } 898 } 899 sb->nr_disks = nr_disks; 900 sb->active_disks = active; 901 sb->working_disks = working; 902 sb->failed_disks = failed; 903 sb->spare_disks = spare; 904 905 sb->this_disk = sb->disks[rdev->desc_nr]; 906 sb->sb_csum = calc_sb_csum(sb); 907 } 908 909 /* 910 * version 1 superblock 911 */ 912 913 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 914 { 915 unsigned int disk_csum, csum; 916 unsigned long long newcsum; 917 int size = 256 + le32_to_cpu(sb->max_dev)*2; 918 unsigned int *isuper = (unsigned int*)sb; 919 int i; 920 921 disk_csum = sb->sb_csum; 922 sb->sb_csum = 0; 923 newcsum = 0; 924 for (i=0; size>=4; size -= 4 ) 925 newcsum += le32_to_cpu(*isuper++); 926 927 if (size == 2) 928 newcsum += le16_to_cpu(*(unsigned short*) isuper); 929 930 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 931 sb->sb_csum = disk_csum; 932 return cpu_to_le32(csum); 933 } 934 935 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 936 { 937 struct mdp_superblock_1 *sb; 938 int ret; 939 sector_t sb_offset; 940 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 941 int bmask; 942 943 /* 944 * Calculate the position of the superblock. 945 * It is always aligned to a 4K boundary and 946 * depeding on minor_version, it can be: 947 * 0: At least 8K, but less than 12K, from end of device 948 * 1: At start of device 949 * 2: 4K from start of device. 950 */ 951 switch(minor_version) { 952 case 0: 953 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 954 sb_offset -= 8*2; 955 sb_offset &= ~(sector_t)(4*2-1); 956 /* convert from sectors to K */ 957 sb_offset /= 2; 958 break; 959 case 1: 960 sb_offset = 0; 961 break; 962 case 2: 963 sb_offset = 4; 964 break; 965 default: 966 return -EINVAL; 967 } 968 rdev->sb_offset = sb_offset; 969 970 /* superblock is rarely larger than 1K, but it can be larger, 971 * and it is safe to read 4k, so we do that 972 */ 973 ret = read_disk_sb(rdev, 4096); 974 if (ret) return ret; 975 976 977 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 978 979 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 980 sb->major_version != cpu_to_le32(1) || 981 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 982 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 983 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 984 return -EINVAL; 985 986 if (calc_sb_1_csum(sb) != sb->sb_csum) { 987 printk("md: invalid superblock checksum on %s\n", 988 bdevname(rdev->bdev,b)); 989 return -EINVAL; 990 } 991 if (le64_to_cpu(sb->data_size) < 10) { 992 printk("md: data_size too small on %s\n", 993 bdevname(rdev->bdev,b)); 994 return -EINVAL; 995 } 996 rdev->preferred_minor = 0xffff; 997 rdev->data_offset = le64_to_cpu(sb->data_offset); 998 999 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1000 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1001 if (rdev->sb_size & bmask) 1002 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1003 1004 if (refdev == 0) 1005 return 1; 1006 else { 1007 __u64 ev1, ev2; 1008 struct mdp_superblock_1 *refsb = 1009 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1010 1011 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1012 sb->level != refsb->level || 1013 sb->layout != refsb->layout || 1014 sb->chunksize != refsb->chunksize) { 1015 printk(KERN_WARNING "md: %s has strangely different" 1016 " superblock to %s\n", 1017 bdevname(rdev->bdev,b), 1018 bdevname(refdev->bdev,b2)); 1019 return -EINVAL; 1020 } 1021 ev1 = le64_to_cpu(sb->events); 1022 ev2 = le64_to_cpu(refsb->events); 1023 1024 if (ev1 > ev2) 1025 return 1; 1026 } 1027 if (minor_version) 1028 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1029 else 1030 rdev->size = rdev->sb_offset; 1031 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1032 return -EINVAL; 1033 rdev->size = le64_to_cpu(sb->data_size)/2; 1034 if (le32_to_cpu(sb->chunksize)) 1035 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1036 return 0; 1037 } 1038 1039 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1040 { 1041 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1042 1043 rdev->raid_disk = -1; 1044 rdev->flags = 0; 1045 if (mddev->raid_disks == 0) { 1046 mddev->major_version = 1; 1047 mddev->patch_version = 0; 1048 mddev->persistent = 1; 1049 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1050 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1051 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1052 mddev->level = le32_to_cpu(sb->level); 1053 mddev->layout = le32_to_cpu(sb->layout); 1054 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1055 mddev->size = le64_to_cpu(sb->size)/2; 1056 mddev->events = le64_to_cpu(sb->events); 1057 mddev->bitmap_offset = 0; 1058 mddev->default_bitmap_offset = 0; 1059 mddev->default_bitmap_offset = 1024; 1060 1061 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1062 memcpy(mddev->uuid, sb->set_uuid, 16); 1063 1064 mddev->max_disks = (4096-256)/2; 1065 1066 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1067 mddev->bitmap_file == NULL ) { 1068 if (mddev->level != 1) { 1069 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 1070 return -EINVAL; 1071 } 1072 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1073 } 1074 } else if (mddev->pers == NULL) { 1075 /* Insist of good event counter while assembling */ 1076 __u64 ev1 = le64_to_cpu(sb->events); 1077 ++ev1; 1078 if (ev1 < mddev->events) 1079 return -EINVAL; 1080 } else if (mddev->bitmap) { 1081 /* If adding to array with a bitmap, then we can accept an 1082 * older device, but not too old. 1083 */ 1084 __u64 ev1 = le64_to_cpu(sb->events); 1085 if (ev1 < mddev->bitmap->events_cleared) 1086 return 0; 1087 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 1088 return 0; 1089 1090 if (mddev->level != LEVEL_MULTIPATH) { 1091 int role; 1092 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1093 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1094 switch(role) { 1095 case 0xffff: /* spare */ 1096 break; 1097 case 0xfffe: /* faulty */ 1098 set_bit(Faulty, &rdev->flags); 1099 break; 1100 default: 1101 set_bit(In_sync, &rdev->flags); 1102 rdev->raid_disk = role; 1103 break; 1104 } 1105 if (sb->devflags & WriteMostly1) 1106 set_bit(WriteMostly, &rdev->flags); 1107 } else /* MULTIPATH are always insync */ 1108 set_bit(In_sync, &rdev->flags); 1109 1110 return 0; 1111 } 1112 1113 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1114 { 1115 struct mdp_superblock_1 *sb; 1116 struct list_head *tmp; 1117 mdk_rdev_t *rdev2; 1118 int max_dev, i; 1119 /* make rdev->sb match mddev and rdev data. */ 1120 1121 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1122 1123 sb->feature_map = 0; 1124 sb->pad0 = 0; 1125 memset(sb->pad1, 0, sizeof(sb->pad1)); 1126 memset(sb->pad2, 0, sizeof(sb->pad2)); 1127 memset(sb->pad3, 0, sizeof(sb->pad3)); 1128 1129 sb->utime = cpu_to_le64((__u64)mddev->utime); 1130 sb->events = cpu_to_le64(mddev->events); 1131 if (mddev->in_sync) 1132 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1133 else 1134 sb->resync_offset = cpu_to_le64(0); 1135 1136 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1137 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1138 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1139 } 1140 1141 max_dev = 0; 1142 ITERATE_RDEV(mddev,rdev2,tmp) 1143 if (rdev2->desc_nr+1 > max_dev) 1144 max_dev = rdev2->desc_nr+1; 1145 1146 sb->max_dev = cpu_to_le32(max_dev); 1147 for (i=0; i<max_dev;i++) 1148 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1149 1150 ITERATE_RDEV(mddev,rdev2,tmp) { 1151 i = rdev2->desc_nr; 1152 if (test_bit(Faulty, &rdev2->flags)) 1153 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1154 else if (test_bit(In_sync, &rdev2->flags)) 1155 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1156 else 1157 sb->dev_roles[i] = cpu_to_le16(0xffff); 1158 } 1159 1160 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1161 sb->sb_csum = calc_sb_1_csum(sb); 1162 } 1163 1164 1165 static struct super_type super_types[] = { 1166 [0] = { 1167 .name = "0.90.0", 1168 .owner = THIS_MODULE, 1169 .load_super = super_90_load, 1170 .validate_super = super_90_validate, 1171 .sync_super = super_90_sync, 1172 }, 1173 [1] = { 1174 .name = "md-1", 1175 .owner = THIS_MODULE, 1176 .load_super = super_1_load, 1177 .validate_super = super_1_validate, 1178 .sync_super = super_1_sync, 1179 }, 1180 }; 1181 1182 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1183 { 1184 struct list_head *tmp; 1185 mdk_rdev_t *rdev; 1186 1187 ITERATE_RDEV(mddev,rdev,tmp) 1188 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1189 return rdev; 1190 1191 return NULL; 1192 } 1193 1194 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1195 { 1196 struct list_head *tmp; 1197 mdk_rdev_t *rdev; 1198 1199 ITERATE_RDEV(mddev1,rdev,tmp) 1200 if (match_dev_unit(mddev2, rdev)) 1201 return 1; 1202 1203 return 0; 1204 } 1205 1206 static LIST_HEAD(pending_raid_disks); 1207 1208 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1209 { 1210 mdk_rdev_t *same_pdev; 1211 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1212 1213 if (rdev->mddev) { 1214 MD_BUG(); 1215 return -EINVAL; 1216 } 1217 same_pdev = match_dev_unit(mddev, rdev); 1218 if (same_pdev) 1219 printk(KERN_WARNING 1220 "%s: WARNING: %s appears to be on the same physical" 1221 " disk as %s. True\n protection against single-disk" 1222 " failure might be compromised.\n", 1223 mdname(mddev), bdevname(rdev->bdev,b), 1224 bdevname(same_pdev->bdev,b2)); 1225 1226 /* Verify rdev->desc_nr is unique. 1227 * If it is -1, assign a free number, else 1228 * check number is not in use 1229 */ 1230 if (rdev->desc_nr < 0) { 1231 int choice = 0; 1232 if (mddev->pers) choice = mddev->raid_disks; 1233 while (find_rdev_nr(mddev, choice)) 1234 choice++; 1235 rdev->desc_nr = choice; 1236 } else { 1237 if (find_rdev_nr(mddev, rdev->desc_nr)) 1238 return -EBUSY; 1239 } 1240 1241 list_add(&rdev->same_set, &mddev->disks); 1242 rdev->mddev = mddev; 1243 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1244 1245 rdev->kobj.k_name = NULL; 1246 snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); 1247 rdev->kobj.parent = &mddev->kobj; 1248 kobject_add(&rdev->kobj); 1249 1250 sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); 1251 return 0; 1252 } 1253 1254 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1255 { 1256 char b[BDEVNAME_SIZE]; 1257 if (!rdev->mddev) { 1258 MD_BUG(); 1259 return; 1260 } 1261 list_del_init(&rdev->same_set); 1262 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1263 rdev->mddev = NULL; 1264 sysfs_remove_link(&rdev->kobj, "block"); 1265 kobject_del(&rdev->kobj); 1266 } 1267 1268 /* 1269 * prevent the device from being mounted, repartitioned or 1270 * otherwise reused by a RAID array (or any other kernel 1271 * subsystem), by bd_claiming the device. 1272 */ 1273 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1274 { 1275 int err = 0; 1276 struct block_device *bdev; 1277 char b[BDEVNAME_SIZE]; 1278 1279 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1280 if (IS_ERR(bdev)) { 1281 printk(KERN_ERR "md: could not open %s.\n", 1282 __bdevname(dev, b)); 1283 return PTR_ERR(bdev); 1284 } 1285 err = bd_claim(bdev, rdev); 1286 if (err) { 1287 printk(KERN_ERR "md: could not bd_claim %s.\n", 1288 bdevname(bdev, b)); 1289 blkdev_put(bdev); 1290 return err; 1291 } 1292 rdev->bdev = bdev; 1293 return err; 1294 } 1295 1296 static void unlock_rdev(mdk_rdev_t *rdev) 1297 { 1298 struct block_device *bdev = rdev->bdev; 1299 rdev->bdev = NULL; 1300 if (!bdev) 1301 MD_BUG(); 1302 bd_release(bdev); 1303 blkdev_put(bdev); 1304 } 1305 1306 void md_autodetect_dev(dev_t dev); 1307 1308 static void export_rdev(mdk_rdev_t * rdev) 1309 { 1310 char b[BDEVNAME_SIZE]; 1311 printk(KERN_INFO "md: export_rdev(%s)\n", 1312 bdevname(rdev->bdev,b)); 1313 if (rdev->mddev) 1314 MD_BUG(); 1315 free_disk_sb(rdev); 1316 list_del_init(&rdev->same_set); 1317 #ifndef MODULE 1318 md_autodetect_dev(rdev->bdev->bd_dev); 1319 #endif 1320 unlock_rdev(rdev); 1321 kobject_put(&rdev->kobj); 1322 } 1323 1324 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1325 { 1326 unbind_rdev_from_array(rdev); 1327 export_rdev(rdev); 1328 } 1329 1330 static void export_array(mddev_t *mddev) 1331 { 1332 struct list_head *tmp; 1333 mdk_rdev_t *rdev; 1334 1335 ITERATE_RDEV(mddev,rdev,tmp) { 1336 if (!rdev->mddev) { 1337 MD_BUG(); 1338 continue; 1339 } 1340 kick_rdev_from_array(rdev); 1341 } 1342 if (!list_empty(&mddev->disks)) 1343 MD_BUG(); 1344 mddev->raid_disks = 0; 1345 mddev->major_version = 0; 1346 } 1347 1348 static void print_desc(mdp_disk_t *desc) 1349 { 1350 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1351 desc->major,desc->minor,desc->raid_disk,desc->state); 1352 } 1353 1354 static void print_sb(mdp_super_t *sb) 1355 { 1356 int i; 1357 1358 printk(KERN_INFO 1359 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1360 sb->major_version, sb->minor_version, sb->patch_version, 1361 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1362 sb->ctime); 1363 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1364 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1365 sb->md_minor, sb->layout, sb->chunk_size); 1366 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1367 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1368 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1369 sb->failed_disks, sb->spare_disks, 1370 sb->sb_csum, (unsigned long)sb->events_lo); 1371 1372 printk(KERN_INFO); 1373 for (i = 0; i < MD_SB_DISKS; i++) { 1374 mdp_disk_t *desc; 1375 1376 desc = sb->disks + i; 1377 if (desc->number || desc->major || desc->minor || 1378 desc->raid_disk || (desc->state && (desc->state != 4))) { 1379 printk(" D %2d: ", i); 1380 print_desc(desc); 1381 } 1382 } 1383 printk(KERN_INFO "md: THIS: "); 1384 print_desc(&sb->this_disk); 1385 1386 } 1387 1388 static void print_rdev(mdk_rdev_t *rdev) 1389 { 1390 char b[BDEVNAME_SIZE]; 1391 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1392 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1393 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1394 rdev->desc_nr); 1395 if (rdev->sb_loaded) { 1396 printk(KERN_INFO "md: rdev superblock:\n"); 1397 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1398 } else 1399 printk(KERN_INFO "md: no rdev superblock!\n"); 1400 } 1401 1402 void md_print_devices(void) 1403 { 1404 struct list_head *tmp, *tmp2; 1405 mdk_rdev_t *rdev; 1406 mddev_t *mddev; 1407 char b[BDEVNAME_SIZE]; 1408 1409 printk("\n"); 1410 printk("md: **********************************\n"); 1411 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1412 printk("md: **********************************\n"); 1413 ITERATE_MDDEV(mddev,tmp) { 1414 1415 if (mddev->bitmap) 1416 bitmap_print_sb(mddev->bitmap); 1417 else 1418 printk("%s: ", mdname(mddev)); 1419 ITERATE_RDEV(mddev,rdev,tmp2) 1420 printk("<%s>", bdevname(rdev->bdev,b)); 1421 printk("\n"); 1422 1423 ITERATE_RDEV(mddev,rdev,tmp2) 1424 print_rdev(rdev); 1425 } 1426 printk("md: **********************************\n"); 1427 printk("\n"); 1428 } 1429 1430 1431 static void sync_sbs(mddev_t * mddev) 1432 { 1433 mdk_rdev_t *rdev; 1434 struct list_head *tmp; 1435 1436 ITERATE_RDEV(mddev,rdev,tmp) { 1437 super_types[mddev->major_version]. 1438 sync_super(mddev, rdev); 1439 rdev->sb_loaded = 1; 1440 } 1441 } 1442 1443 static void md_update_sb(mddev_t * mddev) 1444 { 1445 int err; 1446 struct list_head *tmp; 1447 mdk_rdev_t *rdev; 1448 int sync_req; 1449 1450 repeat: 1451 spin_lock_irq(&mddev->write_lock); 1452 sync_req = mddev->in_sync; 1453 mddev->utime = get_seconds(); 1454 mddev->events ++; 1455 1456 if (!mddev->events) { 1457 /* 1458 * oops, this 64-bit counter should never wrap. 1459 * Either we are in around ~1 trillion A.C., assuming 1460 * 1 reboot per second, or we have a bug: 1461 */ 1462 MD_BUG(); 1463 mddev->events --; 1464 } 1465 mddev->sb_dirty = 2; 1466 sync_sbs(mddev); 1467 1468 /* 1469 * do not write anything to disk if using 1470 * nonpersistent superblocks 1471 */ 1472 if (!mddev->persistent) { 1473 mddev->sb_dirty = 0; 1474 spin_unlock_irq(&mddev->write_lock); 1475 wake_up(&mddev->sb_wait); 1476 return; 1477 } 1478 spin_unlock_irq(&mddev->write_lock); 1479 1480 dprintk(KERN_INFO 1481 "md: updating %s RAID superblock on device (in sync %d)\n", 1482 mdname(mddev),mddev->in_sync); 1483 1484 err = bitmap_update_sb(mddev->bitmap); 1485 ITERATE_RDEV(mddev,rdev,tmp) { 1486 char b[BDEVNAME_SIZE]; 1487 dprintk(KERN_INFO "md: "); 1488 if (test_bit(Faulty, &rdev->flags)) 1489 dprintk("(skipping faulty "); 1490 1491 dprintk("%s ", bdevname(rdev->bdev,b)); 1492 if (!test_bit(Faulty, &rdev->flags)) { 1493 md_super_write(mddev,rdev, 1494 rdev->sb_offset<<1, rdev->sb_size, 1495 rdev->sb_page); 1496 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1497 bdevname(rdev->bdev,b), 1498 (unsigned long long)rdev->sb_offset); 1499 1500 } else 1501 dprintk(")\n"); 1502 if (mddev->level == LEVEL_MULTIPATH) 1503 /* only need to write one superblock... */ 1504 break; 1505 } 1506 md_super_wait(mddev); 1507 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1508 1509 spin_lock_irq(&mddev->write_lock); 1510 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1511 /* have to write it out again */ 1512 spin_unlock_irq(&mddev->write_lock); 1513 goto repeat; 1514 } 1515 mddev->sb_dirty = 0; 1516 spin_unlock_irq(&mddev->write_lock); 1517 wake_up(&mddev->sb_wait); 1518 1519 } 1520 1521 struct rdev_sysfs_entry { 1522 struct attribute attr; 1523 ssize_t (*show)(mdk_rdev_t *, char *); 1524 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1525 }; 1526 1527 static ssize_t 1528 rdev_show_state(mdk_rdev_t *rdev, char *page) 1529 { 1530 char *sep = ""; 1531 int len=0; 1532 1533 if (test_bit(Faulty, &rdev->flags)) { 1534 len+= sprintf(page+len, "%sfaulty",sep); 1535 sep = ","; 1536 } 1537 if (test_bit(In_sync, &rdev->flags)) { 1538 len += sprintf(page+len, "%sin_sync",sep); 1539 sep = ","; 1540 } 1541 if (!test_bit(Faulty, &rdev->flags) && 1542 !test_bit(In_sync, &rdev->flags)) { 1543 len += sprintf(page+len, "%sspare", sep); 1544 sep = ","; 1545 } 1546 return len+sprintf(page+len, "\n"); 1547 } 1548 1549 static struct rdev_sysfs_entry rdev_state = { 1550 .attr = {.name = "state", .mode = S_IRUGO }, 1551 .show = rdev_show_state, 1552 }; 1553 1554 static ssize_t 1555 rdev_show_super(mdk_rdev_t *rdev, char *page) 1556 { 1557 if (rdev->sb_loaded && rdev->sb_size) { 1558 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1559 return rdev->sb_size; 1560 } else 1561 return 0; 1562 } 1563 static struct rdev_sysfs_entry rdev_super = { 1564 .attr = {.name = "super", .mode = S_IRUGO }, 1565 .show = rdev_show_super, 1566 }; 1567 static struct attribute *rdev_default_attrs[] = { 1568 &rdev_state.attr, 1569 &rdev_super.attr, 1570 NULL, 1571 }; 1572 static ssize_t 1573 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1574 { 1575 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1576 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1577 1578 if (!entry->show) 1579 return -EIO; 1580 return entry->show(rdev, page); 1581 } 1582 1583 static ssize_t 1584 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1585 const char *page, size_t length) 1586 { 1587 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1588 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1589 1590 if (!entry->store) 1591 return -EIO; 1592 return entry->store(rdev, page, length); 1593 } 1594 1595 static void rdev_free(struct kobject *ko) 1596 { 1597 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1598 kfree(rdev); 1599 } 1600 static struct sysfs_ops rdev_sysfs_ops = { 1601 .show = rdev_attr_show, 1602 .store = rdev_attr_store, 1603 }; 1604 static struct kobj_type rdev_ktype = { 1605 .release = rdev_free, 1606 .sysfs_ops = &rdev_sysfs_ops, 1607 .default_attrs = rdev_default_attrs, 1608 }; 1609 1610 /* 1611 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1612 * 1613 * mark the device faulty if: 1614 * 1615 * - the device is nonexistent (zero size) 1616 * - the device has no valid superblock 1617 * 1618 * a faulty rdev _never_ has rdev->sb set. 1619 */ 1620 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1621 { 1622 char b[BDEVNAME_SIZE]; 1623 int err; 1624 mdk_rdev_t *rdev; 1625 sector_t size; 1626 1627 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1628 if (!rdev) { 1629 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1630 return ERR_PTR(-ENOMEM); 1631 } 1632 memset(rdev, 0, sizeof(*rdev)); 1633 1634 if ((err = alloc_disk_sb(rdev))) 1635 goto abort_free; 1636 1637 err = lock_rdev(rdev, newdev); 1638 if (err) 1639 goto abort_free; 1640 1641 rdev->kobj.parent = NULL; 1642 rdev->kobj.ktype = &rdev_ktype; 1643 kobject_init(&rdev->kobj); 1644 1645 rdev->desc_nr = -1; 1646 rdev->flags = 0; 1647 rdev->data_offset = 0; 1648 atomic_set(&rdev->nr_pending, 0); 1649 atomic_set(&rdev->read_errors, 0); 1650 1651 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1652 if (!size) { 1653 printk(KERN_WARNING 1654 "md: %s has zero or unknown size, marking faulty!\n", 1655 bdevname(rdev->bdev,b)); 1656 err = -EINVAL; 1657 goto abort_free; 1658 } 1659 1660 if (super_format >= 0) { 1661 err = super_types[super_format]. 1662 load_super(rdev, NULL, super_minor); 1663 if (err == -EINVAL) { 1664 printk(KERN_WARNING 1665 "md: %s has invalid sb, not importing!\n", 1666 bdevname(rdev->bdev,b)); 1667 goto abort_free; 1668 } 1669 if (err < 0) { 1670 printk(KERN_WARNING 1671 "md: could not read %s's sb, not importing!\n", 1672 bdevname(rdev->bdev,b)); 1673 goto abort_free; 1674 } 1675 } 1676 INIT_LIST_HEAD(&rdev->same_set); 1677 1678 return rdev; 1679 1680 abort_free: 1681 if (rdev->sb_page) { 1682 if (rdev->bdev) 1683 unlock_rdev(rdev); 1684 free_disk_sb(rdev); 1685 } 1686 kfree(rdev); 1687 return ERR_PTR(err); 1688 } 1689 1690 /* 1691 * Check a full RAID array for plausibility 1692 */ 1693 1694 1695 static void analyze_sbs(mddev_t * mddev) 1696 { 1697 int i; 1698 struct list_head *tmp; 1699 mdk_rdev_t *rdev, *freshest; 1700 char b[BDEVNAME_SIZE]; 1701 1702 freshest = NULL; 1703 ITERATE_RDEV(mddev,rdev,tmp) 1704 switch (super_types[mddev->major_version]. 1705 load_super(rdev, freshest, mddev->minor_version)) { 1706 case 1: 1707 freshest = rdev; 1708 break; 1709 case 0: 1710 break; 1711 default: 1712 printk( KERN_ERR \ 1713 "md: fatal superblock inconsistency in %s" 1714 " -- removing from array\n", 1715 bdevname(rdev->bdev,b)); 1716 kick_rdev_from_array(rdev); 1717 } 1718 1719 1720 super_types[mddev->major_version]. 1721 validate_super(mddev, freshest); 1722 1723 i = 0; 1724 ITERATE_RDEV(mddev,rdev,tmp) { 1725 if (rdev != freshest) 1726 if (super_types[mddev->major_version]. 1727 validate_super(mddev, rdev)) { 1728 printk(KERN_WARNING "md: kicking non-fresh %s" 1729 " from array!\n", 1730 bdevname(rdev->bdev,b)); 1731 kick_rdev_from_array(rdev); 1732 continue; 1733 } 1734 if (mddev->level == LEVEL_MULTIPATH) { 1735 rdev->desc_nr = i++; 1736 rdev->raid_disk = rdev->desc_nr; 1737 set_bit(In_sync, &rdev->flags); 1738 } 1739 } 1740 1741 1742 1743 if (mddev->recovery_cp != MaxSector && 1744 mddev->level >= 1) 1745 printk(KERN_ERR "md: %s: raid array is not clean" 1746 " -- starting background reconstruction\n", 1747 mdname(mddev)); 1748 1749 } 1750 1751 static ssize_t 1752 md_show_level(mddev_t *mddev, char *page) 1753 { 1754 mdk_personality_t *p = mddev->pers; 1755 if (p == NULL) 1756 return 0; 1757 if (mddev->level >= 0) 1758 return sprintf(page, "RAID-%d\n", mddev->level); 1759 else 1760 return sprintf(page, "%s\n", p->name); 1761 } 1762 1763 static struct md_sysfs_entry md_level = { 1764 .attr = {.name = "level", .mode = S_IRUGO }, 1765 .show = md_show_level, 1766 }; 1767 1768 static ssize_t 1769 md_show_rdisks(mddev_t *mddev, char *page) 1770 { 1771 return sprintf(page, "%d\n", mddev->raid_disks); 1772 } 1773 1774 static struct md_sysfs_entry md_raid_disks = { 1775 .attr = {.name = "raid_disks", .mode = S_IRUGO }, 1776 .show = md_show_rdisks, 1777 }; 1778 1779 static ssize_t 1780 md_show_scan(mddev_t *mddev, char *page) 1781 { 1782 char *type = "none"; 1783 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1784 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 1785 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1786 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1787 type = "resync"; 1788 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1789 type = "check"; 1790 else 1791 type = "repair"; 1792 } else 1793 type = "recover"; 1794 } 1795 return sprintf(page, "%s\n", type); 1796 } 1797 1798 static ssize_t 1799 md_store_scan(mddev_t *mddev, const char *page, size_t len) 1800 { 1801 int canscan=0; 1802 1803 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 1804 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 1805 return -EBUSY; 1806 down(&mddev->reconfig_sem); 1807 if (mddev->pers && mddev->pers->sync_request) 1808 canscan=1; 1809 up(&mddev->reconfig_sem); 1810 if (!canscan) 1811 return -EINVAL; 1812 1813 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) 1814 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 1815 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) 1816 return -EINVAL; 1817 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 1818 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 1819 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1820 md_wakeup_thread(mddev->thread); 1821 return len; 1822 } 1823 1824 static ssize_t 1825 md_show_mismatch(mddev_t *mddev, char *page) 1826 { 1827 return sprintf(page, "%llu\n", 1828 (unsigned long long) mddev->resync_mismatches); 1829 } 1830 1831 static struct md_sysfs_entry md_scan_mode = { 1832 .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, 1833 .show = md_show_scan, 1834 .store = md_store_scan, 1835 }; 1836 1837 static struct md_sysfs_entry md_mismatches = { 1838 .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, 1839 .show = md_show_mismatch, 1840 }; 1841 1842 static struct attribute *md_default_attrs[] = { 1843 &md_level.attr, 1844 &md_raid_disks.attr, 1845 &md_scan_mode.attr, 1846 &md_mismatches.attr, 1847 NULL, 1848 }; 1849 1850 static ssize_t 1851 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1852 { 1853 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1854 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1855 1856 if (!entry->show) 1857 return -EIO; 1858 return entry->show(mddev, page); 1859 } 1860 1861 static ssize_t 1862 md_attr_store(struct kobject *kobj, struct attribute *attr, 1863 const char *page, size_t length) 1864 { 1865 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 1866 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 1867 1868 if (!entry->store) 1869 return -EIO; 1870 return entry->store(mddev, page, length); 1871 } 1872 1873 static void md_free(struct kobject *ko) 1874 { 1875 mddev_t *mddev = container_of(ko, mddev_t, kobj); 1876 kfree(mddev); 1877 } 1878 1879 static struct sysfs_ops md_sysfs_ops = { 1880 .show = md_attr_show, 1881 .store = md_attr_store, 1882 }; 1883 static struct kobj_type md_ktype = { 1884 .release = md_free, 1885 .sysfs_ops = &md_sysfs_ops, 1886 .default_attrs = md_default_attrs, 1887 }; 1888 1889 int mdp_major = 0; 1890 1891 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1892 { 1893 static DECLARE_MUTEX(disks_sem); 1894 mddev_t *mddev = mddev_find(dev); 1895 struct gendisk *disk; 1896 int partitioned = (MAJOR(dev) != MD_MAJOR); 1897 int shift = partitioned ? MdpMinorShift : 0; 1898 int unit = MINOR(dev) >> shift; 1899 1900 if (!mddev) 1901 return NULL; 1902 1903 down(&disks_sem); 1904 if (mddev->gendisk) { 1905 up(&disks_sem); 1906 mddev_put(mddev); 1907 return NULL; 1908 } 1909 disk = alloc_disk(1 << shift); 1910 if (!disk) { 1911 up(&disks_sem); 1912 mddev_put(mddev); 1913 return NULL; 1914 } 1915 disk->major = MAJOR(dev); 1916 disk->first_minor = unit << shift; 1917 if (partitioned) { 1918 sprintf(disk->disk_name, "md_d%d", unit); 1919 sprintf(disk->devfs_name, "md/d%d", unit); 1920 } else { 1921 sprintf(disk->disk_name, "md%d", unit); 1922 sprintf(disk->devfs_name, "md/%d", unit); 1923 } 1924 disk->fops = &md_fops; 1925 disk->private_data = mddev; 1926 disk->queue = mddev->queue; 1927 add_disk(disk); 1928 mddev->gendisk = disk; 1929 up(&disks_sem); 1930 mddev->kobj.parent = &disk->kobj; 1931 mddev->kobj.k_name = NULL; 1932 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 1933 mddev->kobj.ktype = &md_ktype; 1934 kobject_register(&mddev->kobj); 1935 return NULL; 1936 } 1937 1938 void md_wakeup_thread(mdk_thread_t *thread); 1939 1940 static void md_safemode_timeout(unsigned long data) 1941 { 1942 mddev_t *mddev = (mddev_t *) data; 1943 1944 mddev->safemode = 1; 1945 md_wakeup_thread(mddev->thread); 1946 } 1947 1948 1949 static int do_md_run(mddev_t * mddev) 1950 { 1951 int pnum, err; 1952 int chunk_size; 1953 struct list_head *tmp; 1954 mdk_rdev_t *rdev; 1955 struct gendisk *disk; 1956 char b[BDEVNAME_SIZE]; 1957 1958 if (list_empty(&mddev->disks)) 1959 /* cannot run an array with no devices.. */ 1960 return -EINVAL; 1961 1962 if (mddev->pers) 1963 return -EBUSY; 1964 1965 /* 1966 * Analyze all RAID superblock(s) 1967 */ 1968 if (!mddev->raid_disks) 1969 analyze_sbs(mddev); 1970 1971 chunk_size = mddev->chunk_size; 1972 pnum = level_to_pers(mddev->level); 1973 1974 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1975 if (!chunk_size) { 1976 /* 1977 * 'default chunksize' in the old md code used to 1978 * be PAGE_SIZE, baaad. 1979 * we abort here to be on the safe side. We don't 1980 * want to continue the bad practice. 1981 */ 1982 printk(KERN_ERR 1983 "no chunksize specified, see 'man raidtab'\n"); 1984 return -EINVAL; 1985 } 1986 if (chunk_size > MAX_CHUNK_SIZE) { 1987 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1988 chunk_size, MAX_CHUNK_SIZE); 1989 return -EINVAL; 1990 } 1991 /* 1992 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1993 */ 1994 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1995 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1996 return -EINVAL; 1997 } 1998 if (chunk_size < PAGE_SIZE) { 1999 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 2000 chunk_size, PAGE_SIZE); 2001 return -EINVAL; 2002 } 2003 2004 /* devices must have minimum size of one chunk */ 2005 ITERATE_RDEV(mddev,rdev,tmp) { 2006 if (test_bit(Faulty, &rdev->flags)) 2007 continue; 2008 if (rdev->size < chunk_size / 1024) { 2009 printk(KERN_WARNING 2010 "md: Dev %s smaller than chunk_size:" 2011 " %lluk < %dk\n", 2012 bdevname(rdev->bdev,b), 2013 (unsigned long long)rdev->size, 2014 chunk_size / 1024); 2015 return -EINVAL; 2016 } 2017 } 2018 } 2019 2020 #ifdef CONFIG_KMOD 2021 if (!pers[pnum]) 2022 { 2023 request_module("md-personality-%d", pnum); 2024 } 2025 #endif 2026 2027 /* 2028 * Drop all container device buffers, from now on 2029 * the only valid external interface is through the md 2030 * device. 2031 * Also find largest hardsector size 2032 */ 2033 ITERATE_RDEV(mddev,rdev,tmp) { 2034 if (test_bit(Faulty, &rdev->flags)) 2035 continue; 2036 sync_blockdev(rdev->bdev); 2037 invalidate_bdev(rdev->bdev, 0); 2038 } 2039 2040 md_probe(mddev->unit, NULL, NULL); 2041 disk = mddev->gendisk; 2042 if (!disk) 2043 return -ENOMEM; 2044 2045 spin_lock(&pers_lock); 2046 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 2047 spin_unlock(&pers_lock); 2048 printk(KERN_WARNING "md: personality %d is not loaded!\n", 2049 pnum); 2050 return -EINVAL; 2051 } 2052 2053 mddev->pers = pers[pnum]; 2054 spin_unlock(&pers_lock); 2055 2056 mddev->recovery = 0; 2057 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2058 mddev->barriers_work = 1; 2059 2060 /* before we start the array running, initialise the bitmap */ 2061 err = bitmap_create(mddev); 2062 if (err) 2063 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2064 mdname(mddev), err); 2065 else 2066 err = mddev->pers->run(mddev); 2067 if (err) { 2068 printk(KERN_ERR "md: pers->run() failed ...\n"); 2069 module_put(mddev->pers->owner); 2070 mddev->pers = NULL; 2071 bitmap_destroy(mddev); 2072 return err; 2073 } 2074 atomic_set(&mddev->writes_pending,0); 2075 mddev->safemode = 0; 2076 mddev->safemode_timer.function = md_safemode_timeout; 2077 mddev->safemode_timer.data = (unsigned long) mddev; 2078 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2079 mddev->in_sync = 1; 2080 2081 ITERATE_RDEV(mddev,rdev,tmp) 2082 if (rdev->raid_disk >= 0) { 2083 char nm[20]; 2084 sprintf(nm, "rd%d", rdev->raid_disk); 2085 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2086 } 2087 2088 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2089 md_wakeup_thread(mddev->thread); 2090 2091 if (mddev->sb_dirty) 2092 md_update_sb(mddev); 2093 2094 set_capacity(disk, mddev->array_size<<1); 2095 2096 /* If we call blk_queue_make_request here, it will 2097 * re-initialise max_sectors etc which may have been 2098 * refined inside -> run. So just set the bits we need to set. 2099 * Most initialisation happended when we called 2100 * blk_queue_make_request(..., md_fail_request) 2101 * earlier. 2102 */ 2103 mddev->queue->queuedata = mddev; 2104 mddev->queue->make_request_fn = mddev->pers->make_request; 2105 2106 mddev->changed = 1; 2107 return 0; 2108 } 2109 2110 static int restart_array(mddev_t *mddev) 2111 { 2112 struct gendisk *disk = mddev->gendisk; 2113 int err; 2114 2115 /* 2116 * Complain if it has no devices 2117 */ 2118 err = -ENXIO; 2119 if (list_empty(&mddev->disks)) 2120 goto out; 2121 2122 if (mddev->pers) { 2123 err = -EBUSY; 2124 if (!mddev->ro) 2125 goto out; 2126 2127 mddev->safemode = 0; 2128 mddev->ro = 0; 2129 set_disk_ro(disk, 0); 2130 2131 printk(KERN_INFO "md: %s switched to read-write mode.\n", 2132 mdname(mddev)); 2133 /* 2134 * Kick recovery or resync if necessary 2135 */ 2136 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2137 md_wakeup_thread(mddev->thread); 2138 err = 0; 2139 } else { 2140 printk(KERN_ERR "md: %s has no personality assigned.\n", 2141 mdname(mddev)); 2142 err = -EINVAL; 2143 } 2144 2145 out: 2146 return err; 2147 } 2148 2149 static int do_md_stop(mddev_t * mddev, int ro) 2150 { 2151 int err = 0; 2152 struct gendisk *disk = mddev->gendisk; 2153 2154 if (mddev->pers) { 2155 if (atomic_read(&mddev->active)>2) { 2156 printk("md: %s still in use.\n",mdname(mddev)); 2157 return -EBUSY; 2158 } 2159 2160 if (mddev->sync_thread) { 2161 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2162 md_unregister_thread(mddev->sync_thread); 2163 mddev->sync_thread = NULL; 2164 } 2165 2166 del_timer_sync(&mddev->safemode_timer); 2167 2168 invalidate_partition(disk, 0); 2169 2170 if (ro) { 2171 err = -ENXIO; 2172 if (mddev->ro) 2173 goto out; 2174 mddev->ro = 1; 2175 } else { 2176 bitmap_flush(mddev); 2177 md_super_wait(mddev); 2178 if (mddev->ro) 2179 set_disk_ro(disk, 0); 2180 blk_queue_make_request(mddev->queue, md_fail_request); 2181 mddev->pers->stop(mddev); 2182 module_put(mddev->pers->owner); 2183 mddev->pers = NULL; 2184 if (mddev->ro) 2185 mddev->ro = 0; 2186 } 2187 if (!mddev->in_sync) { 2188 /* mark array as shutdown cleanly */ 2189 mddev->in_sync = 1; 2190 md_update_sb(mddev); 2191 } 2192 if (ro) 2193 set_disk_ro(disk, 1); 2194 } 2195 2196 bitmap_destroy(mddev); 2197 if (mddev->bitmap_file) { 2198 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 2199 fput(mddev->bitmap_file); 2200 mddev->bitmap_file = NULL; 2201 } 2202 mddev->bitmap_offset = 0; 2203 2204 /* 2205 * Free resources if final stop 2206 */ 2207 if (!ro) { 2208 mdk_rdev_t *rdev; 2209 struct list_head *tmp; 2210 struct gendisk *disk; 2211 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2212 2213 ITERATE_RDEV(mddev,rdev,tmp) 2214 if (rdev->raid_disk >= 0) { 2215 char nm[20]; 2216 sprintf(nm, "rd%d", rdev->raid_disk); 2217 sysfs_remove_link(&mddev->kobj, nm); 2218 } 2219 2220 export_array(mddev); 2221 2222 mddev->array_size = 0; 2223 disk = mddev->gendisk; 2224 if (disk) 2225 set_capacity(disk, 0); 2226 mddev->changed = 1; 2227 } else 2228 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2229 mdname(mddev)); 2230 err = 0; 2231 out: 2232 return err; 2233 } 2234 2235 static void autorun_array(mddev_t *mddev) 2236 { 2237 mdk_rdev_t *rdev; 2238 struct list_head *tmp; 2239 int err; 2240 2241 if (list_empty(&mddev->disks)) 2242 return; 2243 2244 printk(KERN_INFO "md: running: "); 2245 2246 ITERATE_RDEV(mddev,rdev,tmp) { 2247 char b[BDEVNAME_SIZE]; 2248 printk("<%s>", bdevname(rdev->bdev,b)); 2249 } 2250 printk("\n"); 2251 2252 err = do_md_run (mddev); 2253 if (err) { 2254 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 2255 do_md_stop (mddev, 0); 2256 } 2257 } 2258 2259 /* 2260 * lets try to run arrays based on all disks that have arrived 2261 * until now. (those are in pending_raid_disks) 2262 * 2263 * the method: pick the first pending disk, collect all disks with 2264 * the same UUID, remove all from the pending list and put them into 2265 * the 'same_array' list. Then order this list based on superblock 2266 * update time (freshest comes first), kick out 'old' disks and 2267 * compare superblocks. If everything's fine then run it. 2268 * 2269 * If "unit" is allocated, then bump its reference count 2270 */ 2271 static void autorun_devices(int part) 2272 { 2273 struct list_head candidates; 2274 struct list_head *tmp; 2275 mdk_rdev_t *rdev0, *rdev; 2276 mddev_t *mddev; 2277 char b[BDEVNAME_SIZE]; 2278 2279 printk(KERN_INFO "md: autorun ...\n"); 2280 while (!list_empty(&pending_raid_disks)) { 2281 dev_t dev; 2282 rdev0 = list_entry(pending_raid_disks.next, 2283 mdk_rdev_t, same_set); 2284 2285 printk(KERN_INFO "md: considering %s ...\n", 2286 bdevname(rdev0->bdev,b)); 2287 INIT_LIST_HEAD(&candidates); 2288 ITERATE_RDEV_PENDING(rdev,tmp) 2289 if (super_90_load(rdev, rdev0, 0) >= 0) { 2290 printk(KERN_INFO "md: adding %s ...\n", 2291 bdevname(rdev->bdev,b)); 2292 list_move(&rdev->same_set, &candidates); 2293 } 2294 /* 2295 * now we have a set of devices, with all of them having 2296 * mostly sane superblocks. It's time to allocate the 2297 * mddev. 2298 */ 2299 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 2300 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 2301 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 2302 break; 2303 } 2304 if (part) 2305 dev = MKDEV(mdp_major, 2306 rdev0->preferred_minor << MdpMinorShift); 2307 else 2308 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 2309 2310 md_probe(dev, NULL, NULL); 2311 mddev = mddev_find(dev); 2312 if (!mddev) { 2313 printk(KERN_ERR 2314 "md: cannot allocate memory for md drive.\n"); 2315 break; 2316 } 2317 if (mddev_lock(mddev)) 2318 printk(KERN_WARNING "md: %s locked, cannot run\n", 2319 mdname(mddev)); 2320 else if (mddev->raid_disks || mddev->major_version 2321 || !list_empty(&mddev->disks)) { 2322 printk(KERN_WARNING 2323 "md: %s already running, cannot run %s\n", 2324 mdname(mddev), bdevname(rdev0->bdev,b)); 2325 mddev_unlock(mddev); 2326 } else { 2327 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 2328 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 2329 list_del_init(&rdev->same_set); 2330 if (bind_rdev_to_array(rdev, mddev)) 2331 export_rdev(rdev); 2332 } 2333 autorun_array(mddev); 2334 mddev_unlock(mddev); 2335 } 2336 /* on success, candidates will be empty, on error 2337 * it won't... 2338 */ 2339 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 2340 export_rdev(rdev); 2341 mddev_put(mddev); 2342 } 2343 printk(KERN_INFO "md: ... autorun DONE.\n"); 2344 } 2345 2346 /* 2347 * import RAID devices based on one partition 2348 * if possible, the array gets run as well. 2349 */ 2350 2351 static int autostart_array(dev_t startdev) 2352 { 2353 char b[BDEVNAME_SIZE]; 2354 int err = -EINVAL, i; 2355 mdp_super_t *sb = NULL; 2356 mdk_rdev_t *start_rdev = NULL, *rdev; 2357 2358 start_rdev = md_import_device(startdev, 0, 0); 2359 if (IS_ERR(start_rdev)) 2360 return err; 2361 2362 2363 /* NOTE: this can only work for 0.90.0 superblocks */ 2364 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2365 if (sb->major_version != 0 || 2366 sb->minor_version != 90 ) { 2367 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2368 export_rdev(start_rdev); 2369 return err; 2370 } 2371 2372 if (test_bit(Faulty, &start_rdev->flags)) { 2373 printk(KERN_WARNING 2374 "md: can not autostart based on faulty %s!\n", 2375 bdevname(start_rdev->bdev,b)); 2376 export_rdev(start_rdev); 2377 return err; 2378 } 2379 list_add(&start_rdev->same_set, &pending_raid_disks); 2380 2381 for (i = 0; i < MD_SB_DISKS; i++) { 2382 mdp_disk_t *desc = sb->disks + i; 2383 dev_t dev = MKDEV(desc->major, desc->minor); 2384 2385 if (!dev) 2386 continue; 2387 if (dev == startdev) 2388 continue; 2389 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2390 continue; 2391 rdev = md_import_device(dev, 0, 0); 2392 if (IS_ERR(rdev)) 2393 continue; 2394 2395 list_add(&rdev->same_set, &pending_raid_disks); 2396 } 2397 2398 /* 2399 * possibly return codes 2400 */ 2401 autorun_devices(0); 2402 return 0; 2403 2404 } 2405 2406 2407 static int get_version(void __user * arg) 2408 { 2409 mdu_version_t ver; 2410 2411 ver.major = MD_MAJOR_VERSION; 2412 ver.minor = MD_MINOR_VERSION; 2413 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2414 2415 if (copy_to_user(arg, &ver, sizeof(ver))) 2416 return -EFAULT; 2417 2418 return 0; 2419 } 2420 2421 static int get_array_info(mddev_t * mddev, void __user * arg) 2422 { 2423 mdu_array_info_t info; 2424 int nr,working,active,failed,spare; 2425 mdk_rdev_t *rdev; 2426 struct list_head *tmp; 2427 2428 nr=working=active=failed=spare=0; 2429 ITERATE_RDEV(mddev,rdev,tmp) { 2430 nr++; 2431 if (test_bit(Faulty, &rdev->flags)) 2432 failed++; 2433 else { 2434 working++; 2435 if (test_bit(In_sync, &rdev->flags)) 2436 active++; 2437 else 2438 spare++; 2439 } 2440 } 2441 2442 info.major_version = mddev->major_version; 2443 info.minor_version = mddev->minor_version; 2444 info.patch_version = MD_PATCHLEVEL_VERSION; 2445 info.ctime = mddev->ctime; 2446 info.level = mddev->level; 2447 info.size = mddev->size; 2448 info.nr_disks = nr; 2449 info.raid_disks = mddev->raid_disks; 2450 info.md_minor = mddev->md_minor; 2451 info.not_persistent= !mddev->persistent; 2452 2453 info.utime = mddev->utime; 2454 info.state = 0; 2455 if (mddev->in_sync) 2456 info.state = (1<<MD_SB_CLEAN); 2457 if (mddev->bitmap && mddev->bitmap_offset) 2458 info.state = (1<<MD_SB_BITMAP_PRESENT); 2459 info.active_disks = active; 2460 info.working_disks = working; 2461 info.failed_disks = failed; 2462 info.spare_disks = spare; 2463 2464 info.layout = mddev->layout; 2465 info.chunk_size = mddev->chunk_size; 2466 2467 if (copy_to_user(arg, &info, sizeof(info))) 2468 return -EFAULT; 2469 2470 return 0; 2471 } 2472 2473 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2474 { 2475 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2476 char *ptr, *buf = NULL; 2477 int err = -ENOMEM; 2478 2479 file = kmalloc(sizeof(*file), GFP_KERNEL); 2480 if (!file) 2481 goto out; 2482 2483 /* bitmap disabled, zero the first byte and copy out */ 2484 if (!mddev->bitmap || !mddev->bitmap->file) { 2485 file->pathname[0] = '\0'; 2486 goto copy_out; 2487 } 2488 2489 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2490 if (!buf) 2491 goto out; 2492 2493 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2494 if (!ptr) 2495 goto out; 2496 2497 strcpy(file->pathname, ptr); 2498 2499 copy_out: 2500 err = 0; 2501 if (copy_to_user(arg, file, sizeof(*file))) 2502 err = -EFAULT; 2503 out: 2504 kfree(buf); 2505 kfree(file); 2506 return err; 2507 } 2508 2509 static int get_disk_info(mddev_t * mddev, void __user * arg) 2510 { 2511 mdu_disk_info_t info; 2512 unsigned int nr; 2513 mdk_rdev_t *rdev; 2514 2515 if (copy_from_user(&info, arg, sizeof(info))) 2516 return -EFAULT; 2517 2518 nr = info.number; 2519 2520 rdev = find_rdev_nr(mddev, nr); 2521 if (rdev) { 2522 info.major = MAJOR(rdev->bdev->bd_dev); 2523 info.minor = MINOR(rdev->bdev->bd_dev); 2524 info.raid_disk = rdev->raid_disk; 2525 info.state = 0; 2526 if (test_bit(Faulty, &rdev->flags)) 2527 info.state |= (1<<MD_DISK_FAULTY); 2528 else if (test_bit(In_sync, &rdev->flags)) { 2529 info.state |= (1<<MD_DISK_ACTIVE); 2530 info.state |= (1<<MD_DISK_SYNC); 2531 } 2532 if (test_bit(WriteMostly, &rdev->flags)) 2533 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2534 } else { 2535 info.major = info.minor = 0; 2536 info.raid_disk = -1; 2537 info.state = (1<<MD_DISK_REMOVED); 2538 } 2539 2540 if (copy_to_user(arg, &info, sizeof(info))) 2541 return -EFAULT; 2542 2543 return 0; 2544 } 2545 2546 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2547 { 2548 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2549 mdk_rdev_t *rdev; 2550 dev_t dev = MKDEV(info->major,info->minor); 2551 2552 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2553 return -EOVERFLOW; 2554 2555 if (!mddev->raid_disks) { 2556 int err; 2557 /* expecting a device which has a superblock */ 2558 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2559 if (IS_ERR(rdev)) { 2560 printk(KERN_WARNING 2561 "md: md_import_device returned %ld\n", 2562 PTR_ERR(rdev)); 2563 return PTR_ERR(rdev); 2564 } 2565 if (!list_empty(&mddev->disks)) { 2566 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2567 mdk_rdev_t, same_set); 2568 int err = super_types[mddev->major_version] 2569 .load_super(rdev, rdev0, mddev->minor_version); 2570 if (err < 0) { 2571 printk(KERN_WARNING 2572 "md: %s has different UUID to %s\n", 2573 bdevname(rdev->bdev,b), 2574 bdevname(rdev0->bdev,b2)); 2575 export_rdev(rdev); 2576 return -EINVAL; 2577 } 2578 } 2579 err = bind_rdev_to_array(rdev, mddev); 2580 if (err) 2581 export_rdev(rdev); 2582 return err; 2583 } 2584 2585 /* 2586 * add_new_disk can be used once the array is assembled 2587 * to add "hot spares". They must already have a superblock 2588 * written 2589 */ 2590 if (mddev->pers) { 2591 int err; 2592 if (!mddev->pers->hot_add_disk) { 2593 printk(KERN_WARNING 2594 "%s: personality does not support diskops!\n", 2595 mdname(mddev)); 2596 return -EINVAL; 2597 } 2598 if (mddev->persistent) 2599 rdev = md_import_device(dev, mddev->major_version, 2600 mddev->minor_version); 2601 else 2602 rdev = md_import_device(dev, -1, -1); 2603 if (IS_ERR(rdev)) { 2604 printk(KERN_WARNING 2605 "md: md_import_device returned %ld\n", 2606 PTR_ERR(rdev)); 2607 return PTR_ERR(rdev); 2608 } 2609 /* set save_raid_disk if appropriate */ 2610 if (!mddev->persistent) { 2611 if (info->state & (1<<MD_DISK_SYNC) && 2612 info->raid_disk < mddev->raid_disks) 2613 rdev->raid_disk = info->raid_disk; 2614 else 2615 rdev->raid_disk = -1; 2616 } else 2617 super_types[mddev->major_version]. 2618 validate_super(mddev, rdev); 2619 rdev->saved_raid_disk = rdev->raid_disk; 2620 2621 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 2622 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2623 set_bit(WriteMostly, &rdev->flags); 2624 2625 rdev->raid_disk = -1; 2626 err = bind_rdev_to_array(rdev, mddev); 2627 if (err) 2628 export_rdev(rdev); 2629 2630 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2631 md_wakeup_thread(mddev->thread); 2632 return err; 2633 } 2634 2635 /* otherwise, add_new_disk is only allowed 2636 * for major_version==0 superblocks 2637 */ 2638 if (mddev->major_version != 0) { 2639 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2640 mdname(mddev)); 2641 return -EINVAL; 2642 } 2643 2644 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2645 int err; 2646 rdev = md_import_device (dev, -1, 0); 2647 if (IS_ERR(rdev)) { 2648 printk(KERN_WARNING 2649 "md: error, md_import_device() returned %ld\n", 2650 PTR_ERR(rdev)); 2651 return PTR_ERR(rdev); 2652 } 2653 rdev->desc_nr = info->number; 2654 if (info->raid_disk < mddev->raid_disks) 2655 rdev->raid_disk = info->raid_disk; 2656 else 2657 rdev->raid_disk = -1; 2658 2659 rdev->flags = 0; 2660 2661 if (rdev->raid_disk < mddev->raid_disks) 2662 if (info->state & (1<<MD_DISK_SYNC)) 2663 set_bit(In_sync, &rdev->flags); 2664 2665 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2666 set_bit(WriteMostly, &rdev->flags); 2667 2668 err = bind_rdev_to_array(rdev, mddev); 2669 if (err) { 2670 export_rdev(rdev); 2671 return err; 2672 } 2673 2674 if (!mddev->persistent) { 2675 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2676 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2677 } else 2678 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2679 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2680 2681 if (!mddev->size || (mddev->size > rdev->size)) 2682 mddev->size = rdev->size; 2683 } 2684 2685 return 0; 2686 } 2687 2688 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2689 { 2690 char b[BDEVNAME_SIZE]; 2691 mdk_rdev_t *rdev; 2692 2693 if (!mddev->pers) 2694 return -ENODEV; 2695 2696 rdev = find_rdev(mddev, dev); 2697 if (!rdev) 2698 return -ENXIO; 2699 2700 if (rdev->raid_disk >= 0) 2701 goto busy; 2702 2703 kick_rdev_from_array(rdev); 2704 md_update_sb(mddev); 2705 2706 return 0; 2707 busy: 2708 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2709 bdevname(rdev->bdev,b), mdname(mddev)); 2710 return -EBUSY; 2711 } 2712 2713 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2714 { 2715 char b[BDEVNAME_SIZE]; 2716 int err; 2717 unsigned int size; 2718 mdk_rdev_t *rdev; 2719 2720 if (!mddev->pers) 2721 return -ENODEV; 2722 2723 if (mddev->major_version != 0) { 2724 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2725 " version-0 superblocks.\n", 2726 mdname(mddev)); 2727 return -EINVAL; 2728 } 2729 if (!mddev->pers->hot_add_disk) { 2730 printk(KERN_WARNING 2731 "%s: personality does not support diskops!\n", 2732 mdname(mddev)); 2733 return -EINVAL; 2734 } 2735 2736 rdev = md_import_device (dev, -1, 0); 2737 if (IS_ERR(rdev)) { 2738 printk(KERN_WARNING 2739 "md: error, md_import_device() returned %ld\n", 2740 PTR_ERR(rdev)); 2741 return -EINVAL; 2742 } 2743 2744 if (mddev->persistent) 2745 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2746 else 2747 rdev->sb_offset = 2748 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2749 2750 size = calc_dev_size(rdev, mddev->chunk_size); 2751 rdev->size = size; 2752 2753 if (size < mddev->size) { 2754 printk(KERN_WARNING 2755 "%s: disk size %llu blocks < array size %llu\n", 2756 mdname(mddev), (unsigned long long)size, 2757 (unsigned long long)mddev->size); 2758 err = -ENOSPC; 2759 goto abort_export; 2760 } 2761 2762 if (test_bit(Faulty, &rdev->flags)) { 2763 printk(KERN_WARNING 2764 "md: can not hot-add faulty %s disk to %s!\n", 2765 bdevname(rdev->bdev,b), mdname(mddev)); 2766 err = -EINVAL; 2767 goto abort_export; 2768 } 2769 clear_bit(In_sync, &rdev->flags); 2770 rdev->desc_nr = -1; 2771 bind_rdev_to_array(rdev, mddev); 2772 2773 /* 2774 * The rest should better be atomic, we can have disk failures 2775 * noticed in interrupt contexts ... 2776 */ 2777 2778 if (rdev->desc_nr == mddev->max_disks) { 2779 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2780 mdname(mddev)); 2781 err = -EBUSY; 2782 goto abort_unbind_export; 2783 } 2784 2785 rdev->raid_disk = -1; 2786 2787 md_update_sb(mddev); 2788 2789 /* 2790 * Kick recovery, maybe this spare has to be added to the 2791 * array immediately. 2792 */ 2793 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2794 md_wakeup_thread(mddev->thread); 2795 2796 return 0; 2797 2798 abort_unbind_export: 2799 unbind_rdev_from_array(rdev); 2800 2801 abort_export: 2802 export_rdev(rdev); 2803 return err; 2804 } 2805 2806 /* similar to deny_write_access, but accounts for our holding a reference 2807 * to the file ourselves */ 2808 static int deny_bitmap_write_access(struct file * file) 2809 { 2810 struct inode *inode = file->f_mapping->host; 2811 2812 spin_lock(&inode->i_lock); 2813 if (atomic_read(&inode->i_writecount) > 1) { 2814 spin_unlock(&inode->i_lock); 2815 return -ETXTBSY; 2816 } 2817 atomic_set(&inode->i_writecount, -1); 2818 spin_unlock(&inode->i_lock); 2819 2820 return 0; 2821 } 2822 2823 static int set_bitmap_file(mddev_t *mddev, int fd) 2824 { 2825 int err; 2826 2827 if (mddev->pers) { 2828 if (!mddev->pers->quiesce) 2829 return -EBUSY; 2830 if (mddev->recovery || mddev->sync_thread) 2831 return -EBUSY; 2832 /* we should be able to change the bitmap.. */ 2833 } 2834 2835 2836 if (fd >= 0) { 2837 if (mddev->bitmap) 2838 return -EEXIST; /* cannot add when bitmap is present */ 2839 mddev->bitmap_file = fget(fd); 2840 2841 if (mddev->bitmap_file == NULL) { 2842 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2843 mdname(mddev)); 2844 return -EBADF; 2845 } 2846 2847 err = deny_bitmap_write_access(mddev->bitmap_file); 2848 if (err) { 2849 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2850 mdname(mddev)); 2851 fput(mddev->bitmap_file); 2852 mddev->bitmap_file = NULL; 2853 return err; 2854 } 2855 mddev->bitmap_offset = 0; /* file overrides offset */ 2856 } else if (mddev->bitmap == NULL) 2857 return -ENOENT; /* cannot remove what isn't there */ 2858 err = 0; 2859 if (mddev->pers) { 2860 mddev->pers->quiesce(mddev, 1); 2861 if (fd >= 0) 2862 err = bitmap_create(mddev); 2863 if (fd < 0 || err) 2864 bitmap_destroy(mddev); 2865 mddev->pers->quiesce(mddev, 0); 2866 } else if (fd < 0) { 2867 if (mddev->bitmap_file) 2868 fput(mddev->bitmap_file); 2869 mddev->bitmap_file = NULL; 2870 } 2871 2872 return err; 2873 } 2874 2875 /* 2876 * set_array_info is used two different ways 2877 * The original usage is when creating a new array. 2878 * In this usage, raid_disks is > 0 and it together with 2879 * level, size, not_persistent,layout,chunksize determine the 2880 * shape of the array. 2881 * This will always create an array with a type-0.90.0 superblock. 2882 * The newer usage is when assembling an array. 2883 * In this case raid_disks will be 0, and the major_version field is 2884 * use to determine which style super-blocks are to be found on the devices. 2885 * The minor and patch _version numbers are also kept incase the 2886 * super_block handler wishes to interpret them. 2887 */ 2888 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2889 { 2890 2891 if (info->raid_disks == 0) { 2892 /* just setting version number for superblock loading */ 2893 if (info->major_version < 0 || 2894 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2895 super_types[info->major_version].name == NULL) { 2896 /* maybe try to auto-load a module? */ 2897 printk(KERN_INFO 2898 "md: superblock version %d not known\n", 2899 info->major_version); 2900 return -EINVAL; 2901 } 2902 mddev->major_version = info->major_version; 2903 mddev->minor_version = info->minor_version; 2904 mddev->patch_version = info->patch_version; 2905 return 0; 2906 } 2907 mddev->major_version = MD_MAJOR_VERSION; 2908 mddev->minor_version = MD_MINOR_VERSION; 2909 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2910 mddev->ctime = get_seconds(); 2911 2912 mddev->level = info->level; 2913 mddev->size = info->size; 2914 mddev->raid_disks = info->raid_disks; 2915 /* don't set md_minor, it is determined by which /dev/md* was 2916 * openned 2917 */ 2918 if (info->state & (1<<MD_SB_CLEAN)) 2919 mddev->recovery_cp = MaxSector; 2920 else 2921 mddev->recovery_cp = 0; 2922 mddev->persistent = ! info->not_persistent; 2923 2924 mddev->layout = info->layout; 2925 mddev->chunk_size = info->chunk_size; 2926 2927 mddev->max_disks = MD_SB_DISKS; 2928 2929 mddev->sb_dirty = 1; 2930 2931 /* 2932 * Generate a 128 bit UUID 2933 */ 2934 get_random_bytes(mddev->uuid, 16); 2935 2936 return 0; 2937 } 2938 2939 /* 2940 * update_array_info is used to change the configuration of an 2941 * on-line array. 2942 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2943 * fields in the info are checked against the array. 2944 * Any differences that cannot be handled will cause an error. 2945 * Normally, only one change can be managed at a time. 2946 */ 2947 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2948 { 2949 int rv = 0; 2950 int cnt = 0; 2951 int state = 0; 2952 2953 /* calculate expected state,ignoring low bits */ 2954 if (mddev->bitmap && mddev->bitmap_offset) 2955 state |= (1 << MD_SB_BITMAP_PRESENT); 2956 2957 if (mddev->major_version != info->major_version || 2958 mddev->minor_version != info->minor_version || 2959 /* mddev->patch_version != info->patch_version || */ 2960 mddev->ctime != info->ctime || 2961 mddev->level != info->level || 2962 /* mddev->layout != info->layout || */ 2963 !mddev->persistent != info->not_persistent|| 2964 mddev->chunk_size != info->chunk_size || 2965 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 2966 ((state^info->state) & 0xfffffe00) 2967 ) 2968 return -EINVAL; 2969 /* Check there is only one change */ 2970 if (mddev->size != info->size) cnt++; 2971 if (mddev->raid_disks != info->raid_disks) cnt++; 2972 if (mddev->layout != info->layout) cnt++; 2973 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 2974 if (cnt == 0) return 0; 2975 if (cnt > 1) return -EINVAL; 2976 2977 if (mddev->layout != info->layout) { 2978 /* Change layout 2979 * we don't need to do anything at the md level, the 2980 * personality will take care of it all. 2981 */ 2982 if (mddev->pers->reconfig == NULL) 2983 return -EINVAL; 2984 else 2985 return mddev->pers->reconfig(mddev, info->layout, -1); 2986 } 2987 if (mddev->size != info->size) { 2988 mdk_rdev_t * rdev; 2989 struct list_head *tmp; 2990 if (mddev->pers->resize == NULL) 2991 return -EINVAL; 2992 /* The "size" is the amount of each device that is used. 2993 * This can only make sense for arrays with redundancy. 2994 * linear and raid0 always use whatever space is available 2995 * We can only consider changing the size if no resync 2996 * or reconstruction is happening, and if the new size 2997 * is acceptable. It must fit before the sb_offset or, 2998 * if that is <data_offset, it must fit before the 2999 * size of each device. 3000 * If size is zero, we find the largest size that fits. 3001 */ 3002 if (mddev->sync_thread) 3003 return -EBUSY; 3004 ITERATE_RDEV(mddev,rdev,tmp) { 3005 sector_t avail; 3006 int fit = (info->size == 0); 3007 if (rdev->sb_offset > rdev->data_offset) 3008 avail = (rdev->sb_offset*2) - rdev->data_offset; 3009 else 3010 avail = get_capacity(rdev->bdev->bd_disk) 3011 - rdev->data_offset; 3012 if (fit && (info->size == 0 || info->size > avail/2)) 3013 info->size = avail/2; 3014 if (avail < ((sector_t)info->size << 1)) 3015 return -ENOSPC; 3016 } 3017 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 3018 if (!rv) { 3019 struct block_device *bdev; 3020 3021 bdev = bdget_disk(mddev->gendisk, 0); 3022 if (bdev) { 3023 down(&bdev->bd_inode->i_sem); 3024 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3025 up(&bdev->bd_inode->i_sem); 3026 bdput(bdev); 3027 } 3028 } 3029 } 3030 if (mddev->raid_disks != info->raid_disks) { 3031 /* change the number of raid disks */ 3032 if (mddev->pers->reshape == NULL) 3033 return -EINVAL; 3034 if (info->raid_disks <= 0 || 3035 info->raid_disks >= mddev->max_disks) 3036 return -EINVAL; 3037 if (mddev->sync_thread) 3038 return -EBUSY; 3039 rv = mddev->pers->reshape(mddev, info->raid_disks); 3040 if (!rv) { 3041 struct block_device *bdev; 3042 3043 bdev = bdget_disk(mddev->gendisk, 0); 3044 if (bdev) { 3045 down(&bdev->bd_inode->i_sem); 3046 i_size_write(bdev->bd_inode, mddev->array_size << 10); 3047 up(&bdev->bd_inode->i_sem); 3048 bdput(bdev); 3049 } 3050 } 3051 } 3052 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3053 if (mddev->pers->quiesce == NULL) 3054 return -EINVAL; 3055 if (mddev->recovery || mddev->sync_thread) 3056 return -EBUSY; 3057 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 3058 /* add the bitmap */ 3059 if (mddev->bitmap) 3060 return -EEXIST; 3061 if (mddev->default_bitmap_offset == 0) 3062 return -EINVAL; 3063 mddev->bitmap_offset = mddev->default_bitmap_offset; 3064 mddev->pers->quiesce(mddev, 1); 3065 rv = bitmap_create(mddev); 3066 if (rv) 3067 bitmap_destroy(mddev); 3068 mddev->pers->quiesce(mddev, 0); 3069 } else { 3070 /* remove the bitmap */ 3071 if (!mddev->bitmap) 3072 return -ENOENT; 3073 if (mddev->bitmap->file) 3074 return -EINVAL; 3075 mddev->pers->quiesce(mddev, 1); 3076 bitmap_destroy(mddev); 3077 mddev->pers->quiesce(mddev, 0); 3078 mddev->bitmap_offset = 0; 3079 } 3080 } 3081 md_update_sb(mddev); 3082 return rv; 3083 } 3084 3085 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 3086 { 3087 mdk_rdev_t *rdev; 3088 3089 if (mddev->pers == NULL) 3090 return -ENODEV; 3091 3092 rdev = find_rdev(mddev, dev); 3093 if (!rdev) 3094 return -ENODEV; 3095 3096 md_error(mddev, rdev); 3097 return 0; 3098 } 3099 3100 static int md_ioctl(struct inode *inode, struct file *file, 3101 unsigned int cmd, unsigned long arg) 3102 { 3103 int err = 0; 3104 void __user *argp = (void __user *)arg; 3105 struct hd_geometry __user *loc = argp; 3106 mddev_t *mddev = NULL; 3107 3108 if (!capable(CAP_SYS_ADMIN)) 3109 return -EACCES; 3110 3111 /* 3112 * Commands dealing with the RAID driver but not any 3113 * particular array: 3114 */ 3115 switch (cmd) 3116 { 3117 case RAID_VERSION: 3118 err = get_version(argp); 3119 goto done; 3120 3121 case PRINT_RAID_DEBUG: 3122 err = 0; 3123 md_print_devices(); 3124 goto done; 3125 3126 #ifndef MODULE 3127 case RAID_AUTORUN: 3128 err = 0; 3129 autostart_arrays(arg); 3130 goto done; 3131 #endif 3132 default:; 3133 } 3134 3135 /* 3136 * Commands creating/starting a new array: 3137 */ 3138 3139 mddev = inode->i_bdev->bd_disk->private_data; 3140 3141 if (!mddev) { 3142 BUG(); 3143 goto abort; 3144 } 3145 3146 3147 if (cmd == START_ARRAY) { 3148 /* START_ARRAY doesn't need to lock the array as autostart_array 3149 * does the locking, and it could even be a different array 3150 */ 3151 static int cnt = 3; 3152 if (cnt > 0 ) { 3153 printk(KERN_WARNING 3154 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 3155 "This will not be supported beyond 2.6\n", 3156 current->comm, current->pid); 3157 cnt--; 3158 } 3159 err = autostart_array(new_decode_dev(arg)); 3160 if (err) { 3161 printk(KERN_WARNING "md: autostart failed!\n"); 3162 goto abort; 3163 } 3164 goto done; 3165 } 3166 3167 err = mddev_lock(mddev); 3168 if (err) { 3169 printk(KERN_INFO 3170 "md: ioctl lock interrupted, reason %d, cmd %d\n", 3171 err, cmd); 3172 goto abort; 3173 } 3174 3175 switch (cmd) 3176 { 3177 case SET_ARRAY_INFO: 3178 { 3179 mdu_array_info_t info; 3180 if (!arg) 3181 memset(&info, 0, sizeof(info)); 3182 else if (copy_from_user(&info, argp, sizeof(info))) { 3183 err = -EFAULT; 3184 goto abort_unlock; 3185 } 3186 if (mddev->pers) { 3187 err = update_array_info(mddev, &info); 3188 if (err) { 3189 printk(KERN_WARNING "md: couldn't update" 3190 " array info. %d\n", err); 3191 goto abort_unlock; 3192 } 3193 goto done_unlock; 3194 } 3195 if (!list_empty(&mddev->disks)) { 3196 printk(KERN_WARNING 3197 "md: array %s already has disks!\n", 3198 mdname(mddev)); 3199 err = -EBUSY; 3200 goto abort_unlock; 3201 } 3202 if (mddev->raid_disks) { 3203 printk(KERN_WARNING 3204 "md: array %s already initialised!\n", 3205 mdname(mddev)); 3206 err = -EBUSY; 3207 goto abort_unlock; 3208 } 3209 err = set_array_info(mddev, &info); 3210 if (err) { 3211 printk(KERN_WARNING "md: couldn't set" 3212 " array info. %d\n", err); 3213 goto abort_unlock; 3214 } 3215 } 3216 goto done_unlock; 3217 3218 default:; 3219 } 3220 3221 /* 3222 * Commands querying/configuring an existing array: 3223 */ 3224 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 3225 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 3226 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 3227 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 3228 err = -ENODEV; 3229 goto abort_unlock; 3230 } 3231 3232 /* 3233 * Commands even a read-only array can execute: 3234 */ 3235 switch (cmd) 3236 { 3237 case GET_ARRAY_INFO: 3238 err = get_array_info(mddev, argp); 3239 goto done_unlock; 3240 3241 case GET_BITMAP_FILE: 3242 err = get_bitmap_file(mddev, argp); 3243 goto done_unlock; 3244 3245 case GET_DISK_INFO: 3246 err = get_disk_info(mddev, argp); 3247 goto done_unlock; 3248 3249 case RESTART_ARRAY_RW: 3250 err = restart_array(mddev); 3251 goto done_unlock; 3252 3253 case STOP_ARRAY: 3254 err = do_md_stop (mddev, 0); 3255 goto done_unlock; 3256 3257 case STOP_ARRAY_RO: 3258 err = do_md_stop (mddev, 1); 3259 goto done_unlock; 3260 3261 /* 3262 * We have a problem here : there is no easy way to give a CHS 3263 * virtual geometry. We currently pretend that we have a 2 heads 3264 * 4 sectors (with a BIG number of cylinders...). This drives 3265 * dosfs just mad... ;-) 3266 */ 3267 case HDIO_GETGEO: 3268 if (!loc) { 3269 err = -EINVAL; 3270 goto abort_unlock; 3271 } 3272 err = put_user (2, (char __user *) &loc->heads); 3273 if (err) 3274 goto abort_unlock; 3275 err = put_user (4, (char __user *) &loc->sectors); 3276 if (err) 3277 goto abort_unlock; 3278 err = put_user(get_capacity(mddev->gendisk)/8, 3279 (short __user *) &loc->cylinders); 3280 if (err) 3281 goto abort_unlock; 3282 err = put_user (get_start_sect(inode->i_bdev), 3283 (long __user *) &loc->start); 3284 goto done_unlock; 3285 } 3286 3287 /* 3288 * The remaining ioctls are changing the state of the 3289 * superblock, so we do not allow read-only arrays 3290 * here: 3291 */ 3292 if (mddev->ro) { 3293 err = -EROFS; 3294 goto abort_unlock; 3295 } 3296 3297 switch (cmd) 3298 { 3299 case ADD_NEW_DISK: 3300 { 3301 mdu_disk_info_t info; 3302 if (copy_from_user(&info, argp, sizeof(info))) 3303 err = -EFAULT; 3304 else 3305 err = add_new_disk(mddev, &info); 3306 goto done_unlock; 3307 } 3308 3309 case HOT_REMOVE_DISK: 3310 err = hot_remove_disk(mddev, new_decode_dev(arg)); 3311 goto done_unlock; 3312 3313 case HOT_ADD_DISK: 3314 err = hot_add_disk(mddev, new_decode_dev(arg)); 3315 goto done_unlock; 3316 3317 case SET_DISK_FAULTY: 3318 err = set_disk_faulty(mddev, new_decode_dev(arg)); 3319 goto done_unlock; 3320 3321 case RUN_ARRAY: 3322 err = do_md_run (mddev); 3323 goto done_unlock; 3324 3325 case SET_BITMAP_FILE: 3326 err = set_bitmap_file(mddev, (int)arg); 3327 goto done_unlock; 3328 3329 default: 3330 if (_IOC_TYPE(cmd) == MD_MAJOR) 3331 printk(KERN_WARNING "md: %s(pid %d) used" 3332 " obsolete MD ioctl, upgrade your" 3333 " software to use new ictls.\n", 3334 current->comm, current->pid); 3335 err = -EINVAL; 3336 goto abort_unlock; 3337 } 3338 3339 done_unlock: 3340 abort_unlock: 3341 mddev_unlock(mddev); 3342 3343 return err; 3344 done: 3345 if (err) 3346 MD_BUG(); 3347 abort: 3348 return err; 3349 } 3350 3351 static int md_open(struct inode *inode, struct file *file) 3352 { 3353 /* 3354 * Succeed if we can lock the mddev, which confirms that 3355 * it isn't being stopped right now. 3356 */ 3357 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3358 int err; 3359 3360 if ((err = mddev_lock(mddev))) 3361 goto out; 3362 3363 err = 0; 3364 mddev_get(mddev); 3365 mddev_unlock(mddev); 3366 3367 check_disk_change(inode->i_bdev); 3368 out: 3369 return err; 3370 } 3371 3372 static int md_release(struct inode *inode, struct file * file) 3373 { 3374 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3375 3376 if (!mddev) 3377 BUG(); 3378 mddev_put(mddev); 3379 3380 return 0; 3381 } 3382 3383 static int md_media_changed(struct gendisk *disk) 3384 { 3385 mddev_t *mddev = disk->private_data; 3386 3387 return mddev->changed; 3388 } 3389 3390 static int md_revalidate(struct gendisk *disk) 3391 { 3392 mddev_t *mddev = disk->private_data; 3393 3394 mddev->changed = 0; 3395 return 0; 3396 } 3397 static struct block_device_operations md_fops = 3398 { 3399 .owner = THIS_MODULE, 3400 .open = md_open, 3401 .release = md_release, 3402 .ioctl = md_ioctl, 3403 .media_changed = md_media_changed, 3404 .revalidate_disk= md_revalidate, 3405 }; 3406 3407 static int md_thread(void * arg) 3408 { 3409 mdk_thread_t *thread = arg; 3410 3411 /* 3412 * md_thread is a 'system-thread', it's priority should be very 3413 * high. We avoid resource deadlocks individually in each 3414 * raid personality. (RAID5 does preallocation) We also use RR and 3415 * the very same RT priority as kswapd, thus we will never get 3416 * into a priority inversion deadlock. 3417 * 3418 * we definitely have to have equal or higher priority than 3419 * bdflush, otherwise bdflush will deadlock if there are too 3420 * many dirty RAID5 blocks. 3421 */ 3422 3423 allow_signal(SIGKILL); 3424 complete(thread->event); 3425 while (!kthread_should_stop()) { 3426 void (*run)(mddev_t *); 3427 3428 wait_event_interruptible_timeout(thread->wqueue, 3429 test_bit(THREAD_WAKEUP, &thread->flags) 3430 || kthread_should_stop(), 3431 thread->timeout); 3432 try_to_freeze(); 3433 3434 clear_bit(THREAD_WAKEUP, &thread->flags); 3435 3436 run = thread->run; 3437 if (run) 3438 run(thread->mddev); 3439 } 3440 3441 return 0; 3442 } 3443 3444 void md_wakeup_thread(mdk_thread_t *thread) 3445 { 3446 if (thread) { 3447 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3448 set_bit(THREAD_WAKEUP, &thread->flags); 3449 wake_up(&thread->wqueue); 3450 } 3451 } 3452 3453 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3454 const char *name) 3455 { 3456 mdk_thread_t *thread; 3457 struct completion event; 3458 3459 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3460 if (!thread) 3461 return NULL; 3462 3463 memset(thread, 0, sizeof(mdk_thread_t)); 3464 init_waitqueue_head(&thread->wqueue); 3465 3466 init_completion(&event); 3467 thread->event = &event; 3468 thread->run = run; 3469 thread->mddev = mddev; 3470 thread->name = name; 3471 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3472 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3473 if (IS_ERR(thread->tsk)) { 3474 kfree(thread); 3475 return NULL; 3476 } 3477 wait_for_completion(&event); 3478 return thread; 3479 } 3480 3481 void md_unregister_thread(mdk_thread_t *thread) 3482 { 3483 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3484 3485 kthread_stop(thread->tsk); 3486 kfree(thread); 3487 } 3488 3489 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3490 { 3491 if (!mddev) { 3492 MD_BUG(); 3493 return; 3494 } 3495 3496 if (!rdev || test_bit(Faulty, &rdev->flags)) 3497 return; 3498 /* 3499 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3500 mdname(mddev), 3501 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3502 __builtin_return_address(0),__builtin_return_address(1), 3503 __builtin_return_address(2),__builtin_return_address(3)); 3504 */ 3505 if (!mddev->pers->error_handler) 3506 return; 3507 mddev->pers->error_handler(mddev,rdev); 3508 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3509 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3510 md_wakeup_thread(mddev->thread); 3511 } 3512 3513 /* seq_file implementation /proc/mdstat */ 3514 3515 static void status_unused(struct seq_file *seq) 3516 { 3517 int i = 0; 3518 mdk_rdev_t *rdev; 3519 struct list_head *tmp; 3520 3521 seq_printf(seq, "unused devices: "); 3522 3523 ITERATE_RDEV_PENDING(rdev,tmp) { 3524 char b[BDEVNAME_SIZE]; 3525 i++; 3526 seq_printf(seq, "%s ", 3527 bdevname(rdev->bdev,b)); 3528 } 3529 if (!i) 3530 seq_printf(seq, "<none>"); 3531 3532 seq_printf(seq, "\n"); 3533 } 3534 3535 3536 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3537 { 3538 unsigned long max_blocks, resync, res, dt, db, rt; 3539 3540 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3541 3542 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3543 max_blocks = mddev->resync_max_sectors >> 1; 3544 else 3545 max_blocks = mddev->size; 3546 3547 /* 3548 * Should not happen. 3549 */ 3550 if (!max_blocks) { 3551 MD_BUG(); 3552 return; 3553 } 3554 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3555 { 3556 int i, x = res/50, y = 20-x; 3557 seq_printf(seq, "["); 3558 for (i = 0; i < x; i++) 3559 seq_printf(seq, "="); 3560 seq_printf(seq, ">"); 3561 for (i = 0; i < y; i++) 3562 seq_printf(seq, "."); 3563 seq_printf(seq, "] "); 3564 } 3565 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3566 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3567 "resync" : "recovery"), 3568 res/10, res % 10, resync, max_blocks); 3569 3570 /* 3571 * We do not want to overflow, so the order of operands and 3572 * the * 100 / 100 trick are important. We do a +1 to be 3573 * safe against division by zero. We only estimate anyway. 3574 * 3575 * dt: time from mark until now 3576 * db: blocks written from mark until now 3577 * rt: remaining time 3578 */ 3579 dt = ((jiffies - mddev->resync_mark) / HZ); 3580 if (!dt) dt++; 3581 db = resync - (mddev->resync_mark_cnt/2); 3582 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3583 3584 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3585 3586 seq_printf(seq, " speed=%ldK/sec", db/dt); 3587 } 3588 3589 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3590 { 3591 struct list_head *tmp; 3592 loff_t l = *pos; 3593 mddev_t *mddev; 3594 3595 if (l >= 0x10000) 3596 return NULL; 3597 if (!l--) 3598 /* header */ 3599 return (void*)1; 3600 3601 spin_lock(&all_mddevs_lock); 3602 list_for_each(tmp,&all_mddevs) 3603 if (!l--) { 3604 mddev = list_entry(tmp, mddev_t, all_mddevs); 3605 mddev_get(mddev); 3606 spin_unlock(&all_mddevs_lock); 3607 return mddev; 3608 } 3609 spin_unlock(&all_mddevs_lock); 3610 if (!l--) 3611 return (void*)2;/* tail */ 3612 return NULL; 3613 } 3614 3615 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3616 { 3617 struct list_head *tmp; 3618 mddev_t *next_mddev, *mddev = v; 3619 3620 ++*pos; 3621 if (v == (void*)2) 3622 return NULL; 3623 3624 spin_lock(&all_mddevs_lock); 3625 if (v == (void*)1) 3626 tmp = all_mddevs.next; 3627 else 3628 tmp = mddev->all_mddevs.next; 3629 if (tmp != &all_mddevs) 3630 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3631 else { 3632 next_mddev = (void*)2; 3633 *pos = 0x10000; 3634 } 3635 spin_unlock(&all_mddevs_lock); 3636 3637 if (v != (void*)1) 3638 mddev_put(mddev); 3639 return next_mddev; 3640 3641 } 3642 3643 static void md_seq_stop(struct seq_file *seq, void *v) 3644 { 3645 mddev_t *mddev = v; 3646 3647 if (mddev && v != (void*)1 && v != (void*)2) 3648 mddev_put(mddev); 3649 } 3650 3651 static int md_seq_show(struct seq_file *seq, void *v) 3652 { 3653 mddev_t *mddev = v; 3654 sector_t size; 3655 struct list_head *tmp2; 3656 mdk_rdev_t *rdev; 3657 int i; 3658 struct bitmap *bitmap; 3659 3660 if (v == (void*)1) { 3661 seq_printf(seq, "Personalities : "); 3662 spin_lock(&pers_lock); 3663 for (i = 0; i < MAX_PERSONALITY; i++) 3664 if (pers[i]) 3665 seq_printf(seq, "[%s] ", pers[i]->name); 3666 3667 spin_unlock(&pers_lock); 3668 seq_printf(seq, "\n"); 3669 return 0; 3670 } 3671 if (v == (void*)2) { 3672 status_unused(seq); 3673 return 0; 3674 } 3675 3676 if (mddev_lock(mddev)!=0) 3677 return -EINTR; 3678 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3679 seq_printf(seq, "%s : %sactive", mdname(mddev), 3680 mddev->pers ? "" : "in"); 3681 if (mddev->pers) { 3682 if (mddev->ro) 3683 seq_printf(seq, " (read-only)"); 3684 seq_printf(seq, " %s", mddev->pers->name); 3685 } 3686 3687 size = 0; 3688 ITERATE_RDEV(mddev,rdev,tmp2) { 3689 char b[BDEVNAME_SIZE]; 3690 seq_printf(seq, " %s[%d]", 3691 bdevname(rdev->bdev,b), rdev->desc_nr); 3692 if (test_bit(WriteMostly, &rdev->flags)) 3693 seq_printf(seq, "(W)"); 3694 if (test_bit(Faulty, &rdev->flags)) { 3695 seq_printf(seq, "(F)"); 3696 continue; 3697 } else if (rdev->raid_disk < 0) 3698 seq_printf(seq, "(S)"); /* spare */ 3699 size += rdev->size; 3700 } 3701 3702 if (!list_empty(&mddev->disks)) { 3703 if (mddev->pers) 3704 seq_printf(seq, "\n %llu blocks", 3705 (unsigned long long)mddev->array_size); 3706 else 3707 seq_printf(seq, "\n %llu blocks", 3708 (unsigned long long)size); 3709 } 3710 if (mddev->persistent) { 3711 if (mddev->major_version != 0 || 3712 mddev->minor_version != 90) { 3713 seq_printf(seq," super %d.%d", 3714 mddev->major_version, 3715 mddev->minor_version); 3716 } 3717 } else 3718 seq_printf(seq, " super non-persistent"); 3719 3720 if (mddev->pers) { 3721 mddev->pers->status (seq, mddev); 3722 seq_printf(seq, "\n "); 3723 if (mddev->curr_resync > 2) { 3724 status_resync (seq, mddev); 3725 seq_printf(seq, "\n "); 3726 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3727 seq_printf(seq, " resync=DELAYED\n "); 3728 } else 3729 seq_printf(seq, "\n "); 3730 3731 if ((bitmap = mddev->bitmap)) { 3732 unsigned long chunk_kb; 3733 unsigned long flags; 3734 spin_lock_irqsave(&bitmap->lock, flags); 3735 chunk_kb = bitmap->chunksize >> 10; 3736 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3737 "%lu%s chunk", 3738 bitmap->pages - bitmap->missing_pages, 3739 bitmap->pages, 3740 (bitmap->pages - bitmap->missing_pages) 3741 << (PAGE_SHIFT - 10), 3742 chunk_kb ? chunk_kb : bitmap->chunksize, 3743 chunk_kb ? "KB" : "B"); 3744 if (bitmap->file) { 3745 seq_printf(seq, ", file: "); 3746 seq_path(seq, bitmap->file->f_vfsmnt, 3747 bitmap->file->f_dentry," \t\n"); 3748 } 3749 3750 seq_printf(seq, "\n"); 3751 spin_unlock_irqrestore(&bitmap->lock, flags); 3752 } 3753 3754 seq_printf(seq, "\n"); 3755 } 3756 mddev_unlock(mddev); 3757 3758 return 0; 3759 } 3760 3761 static struct seq_operations md_seq_ops = { 3762 .start = md_seq_start, 3763 .next = md_seq_next, 3764 .stop = md_seq_stop, 3765 .show = md_seq_show, 3766 }; 3767 3768 static int md_seq_open(struct inode *inode, struct file *file) 3769 { 3770 int error; 3771 3772 error = seq_open(file, &md_seq_ops); 3773 return error; 3774 } 3775 3776 static struct file_operations md_seq_fops = { 3777 .open = md_seq_open, 3778 .read = seq_read, 3779 .llseek = seq_lseek, 3780 .release = seq_release, 3781 }; 3782 3783 int register_md_personality(int pnum, mdk_personality_t *p) 3784 { 3785 if (pnum >= MAX_PERSONALITY) { 3786 printk(KERN_ERR 3787 "md: tried to install personality %s as nr %d, but max is %lu\n", 3788 p->name, pnum, MAX_PERSONALITY-1); 3789 return -EINVAL; 3790 } 3791 3792 spin_lock(&pers_lock); 3793 if (pers[pnum]) { 3794 spin_unlock(&pers_lock); 3795 return -EBUSY; 3796 } 3797 3798 pers[pnum] = p; 3799 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3800 spin_unlock(&pers_lock); 3801 return 0; 3802 } 3803 3804 int unregister_md_personality(int pnum) 3805 { 3806 if (pnum >= MAX_PERSONALITY) 3807 return -EINVAL; 3808 3809 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3810 spin_lock(&pers_lock); 3811 pers[pnum] = NULL; 3812 spin_unlock(&pers_lock); 3813 return 0; 3814 } 3815 3816 static int is_mddev_idle(mddev_t *mddev) 3817 { 3818 mdk_rdev_t * rdev; 3819 struct list_head *tmp; 3820 int idle; 3821 unsigned long curr_events; 3822 3823 idle = 1; 3824 ITERATE_RDEV(mddev,rdev,tmp) { 3825 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3826 curr_events = disk_stat_read(disk, sectors[0]) + 3827 disk_stat_read(disk, sectors[1]) - 3828 atomic_read(&disk->sync_io); 3829 /* Allow some slack between valud of curr_events and last_events, 3830 * as there are some uninteresting races. 3831 * Note: the following is an unsigned comparison. 3832 */ 3833 if ((curr_events - rdev->last_events + 32) > 64) { 3834 rdev->last_events = curr_events; 3835 idle = 0; 3836 } 3837 } 3838 return idle; 3839 } 3840 3841 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3842 { 3843 /* another "blocks" (512byte) blocks have been synced */ 3844 atomic_sub(blocks, &mddev->recovery_active); 3845 wake_up(&mddev->recovery_wait); 3846 if (!ok) { 3847 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3848 md_wakeup_thread(mddev->thread); 3849 // stop recovery, signal do_sync .... 3850 } 3851 } 3852 3853 3854 /* md_write_start(mddev, bi) 3855 * If we need to update some array metadata (e.g. 'active' flag 3856 * in superblock) before writing, schedule a superblock update 3857 * and wait for it to complete. 3858 */ 3859 void md_write_start(mddev_t *mddev, struct bio *bi) 3860 { 3861 if (bio_data_dir(bi) != WRITE) 3862 return; 3863 3864 atomic_inc(&mddev->writes_pending); 3865 if (mddev->in_sync) { 3866 spin_lock_irq(&mddev->write_lock); 3867 if (mddev->in_sync) { 3868 mddev->in_sync = 0; 3869 mddev->sb_dirty = 1; 3870 md_wakeup_thread(mddev->thread); 3871 } 3872 spin_unlock_irq(&mddev->write_lock); 3873 } 3874 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3875 } 3876 3877 void md_write_end(mddev_t *mddev) 3878 { 3879 if (atomic_dec_and_test(&mddev->writes_pending)) { 3880 if (mddev->safemode == 2) 3881 md_wakeup_thread(mddev->thread); 3882 else 3883 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3884 } 3885 } 3886 3887 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3888 3889 #define SYNC_MARKS 10 3890 #define SYNC_MARK_STEP (3*HZ) 3891 static void md_do_sync(mddev_t *mddev) 3892 { 3893 mddev_t *mddev2; 3894 unsigned int currspeed = 0, 3895 window; 3896 sector_t max_sectors,j, io_sectors; 3897 unsigned long mark[SYNC_MARKS]; 3898 sector_t mark_cnt[SYNC_MARKS]; 3899 int last_mark,m; 3900 struct list_head *tmp; 3901 sector_t last_check; 3902 int skipped = 0; 3903 3904 /* just incase thread restarts... */ 3905 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3906 return; 3907 3908 /* we overload curr_resync somewhat here. 3909 * 0 == not engaged in resync at all 3910 * 2 == checking that there is no conflict with another sync 3911 * 1 == like 2, but have yielded to allow conflicting resync to 3912 * commense 3913 * other == active in resync - this many blocks 3914 * 3915 * Before starting a resync we must have set curr_resync to 3916 * 2, and then checked that every "conflicting" array has curr_resync 3917 * less than ours. When we find one that is the same or higher 3918 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3919 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3920 * This will mean we have to start checking from the beginning again. 3921 * 3922 */ 3923 3924 do { 3925 mddev->curr_resync = 2; 3926 3927 try_again: 3928 if (signal_pending(current) || 3929 kthread_should_stop()) { 3930 flush_signals(current); 3931 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3932 goto skip; 3933 } 3934 ITERATE_MDDEV(mddev2,tmp) { 3935 if (mddev2 == mddev) 3936 continue; 3937 if (mddev2->curr_resync && 3938 match_mddev_units(mddev,mddev2)) { 3939 DEFINE_WAIT(wq); 3940 if (mddev < mddev2 && mddev->curr_resync == 2) { 3941 /* arbitrarily yield */ 3942 mddev->curr_resync = 1; 3943 wake_up(&resync_wait); 3944 } 3945 if (mddev > mddev2 && mddev->curr_resync == 1) 3946 /* no need to wait here, we can wait the next 3947 * time 'round when curr_resync == 2 3948 */ 3949 continue; 3950 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3951 if (!signal_pending(current) && 3952 !kthread_should_stop() && 3953 mddev2->curr_resync >= mddev->curr_resync) { 3954 printk(KERN_INFO "md: delaying resync of %s" 3955 " until %s has finished resync (they" 3956 " share one or more physical units)\n", 3957 mdname(mddev), mdname(mddev2)); 3958 mddev_put(mddev2); 3959 schedule(); 3960 finish_wait(&resync_wait, &wq); 3961 goto try_again; 3962 } 3963 finish_wait(&resync_wait, &wq); 3964 } 3965 } 3966 } while (mddev->curr_resync < 2); 3967 3968 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3969 /* resync follows the size requested by the personality, 3970 * which defaults to physical size, but can be virtual size 3971 */ 3972 max_sectors = mddev->resync_max_sectors; 3973 mddev->resync_mismatches = 0; 3974 } else 3975 /* recovery follows the physical size of devices */ 3976 max_sectors = mddev->size << 1; 3977 3978 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3979 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3980 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3981 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 3982 "(but not more than %d KB/sec) for reconstruction.\n", 3983 sysctl_speed_limit_max); 3984 3985 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3986 /* we don't use the checkpoint if there's a bitmap */ 3987 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap 3988 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3989 j = mddev->recovery_cp; 3990 else 3991 j = 0; 3992 io_sectors = 0; 3993 for (m = 0; m < SYNC_MARKS; m++) { 3994 mark[m] = jiffies; 3995 mark_cnt[m] = io_sectors; 3996 } 3997 last_mark = 0; 3998 mddev->resync_mark = mark[last_mark]; 3999 mddev->resync_mark_cnt = mark_cnt[last_mark]; 4000 4001 /* 4002 * Tune reconstruction: 4003 */ 4004 window = 32*(PAGE_SIZE/512); 4005 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 4006 window/2,(unsigned long long) max_sectors/2); 4007 4008 atomic_set(&mddev->recovery_active, 0); 4009 init_waitqueue_head(&mddev->recovery_wait); 4010 last_check = 0; 4011 4012 if (j>2) { 4013 printk(KERN_INFO 4014 "md: resuming recovery of %s from checkpoint.\n", 4015 mdname(mddev)); 4016 mddev->curr_resync = j; 4017 } 4018 4019 while (j < max_sectors) { 4020 sector_t sectors; 4021 4022 skipped = 0; 4023 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4024 currspeed < sysctl_speed_limit_min); 4025 if (sectors == 0) { 4026 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4027 goto out; 4028 } 4029 4030 if (!skipped) { /* actual IO requested */ 4031 io_sectors += sectors; 4032 atomic_add(sectors, &mddev->recovery_active); 4033 } 4034 4035 j += sectors; 4036 if (j>1) mddev->curr_resync = j; 4037 4038 4039 if (last_check + window > io_sectors || j == max_sectors) 4040 continue; 4041 4042 last_check = io_sectors; 4043 4044 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 4045 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 4046 break; 4047 4048 repeat: 4049 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 4050 /* step marks */ 4051 int next = (last_mark+1) % SYNC_MARKS; 4052 4053 mddev->resync_mark = mark[next]; 4054 mddev->resync_mark_cnt = mark_cnt[next]; 4055 mark[next] = jiffies; 4056 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 4057 last_mark = next; 4058 } 4059 4060 4061 if (signal_pending(current) || kthread_should_stop()) { 4062 /* 4063 * got a signal, exit. 4064 */ 4065 printk(KERN_INFO 4066 "md: md_do_sync() got signal ... exiting\n"); 4067 flush_signals(current); 4068 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4069 goto out; 4070 } 4071 4072 /* 4073 * this loop exits only if either when we are slower than 4074 * the 'hard' speed limit, or the system was IO-idle for 4075 * a jiffy. 4076 * the system might be non-idle CPU-wise, but we only care 4077 * about not overloading the IO subsystem. (things like an 4078 * e2fsck being done on the RAID array should execute fast) 4079 */ 4080 mddev->queue->unplug_fn(mddev->queue); 4081 cond_resched(); 4082 4083 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4084 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4085 4086 if (currspeed > sysctl_speed_limit_min) { 4087 if ((currspeed > sysctl_speed_limit_max) || 4088 !is_mddev_idle(mddev)) { 4089 msleep_interruptible(250); 4090 goto repeat; 4091 } 4092 } 4093 } 4094 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 4095 /* 4096 * this also signals 'finished resyncing' to md_stop 4097 */ 4098 out: 4099 mddev->queue->unplug_fn(mddev->queue); 4100 4101 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 4102 4103 /* tell personality that we are finished */ 4104 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4105 4106 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4107 mddev->curr_resync > 2 && 4108 mddev->curr_resync >= mddev->recovery_cp) { 4109 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4110 printk(KERN_INFO 4111 "md: checkpointing recovery of %s.\n", 4112 mdname(mddev)); 4113 mddev->recovery_cp = mddev->curr_resync; 4114 } else 4115 mddev->recovery_cp = MaxSector; 4116 } 4117 4118 skip: 4119 mddev->curr_resync = 0; 4120 wake_up(&resync_wait); 4121 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4122 md_wakeup_thread(mddev->thread); 4123 } 4124 4125 4126 /* 4127 * This routine is regularly called by all per-raid-array threads to 4128 * deal with generic issues like resync and super-block update. 4129 * Raid personalities that don't have a thread (linear/raid0) do not 4130 * need this as they never do any recovery or update the superblock. 4131 * 4132 * It does not do any resync itself, but rather "forks" off other threads 4133 * to do that as needed. 4134 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 4135 * "->recovery" and create a thread at ->sync_thread. 4136 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 4137 * and wakeups up this thread which will reap the thread and finish up. 4138 * This thread also removes any faulty devices (with nr_pending == 0). 4139 * 4140 * The overall approach is: 4141 * 1/ if the superblock needs updating, update it. 4142 * 2/ If a recovery thread is running, don't do anything else. 4143 * 3/ If recovery has finished, clean up, possibly marking spares active. 4144 * 4/ If there are any faulty devices, remove them. 4145 * 5/ If array is degraded, try to add spares devices 4146 * 6/ If array has spares or is not in-sync, start a resync thread. 4147 */ 4148 void md_check_recovery(mddev_t *mddev) 4149 { 4150 mdk_rdev_t *rdev; 4151 struct list_head *rtmp; 4152 4153 4154 if (mddev->bitmap) 4155 bitmap_daemon_work(mddev->bitmap); 4156 4157 if (mddev->ro) 4158 return; 4159 4160 if (signal_pending(current)) { 4161 if (mddev->pers->sync_request) { 4162 printk(KERN_INFO "md: %s in immediate safe mode\n", 4163 mdname(mddev)); 4164 mddev->safemode = 2; 4165 } 4166 flush_signals(current); 4167 } 4168 4169 if ( ! ( 4170 mddev->sb_dirty || 4171 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 4172 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 4173 (mddev->safemode == 1) || 4174 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 4175 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 4176 )) 4177 return; 4178 4179 if (mddev_trylock(mddev)==0) { 4180 int spares =0; 4181 4182 spin_lock_irq(&mddev->write_lock); 4183 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4184 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4185 mddev->in_sync = 1; 4186 mddev->sb_dirty = 1; 4187 } 4188 if (mddev->safemode == 1) 4189 mddev->safemode = 0; 4190 spin_unlock_irq(&mddev->write_lock); 4191 4192 if (mddev->sb_dirty) 4193 md_update_sb(mddev); 4194 4195 4196 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4197 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 4198 /* resync/recovery still happening */ 4199 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4200 goto unlock; 4201 } 4202 if (mddev->sync_thread) { 4203 /* resync has finished, collect result */ 4204 md_unregister_thread(mddev->sync_thread); 4205 mddev->sync_thread = NULL; 4206 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4207 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4208 /* success...*/ 4209 /* activate any spares */ 4210 mddev->pers->spare_active(mddev); 4211 } 4212 md_update_sb(mddev); 4213 4214 /* if array is no-longer degraded, then any saved_raid_disk 4215 * information must be scrapped 4216 */ 4217 if (!mddev->degraded) 4218 ITERATE_RDEV(mddev,rdev,rtmp) 4219 rdev->saved_raid_disk = -1; 4220 4221 mddev->recovery = 0; 4222 /* flag recovery needed just to double check */ 4223 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4224 goto unlock; 4225 } 4226 /* Clear some bits that don't mean anything, but 4227 * might be left set 4228 */ 4229 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4230 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 4231 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 4232 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4233 4234 /* no recovery is running. 4235 * remove any failed drives, then 4236 * add spares if possible. 4237 * Spare are also removed and re-added, to allow 4238 * the personality to fail the re-add. 4239 */ 4240 ITERATE_RDEV(mddev,rdev,rtmp) 4241 if (rdev->raid_disk >= 0 && 4242 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 4243 atomic_read(&rdev->nr_pending)==0) { 4244 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 4245 char nm[20]; 4246 sprintf(nm,"rd%d", rdev->raid_disk); 4247 sysfs_remove_link(&mddev->kobj, nm); 4248 rdev->raid_disk = -1; 4249 } 4250 } 4251 4252 if (mddev->degraded) { 4253 ITERATE_RDEV(mddev,rdev,rtmp) 4254 if (rdev->raid_disk < 0 4255 && !test_bit(Faulty, &rdev->flags)) { 4256 if (mddev->pers->hot_add_disk(mddev,rdev)) { 4257 char nm[20]; 4258 sprintf(nm, "rd%d", rdev->raid_disk); 4259 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4260 spares++; 4261 } else 4262 break; 4263 } 4264 } 4265 4266 if (spares) { 4267 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4268 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4269 } else if (mddev->recovery_cp < MaxSector) { 4270 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4271 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4272 /* nothing to be done ... */ 4273 goto unlock; 4274 4275 if (mddev->pers->sync_request) { 4276 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4277 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4278 /* We are adding a device or devices to an array 4279 * which has the bitmap stored on all devices. 4280 * So make sure all bitmap pages get written 4281 */ 4282 bitmap_write_all(mddev->bitmap); 4283 } 4284 mddev->sync_thread = md_register_thread(md_do_sync, 4285 mddev, 4286 "%s_resync"); 4287 if (!mddev->sync_thread) { 4288 printk(KERN_ERR "%s: could not start resync" 4289 " thread...\n", 4290 mdname(mddev)); 4291 /* leave the spares where they are, it shouldn't hurt */ 4292 mddev->recovery = 0; 4293 } else { 4294 md_wakeup_thread(mddev->sync_thread); 4295 } 4296 } 4297 unlock: 4298 mddev_unlock(mddev); 4299 } 4300 } 4301 4302 static int md_notify_reboot(struct notifier_block *this, 4303 unsigned long code, void *x) 4304 { 4305 struct list_head *tmp; 4306 mddev_t *mddev; 4307 4308 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 4309 4310 printk(KERN_INFO "md: stopping all md devices.\n"); 4311 4312 ITERATE_MDDEV(mddev,tmp) 4313 if (mddev_trylock(mddev)==0) 4314 do_md_stop (mddev, 1); 4315 /* 4316 * certain more exotic SCSI devices are known to be 4317 * volatile wrt too early system reboots. While the 4318 * right place to handle this issue is the given 4319 * driver, we do want to have a safe RAID driver ... 4320 */ 4321 mdelay(1000*1); 4322 } 4323 return NOTIFY_DONE; 4324 } 4325 4326 static struct notifier_block md_notifier = { 4327 .notifier_call = md_notify_reboot, 4328 .next = NULL, 4329 .priority = INT_MAX, /* before any real devices */ 4330 }; 4331 4332 static void md_geninit(void) 4333 { 4334 struct proc_dir_entry *p; 4335 4336 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 4337 4338 p = create_proc_entry("mdstat", S_IRUGO, NULL); 4339 if (p) 4340 p->proc_fops = &md_seq_fops; 4341 } 4342 4343 static int __init md_init(void) 4344 { 4345 int minor; 4346 4347 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 4348 " MD_SB_DISKS=%d\n", 4349 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4350 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4351 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 4352 BITMAP_MINOR); 4353 4354 if (register_blkdev(MAJOR_NR, "md")) 4355 return -1; 4356 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 4357 unregister_blkdev(MAJOR_NR, "md"); 4358 return -1; 4359 } 4360 devfs_mk_dir("md"); 4361 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 4362 md_probe, NULL, NULL); 4363 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 4364 md_probe, NULL, NULL); 4365 4366 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4367 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 4368 S_IFBLK|S_IRUSR|S_IWUSR, 4369 "md/%d", minor); 4370 4371 for (minor=0; minor < MAX_MD_DEVS; ++minor) 4372 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4373 S_IFBLK|S_IRUSR|S_IWUSR, 4374 "md/mdp%d", minor); 4375 4376 4377 register_reboot_notifier(&md_notifier); 4378 raid_table_header = register_sysctl_table(raid_root_table, 1); 4379 4380 md_geninit(); 4381 return (0); 4382 } 4383 4384 4385 #ifndef MODULE 4386 4387 /* 4388 * Searches all registered partitions for autorun RAID arrays 4389 * at boot time. 4390 */ 4391 static dev_t detected_devices[128]; 4392 static int dev_cnt; 4393 4394 void md_autodetect_dev(dev_t dev) 4395 { 4396 if (dev_cnt >= 0 && dev_cnt < 127) 4397 detected_devices[dev_cnt++] = dev; 4398 } 4399 4400 4401 static void autostart_arrays(int part) 4402 { 4403 mdk_rdev_t *rdev; 4404 int i; 4405 4406 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4407 4408 for (i = 0; i < dev_cnt; i++) { 4409 dev_t dev = detected_devices[i]; 4410 4411 rdev = md_import_device(dev,0, 0); 4412 if (IS_ERR(rdev)) 4413 continue; 4414 4415 if (test_bit(Faulty, &rdev->flags)) { 4416 MD_BUG(); 4417 continue; 4418 } 4419 list_add(&rdev->same_set, &pending_raid_disks); 4420 } 4421 dev_cnt = 0; 4422 4423 autorun_devices(part); 4424 } 4425 4426 #endif 4427 4428 static __exit void md_exit(void) 4429 { 4430 mddev_t *mddev; 4431 struct list_head *tmp; 4432 int i; 4433 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4434 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4435 for (i=0; i < MAX_MD_DEVS; i++) 4436 devfs_remove("md/%d", i); 4437 for (i=0; i < MAX_MD_DEVS; i++) 4438 devfs_remove("md/d%d", i); 4439 4440 devfs_remove("md"); 4441 4442 unregister_blkdev(MAJOR_NR,"md"); 4443 unregister_blkdev(mdp_major, "mdp"); 4444 unregister_reboot_notifier(&md_notifier); 4445 unregister_sysctl_table(raid_table_header); 4446 remove_proc_entry("mdstat", NULL); 4447 ITERATE_MDDEV(mddev,tmp) { 4448 struct gendisk *disk = mddev->gendisk; 4449 if (!disk) 4450 continue; 4451 export_array(mddev); 4452 del_gendisk(disk); 4453 put_disk(disk); 4454 mddev->gendisk = NULL; 4455 mddev_put(mddev); 4456 } 4457 } 4458 4459 module_init(md_init) 4460 module_exit(md_exit) 4461 4462 EXPORT_SYMBOL(register_md_personality); 4463 EXPORT_SYMBOL(unregister_md_personality); 4464 EXPORT_SYMBOL(md_error); 4465 EXPORT_SYMBOL(md_done_sync); 4466 EXPORT_SYMBOL(md_write_start); 4467 EXPORT_SYMBOL(md_write_end); 4468 EXPORT_SYMBOL(md_register_thread); 4469 EXPORT_SYMBOL(md_unregister_thread); 4470 EXPORT_SYMBOL(md_wakeup_thread); 4471 EXPORT_SYMBOL(md_print_devices); 4472 EXPORT_SYMBOL(md_check_recovery); 4473 MODULE_LICENSE("GPL"); 4474 MODULE_ALIAS("md"); 4475 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4476