1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/devfs_fs_kernel.h> 43 #include <linux/buffer_head.h> /* for invalidate_bdev */ 44 #include <linux/suspend.h> 45 46 #include <linux/init.h> 47 48 #include <linux/file.h> 49 50 #ifdef CONFIG_KMOD 51 #include <linux/kmod.h> 52 #endif 53 54 #include <asm/unaligned.h> 55 56 #define MAJOR_NR MD_MAJOR 57 #define MD_DRIVER 58 59 /* 63 partitions with the alternate major number (mdp) */ 60 #define MdpMinorShift 6 61 62 #define DEBUG 0 63 #define dprintk(x...) ((void)(DEBUG && printk(x))) 64 65 66 #ifndef MODULE 67 static void autostart_arrays (int part); 68 #endif 69 70 static mdk_personality_t *pers[MAX_PERSONALITY]; 71 static DEFINE_SPINLOCK(pers_lock); 72 73 /* 74 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 75 * is 1000 KB/sec, so the extra system load does not show up that much. 76 * Increase it if you want to have more _guaranteed_ speed. Note that 77 * the RAID driver will use the maximum available bandwidth if the IO 78 * subsystem is idle. There is also an 'absolute maximum' reconstruction 79 * speed limit - in case reconstruction slows down your system despite 80 * idle IO detection. 81 * 82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 */ 84 85 static int sysctl_speed_limit_min = 1000; 86 static int sysctl_speed_limit_max = 200000; 87 88 static struct ctl_table_header *raid_table_header; 89 90 static ctl_table raid_table[] = { 91 { 92 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 93 .procname = "speed_limit_min", 94 .data = &sysctl_speed_limit_min, 95 .maxlen = sizeof(int), 96 .mode = 0644, 97 .proc_handler = &proc_dointvec, 98 }, 99 { 100 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 101 .procname = "speed_limit_max", 102 .data = &sysctl_speed_limit_max, 103 .maxlen = sizeof(int), 104 .mode = 0644, 105 .proc_handler = &proc_dointvec, 106 }, 107 { .ctl_name = 0 } 108 }; 109 110 static ctl_table raid_dir_table[] = { 111 { 112 .ctl_name = DEV_RAID, 113 .procname = "raid", 114 .maxlen = 0, 115 .mode = 0555, 116 .child = raid_table, 117 }, 118 { .ctl_name = 0 } 119 }; 120 121 static ctl_table raid_root_table[] = { 122 { 123 .ctl_name = CTL_DEV, 124 .procname = "dev", 125 .maxlen = 0, 126 .mode = 0555, 127 .child = raid_dir_table, 128 }, 129 { .ctl_name = 0 } 130 }; 131 132 static struct block_device_operations md_fops; 133 134 /* 135 * Enables to iterate over all existing md arrays 136 * all_mddevs_lock protects this list. 137 */ 138 static LIST_HEAD(all_mddevs); 139 static DEFINE_SPINLOCK(all_mddevs_lock); 140 141 142 /* 143 * iterates through all used mddevs in the system. 144 * We take care to grab the all_mddevs_lock whenever navigating 145 * the list, and to always hold a refcount when unlocked. 146 * Any code which breaks out of this loop while own 147 * a reference to the current mddev and must mddev_put it. 148 */ 149 #define ITERATE_MDDEV(mddev,tmp) \ 150 \ 151 for (({ spin_lock(&all_mddevs_lock); \ 152 tmp = all_mddevs.next; \ 153 mddev = NULL;}); \ 154 ({ if (tmp != &all_mddevs) \ 155 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 156 spin_unlock(&all_mddevs_lock); \ 157 if (mddev) mddev_put(mddev); \ 158 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 159 tmp != &all_mddevs;}); \ 160 ({ spin_lock(&all_mddevs_lock); \ 161 tmp = tmp->next;}) \ 162 ) 163 164 165 static int md_fail_request (request_queue_t *q, struct bio *bio) 166 { 167 bio_io_error(bio, bio->bi_size); 168 return 0; 169 } 170 171 static inline mddev_t *mddev_get(mddev_t *mddev) 172 { 173 atomic_inc(&mddev->active); 174 return mddev; 175 } 176 177 static void mddev_put(mddev_t *mddev) 178 { 179 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 180 return; 181 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 182 list_del(&mddev->all_mddevs); 183 blk_put_queue(mddev->queue); 184 kfree(mddev); 185 } 186 spin_unlock(&all_mddevs_lock); 187 } 188 189 static mddev_t * mddev_find(dev_t unit) 190 { 191 mddev_t *mddev, *new = NULL; 192 193 retry: 194 spin_lock(&all_mddevs_lock); 195 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 196 if (mddev->unit == unit) { 197 mddev_get(mddev); 198 spin_unlock(&all_mddevs_lock); 199 kfree(new); 200 return mddev; 201 } 202 203 if (new) { 204 list_add(&new->all_mddevs, &all_mddevs); 205 spin_unlock(&all_mddevs_lock); 206 return new; 207 } 208 spin_unlock(&all_mddevs_lock); 209 210 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 211 if (!new) 212 return NULL; 213 214 memset(new, 0, sizeof(*new)); 215 216 new->unit = unit; 217 if (MAJOR(unit) == MD_MAJOR) 218 new->md_minor = MINOR(unit); 219 else 220 new->md_minor = MINOR(unit) >> MdpMinorShift; 221 222 init_MUTEX(&new->reconfig_sem); 223 INIT_LIST_HEAD(&new->disks); 224 INIT_LIST_HEAD(&new->all_mddevs); 225 init_timer(&new->safemode_timer); 226 atomic_set(&new->active, 1); 227 spin_lock_init(&new->write_lock); 228 init_waitqueue_head(&new->sb_wait); 229 230 new->queue = blk_alloc_queue(GFP_KERNEL); 231 if (!new->queue) { 232 kfree(new); 233 return NULL; 234 } 235 236 blk_queue_make_request(new->queue, md_fail_request); 237 238 goto retry; 239 } 240 241 static inline int mddev_lock(mddev_t * mddev) 242 { 243 return down_interruptible(&mddev->reconfig_sem); 244 } 245 246 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 247 { 248 down(&mddev->reconfig_sem); 249 } 250 251 static inline int mddev_trylock(mddev_t * mddev) 252 { 253 return down_trylock(&mddev->reconfig_sem); 254 } 255 256 static inline void mddev_unlock(mddev_t * mddev) 257 { 258 up(&mddev->reconfig_sem); 259 260 md_wakeup_thread(mddev->thread); 261 } 262 263 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 264 { 265 mdk_rdev_t * rdev; 266 struct list_head *tmp; 267 268 ITERATE_RDEV(mddev,rdev,tmp) { 269 if (rdev->desc_nr == nr) 270 return rdev; 271 } 272 return NULL; 273 } 274 275 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 276 { 277 struct list_head *tmp; 278 mdk_rdev_t *rdev; 279 280 ITERATE_RDEV(mddev,rdev,tmp) { 281 if (rdev->bdev->bd_dev == dev) 282 return rdev; 283 } 284 return NULL; 285 } 286 287 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 288 { 289 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 290 return MD_NEW_SIZE_BLOCKS(size); 291 } 292 293 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 294 { 295 sector_t size; 296 297 size = rdev->sb_offset; 298 299 if (chunk_size) 300 size &= ~((sector_t)chunk_size/1024 - 1); 301 return size; 302 } 303 304 static int alloc_disk_sb(mdk_rdev_t * rdev) 305 { 306 if (rdev->sb_page) 307 MD_BUG(); 308 309 rdev->sb_page = alloc_page(GFP_KERNEL); 310 if (!rdev->sb_page) { 311 printk(KERN_ALERT "md: out of memory.\n"); 312 return -EINVAL; 313 } 314 315 return 0; 316 } 317 318 static void free_disk_sb(mdk_rdev_t * rdev) 319 { 320 if (rdev->sb_page) { 321 page_cache_release(rdev->sb_page); 322 rdev->sb_loaded = 0; 323 rdev->sb_page = NULL; 324 rdev->sb_offset = 0; 325 rdev->size = 0; 326 } 327 } 328 329 330 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 331 { 332 mdk_rdev_t *rdev = bio->bi_private; 333 if (bio->bi_size) 334 return 1; 335 336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 md_error(rdev->mddev, rdev); 338 339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 wake_up(&rdev->mddev->sb_wait); 341 bio_put(bio); 342 return 0; 343 } 344 345 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 346 sector_t sector, int size, struct page *page) 347 { 348 /* write first size bytes of page to sector of rdev 349 * Increment mddev->pending_writes before returning 350 * and decrement it on completion, waking up sb_wait 351 * if zero is reached. 352 * If an error occurred, call md_error 353 */ 354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 355 356 bio->bi_bdev = rdev->bdev; 357 bio->bi_sector = sector; 358 bio_add_page(bio, page, size, 0); 359 bio->bi_private = rdev; 360 bio->bi_end_io = super_written; 361 atomic_inc(&mddev->pending_writes); 362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 363 } 364 365 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 366 { 367 if (bio->bi_size) 368 return 1; 369 370 complete((struct completion*)bio->bi_private); 371 return 0; 372 } 373 374 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 375 struct page *page, int rw) 376 { 377 struct bio *bio = bio_alloc(GFP_NOIO, 1); 378 struct completion event; 379 int ret; 380 381 rw |= (1 << BIO_RW_SYNC); 382 383 bio->bi_bdev = bdev; 384 bio->bi_sector = sector; 385 bio_add_page(bio, page, size, 0); 386 init_completion(&event); 387 bio->bi_private = &event; 388 bio->bi_end_io = bi_complete; 389 submit_bio(rw, bio); 390 wait_for_completion(&event); 391 392 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 393 bio_put(bio); 394 return ret; 395 } 396 397 static int read_disk_sb(mdk_rdev_t * rdev, int size) 398 { 399 char b[BDEVNAME_SIZE]; 400 if (!rdev->sb_page) { 401 MD_BUG(); 402 return -EINVAL; 403 } 404 if (rdev->sb_loaded) 405 return 0; 406 407 408 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 409 goto fail; 410 rdev->sb_loaded = 1; 411 return 0; 412 413 fail: 414 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 415 bdevname(rdev->bdev,b)); 416 return -EINVAL; 417 } 418 419 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 420 { 421 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 422 (sb1->set_uuid1 == sb2->set_uuid1) && 423 (sb1->set_uuid2 == sb2->set_uuid2) && 424 (sb1->set_uuid3 == sb2->set_uuid3)) 425 426 return 1; 427 428 return 0; 429 } 430 431 432 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 433 { 434 int ret; 435 mdp_super_t *tmp1, *tmp2; 436 437 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 438 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 439 440 if (!tmp1 || !tmp2) { 441 ret = 0; 442 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 443 goto abort; 444 } 445 446 *tmp1 = *sb1; 447 *tmp2 = *sb2; 448 449 /* 450 * nr_disks is not constant 451 */ 452 tmp1->nr_disks = 0; 453 tmp2->nr_disks = 0; 454 455 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 456 ret = 0; 457 else 458 ret = 1; 459 460 abort: 461 kfree(tmp1); 462 kfree(tmp2); 463 return ret; 464 } 465 466 static unsigned int calc_sb_csum(mdp_super_t * sb) 467 { 468 unsigned int disk_csum, csum; 469 470 disk_csum = sb->sb_csum; 471 sb->sb_csum = 0; 472 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 473 sb->sb_csum = disk_csum; 474 return csum; 475 } 476 477 478 /* 479 * Handle superblock details. 480 * We want to be able to handle multiple superblock formats 481 * so we have a common interface to them all, and an array of 482 * different handlers. 483 * We rely on user-space to write the initial superblock, and support 484 * reading and updating of superblocks. 485 * Interface methods are: 486 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 487 * loads and validates a superblock on dev. 488 * if refdev != NULL, compare superblocks on both devices 489 * Return: 490 * 0 - dev has a superblock that is compatible with refdev 491 * 1 - dev has a superblock that is compatible and newer than refdev 492 * so dev should be used as the refdev in future 493 * -EINVAL superblock incompatible or invalid 494 * -othererror e.g. -EIO 495 * 496 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 497 * Verify that dev is acceptable into mddev. 498 * The first time, mddev->raid_disks will be 0, and data from 499 * dev should be merged in. Subsequent calls check that dev 500 * is new enough. Return 0 or -EINVAL 501 * 502 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 503 * Update the superblock for rdev with data in mddev 504 * This does not write to disc. 505 * 506 */ 507 508 struct super_type { 509 char *name; 510 struct module *owner; 511 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 512 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 513 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 514 }; 515 516 /* 517 * load_super for 0.90.0 518 */ 519 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 520 { 521 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 522 mdp_super_t *sb; 523 int ret; 524 sector_t sb_offset; 525 526 /* 527 * Calculate the position of the superblock, 528 * it's at the end of the disk. 529 * 530 * It also happens to be a multiple of 4Kb. 531 */ 532 sb_offset = calc_dev_sboffset(rdev->bdev); 533 rdev->sb_offset = sb_offset; 534 535 ret = read_disk_sb(rdev, MD_SB_BYTES); 536 if (ret) return ret; 537 538 ret = -EINVAL; 539 540 bdevname(rdev->bdev, b); 541 sb = (mdp_super_t*)page_address(rdev->sb_page); 542 543 if (sb->md_magic != MD_SB_MAGIC) { 544 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 545 b); 546 goto abort; 547 } 548 549 if (sb->major_version != 0 || 550 sb->minor_version != 90) { 551 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 552 sb->major_version, sb->minor_version, 553 b); 554 goto abort; 555 } 556 557 if (sb->raid_disks <= 0) 558 goto abort; 559 560 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 561 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 562 b); 563 goto abort; 564 } 565 566 rdev->preferred_minor = sb->md_minor; 567 rdev->data_offset = 0; 568 rdev->sb_size = MD_SB_BYTES; 569 570 if (sb->level == LEVEL_MULTIPATH) 571 rdev->desc_nr = -1; 572 else 573 rdev->desc_nr = sb->this_disk.number; 574 575 if (refdev == 0) 576 ret = 1; 577 else { 578 __u64 ev1, ev2; 579 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 580 if (!uuid_equal(refsb, sb)) { 581 printk(KERN_WARNING "md: %s has different UUID to %s\n", 582 b, bdevname(refdev->bdev,b2)); 583 goto abort; 584 } 585 if (!sb_equal(refsb, sb)) { 586 printk(KERN_WARNING "md: %s has same UUID" 587 " but different superblock to %s\n", 588 b, bdevname(refdev->bdev, b2)); 589 goto abort; 590 } 591 ev1 = md_event(sb); 592 ev2 = md_event(refsb); 593 if (ev1 > ev2) 594 ret = 1; 595 else 596 ret = 0; 597 } 598 rdev->size = calc_dev_size(rdev, sb->chunk_size); 599 600 abort: 601 return ret; 602 } 603 604 /* 605 * validate_super for 0.90.0 606 */ 607 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 608 { 609 mdp_disk_t *desc; 610 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 611 612 rdev->raid_disk = -1; 613 rdev->in_sync = 0; 614 if (mddev->raid_disks == 0) { 615 mddev->major_version = 0; 616 mddev->minor_version = sb->minor_version; 617 mddev->patch_version = sb->patch_version; 618 mddev->persistent = ! sb->not_persistent; 619 mddev->chunk_size = sb->chunk_size; 620 mddev->ctime = sb->ctime; 621 mddev->utime = sb->utime; 622 mddev->level = sb->level; 623 mddev->layout = sb->layout; 624 mddev->raid_disks = sb->raid_disks; 625 mddev->size = sb->size; 626 mddev->events = md_event(sb); 627 mddev->bitmap_offset = 0; 628 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 629 630 if (sb->state & (1<<MD_SB_CLEAN)) 631 mddev->recovery_cp = MaxSector; 632 else { 633 if (sb->events_hi == sb->cp_events_hi && 634 sb->events_lo == sb->cp_events_lo) { 635 mddev->recovery_cp = sb->recovery_cp; 636 } else 637 mddev->recovery_cp = 0; 638 } 639 640 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 641 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 642 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 643 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 644 645 mddev->max_disks = MD_SB_DISKS; 646 647 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 648 mddev->bitmap_file == NULL) { 649 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 650 /* FIXME use a better test */ 651 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 652 return -EINVAL; 653 } 654 mddev->bitmap_offset = mddev->default_bitmap_offset; 655 } 656 657 } else if (mddev->pers == NULL) { 658 /* Insist on good event counter while assembling */ 659 __u64 ev1 = md_event(sb); 660 ++ev1; 661 if (ev1 < mddev->events) 662 return -EINVAL; 663 } else if (mddev->bitmap) { 664 /* if adding to array with a bitmap, then we can accept an 665 * older device ... but not too old. 666 */ 667 __u64 ev1 = md_event(sb); 668 if (ev1 < mddev->bitmap->events_cleared) 669 return 0; 670 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 671 return 0; 672 673 if (mddev->level != LEVEL_MULTIPATH) { 674 rdev->faulty = 0; 675 rdev->flags = 0; 676 desc = sb->disks + rdev->desc_nr; 677 678 if (desc->state & (1<<MD_DISK_FAULTY)) 679 rdev->faulty = 1; 680 else if (desc->state & (1<<MD_DISK_SYNC) && 681 desc->raid_disk < mddev->raid_disks) { 682 rdev->in_sync = 1; 683 rdev->raid_disk = desc->raid_disk; 684 } 685 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 686 set_bit(WriteMostly, &rdev->flags); 687 } else /* MULTIPATH are always insync */ 688 rdev->in_sync = 1; 689 return 0; 690 } 691 692 /* 693 * sync_super for 0.90.0 694 */ 695 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 696 { 697 mdp_super_t *sb; 698 struct list_head *tmp; 699 mdk_rdev_t *rdev2; 700 int next_spare = mddev->raid_disks; 701 702 /* make rdev->sb match mddev data.. 703 * 704 * 1/ zero out disks 705 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 706 * 3/ any empty disks < next_spare become removed 707 * 708 * disks[0] gets initialised to REMOVED because 709 * we cannot be sure from other fields if it has 710 * been initialised or not. 711 */ 712 int i; 713 int active=0, working=0,failed=0,spare=0,nr_disks=0; 714 715 rdev->sb_size = MD_SB_BYTES; 716 717 sb = (mdp_super_t*)page_address(rdev->sb_page); 718 719 memset(sb, 0, sizeof(*sb)); 720 721 sb->md_magic = MD_SB_MAGIC; 722 sb->major_version = mddev->major_version; 723 sb->minor_version = mddev->minor_version; 724 sb->patch_version = mddev->patch_version; 725 sb->gvalid_words = 0; /* ignored */ 726 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 727 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 728 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 729 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 730 731 sb->ctime = mddev->ctime; 732 sb->level = mddev->level; 733 sb->size = mddev->size; 734 sb->raid_disks = mddev->raid_disks; 735 sb->md_minor = mddev->md_minor; 736 sb->not_persistent = !mddev->persistent; 737 sb->utime = mddev->utime; 738 sb->state = 0; 739 sb->events_hi = (mddev->events>>32); 740 sb->events_lo = (u32)mddev->events; 741 742 if (mddev->in_sync) 743 { 744 sb->recovery_cp = mddev->recovery_cp; 745 sb->cp_events_hi = (mddev->events>>32); 746 sb->cp_events_lo = (u32)mddev->events; 747 if (mddev->recovery_cp == MaxSector) 748 sb->state = (1<< MD_SB_CLEAN); 749 } else 750 sb->recovery_cp = 0; 751 752 sb->layout = mddev->layout; 753 sb->chunk_size = mddev->chunk_size; 754 755 if (mddev->bitmap && mddev->bitmap_file == NULL) 756 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 757 758 sb->disks[0].state = (1<<MD_DISK_REMOVED); 759 ITERATE_RDEV(mddev,rdev2,tmp) { 760 mdp_disk_t *d; 761 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 762 rdev2->desc_nr = rdev2->raid_disk; 763 else 764 rdev2->desc_nr = next_spare++; 765 d = &sb->disks[rdev2->desc_nr]; 766 nr_disks++; 767 d->number = rdev2->desc_nr; 768 d->major = MAJOR(rdev2->bdev->bd_dev); 769 d->minor = MINOR(rdev2->bdev->bd_dev); 770 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 771 d->raid_disk = rdev2->raid_disk; 772 else 773 d->raid_disk = rdev2->desc_nr; /* compatibility */ 774 if (rdev2->faulty) { 775 d->state = (1<<MD_DISK_FAULTY); 776 failed++; 777 } else if (rdev2->in_sync) { 778 d->state = (1<<MD_DISK_ACTIVE); 779 d->state |= (1<<MD_DISK_SYNC); 780 active++; 781 working++; 782 } else { 783 d->state = 0; 784 spare++; 785 working++; 786 } 787 if (test_bit(WriteMostly, &rdev2->flags)) 788 d->state |= (1<<MD_DISK_WRITEMOSTLY); 789 } 790 791 /* now set the "removed" and "faulty" bits on any missing devices */ 792 for (i=0 ; i < mddev->raid_disks ; i++) { 793 mdp_disk_t *d = &sb->disks[i]; 794 if (d->state == 0 && d->number == 0) { 795 d->number = i; 796 d->raid_disk = i; 797 d->state = (1<<MD_DISK_REMOVED); 798 d->state |= (1<<MD_DISK_FAULTY); 799 failed++; 800 } 801 } 802 sb->nr_disks = nr_disks; 803 sb->active_disks = active; 804 sb->working_disks = working; 805 sb->failed_disks = failed; 806 sb->spare_disks = spare; 807 808 sb->this_disk = sb->disks[rdev->desc_nr]; 809 sb->sb_csum = calc_sb_csum(sb); 810 } 811 812 /* 813 * version 1 superblock 814 */ 815 816 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 817 { 818 unsigned int disk_csum, csum; 819 unsigned long long newcsum; 820 int size = 256 + le32_to_cpu(sb->max_dev)*2; 821 unsigned int *isuper = (unsigned int*)sb; 822 int i; 823 824 disk_csum = sb->sb_csum; 825 sb->sb_csum = 0; 826 newcsum = 0; 827 for (i=0; size>=4; size -= 4 ) 828 newcsum += le32_to_cpu(*isuper++); 829 830 if (size == 2) 831 newcsum += le16_to_cpu(*(unsigned short*) isuper); 832 833 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 834 sb->sb_csum = disk_csum; 835 return cpu_to_le32(csum); 836 } 837 838 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 839 { 840 struct mdp_superblock_1 *sb; 841 int ret; 842 sector_t sb_offset; 843 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 844 int bmask; 845 846 /* 847 * Calculate the position of the superblock. 848 * It is always aligned to a 4K boundary and 849 * depeding on minor_version, it can be: 850 * 0: At least 8K, but less than 12K, from end of device 851 * 1: At start of device 852 * 2: 4K from start of device. 853 */ 854 switch(minor_version) { 855 case 0: 856 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 857 sb_offset -= 8*2; 858 sb_offset &= ~(sector_t)(4*2-1); 859 /* convert from sectors to K */ 860 sb_offset /= 2; 861 break; 862 case 1: 863 sb_offset = 0; 864 break; 865 case 2: 866 sb_offset = 4; 867 break; 868 default: 869 return -EINVAL; 870 } 871 rdev->sb_offset = sb_offset; 872 873 /* superblock is rarely larger than 1K, but it can be larger, 874 * and it is safe to read 4k, so we do that 875 */ 876 ret = read_disk_sb(rdev, 4096); 877 if (ret) return ret; 878 879 880 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 881 882 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 883 sb->major_version != cpu_to_le32(1) || 884 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 885 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 886 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 887 return -EINVAL; 888 889 if (calc_sb_1_csum(sb) != sb->sb_csum) { 890 printk("md: invalid superblock checksum on %s\n", 891 bdevname(rdev->bdev,b)); 892 return -EINVAL; 893 } 894 if (le64_to_cpu(sb->data_size) < 10) { 895 printk("md: data_size too small on %s\n", 896 bdevname(rdev->bdev,b)); 897 return -EINVAL; 898 } 899 rdev->preferred_minor = 0xffff; 900 rdev->data_offset = le64_to_cpu(sb->data_offset); 901 902 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 903 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 904 if (rdev->sb_size & bmask) 905 rdev-> sb_size = (rdev->sb_size | bmask)+1; 906 907 if (refdev == 0) 908 return 1; 909 else { 910 __u64 ev1, ev2; 911 struct mdp_superblock_1 *refsb = 912 (struct mdp_superblock_1*)page_address(refdev->sb_page); 913 914 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 915 sb->level != refsb->level || 916 sb->layout != refsb->layout || 917 sb->chunksize != refsb->chunksize) { 918 printk(KERN_WARNING "md: %s has strangely different" 919 " superblock to %s\n", 920 bdevname(rdev->bdev,b), 921 bdevname(refdev->bdev,b2)); 922 return -EINVAL; 923 } 924 ev1 = le64_to_cpu(sb->events); 925 ev2 = le64_to_cpu(refsb->events); 926 927 if (ev1 > ev2) 928 return 1; 929 } 930 if (minor_version) 931 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 932 else 933 rdev->size = rdev->sb_offset; 934 if (rdev->size < le64_to_cpu(sb->data_size)/2) 935 return -EINVAL; 936 rdev->size = le64_to_cpu(sb->data_size)/2; 937 if (le32_to_cpu(sb->chunksize)) 938 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 939 return 0; 940 } 941 942 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 943 { 944 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 945 946 rdev->raid_disk = -1; 947 rdev->in_sync = 0; 948 if (mddev->raid_disks == 0) { 949 mddev->major_version = 1; 950 mddev->patch_version = 0; 951 mddev->persistent = 1; 952 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 953 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 954 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 955 mddev->level = le32_to_cpu(sb->level); 956 mddev->layout = le32_to_cpu(sb->layout); 957 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 958 mddev->size = le64_to_cpu(sb->size)/2; 959 mddev->events = le64_to_cpu(sb->events); 960 mddev->bitmap_offset = 0; 961 mddev->default_bitmap_offset = 0; 962 mddev->default_bitmap_offset = 1024; 963 964 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 965 memcpy(mddev->uuid, sb->set_uuid, 16); 966 967 mddev->max_disks = (4096-256)/2; 968 969 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 970 mddev->bitmap_file == NULL ) { 971 if (mddev->level != 1) { 972 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 973 return -EINVAL; 974 } 975 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 976 } 977 } else if (mddev->pers == NULL) { 978 /* Insist of good event counter while assembling */ 979 __u64 ev1 = le64_to_cpu(sb->events); 980 ++ev1; 981 if (ev1 < mddev->events) 982 return -EINVAL; 983 } else if (mddev->bitmap) { 984 /* If adding to array with a bitmap, then we can accept an 985 * older device, but not too old. 986 */ 987 __u64 ev1 = le64_to_cpu(sb->events); 988 if (ev1 < mddev->bitmap->events_cleared) 989 return 0; 990 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 991 return 0; 992 993 if (mddev->level != LEVEL_MULTIPATH) { 994 int role; 995 rdev->desc_nr = le32_to_cpu(sb->dev_number); 996 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 997 switch(role) { 998 case 0xffff: /* spare */ 999 rdev->faulty = 0; 1000 break; 1001 case 0xfffe: /* faulty */ 1002 rdev->faulty = 1; 1003 break; 1004 default: 1005 rdev->in_sync = 1; 1006 rdev->faulty = 0; 1007 rdev->raid_disk = role; 1008 break; 1009 } 1010 rdev->flags = 0; 1011 if (sb->devflags & WriteMostly1) 1012 set_bit(WriteMostly, &rdev->flags); 1013 } else /* MULTIPATH are always insync */ 1014 rdev->in_sync = 1; 1015 1016 return 0; 1017 } 1018 1019 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1020 { 1021 struct mdp_superblock_1 *sb; 1022 struct list_head *tmp; 1023 mdk_rdev_t *rdev2; 1024 int max_dev, i; 1025 /* make rdev->sb match mddev and rdev data. */ 1026 1027 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1028 1029 sb->feature_map = 0; 1030 sb->pad0 = 0; 1031 memset(sb->pad1, 0, sizeof(sb->pad1)); 1032 memset(sb->pad2, 0, sizeof(sb->pad2)); 1033 memset(sb->pad3, 0, sizeof(sb->pad3)); 1034 1035 sb->utime = cpu_to_le64((__u64)mddev->utime); 1036 sb->events = cpu_to_le64(mddev->events); 1037 if (mddev->in_sync) 1038 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1039 else 1040 sb->resync_offset = cpu_to_le64(0); 1041 1042 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1043 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1044 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1045 } 1046 1047 max_dev = 0; 1048 ITERATE_RDEV(mddev,rdev2,tmp) 1049 if (rdev2->desc_nr+1 > max_dev) 1050 max_dev = rdev2->desc_nr+1; 1051 1052 sb->max_dev = cpu_to_le32(max_dev); 1053 for (i=0; i<max_dev;i++) 1054 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1055 1056 ITERATE_RDEV(mddev,rdev2,tmp) { 1057 i = rdev2->desc_nr; 1058 if (rdev2->faulty) 1059 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1060 else if (rdev2->in_sync) 1061 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1062 else 1063 sb->dev_roles[i] = cpu_to_le16(0xffff); 1064 } 1065 1066 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1067 sb->sb_csum = calc_sb_1_csum(sb); 1068 } 1069 1070 1071 static struct super_type super_types[] = { 1072 [0] = { 1073 .name = "0.90.0", 1074 .owner = THIS_MODULE, 1075 .load_super = super_90_load, 1076 .validate_super = super_90_validate, 1077 .sync_super = super_90_sync, 1078 }, 1079 [1] = { 1080 .name = "md-1", 1081 .owner = THIS_MODULE, 1082 .load_super = super_1_load, 1083 .validate_super = super_1_validate, 1084 .sync_super = super_1_sync, 1085 }, 1086 }; 1087 1088 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1089 { 1090 struct list_head *tmp; 1091 mdk_rdev_t *rdev; 1092 1093 ITERATE_RDEV(mddev,rdev,tmp) 1094 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1095 return rdev; 1096 1097 return NULL; 1098 } 1099 1100 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1101 { 1102 struct list_head *tmp; 1103 mdk_rdev_t *rdev; 1104 1105 ITERATE_RDEV(mddev1,rdev,tmp) 1106 if (match_dev_unit(mddev2, rdev)) 1107 return 1; 1108 1109 return 0; 1110 } 1111 1112 static LIST_HEAD(pending_raid_disks); 1113 1114 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1115 { 1116 mdk_rdev_t *same_pdev; 1117 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1118 1119 if (rdev->mddev) { 1120 MD_BUG(); 1121 return -EINVAL; 1122 } 1123 same_pdev = match_dev_unit(mddev, rdev); 1124 if (same_pdev) 1125 printk(KERN_WARNING 1126 "%s: WARNING: %s appears to be on the same physical" 1127 " disk as %s. True\n protection against single-disk" 1128 " failure might be compromised.\n", 1129 mdname(mddev), bdevname(rdev->bdev,b), 1130 bdevname(same_pdev->bdev,b2)); 1131 1132 /* Verify rdev->desc_nr is unique. 1133 * If it is -1, assign a free number, else 1134 * check number is not in use 1135 */ 1136 if (rdev->desc_nr < 0) { 1137 int choice = 0; 1138 if (mddev->pers) choice = mddev->raid_disks; 1139 while (find_rdev_nr(mddev, choice)) 1140 choice++; 1141 rdev->desc_nr = choice; 1142 } else { 1143 if (find_rdev_nr(mddev, rdev->desc_nr)) 1144 return -EBUSY; 1145 } 1146 1147 list_add(&rdev->same_set, &mddev->disks); 1148 rdev->mddev = mddev; 1149 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1150 return 0; 1151 } 1152 1153 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1154 { 1155 char b[BDEVNAME_SIZE]; 1156 if (!rdev->mddev) { 1157 MD_BUG(); 1158 return; 1159 } 1160 list_del_init(&rdev->same_set); 1161 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1162 rdev->mddev = NULL; 1163 } 1164 1165 /* 1166 * prevent the device from being mounted, repartitioned or 1167 * otherwise reused by a RAID array (or any other kernel 1168 * subsystem), by bd_claiming the device. 1169 */ 1170 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1171 { 1172 int err = 0; 1173 struct block_device *bdev; 1174 char b[BDEVNAME_SIZE]; 1175 1176 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1177 if (IS_ERR(bdev)) { 1178 printk(KERN_ERR "md: could not open %s.\n", 1179 __bdevname(dev, b)); 1180 return PTR_ERR(bdev); 1181 } 1182 err = bd_claim(bdev, rdev); 1183 if (err) { 1184 printk(KERN_ERR "md: could not bd_claim %s.\n", 1185 bdevname(bdev, b)); 1186 blkdev_put(bdev); 1187 return err; 1188 } 1189 rdev->bdev = bdev; 1190 return err; 1191 } 1192 1193 static void unlock_rdev(mdk_rdev_t *rdev) 1194 { 1195 struct block_device *bdev = rdev->bdev; 1196 rdev->bdev = NULL; 1197 if (!bdev) 1198 MD_BUG(); 1199 bd_release(bdev); 1200 blkdev_put(bdev); 1201 } 1202 1203 void md_autodetect_dev(dev_t dev); 1204 1205 static void export_rdev(mdk_rdev_t * rdev) 1206 { 1207 char b[BDEVNAME_SIZE]; 1208 printk(KERN_INFO "md: export_rdev(%s)\n", 1209 bdevname(rdev->bdev,b)); 1210 if (rdev->mddev) 1211 MD_BUG(); 1212 free_disk_sb(rdev); 1213 list_del_init(&rdev->same_set); 1214 #ifndef MODULE 1215 md_autodetect_dev(rdev->bdev->bd_dev); 1216 #endif 1217 unlock_rdev(rdev); 1218 kfree(rdev); 1219 } 1220 1221 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1222 { 1223 unbind_rdev_from_array(rdev); 1224 export_rdev(rdev); 1225 } 1226 1227 static void export_array(mddev_t *mddev) 1228 { 1229 struct list_head *tmp; 1230 mdk_rdev_t *rdev; 1231 1232 ITERATE_RDEV(mddev,rdev,tmp) { 1233 if (!rdev->mddev) { 1234 MD_BUG(); 1235 continue; 1236 } 1237 kick_rdev_from_array(rdev); 1238 } 1239 if (!list_empty(&mddev->disks)) 1240 MD_BUG(); 1241 mddev->raid_disks = 0; 1242 mddev->major_version = 0; 1243 } 1244 1245 static void print_desc(mdp_disk_t *desc) 1246 { 1247 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1248 desc->major,desc->minor,desc->raid_disk,desc->state); 1249 } 1250 1251 static void print_sb(mdp_super_t *sb) 1252 { 1253 int i; 1254 1255 printk(KERN_INFO 1256 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1257 sb->major_version, sb->minor_version, sb->patch_version, 1258 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1259 sb->ctime); 1260 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1261 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1262 sb->md_minor, sb->layout, sb->chunk_size); 1263 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1264 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1265 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1266 sb->failed_disks, sb->spare_disks, 1267 sb->sb_csum, (unsigned long)sb->events_lo); 1268 1269 printk(KERN_INFO); 1270 for (i = 0; i < MD_SB_DISKS; i++) { 1271 mdp_disk_t *desc; 1272 1273 desc = sb->disks + i; 1274 if (desc->number || desc->major || desc->minor || 1275 desc->raid_disk || (desc->state && (desc->state != 4))) { 1276 printk(" D %2d: ", i); 1277 print_desc(desc); 1278 } 1279 } 1280 printk(KERN_INFO "md: THIS: "); 1281 print_desc(&sb->this_disk); 1282 1283 } 1284 1285 static void print_rdev(mdk_rdev_t *rdev) 1286 { 1287 char b[BDEVNAME_SIZE]; 1288 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1289 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1290 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1291 if (rdev->sb_loaded) { 1292 printk(KERN_INFO "md: rdev superblock:\n"); 1293 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1294 } else 1295 printk(KERN_INFO "md: no rdev superblock!\n"); 1296 } 1297 1298 void md_print_devices(void) 1299 { 1300 struct list_head *tmp, *tmp2; 1301 mdk_rdev_t *rdev; 1302 mddev_t *mddev; 1303 char b[BDEVNAME_SIZE]; 1304 1305 printk("\n"); 1306 printk("md: **********************************\n"); 1307 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1308 printk("md: **********************************\n"); 1309 ITERATE_MDDEV(mddev,tmp) { 1310 1311 if (mddev->bitmap) 1312 bitmap_print_sb(mddev->bitmap); 1313 else 1314 printk("%s: ", mdname(mddev)); 1315 ITERATE_RDEV(mddev,rdev,tmp2) 1316 printk("<%s>", bdevname(rdev->bdev,b)); 1317 printk("\n"); 1318 1319 ITERATE_RDEV(mddev,rdev,tmp2) 1320 print_rdev(rdev); 1321 } 1322 printk("md: **********************************\n"); 1323 printk("\n"); 1324 } 1325 1326 1327 static void sync_sbs(mddev_t * mddev) 1328 { 1329 mdk_rdev_t *rdev; 1330 struct list_head *tmp; 1331 1332 ITERATE_RDEV(mddev,rdev,tmp) { 1333 super_types[mddev->major_version]. 1334 sync_super(mddev, rdev); 1335 rdev->sb_loaded = 1; 1336 } 1337 } 1338 1339 static void md_update_sb(mddev_t * mddev) 1340 { 1341 int err; 1342 struct list_head *tmp; 1343 mdk_rdev_t *rdev; 1344 int sync_req; 1345 1346 repeat: 1347 spin_lock(&mddev->write_lock); 1348 sync_req = mddev->in_sync; 1349 mddev->utime = get_seconds(); 1350 mddev->events ++; 1351 1352 if (!mddev->events) { 1353 /* 1354 * oops, this 64-bit counter should never wrap. 1355 * Either we are in around ~1 trillion A.C., assuming 1356 * 1 reboot per second, or we have a bug: 1357 */ 1358 MD_BUG(); 1359 mddev->events --; 1360 } 1361 mddev->sb_dirty = 2; 1362 sync_sbs(mddev); 1363 1364 /* 1365 * do not write anything to disk if using 1366 * nonpersistent superblocks 1367 */ 1368 if (!mddev->persistent) { 1369 mddev->sb_dirty = 0; 1370 spin_unlock(&mddev->write_lock); 1371 wake_up(&mddev->sb_wait); 1372 return; 1373 } 1374 spin_unlock(&mddev->write_lock); 1375 1376 dprintk(KERN_INFO 1377 "md: updating %s RAID superblock on device (in sync %d)\n", 1378 mdname(mddev),mddev->in_sync); 1379 1380 err = bitmap_update_sb(mddev->bitmap); 1381 ITERATE_RDEV(mddev,rdev,tmp) { 1382 char b[BDEVNAME_SIZE]; 1383 dprintk(KERN_INFO "md: "); 1384 if (rdev->faulty) 1385 dprintk("(skipping faulty "); 1386 1387 dprintk("%s ", bdevname(rdev->bdev,b)); 1388 if (!rdev->faulty) { 1389 md_super_write(mddev,rdev, 1390 rdev->sb_offset<<1, rdev->sb_size, 1391 rdev->sb_page); 1392 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1393 bdevname(rdev->bdev,b), 1394 (unsigned long long)rdev->sb_offset); 1395 1396 } else 1397 dprintk(")\n"); 1398 if (mddev->level == LEVEL_MULTIPATH) 1399 /* only need to write one superblock... */ 1400 break; 1401 } 1402 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1403 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1404 1405 spin_lock(&mddev->write_lock); 1406 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1407 /* have to write it out again */ 1408 spin_unlock(&mddev->write_lock); 1409 goto repeat; 1410 } 1411 mddev->sb_dirty = 0; 1412 spin_unlock(&mddev->write_lock); 1413 wake_up(&mddev->sb_wait); 1414 1415 } 1416 1417 /* 1418 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1419 * 1420 * mark the device faulty if: 1421 * 1422 * - the device is nonexistent (zero size) 1423 * - the device has no valid superblock 1424 * 1425 * a faulty rdev _never_ has rdev->sb set. 1426 */ 1427 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1428 { 1429 char b[BDEVNAME_SIZE]; 1430 int err; 1431 mdk_rdev_t *rdev; 1432 sector_t size; 1433 1434 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1435 if (!rdev) { 1436 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1437 return ERR_PTR(-ENOMEM); 1438 } 1439 memset(rdev, 0, sizeof(*rdev)); 1440 1441 if ((err = alloc_disk_sb(rdev))) 1442 goto abort_free; 1443 1444 err = lock_rdev(rdev, newdev); 1445 if (err) 1446 goto abort_free; 1447 1448 rdev->desc_nr = -1; 1449 rdev->faulty = 0; 1450 rdev->in_sync = 0; 1451 rdev->data_offset = 0; 1452 atomic_set(&rdev->nr_pending, 0); 1453 1454 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1455 if (!size) { 1456 printk(KERN_WARNING 1457 "md: %s has zero or unknown size, marking faulty!\n", 1458 bdevname(rdev->bdev,b)); 1459 err = -EINVAL; 1460 goto abort_free; 1461 } 1462 1463 if (super_format >= 0) { 1464 err = super_types[super_format]. 1465 load_super(rdev, NULL, super_minor); 1466 if (err == -EINVAL) { 1467 printk(KERN_WARNING 1468 "md: %s has invalid sb, not importing!\n", 1469 bdevname(rdev->bdev,b)); 1470 goto abort_free; 1471 } 1472 if (err < 0) { 1473 printk(KERN_WARNING 1474 "md: could not read %s's sb, not importing!\n", 1475 bdevname(rdev->bdev,b)); 1476 goto abort_free; 1477 } 1478 } 1479 INIT_LIST_HEAD(&rdev->same_set); 1480 1481 return rdev; 1482 1483 abort_free: 1484 if (rdev->sb_page) { 1485 if (rdev->bdev) 1486 unlock_rdev(rdev); 1487 free_disk_sb(rdev); 1488 } 1489 kfree(rdev); 1490 return ERR_PTR(err); 1491 } 1492 1493 /* 1494 * Check a full RAID array for plausibility 1495 */ 1496 1497 1498 static void analyze_sbs(mddev_t * mddev) 1499 { 1500 int i; 1501 struct list_head *tmp; 1502 mdk_rdev_t *rdev, *freshest; 1503 char b[BDEVNAME_SIZE]; 1504 1505 freshest = NULL; 1506 ITERATE_RDEV(mddev,rdev,tmp) 1507 switch (super_types[mddev->major_version]. 1508 load_super(rdev, freshest, mddev->minor_version)) { 1509 case 1: 1510 freshest = rdev; 1511 break; 1512 case 0: 1513 break; 1514 default: 1515 printk( KERN_ERR \ 1516 "md: fatal superblock inconsistency in %s" 1517 " -- removing from array\n", 1518 bdevname(rdev->bdev,b)); 1519 kick_rdev_from_array(rdev); 1520 } 1521 1522 1523 super_types[mddev->major_version]. 1524 validate_super(mddev, freshest); 1525 1526 i = 0; 1527 ITERATE_RDEV(mddev,rdev,tmp) { 1528 if (rdev != freshest) 1529 if (super_types[mddev->major_version]. 1530 validate_super(mddev, rdev)) { 1531 printk(KERN_WARNING "md: kicking non-fresh %s" 1532 " from array!\n", 1533 bdevname(rdev->bdev,b)); 1534 kick_rdev_from_array(rdev); 1535 continue; 1536 } 1537 if (mddev->level == LEVEL_MULTIPATH) { 1538 rdev->desc_nr = i++; 1539 rdev->raid_disk = rdev->desc_nr; 1540 rdev->in_sync = 1; 1541 } 1542 } 1543 1544 1545 1546 if (mddev->recovery_cp != MaxSector && 1547 mddev->level >= 1) 1548 printk(KERN_ERR "md: %s: raid array is not clean" 1549 " -- starting background reconstruction\n", 1550 mdname(mddev)); 1551 1552 } 1553 1554 int mdp_major = 0; 1555 1556 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1557 { 1558 static DECLARE_MUTEX(disks_sem); 1559 mddev_t *mddev = mddev_find(dev); 1560 struct gendisk *disk; 1561 int partitioned = (MAJOR(dev) != MD_MAJOR); 1562 int shift = partitioned ? MdpMinorShift : 0; 1563 int unit = MINOR(dev) >> shift; 1564 1565 if (!mddev) 1566 return NULL; 1567 1568 down(&disks_sem); 1569 if (mddev->gendisk) { 1570 up(&disks_sem); 1571 mddev_put(mddev); 1572 return NULL; 1573 } 1574 disk = alloc_disk(1 << shift); 1575 if (!disk) { 1576 up(&disks_sem); 1577 mddev_put(mddev); 1578 return NULL; 1579 } 1580 disk->major = MAJOR(dev); 1581 disk->first_minor = unit << shift; 1582 if (partitioned) { 1583 sprintf(disk->disk_name, "md_d%d", unit); 1584 sprintf(disk->devfs_name, "md/d%d", unit); 1585 } else { 1586 sprintf(disk->disk_name, "md%d", unit); 1587 sprintf(disk->devfs_name, "md/%d", unit); 1588 } 1589 disk->fops = &md_fops; 1590 disk->private_data = mddev; 1591 disk->queue = mddev->queue; 1592 add_disk(disk); 1593 mddev->gendisk = disk; 1594 up(&disks_sem); 1595 return NULL; 1596 } 1597 1598 void md_wakeup_thread(mdk_thread_t *thread); 1599 1600 static void md_safemode_timeout(unsigned long data) 1601 { 1602 mddev_t *mddev = (mddev_t *) data; 1603 1604 mddev->safemode = 1; 1605 md_wakeup_thread(mddev->thread); 1606 } 1607 1608 1609 static int do_md_run(mddev_t * mddev) 1610 { 1611 int pnum, err; 1612 int chunk_size; 1613 struct list_head *tmp; 1614 mdk_rdev_t *rdev; 1615 struct gendisk *disk; 1616 char b[BDEVNAME_SIZE]; 1617 1618 if (list_empty(&mddev->disks)) 1619 /* cannot run an array with no devices.. */ 1620 return -EINVAL; 1621 1622 if (mddev->pers) 1623 return -EBUSY; 1624 1625 /* 1626 * Analyze all RAID superblock(s) 1627 */ 1628 if (!mddev->raid_disks) 1629 analyze_sbs(mddev); 1630 1631 chunk_size = mddev->chunk_size; 1632 pnum = level_to_pers(mddev->level); 1633 1634 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1635 if (!chunk_size) { 1636 /* 1637 * 'default chunksize' in the old md code used to 1638 * be PAGE_SIZE, baaad. 1639 * we abort here to be on the safe side. We don't 1640 * want to continue the bad practice. 1641 */ 1642 printk(KERN_ERR 1643 "no chunksize specified, see 'man raidtab'\n"); 1644 return -EINVAL; 1645 } 1646 if (chunk_size > MAX_CHUNK_SIZE) { 1647 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1648 chunk_size, MAX_CHUNK_SIZE); 1649 return -EINVAL; 1650 } 1651 /* 1652 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1653 */ 1654 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1655 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1656 return -EINVAL; 1657 } 1658 if (chunk_size < PAGE_SIZE) { 1659 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1660 chunk_size, PAGE_SIZE); 1661 return -EINVAL; 1662 } 1663 1664 /* devices must have minimum size of one chunk */ 1665 ITERATE_RDEV(mddev,rdev,tmp) { 1666 if (rdev->faulty) 1667 continue; 1668 if (rdev->size < chunk_size / 1024) { 1669 printk(KERN_WARNING 1670 "md: Dev %s smaller than chunk_size:" 1671 " %lluk < %dk\n", 1672 bdevname(rdev->bdev,b), 1673 (unsigned long long)rdev->size, 1674 chunk_size / 1024); 1675 return -EINVAL; 1676 } 1677 } 1678 } 1679 1680 #ifdef CONFIG_KMOD 1681 if (!pers[pnum]) 1682 { 1683 request_module("md-personality-%d", pnum); 1684 } 1685 #endif 1686 1687 /* 1688 * Drop all container device buffers, from now on 1689 * the only valid external interface is through the md 1690 * device. 1691 * Also find largest hardsector size 1692 */ 1693 ITERATE_RDEV(mddev,rdev,tmp) { 1694 if (rdev->faulty) 1695 continue; 1696 sync_blockdev(rdev->bdev); 1697 invalidate_bdev(rdev->bdev, 0); 1698 } 1699 1700 md_probe(mddev->unit, NULL, NULL); 1701 disk = mddev->gendisk; 1702 if (!disk) 1703 return -ENOMEM; 1704 1705 spin_lock(&pers_lock); 1706 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1707 spin_unlock(&pers_lock); 1708 printk(KERN_WARNING "md: personality %d is not loaded!\n", 1709 pnum); 1710 return -EINVAL; 1711 } 1712 1713 mddev->pers = pers[pnum]; 1714 spin_unlock(&pers_lock); 1715 1716 mddev->recovery = 0; 1717 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1718 1719 /* before we start the array running, initialise the bitmap */ 1720 err = bitmap_create(mddev); 1721 if (err) 1722 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 1723 mdname(mddev), err); 1724 else 1725 err = mddev->pers->run(mddev); 1726 if (err) { 1727 printk(KERN_ERR "md: pers->run() failed ...\n"); 1728 module_put(mddev->pers->owner); 1729 mddev->pers = NULL; 1730 bitmap_destroy(mddev); 1731 return err; 1732 } 1733 atomic_set(&mddev->writes_pending,0); 1734 mddev->safemode = 0; 1735 mddev->safemode_timer.function = md_safemode_timeout; 1736 mddev->safemode_timer.data = (unsigned long) mddev; 1737 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 1738 mddev->in_sync = 1; 1739 1740 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1741 md_wakeup_thread(mddev->thread); 1742 1743 if (mddev->sb_dirty) 1744 md_update_sb(mddev); 1745 1746 set_capacity(disk, mddev->array_size<<1); 1747 1748 /* If we call blk_queue_make_request here, it will 1749 * re-initialise max_sectors etc which may have been 1750 * refined inside -> run. So just set the bits we need to set. 1751 * Most initialisation happended when we called 1752 * blk_queue_make_request(..., md_fail_request) 1753 * earlier. 1754 */ 1755 mddev->queue->queuedata = mddev; 1756 mddev->queue->make_request_fn = mddev->pers->make_request; 1757 1758 mddev->changed = 1; 1759 return 0; 1760 } 1761 1762 static int restart_array(mddev_t *mddev) 1763 { 1764 struct gendisk *disk = mddev->gendisk; 1765 int err; 1766 1767 /* 1768 * Complain if it has no devices 1769 */ 1770 err = -ENXIO; 1771 if (list_empty(&mddev->disks)) 1772 goto out; 1773 1774 if (mddev->pers) { 1775 err = -EBUSY; 1776 if (!mddev->ro) 1777 goto out; 1778 1779 mddev->safemode = 0; 1780 mddev->ro = 0; 1781 set_disk_ro(disk, 0); 1782 1783 printk(KERN_INFO "md: %s switched to read-write mode.\n", 1784 mdname(mddev)); 1785 /* 1786 * Kick recovery or resync if necessary 1787 */ 1788 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1789 md_wakeup_thread(mddev->thread); 1790 err = 0; 1791 } else { 1792 printk(KERN_ERR "md: %s has no personality assigned.\n", 1793 mdname(mddev)); 1794 err = -EINVAL; 1795 } 1796 1797 out: 1798 return err; 1799 } 1800 1801 static int do_md_stop(mddev_t * mddev, int ro) 1802 { 1803 int err = 0; 1804 struct gendisk *disk = mddev->gendisk; 1805 1806 if (mddev->pers) { 1807 if (atomic_read(&mddev->active)>2) { 1808 printk("md: %s still in use.\n",mdname(mddev)); 1809 return -EBUSY; 1810 } 1811 1812 if (mddev->sync_thread) { 1813 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1814 md_unregister_thread(mddev->sync_thread); 1815 mddev->sync_thread = NULL; 1816 } 1817 1818 del_timer_sync(&mddev->safemode_timer); 1819 1820 invalidate_partition(disk, 0); 1821 1822 if (ro) { 1823 err = -ENXIO; 1824 if (mddev->ro) 1825 goto out; 1826 mddev->ro = 1; 1827 } else { 1828 bitmap_flush(mddev); 1829 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1830 if (mddev->ro) 1831 set_disk_ro(disk, 0); 1832 blk_queue_make_request(mddev->queue, md_fail_request); 1833 mddev->pers->stop(mddev); 1834 module_put(mddev->pers->owner); 1835 mddev->pers = NULL; 1836 if (mddev->ro) 1837 mddev->ro = 0; 1838 } 1839 if (!mddev->in_sync) { 1840 /* mark array as shutdown cleanly */ 1841 mddev->in_sync = 1; 1842 md_update_sb(mddev); 1843 } 1844 if (ro) 1845 set_disk_ro(disk, 1); 1846 } 1847 1848 bitmap_destroy(mddev); 1849 if (mddev->bitmap_file) { 1850 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 1851 fput(mddev->bitmap_file); 1852 mddev->bitmap_file = NULL; 1853 } 1854 mddev->bitmap_offset = 0; 1855 1856 /* 1857 * Free resources if final stop 1858 */ 1859 if (!ro) { 1860 struct gendisk *disk; 1861 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 1862 1863 export_array(mddev); 1864 1865 mddev->array_size = 0; 1866 disk = mddev->gendisk; 1867 if (disk) 1868 set_capacity(disk, 0); 1869 mddev->changed = 1; 1870 } else 1871 printk(KERN_INFO "md: %s switched to read-only mode.\n", 1872 mdname(mddev)); 1873 err = 0; 1874 out: 1875 return err; 1876 } 1877 1878 static void autorun_array(mddev_t *mddev) 1879 { 1880 mdk_rdev_t *rdev; 1881 struct list_head *tmp; 1882 int err; 1883 1884 if (list_empty(&mddev->disks)) 1885 return; 1886 1887 printk(KERN_INFO "md: running: "); 1888 1889 ITERATE_RDEV(mddev,rdev,tmp) { 1890 char b[BDEVNAME_SIZE]; 1891 printk("<%s>", bdevname(rdev->bdev,b)); 1892 } 1893 printk("\n"); 1894 1895 err = do_md_run (mddev); 1896 if (err) { 1897 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 1898 do_md_stop (mddev, 0); 1899 } 1900 } 1901 1902 /* 1903 * lets try to run arrays based on all disks that have arrived 1904 * until now. (those are in pending_raid_disks) 1905 * 1906 * the method: pick the first pending disk, collect all disks with 1907 * the same UUID, remove all from the pending list and put them into 1908 * the 'same_array' list. Then order this list based on superblock 1909 * update time (freshest comes first), kick out 'old' disks and 1910 * compare superblocks. If everything's fine then run it. 1911 * 1912 * If "unit" is allocated, then bump its reference count 1913 */ 1914 static void autorun_devices(int part) 1915 { 1916 struct list_head candidates; 1917 struct list_head *tmp; 1918 mdk_rdev_t *rdev0, *rdev; 1919 mddev_t *mddev; 1920 char b[BDEVNAME_SIZE]; 1921 1922 printk(KERN_INFO "md: autorun ...\n"); 1923 while (!list_empty(&pending_raid_disks)) { 1924 dev_t dev; 1925 rdev0 = list_entry(pending_raid_disks.next, 1926 mdk_rdev_t, same_set); 1927 1928 printk(KERN_INFO "md: considering %s ...\n", 1929 bdevname(rdev0->bdev,b)); 1930 INIT_LIST_HEAD(&candidates); 1931 ITERATE_RDEV_PENDING(rdev,tmp) 1932 if (super_90_load(rdev, rdev0, 0) >= 0) { 1933 printk(KERN_INFO "md: adding %s ...\n", 1934 bdevname(rdev->bdev,b)); 1935 list_move(&rdev->same_set, &candidates); 1936 } 1937 /* 1938 * now we have a set of devices, with all of them having 1939 * mostly sane superblocks. It's time to allocate the 1940 * mddev. 1941 */ 1942 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 1943 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 1944 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 1945 break; 1946 } 1947 if (part) 1948 dev = MKDEV(mdp_major, 1949 rdev0->preferred_minor << MdpMinorShift); 1950 else 1951 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 1952 1953 md_probe(dev, NULL, NULL); 1954 mddev = mddev_find(dev); 1955 if (!mddev) { 1956 printk(KERN_ERR 1957 "md: cannot allocate memory for md drive.\n"); 1958 break; 1959 } 1960 if (mddev_lock(mddev)) 1961 printk(KERN_WARNING "md: %s locked, cannot run\n", 1962 mdname(mddev)); 1963 else if (mddev->raid_disks || mddev->major_version 1964 || !list_empty(&mddev->disks)) { 1965 printk(KERN_WARNING 1966 "md: %s already running, cannot run %s\n", 1967 mdname(mddev), bdevname(rdev0->bdev,b)); 1968 mddev_unlock(mddev); 1969 } else { 1970 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 1971 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 1972 list_del_init(&rdev->same_set); 1973 if (bind_rdev_to_array(rdev, mddev)) 1974 export_rdev(rdev); 1975 } 1976 autorun_array(mddev); 1977 mddev_unlock(mddev); 1978 } 1979 /* on success, candidates will be empty, on error 1980 * it won't... 1981 */ 1982 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 1983 export_rdev(rdev); 1984 mddev_put(mddev); 1985 } 1986 printk(KERN_INFO "md: ... autorun DONE.\n"); 1987 } 1988 1989 /* 1990 * import RAID devices based on one partition 1991 * if possible, the array gets run as well. 1992 */ 1993 1994 static int autostart_array(dev_t startdev) 1995 { 1996 char b[BDEVNAME_SIZE]; 1997 int err = -EINVAL, i; 1998 mdp_super_t *sb = NULL; 1999 mdk_rdev_t *start_rdev = NULL, *rdev; 2000 2001 start_rdev = md_import_device(startdev, 0, 0); 2002 if (IS_ERR(start_rdev)) 2003 return err; 2004 2005 2006 /* NOTE: this can only work for 0.90.0 superblocks */ 2007 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 2008 if (sb->major_version != 0 || 2009 sb->minor_version != 90 ) { 2010 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 2011 export_rdev(start_rdev); 2012 return err; 2013 } 2014 2015 if (start_rdev->faulty) { 2016 printk(KERN_WARNING 2017 "md: can not autostart based on faulty %s!\n", 2018 bdevname(start_rdev->bdev,b)); 2019 export_rdev(start_rdev); 2020 return err; 2021 } 2022 list_add(&start_rdev->same_set, &pending_raid_disks); 2023 2024 for (i = 0; i < MD_SB_DISKS; i++) { 2025 mdp_disk_t *desc = sb->disks + i; 2026 dev_t dev = MKDEV(desc->major, desc->minor); 2027 2028 if (!dev) 2029 continue; 2030 if (dev == startdev) 2031 continue; 2032 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2033 continue; 2034 rdev = md_import_device(dev, 0, 0); 2035 if (IS_ERR(rdev)) 2036 continue; 2037 2038 list_add(&rdev->same_set, &pending_raid_disks); 2039 } 2040 2041 /* 2042 * possibly return codes 2043 */ 2044 autorun_devices(0); 2045 return 0; 2046 2047 } 2048 2049 2050 static int get_version(void __user * arg) 2051 { 2052 mdu_version_t ver; 2053 2054 ver.major = MD_MAJOR_VERSION; 2055 ver.minor = MD_MINOR_VERSION; 2056 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2057 2058 if (copy_to_user(arg, &ver, sizeof(ver))) 2059 return -EFAULT; 2060 2061 return 0; 2062 } 2063 2064 static int get_array_info(mddev_t * mddev, void __user * arg) 2065 { 2066 mdu_array_info_t info; 2067 int nr,working,active,failed,spare; 2068 mdk_rdev_t *rdev; 2069 struct list_head *tmp; 2070 2071 nr=working=active=failed=spare=0; 2072 ITERATE_RDEV(mddev,rdev,tmp) { 2073 nr++; 2074 if (rdev->faulty) 2075 failed++; 2076 else { 2077 working++; 2078 if (rdev->in_sync) 2079 active++; 2080 else 2081 spare++; 2082 } 2083 } 2084 2085 info.major_version = mddev->major_version; 2086 info.minor_version = mddev->minor_version; 2087 info.patch_version = MD_PATCHLEVEL_VERSION; 2088 info.ctime = mddev->ctime; 2089 info.level = mddev->level; 2090 info.size = mddev->size; 2091 info.nr_disks = nr; 2092 info.raid_disks = mddev->raid_disks; 2093 info.md_minor = mddev->md_minor; 2094 info.not_persistent= !mddev->persistent; 2095 2096 info.utime = mddev->utime; 2097 info.state = 0; 2098 if (mddev->in_sync) 2099 info.state = (1<<MD_SB_CLEAN); 2100 if (mddev->bitmap && mddev->bitmap_offset) 2101 info.state = (1<<MD_SB_BITMAP_PRESENT); 2102 info.active_disks = active; 2103 info.working_disks = working; 2104 info.failed_disks = failed; 2105 info.spare_disks = spare; 2106 2107 info.layout = mddev->layout; 2108 info.chunk_size = mddev->chunk_size; 2109 2110 if (copy_to_user(arg, &info, sizeof(info))) 2111 return -EFAULT; 2112 2113 return 0; 2114 } 2115 2116 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 2117 { 2118 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2119 char *ptr, *buf = NULL; 2120 int err = -ENOMEM; 2121 2122 file = kmalloc(sizeof(*file), GFP_KERNEL); 2123 if (!file) 2124 goto out; 2125 2126 /* bitmap disabled, zero the first byte and copy out */ 2127 if (!mddev->bitmap || !mddev->bitmap->file) { 2128 file->pathname[0] = '\0'; 2129 goto copy_out; 2130 } 2131 2132 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2133 if (!buf) 2134 goto out; 2135 2136 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2137 if (!ptr) 2138 goto out; 2139 2140 strcpy(file->pathname, ptr); 2141 2142 copy_out: 2143 err = 0; 2144 if (copy_to_user(arg, file, sizeof(*file))) 2145 err = -EFAULT; 2146 out: 2147 kfree(buf); 2148 kfree(file); 2149 return err; 2150 } 2151 2152 static int get_disk_info(mddev_t * mddev, void __user * arg) 2153 { 2154 mdu_disk_info_t info; 2155 unsigned int nr; 2156 mdk_rdev_t *rdev; 2157 2158 if (copy_from_user(&info, arg, sizeof(info))) 2159 return -EFAULT; 2160 2161 nr = info.number; 2162 2163 rdev = find_rdev_nr(mddev, nr); 2164 if (rdev) { 2165 info.major = MAJOR(rdev->bdev->bd_dev); 2166 info.minor = MINOR(rdev->bdev->bd_dev); 2167 info.raid_disk = rdev->raid_disk; 2168 info.state = 0; 2169 if (rdev->faulty) 2170 info.state |= (1<<MD_DISK_FAULTY); 2171 else if (rdev->in_sync) { 2172 info.state |= (1<<MD_DISK_ACTIVE); 2173 info.state |= (1<<MD_DISK_SYNC); 2174 } 2175 if (test_bit(WriteMostly, &rdev->flags)) 2176 info.state |= (1<<MD_DISK_WRITEMOSTLY); 2177 } else { 2178 info.major = info.minor = 0; 2179 info.raid_disk = -1; 2180 info.state = (1<<MD_DISK_REMOVED); 2181 } 2182 2183 if (copy_to_user(arg, &info, sizeof(info))) 2184 return -EFAULT; 2185 2186 return 0; 2187 } 2188 2189 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2190 { 2191 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2192 mdk_rdev_t *rdev; 2193 dev_t dev = MKDEV(info->major,info->minor); 2194 2195 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2196 return -EOVERFLOW; 2197 2198 if (!mddev->raid_disks) { 2199 int err; 2200 /* expecting a device which has a superblock */ 2201 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2202 if (IS_ERR(rdev)) { 2203 printk(KERN_WARNING 2204 "md: md_import_device returned %ld\n", 2205 PTR_ERR(rdev)); 2206 return PTR_ERR(rdev); 2207 } 2208 if (!list_empty(&mddev->disks)) { 2209 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2210 mdk_rdev_t, same_set); 2211 int err = super_types[mddev->major_version] 2212 .load_super(rdev, rdev0, mddev->minor_version); 2213 if (err < 0) { 2214 printk(KERN_WARNING 2215 "md: %s has different UUID to %s\n", 2216 bdevname(rdev->bdev,b), 2217 bdevname(rdev0->bdev,b2)); 2218 export_rdev(rdev); 2219 return -EINVAL; 2220 } 2221 } 2222 err = bind_rdev_to_array(rdev, mddev); 2223 if (err) 2224 export_rdev(rdev); 2225 return err; 2226 } 2227 2228 /* 2229 * add_new_disk can be used once the array is assembled 2230 * to add "hot spares". They must already have a superblock 2231 * written 2232 */ 2233 if (mddev->pers) { 2234 int err; 2235 if (!mddev->pers->hot_add_disk) { 2236 printk(KERN_WARNING 2237 "%s: personality does not support diskops!\n", 2238 mdname(mddev)); 2239 return -EINVAL; 2240 } 2241 if (mddev->persistent) 2242 rdev = md_import_device(dev, mddev->major_version, 2243 mddev->minor_version); 2244 else 2245 rdev = md_import_device(dev, -1, -1); 2246 if (IS_ERR(rdev)) { 2247 printk(KERN_WARNING 2248 "md: md_import_device returned %ld\n", 2249 PTR_ERR(rdev)); 2250 return PTR_ERR(rdev); 2251 } 2252 /* set save_raid_disk if appropriate */ 2253 if (!mddev->persistent) { 2254 if (info->state & (1<<MD_DISK_SYNC) && 2255 info->raid_disk < mddev->raid_disks) 2256 rdev->raid_disk = info->raid_disk; 2257 else 2258 rdev->raid_disk = -1; 2259 } else 2260 super_types[mddev->major_version]. 2261 validate_super(mddev, rdev); 2262 rdev->saved_raid_disk = rdev->raid_disk; 2263 2264 rdev->in_sync = 0; /* just to be sure */ 2265 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2266 set_bit(WriteMostly, &rdev->flags); 2267 2268 rdev->raid_disk = -1; 2269 err = bind_rdev_to_array(rdev, mddev); 2270 if (err) 2271 export_rdev(rdev); 2272 2273 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2274 md_wakeup_thread(mddev->thread); 2275 return err; 2276 } 2277 2278 /* otherwise, add_new_disk is only allowed 2279 * for major_version==0 superblocks 2280 */ 2281 if (mddev->major_version != 0) { 2282 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2283 mdname(mddev)); 2284 return -EINVAL; 2285 } 2286 2287 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2288 int err; 2289 rdev = md_import_device (dev, -1, 0); 2290 if (IS_ERR(rdev)) { 2291 printk(KERN_WARNING 2292 "md: error, md_import_device() returned %ld\n", 2293 PTR_ERR(rdev)); 2294 return PTR_ERR(rdev); 2295 } 2296 rdev->desc_nr = info->number; 2297 if (info->raid_disk < mddev->raid_disks) 2298 rdev->raid_disk = info->raid_disk; 2299 else 2300 rdev->raid_disk = -1; 2301 2302 rdev->faulty = 0; 2303 if (rdev->raid_disk < mddev->raid_disks) 2304 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2305 else 2306 rdev->in_sync = 0; 2307 2308 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2309 set_bit(WriteMostly, &rdev->flags); 2310 2311 err = bind_rdev_to_array(rdev, mddev); 2312 if (err) { 2313 export_rdev(rdev); 2314 return err; 2315 } 2316 2317 if (!mddev->persistent) { 2318 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2319 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2320 } else 2321 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2322 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2323 2324 if (!mddev->size || (mddev->size > rdev->size)) 2325 mddev->size = rdev->size; 2326 } 2327 2328 return 0; 2329 } 2330 2331 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2332 { 2333 char b[BDEVNAME_SIZE]; 2334 mdk_rdev_t *rdev; 2335 2336 if (!mddev->pers) 2337 return -ENODEV; 2338 2339 rdev = find_rdev(mddev, dev); 2340 if (!rdev) 2341 return -ENXIO; 2342 2343 if (rdev->raid_disk >= 0) 2344 goto busy; 2345 2346 kick_rdev_from_array(rdev); 2347 md_update_sb(mddev); 2348 2349 return 0; 2350 busy: 2351 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2352 bdevname(rdev->bdev,b), mdname(mddev)); 2353 return -EBUSY; 2354 } 2355 2356 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2357 { 2358 char b[BDEVNAME_SIZE]; 2359 int err; 2360 unsigned int size; 2361 mdk_rdev_t *rdev; 2362 2363 if (!mddev->pers) 2364 return -ENODEV; 2365 2366 if (mddev->major_version != 0) { 2367 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2368 " version-0 superblocks.\n", 2369 mdname(mddev)); 2370 return -EINVAL; 2371 } 2372 if (!mddev->pers->hot_add_disk) { 2373 printk(KERN_WARNING 2374 "%s: personality does not support diskops!\n", 2375 mdname(mddev)); 2376 return -EINVAL; 2377 } 2378 2379 rdev = md_import_device (dev, -1, 0); 2380 if (IS_ERR(rdev)) { 2381 printk(KERN_WARNING 2382 "md: error, md_import_device() returned %ld\n", 2383 PTR_ERR(rdev)); 2384 return -EINVAL; 2385 } 2386 2387 if (mddev->persistent) 2388 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2389 else 2390 rdev->sb_offset = 2391 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2392 2393 size = calc_dev_size(rdev, mddev->chunk_size); 2394 rdev->size = size; 2395 2396 if (size < mddev->size) { 2397 printk(KERN_WARNING 2398 "%s: disk size %llu blocks < array size %llu\n", 2399 mdname(mddev), (unsigned long long)size, 2400 (unsigned long long)mddev->size); 2401 err = -ENOSPC; 2402 goto abort_export; 2403 } 2404 2405 if (rdev->faulty) { 2406 printk(KERN_WARNING 2407 "md: can not hot-add faulty %s disk to %s!\n", 2408 bdevname(rdev->bdev,b), mdname(mddev)); 2409 err = -EINVAL; 2410 goto abort_export; 2411 } 2412 rdev->in_sync = 0; 2413 rdev->desc_nr = -1; 2414 bind_rdev_to_array(rdev, mddev); 2415 2416 /* 2417 * The rest should better be atomic, we can have disk failures 2418 * noticed in interrupt contexts ... 2419 */ 2420 2421 if (rdev->desc_nr == mddev->max_disks) { 2422 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2423 mdname(mddev)); 2424 err = -EBUSY; 2425 goto abort_unbind_export; 2426 } 2427 2428 rdev->raid_disk = -1; 2429 2430 md_update_sb(mddev); 2431 2432 /* 2433 * Kick recovery, maybe this spare has to be added to the 2434 * array immediately. 2435 */ 2436 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2437 md_wakeup_thread(mddev->thread); 2438 2439 return 0; 2440 2441 abort_unbind_export: 2442 unbind_rdev_from_array(rdev); 2443 2444 abort_export: 2445 export_rdev(rdev); 2446 return err; 2447 } 2448 2449 /* similar to deny_write_access, but accounts for our holding a reference 2450 * to the file ourselves */ 2451 static int deny_bitmap_write_access(struct file * file) 2452 { 2453 struct inode *inode = file->f_mapping->host; 2454 2455 spin_lock(&inode->i_lock); 2456 if (atomic_read(&inode->i_writecount) > 1) { 2457 spin_unlock(&inode->i_lock); 2458 return -ETXTBSY; 2459 } 2460 atomic_set(&inode->i_writecount, -1); 2461 spin_unlock(&inode->i_lock); 2462 2463 return 0; 2464 } 2465 2466 static int set_bitmap_file(mddev_t *mddev, int fd) 2467 { 2468 int err; 2469 2470 if (mddev->pers) { 2471 if (!mddev->pers->quiesce) 2472 return -EBUSY; 2473 if (mddev->recovery || mddev->sync_thread) 2474 return -EBUSY; 2475 /* we should be able to change the bitmap.. */ 2476 } 2477 2478 2479 if (fd >= 0) { 2480 if (mddev->bitmap) 2481 return -EEXIST; /* cannot add when bitmap is present */ 2482 mddev->bitmap_file = fget(fd); 2483 2484 if (mddev->bitmap_file == NULL) { 2485 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2486 mdname(mddev)); 2487 return -EBADF; 2488 } 2489 2490 err = deny_bitmap_write_access(mddev->bitmap_file); 2491 if (err) { 2492 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2493 mdname(mddev)); 2494 fput(mddev->bitmap_file); 2495 mddev->bitmap_file = NULL; 2496 return err; 2497 } 2498 mddev->bitmap_offset = 0; /* file overrides offset */ 2499 } else if (mddev->bitmap == NULL) 2500 return -ENOENT; /* cannot remove what isn't there */ 2501 err = 0; 2502 if (mddev->pers) { 2503 mddev->pers->quiesce(mddev, 1); 2504 if (fd >= 0) 2505 err = bitmap_create(mddev); 2506 if (fd < 0 || err) 2507 bitmap_destroy(mddev); 2508 mddev->pers->quiesce(mddev, 0); 2509 } else if (fd < 0) { 2510 if (mddev->bitmap_file) 2511 fput(mddev->bitmap_file); 2512 mddev->bitmap_file = NULL; 2513 } 2514 2515 return err; 2516 } 2517 2518 /* 2519 * set_array_info is used two different ways 2520 * The original usage is when creating a new array. 2521 * In this usage, raid_disks is > 0 and it together with 2522 * level, size, not_persistent,layout,chunksize determine the 2523 * shape of the array. 2524 * This will always create an array with a type-0.90.0 superblock. 2525 * The newer usage is when assembling an array. 2526 * In this case raid_disks will be 0, and the major_version field is 2527 * use to determine which style super-blocks are to be found on the devices. 2528 * The minor and patch _version numbers are also kept incase the 2529 * super_block handler wishes to interpret them. 2530 */ 2531 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2532 { 2533 2534 if (info->raid_disks == 0) { 2535 /* just setting version number for superblock loading */ 2536 if (info->major_version < 0 || 2537 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2538 super_types[info->major_version].name == NULL) { 2539 /* maybe try to auto-load a module? */ 2540 printk(KERN_INFO 2541 "md: superblock version %d not known\n", 2542 info->major_version); 2543 return -EINVAL; 2544 } 2545 mddev->major_version = info->major_version; 2546 mddev->minor_version = info->minor_version; 2547 mddev->patch_version = info->patch_version; 2548 return 0; 2549 } 2550 mddev->major_version = MD_MAJOR_VERSION; 2551 mddev->minor_version = MD_MINOR_VERSION; 2552 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2553 mddev->ctime = get_seconds(); 2554 2555 mddev->level = info->level; 2556 mddev->size = info->size; 2557 mddev->raid_disks = info->raid_disks; 2558 /* don't set md_minor, it is determined by which /dev/md* was 2559 * openned 2560 */ 2561 if (info->state & (1<<MD_SB_CLEAN)) 2562 mddev->recovery_cp = MaxSector; 2563 else 2564 mddev->recovery_cp = 0; 2565 mddev->persistent = ! info->not_persistent; 2566 2567 mddev->layout = info->layout; 2568 mddev->chunk_size = info->chunk_size; 2569 2570 mddev->max_disks = MD_SB_DISKS; 2571 2572 mddev->sb_dirty = 1; 2573 2574 /* 2575 * Generate a 128 bit UUID 2576 */ 2577 get_random_bytes(mddev->uuid, 16); 2578 2579 return 0; 2580 } 2581 2582 /* 2583 * update_array_info is used to change the configuration of an 2584 * on-line array. 2585 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2586 * fields in the info are checked against the array. 2587 * Any differences that cannot be handled will cause an error. 2588 * Normally, only one change can be managed at a time. 2589 */ 2590 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2591 { 2592 int rv = 0; 2593 int cnt = 0; 2594 int state = 0; 2595 2596 /* calculate expected state,ignoring low bits */ 2597 if (mddev->bitmap && mddev->bitmap_offset) 2598 state |= (1 << MD_SB_BITMAP_PRESENT); 2599 2600 if (mddev->major_version != info->major_version || 2601 mddev->minor_version != info->minor_version || 2602 /* mddev->patch_version != info->patch_version || */ 2603 mddev->ctime != info->ctime || 2604 mddev->level != info->level || 2605 /* mddev->layout != info->layout || */ 2606 !mddev->persistent != info->not_persistent|| 2607 mddev->chunk_size != info->chunk_size || 2608 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 2609 ((state^info->state) & 0xfffffe00) 2610 ) 2611 return -EINVAL; 2612 /* Check there is only one change */ 2613 if (mddev->size != info->size) cnt++; 2614 if (mddev->raid_disks != info->raid_disks) cnt++; 2615 if (mddev->layout != info->layout) cnt++; 2616 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 2617 if (cnt == 0) return 0; 2618 if (cnt > 1) return -EINVAL; 2619 2620 if (mddev->layout != info->layout) { 2621 /* Change layout 2622 * we don't need to do anything at the md level, the 2623 * personality will take care of it all. 2624 */ 2625 if (mddev->pers->reconfig == NULL) 2626 return -EINVAL; 2627 else 2628 return mddev->pers->reconfig(mddev, info->layout, -1); 2629 } 2630 if (mddev->size != info->size) { 2631 mdk_rdev_t * rdev; 2632 struct list_head *tmp; 2633 if (mddev->pers->resize == NULL) 2634 return -EINVAL; 2635 /* The "size" is the amount of each device that is used. 2636 * This can only make sense for arrays with redundancy. 2637 * linear and raid0 always use whatever space is available 2638 * We can only consider changing the size if no resync 2639 * or reconstruction is happening, and if the new size 2640 * is acceptable. It must fit before the sb_offset or, 2641 * if that is <data_offset, it must fit before the 2642 * size of each device. 2643 * If size is zero, we find the largest size that fits. 2644 */ 2645 if (mddev->sync_thread) 2646 return -EBUSY; 2647 ITERATE_RDEV(mddev,rdev,tmp) { 2648 sector_t avail; 2649 int fit = (info->size == 0); 2650 if (rdev->sb_offset > rdev->data_offset) 2651 avail = (rdev->sb_offset*2) - rdev->data_offset; 2652 else 2653 avail = get_capacity(rdev->bdev->bd_disk) 2654 - rdev->data_offset; 2655 if (fit && (info->size == 0 || info->size > avail/2)) 2656 info->size = avail/2; 2657 if (avail < ((sector_t)info->size << 1)) 2658 return -ENOSPC; 2659 } 2660 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 2661 if (!rv) { 2662 struct block_device *bdev; 2663 2664 bdev = bdget_disk(mddev->gendisk, 0); 2665 if (bdev) { 2666 down(&bdev->bd_inode->i_sem); 2667 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2668 up(&bdev->bd_inode->i_sem); 2669 bdput(bdev); 2670 } 2671 } 2672 } 2673 if (mddev->raid_disks != info->raid_disks) { 2674 /* change the number of raid disks */ 2675 if (mddev->pers->reshape == NULL) 2676 return -EINVAL; 2677 if (info->raid_disks <= 0 || 2678 info->raid_disks >= mddev->max_disks) 2679 return -EINVAL; 2680 if (mddev->sync_thread) 2681 return -EBUSY; 2682 rv = mddev->pers->reshape(mddev, info->raid_disks); 2683 if (!rv) { 2684 struct block_device *bdev; 2685 2686 bdev = bdget_disk(mddev->gendisk, 0); 2687 if (bdev) { 2688 down(&bdev->bd_inode->i_sem); 2689 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2690 up(&bdev->bd_inode->i_sem); 2691 bdput(bdev); 2692 } 2693 } 2694 } 2695 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 2696 if (mddev->pers->quiesce == NULL) 2697 return -EINVAL; 2698 if (mddev->recovery || mddev->sync_thread) 2699 return -EBUSY; 2700 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 2701 /* add the bitmap */ 2702 if (mddev->bitmap) 2703 return -EEXIST; 2704 if (mddev->default_bitmap_offset == 0) 2705 return -EINVAL; 2706 mddev->bitmap_offset = mddev->default_bitmap_offset; 2707 mddev->pers->quiesce(mddev, 1); 2708 rv = bitmap_create(mddev); 2709 if (rv) 2710 bitmap_destroy(mddev); 2711 mddev->pers->quiesce(mddev, 0); 2712 } else { 2713 /* remove the bitmap */ 2714 if (!mddev->bitmap) 2715 return -ENOENT; 2716 if (mddev->bitmap->file) 2717 return -EINVAL; 2718 mddev->pers->quiesce(mddev, 1); 2719 bitmap_destroy(mddev); 2720 mddev->pers->quiesce(mddev, 0); 2721 mddev->bitmap_offset = 0; 2722 } 2723 } 2724 md_update_sb(mddev); 2725 return rv; 2726 } 2727 2728 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 2729 { 2730 mdk_rdev_t *rdev; 2731 2732 if (mddev->pers == NULL) 2733 return -ENODEV; 2734 2735 rdev = find_rdev(mddev, dev); 2736 if (!rdev) 2737 return -ENODEV; 2738 2739 md_error(mddev, rdev); 2740 return 0; 2741 } 2742 2743 static int md_ioctl(struct inode *inode, struct file *file, 2744 unsigned int cmd, unsigned long arg) 2745 { 2746 int err = 0; 2747 void __user *argp = (void __user *)arg; 2748 struct hd_geometry __user *loc = argp; 2749 mddev_t *mddev = NULL; 2750 2751 if (!capable(CAP_SYS_ADMIN)) 2752 return -EACCES; 2753 2754 /* 2755 * Commands dealing with the RAID driver but not any 2756 * particular array: 2757 */ 2758 switch (cmd) 2759 { 2760 case RAID_VERSION: 2761 err = get_version(argp); 2762 goto done; 2763 2764 case PRINT_RAID_DEBUG: 2765 err = 0; 2766 md_print_devices(); 2767 goto done; 2768 2769 #ifndef MODULE 2770 case RAID_AUTORUN: 2771 err = 0; 2772 autostart_arrays(arg); 2773 goto done; 2774 #endif 2775 default:; 2776 } 2777 2778 /* 2779 * Commands creating/starting a new array: 2780 */ 2781 2782 mddev = inode->i_bdev->bd_disk->private_data; 2783 2784 if (!mddev) { 2785 BUG(); 2786 goto abort; 2787 } 2788 2789 2790 if (cmd == START_ARRAY) { 2791 /* START_ARRAY doesn't need to lock the array as autostart_array 2792 * does the locking, and it could even be a different array 2793 */ 2794 static int cnt = 3; 2795 if (cnt > 0 ) { 2796 printk(KERN_WARNING 2797 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 2798 "This will not be supported beyond 2.6\n", 2799 current->comm, current->pid); 2800 cnt--; 2801 } 2802 err = autostart_array(new_decode_dev(arg)); 2803 if (err) { 2804 printk(KERN_WARNING "md: autostart failed!\n"); 2805 goto abort; 2806 } 2807 goto done; 2808 } 2809 2810 err = mddev_lock(mddev); 2811 if (err) { 2812 printk(KERN_INFO 2813 "md: ioctl lock interrupted, reason %d, cmd %d\n", 2814 err, cmd); 2815 goto abort; 2816 } 2817 2818 switch (cmd) 2819 { 2820 case SET_ARRAY_INFO: 2821 { 2822 mdu_array_info_t info; 2823 if (!arg) 2824 memset(&info, 0, sizeof(info)); 2825 else if (copy_from_user(&info, argp, sizeof(info))) { 2826 err = -EFAULT; 2827 goto abort_unlock; 2828 } 2829 if (mddev->pers) { 2830 err = update_array_info(mddev, &info); 2831 if (err) { 2832 printk(KERN_WARNING "md: couldn't update" 2833 " array info. %d\n", err); 2834 goto abort_unlock; 2835 } 2836 goto done_unlock; 2837 } 2838 if (!list_empty(&mddev->disks)) { 2839 printk(KERN_WARNING 2840 "md: array %s already has disks!\n", 2841 mdname(mddev)); 2842 err = -EBUSY; 2843 goto abort_unlock; 2844 } 2845 if (mddev->raid_disks) { 2846 printk(KERN_WARNING 2847 "md: array %s already initialised!\n", 2848 mdname(mddev)); 2849 err = -EBUSY; 2850 goto abort_unlock; 2851 } 2852 err = set_array_info(mddev, &info); 2853 if (err) { 2854 printk(KERN_WARNING "md: couldn't set" 2855 " array info. %d\n", err); 2856 goto abort_unlock; 2857 } 2858 } 2859 goto done_unlock; 2860 2861 default:; 2862 } 2863 2864 /* 2865 * Commands querying/configuring an existing array: 2866 */ 2867 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 2868 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 2869 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 2870 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 2871 err = -ENODEV; 2872 goto abort_unlock; 2873 } 2874 2875 /* 2876 * Commands even a read-only array can execute: 2877 */ 2878 switch (cmd) 2879 { 2880 case GET_ARRAY_INFO: 2881 err = get_array_info(mddev, argp); 2882 goto done_unlock; 2883 2884 case GET_BITMAP_FILE: 2885 err = get_bitmap_file(mddev, argp); 2886 goto done_unlock; 2887 2888 case GET_DISK_INFO: 2889 err = get_disk_info(mddev, argp); 2890 goto done_unlock; 2891 2892 case RESTART_ARRAY_RW: 2893 err = restart_array(mddev); 2894 goto done_unlock; 2895 2896 case STOP_ARRAY: 2897 err = do_md_stop (mddev, 0); 2898 goto done_unlock; 2899 2900 case STOP_ARRAY_RO: 2901 err = do_md_stop (mddev, 1); 2902 goto done_unlock; 2903 2904 /* 2905 * We have a problem here : there is no easy way to give a CHS 2906 * virtual geometry. We currently pretend that we have a 2 heads 2907 * 4 sectors (with a BIG number of cylinders...). This drives 2908 * dosfs just mad... ;-) 2909 */ 2910 case HDIO_GETGEO: 2911 if (!loc) { 2912 err = -EINVAL; 2913 goto abort_unlock; 2914 } 2915 err = put_user (2, (char __user *) &loc->heads); 2916 if (err) 2917 goto abort_unlock; 2918 err = put_user (4, (char __user *) &loc->sectors); 2919 if (err) 2920 goto abort_unlock; 2921 err = put_user(get_capacity(mddev->gendisk)/8, 2922 (short __user *) &loc->cylinders); 2923 if (err) 2924 goto abort_unlock; 2925 err = put_user (get_start_sect(inode->i_bdev), 2926 (long __user *) &loc->start); 2927 goto done_unlock; 2928 } 2929 2930 /* 2931 * The remaining ioctls are changing the state of the 2932 * superblock, so we do not allow read-only arrays 2933 * here: 2934 */ 2935 if (mddev->ro) { 2936 err = -EROFS; 2937 goto abort_unlock; 2938 } 2939 2940 switch (cmd) 2941 { 2942 case ADD_NEW_DISK: 2943 { 2944 mdu_disk_info_t info; 2945 if (copy_from_user(&info, argp, sizeof(info))) 2946 err = -EFAULT; 2947 else 2948 err = add_new_disk(mddev, &info); 2949 goto done_unlock; 2950 } 2951 2952 case HOT_REMOVE_DISK: 2953 err = hot_remove_disk(mddev, new_decode_dev(arg)); 2954 goto done_unlock; 2955 2956 case HOT_ADD_DISK: 2957 err = hot_add_disk(mddev, new_decode_dev(arg)); 2958 goto done_unlock; 2959 2960 case SET_DISK_FAULTY: 2961 err = set_disk_faulty(mddev, new_decode_dev(arg)); 2962 goto done_unlock; 2963 2964 case RUN_ARRAY: 2965 err = do_md_run (mddev); 2966 goto done_unlock; 2967 2968 case SET_BITMAP_FILE: 2969 err = set_bitmap_file(mddev, (int)arg); 2970 goto done_unlock; 2971 2972 default: 2973 if (_IOC_TYPE(cmd) == MD_MAJOR) 2974 printk(KERN_WARNING "md: %s(pid %d) used" 2975 " obsolete MD ioctl, upgrade your" 2976 " software to use new ictls.\n", 2977 current->comm, current->pid); 2978 err = -EINVAL; 2979 goto abort_unlock; 2980 } 2981 2982 done_unlock: 2983 abort_unlock: 2984 mddev_unlock(mddev); 2985 2986 return err; 2987 done: 2988 if (err) 2989 MD_BUG(); 2990 abort: 2991 return err; 2992 } 2993 2994 static int md_open(struct inode *inode, struct file *file) 2995 { 2996 /* 2997 * Succeed if we can lock the mddev, which confirms that 2998 * it isn't being stopped right now. 2999 */ 3000 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3001 int err; 3002 3003 if ((err = mddev_lock(mddev))) 3004 goto out; 3005 3006 err = 0; 3007 mddev_get(mddev); 3008 mddev_unlock(mddev); 3009 3010 check_disk_change(inode->i_bdev); 3011 out: 3012 return err; 3013 } 3014 3015 static int md_release(struct inode *inode, struct file * file) 3016 { 3017 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 3018 3019 if (!mddev) 3020 BUG(); 3021 mddev_put(mddev); 3022 3023 return 0; 3024 } 3025 3026 static int md_media_changed(struct gendisk *disk) 3027 { 3028 mddev_t *mddev = disk->private_data; 3029 3030 return mddev->changed; 3031 } 3032 3033 static int md_revalidate(struct gendisk *disk) 3034 { 3035 mddev_t *mddev = disk->private_data; 3036 3037 mddev->changed = 0; 3038 return 0; 3039 } 3040 static struct block_device_operations md_fops = 3041 { 3042 .owner = THIS_MODULE, 3043 .open = md_open, 3044 .release = md_release, 3045 .ioctl = md_ioctl, 3046 .media_changed = md_media_changed, 3047 .revalidate_disk= md_revalidate, 3048 }; 3049 3050 static int md_thread(void * arg) 3051 { 3052 mdk_thread_t *thread = arg; 3053 3054 /* 3055 * md_thread is a 'system-thread', it's priority should be very 3056 * high. We avoid resource deadlocks individually in each 3057 * raid personality. (RAID5 does preallocation) We also use RR and 3058 * the very same RT priority as kswapd, thus we will never get 3059 * into a priority inversion deadlock. 3060 * 3061 * we definitely have to have equal or higher priority than 3062 * bdflush, otherwise bdflush will deadlock if there are too 3063 * many dirty RAID5 blocks. 3064 */ 3065 3066 allow_signal(SIGKILL); 3067 complete(thread->event); 3068 while (!kthread_should_stop()) { 3069 void (*run)(mddev_t *); 3070 3071 wait_event_interruptible_timeout(thread->wqueue, 3072 test_bit(THREAD_WAKEUP, &thread->flags) 3073 || kthread_should_stop(), 3074 thread->timeout); 3075 try_to_freeze(); 3076 3077 clear_bit(THREAD_WAKEUP, &thread->flags); 3078 3079 run = thread->run; 3080 if (run) 3081 run(thread->mddev); 3082 } 3083 3084 return 0; 3085 } 3086 3087 void md_wakeup_thread(mdk_thread_t *thread) 3088 { 3089 if (thread) { 3090 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 3091 set_bit(THREAD_WAKEUP, &thread->flags); 3092 wake_up(&thread->wqueue); 3093 } 3094 } 3095 3096 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3097 const char *name) 3098 { 3099 mdk_thread_t *thread; 3100 struct completion event; 3101 3102 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3103 if (!thread) 3104 return NULL; 3105 3106 memset(thread, 0, sizeof(mdk_thread_t)); 3107 init_waitqueue_head(&thread->wqueue); 3108 3109 init_completion(&event); 3110 thread->event = &event; 3111 thread->run = run; 3112 thread->mddev = mddev; 3113 thread->name = name; 3114 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3115 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3116 if (IS_ERR(thread->tsk)) { 3117 kfree(thread); 3118 return NULL; 3119 } 3120 wait_for_completion(&event); 3121 return thread; 3122 } 3123 3124 void md_unregister_thread(mdk_thread_t *thread) 3125 { 3126 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3127 3128 kthread_stop(thread->tsk); 3129 kfree(thread); 3130 } 3131 3132 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3133 { 3134 if (!mddev) { 3135 MD_BUG(); 3136 return; 3137 } 3138 3139 if (!rdev || rdev->faulty) 3140 return; 3141 /* 3142 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3143 mdname(mddev), 3144 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3145 __builtin_return_address(0),__builtin_return_address(1), 3146 __builtin_return_address(2),__builtin_return_address(3)); 3147 */ 3148 if (!mddev->pers->error_handler) 3149 return; 3150 mddev->pers->error_handler(mddev,rdev); 3151 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3152 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3153 md_wakeup_thread(mddev->thread); 3154 } 3155 3156 /* seq_file implementation /proc/mdstat */ 3157 3158 static void status_unused(struct seq_file *seq) 3159 { 3160 int i = 0; 3161 mdk_rdev_t *rdev; 3162 struct list_head *tmp; 3163 3164 seq_printf(seq, "unused devices: "); 3165 3166 ITERATE_RDEV_PENDING(rdev,tmp) { 3167 char b[BDEVNAME_SIZE]; 3168 i++; 3169 seq_printf(seq, "%s ", 3170 bdevname(rdev->bdev,b)); 3171 } 3172 if (!i) 3173 seq_printf(seq, "<none>"); 3174 3175 seq_printf(seq, "\n"); 3176 } 3177 3178 3179 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3180 { 3181 unsigned long max_blocks, resync, res, dt, db, rt; 3182 3183 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3184 3185 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3186 max_blocks = mddev->resync_max_sectors >> 1; 3187 else 3188 max_blocks = mddev->size; 3189 3190 /* 3191 * Should not happen. 3192 */ 3193 if (!max_blocks) { 3194 MD_BUG(); 3195 return; 3196 } 3197 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3198 { 3199 int i, x = res/50, y = 20-x; 3200 seq_printf(seq, "["); 3201 for (i = 0; i < x; i++) 3202 seq_printf(seq, "="); 3203 seq_printf(seq, ">"); 3204 for (i = 0; i < y; i++) 3205 seq_printf(seq, "."); 3206 seq_printf(seq, "] "); 3207 } 3208 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3209 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3210 "resync" : "recovery"), 3211 res/10, res % 10, resync, max_blocks); 3212 3213 /* 3214 * We do not want to overflow, so the order of operands and 3215 * the * 100 / 100 trick are important. We do a +1 to be 3216 * safe against division by zero. We only estimate anyway. 3217 * 3218 * dt: time from mark until now 3219 * db: blocks written from mark until now 3220 * rt: remaining time 3221 */ 3222 dt = ((jiffies - mddev->resync_mark) / HZ); 3223 if (!dt) dt++; 3224 db = resync - (mddev->resync_mark_cnt/2); 3225 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3226 3227 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3228 3229 seq_printf(seq, " speed=%ldK/sec", db/dt); 3230 } 3231 3232 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3233 { 3234 struct list_head *tmp; 3235 loff_t l = *pos; 3236 mddev_t *mddev; 3237 3238 if (l >= 0x10000) 3239 return NULL; 3240 if (!l--) 3241 /* header */ 3242 return (void*)1; 3243 3244 spin_lock(&all_mddevs_lock); 3245 list_for_each(tmp,&all_mddevs) 3246 if (!l--) { 3247 mddev = list_entry(tmp, mddev_t, all_mddevs); 3248 mddev_get(mddev); 3249 spin_unlock(&all_mddevs_lock); 3250 return mddev; 3251 } 3252 spin_unlock(&all_mddevs_lock); 3253 if (!l--) 3254 return (void*)2;/* tail */ 3255 return NULL; 3256 } 3257 3258 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3259 { 3260 struct list_head *tmp; 3261 mddev_t *next_mddev, *mddev = v; 3262 3263 ++*pos; 3264 if (v == (void*)2) 3265 return NULL; 3266 3267 spin_lock(&all_mddevs_lock); 3268 if (v == (void*)1) 3269 tmp = all_mddevs.next; 3270 else 3271 tmp = mddev->all_mddevs.next; 3272 if (tmp != &all_mddevs) 3273 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3274 else { 3275 next_mddev = (void*)2; 3276 *pos = 0x10000; 3277 } 3278 spin_unlock(&all_mddevs_lock); 3279 3280 if (v != (void*)1) 3281 mddev_put(mddev); 3282 return next_mddev; 3283 3284 } 3285 3286 static void md_seq_stop(struct seq_file *seq, void *v) 3287 { 3288 mddev_t *mddev = v; 3289 3290 if (mddev && v != (void*)1 && v != (void*)2) 3291 mddev_put(mddev); 3292 } 3293 3294 static int md_seq_show(struct seq_file *seq, void *v) 3295 { 3296 mddev_t *mddev = v; 3297 sector_t size; 3298 struct list_head *tmp2; 3299 mdk_rdev_t *rdev; 3300 int i; 3301 struct bitmap *bitmap; 3302 3303 if (v == (void*)1) { 3304 seq_printf(seq, "Personalities : "); 3305 spin_lock(&pers_lock); 3306 for (i = 0; i < MAX_PERSONALITY; i++) 3307 if (pers[i]) 3308 seq_printf(seq, "[%s] ", pers[i]->name); 3309 3310 spin_unlock(&pers_lock); 3311 seq_printf(seq, "\n"); 3312 return 0; 3313 } 3314 if (v == (void*)2) { 3315 status_unused(seq); 3316 return 0; 3317 } 3318 3319 if (mddev_lock(mddev)!=0) 3320 return -EINTR; 3321 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3322 seq_printf(seq, "%s : %sactive", mdname(mddev), 3323 mddev->pers ? "" : "in"); 3324 if (mddev->pers) { 3325 if (mddev->ro) 3326 seq_printf(seq, " (read-only)"); 3327 seq_printf(seq, " %s", mddev->pers->name); 3328 } 3329 3330 size = 0; 3331 ITERATE_RDEV(mddev,rdev,tmp2) { 3332 char b[BDEVNAME_SIZE]; 3333 seq_printf(seq, " %s[%d]", 3334 bdevname(rdev->bdev,b), rdev->desc_nr); 3335 if (test_bit(WriteMostly, &rdev->flags)) 3336 seq_printf(seq, "(W)"); 3337 if (rdev->faulty) { 3338 seq_printf(seq, "(F)"); 3339 continue; 3340 } else if (rdev->raid_disk < 0) 3341 seq_printf(seq, "(S)"); /* spare */ 3342 size += rdev->size; 3343 } 3344 3345 if (!list_empty(&mddev->disks)) { 3346 if (mddev->pers) 3347 seq_printf(seq, "\n %llu blocks", 3348 (unsigned long long)mddev->array_size); 3349 else 3350 seq_printf(seq, "\n %llu blocks", 3351 (unsigned long long)size); 3352 } 3353 if (mddev->persistent) { 3354 if (mddev->major_version != 0 || 3355 mddev->minor_version != 90) { 3356 seq_printf(seq," super %d.%d", 3357 mddev->major_version, 3358 mddev->minor_version); 3359 } 3360 } else 3361 seq_printf(seq, " super non-persistent"); 3362 3363 if (mddev->pers) { 3364 mddev->pers->status (seq, mddev); 3365 seq_printf(seq, "\n "); 3366 if (mddev->curr_resync > 2) { 3367 status_resync (seq, mddev); 3368 seq_printf(seq, "\n "); 3369 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3370 seq_printf(seq, " resync=DELAYED\n "); 3371 } else 3372 seq_printf(seq, "\n "); 3373 3374 if ((bitmap = mddev->bitmap)) { 3375 unsigned long chunk_kb; 3376 unsigned long flags; 3377 spin_lock_irqsave(&bitmap->lock, flags); 3378 chunk_kb = bitmap->chunksize >> 10; 3379 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3380 "%lu%s chunk", 3381 bitmap->pages - bitmap->missing_pages, 3382 bitmap->pages, 3383 (bitmap->pages - bitmap->missing_pages) 3384 << (PAGE_SHIFT - 10), 3385 chunk_kb ? chunk_kb : bitmap->chunksize, 3386 chunk_kb ? "KB" : "B"); 3387 if (bitmap->file) { 3388 seq_printf(seq, ", file: "); 3389 seq_path(seq, bitmap->file->f_vfsmnt, 3390 bitmap->file->f_dentry," \t\n"); 3391 } 3392 3393 seq_printf(seq, "\n"); 3394 spin_unlock_irqrestore(&bitmap->lock, flags); 3395 } 3396 3397 seq_printf(seq, "\n"); 3398 } 3399 mddev_unlock(mddev); 3400 3401 return 0; 3402 } 3403 3404 static struct seq_operations md_seq_ops = { 3405 .start = md_seq_start, 3406 .next = md_seq_next, 3407 .stop = md_seq_stop, 3408 .show = md_seq_show, 3409 }; 3410 3411 static int md_seq_open(struct inode *inode, struct file *file) 3412 { 3413 int error; 3414 3415 error = seq_open(file, &md_seq_ops); 3416 return error; 3417 } 3418 3419 static struct file_operations md_seq_fops = { 3420 .open = md_seq_open, 3421 .read = seq_read, 3422 .llseek = seq_lseek, 3423 .release = seq_release, 3424 }; 3425 3426 int register_md_personality(int pnum, mdk_personality_t *p) 3427 { 3428 if (pnum >= MAX_PERSONALITY) { 3429 printk(KERN_ERR 3430 "md: tried to install personality %s as nr %d, but max is %lu\n", 3431 p->name, pnum, MAX_PERSONALITY-1); 3432 return -EINVAL; 3433 } 3434 3435 spin_lock(&pers_lock); 3436 if (pers[pnum]) { 3437 spin_unlock(&pers_lock); 3438 return -EBUSY; 3439 } 3440 3441 pers[pnum] = p; 3442 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3443 spin_unlock(&pers_lock); 3444 return 0; 3445 } 3446 3447 int unregister_md_personality(int pnum) 3448 { 3449 if (pnum >= MAX_PERSONALITY) 3450 return -EINVAL; 3451 3452 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3453 spin_lock(&pers_lock); 3454 pers[pnum] = NULL; 3455 spin_unlock(&pers_lock); 3456 return 0; 3457 } 3458 3459 static int is_mddev_idle(mddev_t *mddev) 3460 { 3461 mdk_rdev_t * rdev; 3462 struct list_head *tmp; 3463 int idle; 3464 unsigned long curr_events; 3465 3466 idle = 1; 3467 ITERATE_RDEV(mddev,rdev,tmp) { 3468 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3469 curr_events = disk_stat_read(disk, sectors[0]) + 3470 disk_stat_read(disk, sectors[1]) - 3471 atomic_read(&disk->sync_io); 3472 /* Allow some slack between valud of curr_events and last_events, 3473 * as there are some uninteresting races. 3474 * Note: the following is an unsigned comparison. 3475 */ 3476 if ((curr_events - rdev->last_events + 32) > 64) { 3477 rdev->last_events = curr_events; 3478 idle = 0; 3479 } 3480 } 3481 return idle; 3482 } 3483 3484 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3485 { 3486 /* another "blocks" (512byte) blocks have been synced */ 3487 atomic_sub(blocks, &mddev->recovery_active); 3488 wake_up(&mddev->recovery_wait); 3489 if (!ok) { 3490 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3491 md_wakeup_thread(mddev->thread); 3492 // stop recovery, signal do_sync .... 3493 } 3494 } 3495 3496 3497 /* md_write_start(mddev, bi) 3498 * If we need to update some array metadata (e.g. 'active' flag 3499 * in superblock) before writing, schedule a superblock update 3500 * and wait for it to complete. 3501 */ 3502 void md_write_start(mddev_t *mddev, struct bio *bi) 3503 { 3504 if (bio_data_dir(bi) != WRITE) 3505 return; 3506 3507 atomic_inc(&mddev->writes_pending); 3508 if (mddev->in_sync) { 3509 spin_lock(&mddev->write_lock); 3510 if (mddev->in_sync) { 3511 mddev->in_sync = 0; 3512 mddev->sb_dirty = 1; 3513 md_wakeup_thread(mddev->thread); 3514 } 3515 spin_unlock(&mddev->write_lock); 3516 } 3517 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3518 } 3519 3520 void md_write_end(mddev_t *mddev) 3521 { 3522 if (atomic_dec_and_test(&mddev->writes_pending)) { 3523 if (mddev->safemode == 2) 3524 md_wakeup_thread(mddev->thread); 3525 else 3526 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3527 } 3528 } 3529 3530 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3531 3532 #define SYNC_MARKS 10 3533 #define SYNC_MARK_STEP (3*HZ) 3534 static void md_do_sync(mddev_t *mddev) 3535 { 3536 mddev_t *mddev2; 3537 unsigned int currspeed = 0, 3538 window; 3539 sector_t max_sectors,j, io_sectors; 3540 unsigned long mark[SYNC_MARKS]; 3541 sector_t mark_cnt[SYNC_MARKS]; 3542 int last_mark,m; 3543 struct list_head *tmp; 3544 sector_t last_check; 3545 int skipped = 0; 3546 3547 /* just incase thread restarts... */ 3548 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3549 return; 3550 3551 /* we overload curr_resync somewhat here. 3552 * 0 == not engaged in resync at all 3553 * 2 == checking that there is no conflict with another sync 3554 * 1 == like 2, but have yielded to allow conflicting resync to 3555 * commense 3556 * other == active in resync - this many blocks 3557 * 3558 * Before starting a resync we must have set curr_resync to 3559 * 2, and then checked that every "conflicting" array has curr_resync 3560 * less than ours. When we find one that is the same or higher 3561 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3562 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3563 * This will mean we have to start checking from the beginning again. 3564 * 3565 */ 3566 3567 do { 3568 mddev->curr_resync = 2; 3569 3570 try_again: 3571 if (signal_pending(current) || 3572 kthread_should_stop()) { 3573 flush_signals(current); 3574 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3575 goto skip; 3576 } 3577 ITERATE_MDDEV(mddev2,tmp) { 3578 if (mddev2 == mddev) 3579 continue; 3580 if (mddev2->curr_resync && 3581 match_mddev_units(mddev,mddev2)) { 3582 DEFINE_WAIT(wq); 3583 if (mddev < mddev2 && mddev->curr_resync == 2) { 3584 /* arbitrarily yield */ 3585 mddev->curr_resync = 1; 3586 wake_up(&resync_wait); 3587 } 3588 if (mddev > mddev2 && mddev->curr_resync == 1) 3589 /* no need to wait here, we can wait the next 3590 * time 'round when curr_resync == 2 3591 */ 3592 continue; 3593 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3594 if (!signal_pending(current) && 3595 !kthread_should_stop() && 3596 mddev2->curr_resync >= mddev->curr_resync) { 3597 printk(KERN_INFO "md: delaying resync of %s" 3598 " until %s has finished resync (they" 3599 " share one or more physical units)\n", 3600 mdname(mddev), mdname(mddev2)); 3601 mddev_put(mddev2); 3602 schedule(); 3603 finish_wait(&resync_wait, &wq); 3604 goto try_again; 3605 } 3606 finish_wait(&resync_wait, &wq); 3607 } 3608 } 3609 } while (mddev->curr_resync < 2); 3610 3611 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3612 /* resync follows the size requested by the personality, 3613 * which defaults to physical size, but can be virtual size 3614 */ 3615 max_sectors = mddev->resync_max_sectors; 3616 else 3617 /* recovery follows the physical size of devices */ 3618 max_sectors = mddev->size << 1; 3619 3620 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3621 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3622 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3623 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 3624 "(but not more than %d KB/sec) for reconstruction.\n", 3625 sysctl_speed_limit_max); 3626 3627 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3628 /* we don't use the checkpoint if there's a bitmap */ 3629 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) 3630 j = mddev->recovery_cp; 3631 else 3632 j = 0; 3633 io_sectors = 0; 3634 for (m = 0; m < SYNC_MARKS; m++) { 3635 mark[m] = jiffies; 3636 mark_cnt[m] = io_sectors; 3637 } 3638 last_mark = 0; 3639 mddev->resync_mark = mark[last_mark]; 3640 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3641 3642 /* 3643 * Tune reconstruction: 3644 */ 3645 window = 32*(PAGE_SIZE/512); 3646 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 3647 window/2,(unsigned long long) max_sectors/2); 3648 3649 atomic_set(&mddev->recovery_active, 0); 3650 init_waitqueue_head(&mddev->recovery_wait); 3651 last_check = 0; 3652 3653 if (j>2) { 3654 printk(KERN_INFO 3655 "md: resuming recovery of %s from checkpoint.\n", 3656 mdname(mddev)); 3657 mddev->curr_resync = j; 3658 } 3659 3660 while (j < max_sectors) { 3661 sector_t sectors; 3662 3663 skipped = 0; 3664 sectors = mddev->pers->sync_request(mddev, j, &skipped, 3665 currspeed < sysctl_speed_limit_min); 3666 if (sectors == 0) { 3667 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3668 goto out; 3669 } 3670 3671 if (!skipped) { /* actual IO requested */ 3672 io_sectors += sectors; 3673 atomic_add(sectors, &mddev->recovery_active); 3674 } 3675 3676 j += sectors; 3677 if (j>1) mddev->curr_resync = j; 3678 3679 3680 if (last_check + window > io_sectors || j == max_sectors) 3681 continue; 3682 3683 last_check = io_sectors; 3684 3685 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3686 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3687 break; 3688 3689 repeat: 3690 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 3691 /* step marks */ 3692 int next = (last_mark+1) % SYNC_MARKS; 3693 3694 mddev->resync_mark = mark[next]; 3695 mddev->resync_mark_cnt = mark_cnt[next]; 3696 mark[next] = jiffies; 3697 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 3698 last_mark = next; 3699 } 3700 3701 3702 if (signal_pending(current) || kthread_should_stop()) { 3703 /* 3704 * got a signal, exit. 3705 */ 3706 printk(KERN_INFO 3707 "md: md_do_sync() got signal ... exiting\n"); 3708 flush_signals(current); 3709 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3710 goto out; 3711 } 3712 3713 /* 3714 * this loop exits only if either when we are slower than 3715 * the 'hard' speed limit, or the system was IO-idle for 3716 * a jiffy. 3717 * the system might be non-idle CPU-wise, but we only care 3718 * about not overloading the IO subsystem. (things like an 3719 * e2fsck being done on the RAID array should execute fast) 3720 */ 3721 mddev->queue->unplug_fn(mddev->queue); 3722 cond_resched(); 3723 3724 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 3725 /((jiffies-mddev->resync_mark)/HZ +1) +1; 3726 3727 if (currspeed > sysctl_speed_limit_min) { 3728 if ((currspeed > sysctl_speed_limit_max) || 3729 !is_mddev_idle(mddev)) { 3730 msleep_interruptible(250); 3731 goto repeat; 3732 } 3733 } 3734 } 3735 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 3736 /* 3737 * this also signals 'finished resyncing' to md_stop 3738 */ 3739 out: 3740 mddev->queue->unplug_fn(mddev->queue); 3741 3742 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3743 3744 /* tell personality that we are finished */ 3745 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 3746 3747 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3748 mddev->curr_resync > 2 && 3749 mddev->curr_resync >= mddev->recovery_cp) { 3750 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3751 printk(KERN_INFO 3752 "md: checkpointing recovery of %s.\n", 3753 mdname(mddev)); 3754 mddev->recovery_cp = mddev->curr_resync; 3755 } else 3756 mddev->recovery_cp = MaxSector; 3757 } 3758 3759 skip: 3760 mddev->curr_resync = 0; 3761 wake_up(&resync_wait); 3762 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 3763 md_wakeup_thread(mddev->thread); 3764 } 3765 3766 3767 /* 3768 * This routine is regularly called by all per-raid-array threads to 3769 * deal with generic issues like resync and super-block update. 3770 * Raid personalities that don't have a thread (linear/raid0) do not 3771 * need this as they never do any recovery or update the superblock. 3772 * 3773 * It does not do any resync itself, but rather "forks" off other threads 3774 * to do that as needed. 3775 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 3776 * "->recovery" and create a thread at ->sync_thread. 3777 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 3778 * and wakeups up this thread which will reap the thread and finish up. 3779 * This thread also removes any faulty devices (with nr_pending == 0). 3780 * 3781 * The overall approach is: 3782 * 1/ if the superblock needs updating, update it. 3783 * 2/ If a recovery thread is running, don't do anything else. 3784 * 3/ If recovery has finished, clean up, possibly marking spares active. 3785 * 4/ If there are any faulty devices, remove them. 3786 * 5/ If array is degraded, try to add spares devices 3787 * 6/ If array has spares or is not in-sync, start a resync thread. 3788 */ 3789 void md_check_recovery(mddev_t *mddev) 3790 { 3791 mdk_rdev_t *rdev; 3792 struct list_head *rtmp; 3793 3794 3795 if (mddev->bitmap) 3796 bitmap_daemon_work(mddev->bitmap); 3797 3798 if (mddev->ro) 3799 return; 3800 3801 if (signal_pending(current)) { 3802 if (mddev->pers->sync_request) { 3803 printk(KERN_INFO "md: %s in immediate safe mode\n", 3804 mdname(mddev)); 3805 mddev->safemode = 2; 3806 } 3807 flush_signals(current); 3808 } 3809 3810 if ( ! ( 3811 mddev->sb_dirty || 3812 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3813 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 3814 (mddev->safemode == 1) || 3815 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 3816 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 3817 )) 3818 return; 3819 3820 if (mddev_trylock(mddev)==0) { 3821 int spares =0; 3822 3823 spin_lock(&mddev->write_lock); 3824 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 3825 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 3826 mddev->in_sync = 1; 3827 mddev->sb_dirty = 1; 3828 } 3829 if (mddev->safemode == 1) 3830 mddev->safemode = 0; 3831 spin_unlock(&mddev->write_lock); 3832 3833 if (mddev->sb_dirty) 3834 md_update_sb(mddev); 3835 3836 3837 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3838 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 3839 /* resync/recovery still happening */ 3840 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3841 goto unlock; 3842 } 3843 if (mddev->sync_thread) { 3844 /* resync has finished, collect result */ 3845 md_unregister_thread(mddev->sync_thread); 3846 mddev->sync_thread = NULL; 3847 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3848 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3849 /* success...*/ 3850 /* activate any spares */ 3851 mddev->pers->spare_active(mddev); 3852 } 3853 md_update_sb(mddev); 3854 3855 /* if array is no-longer degraded, then any saved_raid_disk 3856 * information must be scrapped 3857 */ 3858 if (!mddev->degraded) 3859 ITERATE_RDEV(mddev,rdev,rtmp) 3860 rdev->saved_raid_disk = -1; 3861 3862 mddev->recovery = 0; 3863 /* flag recovery needed just to double check */ 3864 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3865 goto unlock; 3866 } 3867 if (mddev->recovery) 3868 /* probably just the RECOVERY_NEEDED flag */ 3869 mddev->recovery = 0; 3870 3871 /* no recovery is running. 3872 * remove any failed drives, then 3873 * add spares if possible. 3874 * Spare are also removed and re-added, to allow 3875 * the personality to fail the re-add. 3876 */ 3877 ITERATE_RDEV(mddev,rdev,rtmp) 3878 if (rdev->raid_disk >= 0 && 3879 (rdev->faulty || ! rdev->in_sync) && 3880 atomic_read(&rdev->nr_pending)==0) { 3881 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) 3882 rdev->raid_disk = -1; 3883 } 3884 3885 if (mddev->degraded) { 3886 ITERATE_RDEV(mddev,rdev,rtmp) 3887 if (rdev->raid_disk < 0 3888 && !rdev->faulty) { 3889 if (mddev->pers->hot_add_disk(mddev,rdev)) 3890 spares++; 3891 else 3892 break; 3893 } 3894 } 3895 3896 if (!spares && (mddev->recovery_cp == MaxSector )) { 3897 /* nothing we can do ... */ 3898 goto unlock; 3899 } 3900 if (mddev->pers->sync_request) { 3901 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3902 if (!spares) 3903 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3904 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 3905 /* We are adding a device or devices to an array 3906 * which has the bitmap stored on all devices. 3907 * So make sure all bitmap pages get written 3908 */ 3909 bitmap_write_all(mddev->bitmap); 3910 } 3911 mddev->sync_thread = md_register_thread(md_do_sync, 3912 mddev, 3913 "%s_resync"); 3914 if (!mddev->sync_thread) { 3915 printk(KERN_ERR "%s: could not start resync" 3916 " thread...\n", 3917 mdname(mddev)); 3918 /* leave the spares where they are, it shouldn't hurt */ 3919 mddev->recovery = 0; 3920 } else { 3921 md_wakeup_thread(mddev->sync_thread); 3922 } 3923 } 3924 unlock: 3925 mddev_unlock(mddev); 3926 } 3927 } 3928 3929 static int md_notify_reboot(struct notifier_block *this, 3930 unsigned long code, void *x) 3931 { 3932 struct list_head *tmp; 3933 mddev_t *mddev; 3934 3935 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 3936 3937 printk(KERN_INFO "md: stopping all md devices.\n"); 3938 3939 ITERATE_MDDEV(mddev,tmp) 3940 if (mddev_trylock(mddev)==0) 3941 do_md_stop (mddev, 1); 3942 /* 3943 * certain more exotic SCSI devices are known to be 3944 * volatile wrt too early system reboots. While the 3945 * right place to handle this issue is the given 3946 * driver, we do want to have a safe RAID driver ... 3947 */ 3948 mdelay(1000*1); 3949 } 3950 return NOTIFY_DONE; 3951 } 3952 3953 static struct notifier_block md_notifier = { 3954 .notifier_call = md_notify_reboot, 3955 .next = NULL, 3956 .priority = INT_MAX, /* before any real devices */ 3957 }; 3958 3959 static void md_geninit(void) 3960 { 3961 struct proc_dir_entry *p; 3962 3963 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3964 3965 p = create_proc_entry("mdstat", S_IRUGO, NULL); 3966 if (p) 3967 p->proc_fops = &md_seq_fops; 3968 } 3969 3970 static int __init md_init(void) 3971 { 3972 int minor; 3973 3974 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 3975 " MD_SB_DISKS=%d\n", 3976 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3977 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3978 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, 3979 BITMAP_MINOR); 3980 3981 if (register_blkdev(MAJOR_NR, "md")) 3982 return -1; 3983 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 3984 unregister_blkdev(MAJOR_NR, "md"); 3985 return -1; 3986 } 3987 devfs_mk_dir("md"); 3988 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 3989 md_probe, NULL, NULL); 3990 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 3991 md_probe, NULL, NULL); 3992 3993 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3994 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 3995 S_IFBLK|S_IRUSR|S_IWUSR, 3996 "md/%d", minor); 3997 3998 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3999 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 4000 S_IFBLK|S_IRUSR|S_IWUSR, 4001 "md/mdp%d", minor); 4002 4003 4004 register_reboot_notifier(&md_notifier); 4005 raid_table_header = register_sysctl_table(raid_root_table, 1); 4006 4007 md_geninit(); 4008 return (0); 4009 } 4010 4011 4012 #ifndef MODULE 4013 4014 /* 4015 * Searches all registered partitions for autorun RAID arrays 4016 * at boot time. 4017 */ 4018 static dev_t detected_devices[128]; 4019 static int dev_cnt; 4020 4021 void md_autodetect_dev(dev_t dev) 4022 { 4023 if (dev_cnt >= 0 && dev_cnt < 127) 4024 detected_devices[dev_cnt++] = dev; 4025 } 4026 4027 4028 static void autostart_arrays(int part) 4029 { 4030 mdk_rdev_t *rdev; 4031 int i; 4032 4033 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 4034 4035 for (i = 0; i < dev_cnt; i++) { 4036 dev_t dev = detected_devices[i]; 4037 4038 rdev = md_import_device(dev,0, 0); 4039 if (IS_ERR(rdev)) 4040 continue; 4041 4042 if (rdev->faulty) { 4043 MD_BUG(); 4044 continue; 4045 } 4046 list_add(&rdev->same_set, &pending_raid_disks); 4047 } 4048 dev_cnt = 0; 4049 4050 autorun_devices(part); 4051 } 4052 4053 #endif 4054 4055 static __exit void md_exit(void) 4056 { 4057 mddev_t *mddev; 4058 struct list_head *tmp; 4059 int i; 4060 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 4061 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 4062 for (i=0; i < MAX_MD_DEVS; i++) 4063 devfs_remove("md/%d", i); 4064 for (i=0; i < MAX_MD_DEVS; i++) 4065 devfs_remove("md/d%d", i); 4066 4067 devfs_remove("md"); 4068 4069 unregister_blkdev(MAJOR_NR,"md"); 4070 unregister_blkdev(mdp_major, "mdp"); 4071 unregister_reboot_notifier(&md_notifier); 4072 unregister_sysctl_table(raid_table_header); 4073 remove_proc_entry("mdstat", NULL); 4074 ITERATE_MDDEV(mddev,tmp) { 4075 struct gendisk *disk = mddev->gendisk; 4076 if (!disk) 4077 continue; 4078 export_array(mddev); 4079 del_gendisk(disk); 4080 put_disk(disk); 4081 mddev->gendisk = NULL; 4082 mddev_put(mddev); 4083 } 4084 } 4085 4086 module_init(md_init) 4087 module_exit(md_exit) 4088 4089 EXPORT_SYMBOL(register_md_personality); 4090 EXPORT_SYMBOL(unregister_md_personality); 4091 EXPORT_SYMBOL(md_error); 4092 EXPORT_SYMBOL(md_done_sync); 4093 EXPORT_SYMBOL(md_write_start); 4094 EXPORT_SYMBOL(md_write_end); 4095 EXPORT_SYMBOL(md_register_thread); 4096 EXPORT_SYMBOL(md_unregister_thread); 4097 EXPORT_SYMBOL(md_wakeup_thread); 4098 EXPORT_SYMBOL(md_print_devices); 4099 EXPORT_SYMBOL(md_check_recovery); 4100 MODULE_LICENSE("GPL"); 4101 MODULE_ALIAS("md"); 4102 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 4103