1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/kernel.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/buffer_head.h> /* for invalidate_bdev */ 43 #include <linux/poll.h> 44 #include <linux/mutex.h> 45 #include <linux/ctype.h> 46 #include <linux/freezer.h> 47 48 #include <linux/init.h> 49 50 #include <linux/file.h> 51 52 #ifdef CONFIG_KMOD 53 #include <linux/kmod.h> 54 #endif 55 56 #include <asm/unaligned.h> 57 58 #define MAJOR_NR MD_MAJOR 59 #define MD_DRIVER 60 61 /* 63 partitions with the alternate major number (mdp) */ 62 #define MdpMinorShift 6 63 64 #define DEBUG 0 65 #define dprintk(x...) ((void)(DEBUG && printk(x))) 66 67 68 #ifndef MODULE 69 static void autostart_arrays (int part); 70 #endif 71 72 static LIST_HEAD(pers_list); 73 static DEFINE_SPINLOCK(pers_lock); 74 75 static void md_print_devices(void); 76 77 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 78 79 /* 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * Increase it if you want to have more _guaranteed_ speed. Note that 83 * the RAID driver will use the maximum available bandwidth if the IO 84 * subsystem is idle. There is also an 'absolute maximum' reconstruction 85 * speed limit - in case reconstruction slows down your system despite 86 * idle IO detection. 87 * 88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 89 * or /sys/block/mdX/md/sync_speed_{min,max} 90 */ 91 92 static int sysctl_speed_limit_min = 1000; 93 static int sysctl_speed_limit_max = 200000; 94 static inline int speed_min(mddev_t *mddev) 95 { 96 return mddev->sync_speed_min ? 97 mddev->sync_speed_min : sysctl_speed_limit_min; 98 } 99 100 static inline int speed_max(mddev_t *mddev) 101 { 102 return mddev->sync_speed_max ? 103 mddev->sync_speed_max : sysctl_speed_limit_max; 104 } 105 106 static struct ctl_table_header *raid_table_header; 107 108 static ctl_table raid_table[] = { 109 { 110 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 111 .procname = "speed_limit_min", 112 .data = &sysctl_speed_limit_min, 113 .maxlen = sizeof(int), 114 .mode = S_IRUGO|S_IWUSR, 115 .proc_handler = &proc_dointvec, 116 }, 117 { 118 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 119 .procname = "speed_limit_max", 120 .data = &sysctl_speed_limit_max, 121 .maxlen = sizeof(int), 122 .mode = S_IRUGO|S_IWUSR, 123 .proc_handler = &proc_dointvec, 124 }, 125 { .ctl_name = 0 } 126 }; 127 128 static ctl_table raid_dir_table[] = { 129 { 130 .ctl_name = DEV_RAID, 131 .procname = "raid", 132 .maxlen = 0, 133 .mode = S_IRUGO|S_IXUGO, 134 .child = raid_table, 135 }, 136 { .ctl_name = 0 } 137 }; 138 139 static ctl_table raid_root_table[] = { 140 { 141 .ctl_name = CTL_DEV, 142 .procname = "dev", 143 .maxlen = 0, 144 .mode = 0555, 145 .child = raid_dir_table, 146 }, 147 { .ctl_name = 0 } 148 }; 149 150 static struct block_device_operations md_fops; 151 152 static int start_readonly; 153 154 /* 155 * We have a system wide 'event count' that is incremented 156 * on any 'interesting' event, and readers of /proc/mdstat 157 * can use 'poll' or 'select' to find out when the event 158 * count increases. 159 * 160 * Events are: 161 * start array, stop array, error, add device, remove device, 162 * start build, activate spare 163 */ 164 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 165 static atomic_t md_event_count; 166 void md_new_event(mddev_t *mddev) 167 { 168 atomic_inc(&md_event_count); 169 wake_up(&md_event_waiters); 170 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 171 } 172 EXPORT_SYMBOL_GPL(md_new_event); 173 174 /* Alternate version that can be called from interrupts 175 * when calling sysfs_notify isn't needed. 176 */ 177 static void md_new_event_inintr(mddev_t *mddev) 178 { 179 atomic_inc(&md_event_count); 180 wake_up(&md_event_waiters); 181 } 182 183 /* 184 * Enables to iterate over all existing md arrays 185 * all_mddevs_lock protects this list. 186 */ 187 static LIST_HEAD(all_mddevs); 188 static DEFINE_SPINLOCK(all_mddevs_lock); 189 190 191 /* 192 * iterates through all used mddevs in the system. 193 * We take care to grab the all_mddevs_lock whenever navigating 194 * the list, and to always hold a refcount when unlocked. 195 * Any code which breaks out of this loop while own 196 * a reference to the current mddev and must mddev_put it. 197 */ 198 #define ITERATE_MDDEV(mddev,tmp) \ 199 \ 200 for (({ spin_lock(&all_mddevs_lock); \ 201 tmp = all_mddevs.next; \ 202 mddev = NULL;}); \ 203 ({ if (tmp != &all_mddevs) \ 204 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 205 spin_unlock(&all_mddevs_lock); \ 206 if (mddev) mddev_put(mddev); \ 207 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 208 tmp != &all_mddevs;}); \ 209 ({ spin_lock(&all_mddevs_lock); \ 210 tmp = tmp->next;}) \ 211 ) 212 213 214 static int md_fail_request (struct request_queue *q, struct bio *bio) 215 { 216 bio_io_error(bio, bio->bi_size); 217 return 0; 218 } 219 220 static inline mddev_t *mddev_get(mddev_t *mddev) 221 { 222 atomic_inc(&mddev->active); 223 return mddev; 224 } 225 226 static void mddev_put(mddev_t *mddev) 227 { 228 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 229 return; 230 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 231 list_del(&mddev->all_mddevs); 232 spin_unlock(&all_mddevs_lock); 233 blk_cleanup_queue(mddev->queue); 234 kobject_unregister(&mddev->kobj); 235 } else 236 spin_unlock(&all_mddevs_lock); 237 } 238 239 static mddev_t * mddev_find(dev_t unit) 240 { 241 mddev_t *mddev, *new = NULL; 242 243 retry: 244 spin_lock(&all_mddevs_lock); 245 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 246 if (mddev->unit == unit) { 247 mddev_get(mddev); 248 spin_unlock(&all_mddevs_lock); 249 kfree(new); 250 return mddev; 251 } 252 253 if (new) { 254 list_add(&new->all_mddevs, &all_mddevs); 255 spin_unlock(&all_mddevs_lock); 256 return new; 257 } 258 spin_unlock(&all_mddevs_lock); 259 260 new = kzalloc(sizeof(*new), GFP_KERNEL); 261 if (!new) 262 return NULL; 263 264 new->unit = unit; 265 if (MAJOR(unit) == MD_MAJOR) 266 new->md_minor = MINOR(unit); 267 else 268 new->md_minor = MINOR(unit) >> MdpMinorShift; 269 270 mutex_init(&new->reconfig_mutex); 271 INIT_LIST_HEAD(&new->disks); 272 INIT_LIST_HEAD(&new->all_mddevs); 273 init_timer(&new->safemode_timer); 274 atomic_set(&new->active, 1); 275 spin_lock_init(&new->write_lock); 276 init_waitqueue_head(&new->sb_wait); 277 new->reshape_position = MaxSector; 278 279 new->queue = blk_alloc_queue(GFP_KERNEL); 280 if (!new->queue) { 281 kfree(new); 282 return NULL; 283 } 284 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 285 286 blk_queue_make_request(new->queue, md_fail_request); 287 288 goto retry; 289 } 290 291 static inline int mddev_lock(mddev_t * mddev) 292 { 293 return mutex_lock_interruptible(&mddev->reconfig_mutex); 294 } 295 296 static inline int mddev_trylock(mddev_t * mddev) 297 { 298 return mutex_trylock(&mddev->reconfig_mutex); 299 } 300 301 static inline void mddev_unlock(mddev_t * mddev) 302 { 303 mutex_unlock(&mddev->reconfig_mutex); 304 305 md_wakeup_thread(mddev->thread); 306 } 307 308 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 309 { 310 mdk_rdev_t * rdev; 311 struct list_head *tmp; 312 313 ITERATE_RDEV(mddev,rdev,tmp) { 314 if (rdev->desc_nr == nr) 315 return rdev; 316 } 317 return NULL; 318 } 319 320 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 321 { 322 struct list_head *tmp; 323 mdk_rdev_t *rdev; 324 325 ITERATE_RDEV(mddev,rdev,tmp) { 326 if (rdev->bdev->bd_dev == dev) 327 return rdev; 328 } 329 return NULL; 330 } 331 332 static struct mdk_personality *find_pers(int level, char *clevel) 333 { 334 struct mdk_personality *pers; 335 list_for_each_entry(pers, &pers_list, list) { 336 if (level != LEVEL_NONE && pers->level == level) 337 return pers; 338 if (strcmp(pers->name, clevel)==0) 339 return pers; 340 } 341 return NULL; 342 } 343 344 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 345 { 346 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 347 return MD_NEW_SIZE_BLOCKS(size); 348 } 349 350 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 351 { 352 sector_t size; 353 354 size = rdev->sb_offset; 355 356 if (chunk_size) 357 size &= ~((sector_t)chunk_size/1024 - 1); 358 return size; 359 } 360 361 static int alloc_disk_sb(mdk_rdev_t * rdev) 362 { 363 if (rdev->sb_page) 364 MD_BUG(); 365 366 rdev->sb_page = alloc_page(GFP_KERNEL); 367 if (!rdev->sb_page) { 368 printk(KERN_ALERT "md: out of memory.\n"); 369 return -EINVAL; 370 } 371 372 return 0; 373 } 374 375 static void free_disk_sb(mdk_rdev_t * rdev) 376 { 377 if (rdev->sb_page) { 378 put_page(rdev->sb_page); 379 rdev->sb_loaded = 0; 380 rdev->sb_page = NULL; 381 rdev->sb_offset = 0; 382 rdev->size = 0; 383 } 384 } 385 386 387 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 388 { 389 mdk_rdev_t *rdev = bio->bi_private; 390 mddev_t *mddev = rdev->mddev; 391 if (bio->bi_size) 392 return 1; 393 394 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 395 printk("md: super_written gets error=%d, uptodate=%d\n", 396 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 397 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 398 md_error(mddev, rdev); 399 } 400 401 if (atomic_dec_and_test(&mddev->pending_writes)) 402 wake_up(&mddev->sb_wait); 403 bio_put(bio); 404 return 0; 405 } 406 407 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 408 { 409 struct bio *bio2 = bio->bi_private; 410 mdk_rdev_t *rdev = bio2->bi_private; 411 mddev_t *mddev = rdev->mddev; 412 if (bio->bi_size) 413 return 1; 414 415 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 416 error == -EOPNOTSUPP) { 417 unsigned long flags; 418 /* barriers don't appear to be supported :-( */ 419 set_bit(BarriersNotsupp, &rdev->flags); 420 mddev->barriers_work = 0; 421 spin_lock_irqsave(&mddev->write_lock, flags); 422 bio2->bi_next = mddev->biolist; 423 mddev->biolist = bio2; 424 spin_unlock_irqrestore(&mddev->write_lock, flags); 425 wake_up(&mddev->sb_wait); 426 bio_put(bio); 427 return 0; 428 } 429 bio_put(bio2); 430 bio->bi_private = rdev; 431 return super_written(bio, bytes_done, error); 432 } 433 434 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 435 sector_t sector, int size, struct page *page) 436 { 437 /* write first size bytes of page to sector of rdev 438 * Increment mddev->pending_writes before returning 439 * and decrement it on completion, waking up sb_wait 440 * if zero is reached. 441 * If an error occurred, call md_error 442 * 443 * As we might need to resubmit the request if BIO_RW_BARRIER 444 * causes ENOTSUPP, we allocate a spare bio... 445 */ 446 struct bio *bio = bio_alloc(GFP_NOIO, 1); 447 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 448 449 bio->bi_bdev = rdev->bdev; 450 bio->bi_sector = sector; 451 bio_add_page(bio, page, size, 0); 452 bio->bi_private = rdev; 453 bio->bi_end_io = super_written; 454 bio->bi_rw = rw; 455 456 atomic_inc(&mddev->pending_writes); 457 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 458 struct bio *rbio; 459 rw |= (1<<BIO_RW_BARRIER); 460 rbio = bio_clone(bio, GFP_NOIO); 461 rbio->bi_private = bio; 462 rbio->bi_end_io = super_written_barrier; 463 submit_bio(rw, rbio); 464 } else 465 submit_bio(rw, bio); 466 } 467 468 void md_super_wait(mddev_t *mddev) 469 { 470 /* wait for all superblock writes that were scheduled to complete. 471 * if any had to be retried (due to BARRIER problems), retry them 472 */ 473 DEFINE_WAIT(wq); 474 for(;;) { 475 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 476 if (atomic_read(&mddev->pending_writes)==0) 477 break; 478 while (mddev->biolist) { 479 struct bio *bio; 480 spin_lock_irq(&mddev->write_lock); 481 bio = mddev->biolist; 482 mddev->biolist = bio->bi_next ; 483 bio->bi_next = NULL; 484 spin_unlock_irq(&mddev->write_lock); 485 submit_bio(bio->bi_rw, bio); 486 } 487 schedule(); 488 } 489 finish_wait(&mddev->sb_wait, &wq); 490 } 491 492 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 493 { 494 if (bio->bi_size) 495 return 1; 496 497 complete((struct completion*)bio->bi_private); 498 return 0; 499 } 500 501 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 502 struct page *page, int rw) 503 { 504 struct bio *bio = bio_alloc(GFP_NOIO, 1); 505 struct completion event; 506 int ret; 507 508 rw |= (1 << BIO_RW_SYNC); 509 510 bio->bi_bdev = bdev; 511 bio->bi_sector = sector; 512 bio_add_page(bio, page, size, 0); 513 init_completion(&event); 514 bio->bi_private = &event; 515 bio->bi_end_io = bi_complete; 516 submit_bio(rw, bio); 517 wait_for_completion(&event); 518 519 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 520 bio_put(bio); 521 return ret; 522 } 523 EXPORT_SYMBOL_GPL(sync_page_io); 524 525 static int read_disk_sb(mdk_rdev_t * rdev, int size) 526 { 527 char b[BDEVNAME_SIZE]; 528 if (!rdev->sb_page) { 529 MD_BUG(); 530 return -EINVAL; 531 } 532 if (rdev->sb_loaded) 533 return 0; 534 535 536 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 537 goto fail; 538 rdev->sb_loaded = 1; 539 return 0; 540 541 fail: 542 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 543 bdevname(rdev->bdev,b)); 544 return -EINVAL; 545 } 546 547 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 548 { 549 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 550 (sb1->set_uuid1 == sb2->set_uuid1) && 551 (sb1->set_uuid2 == sb2->set_uuid2) && 552 (sb1->set_uuid3 == sb2->set_uuid3)) 553 554 return 1; 555 556 return 0; 557 } 558 559 560 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 561 { 562 int ret; 563 mdp_super_t *tmp1, *tmp2; 564 565 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 566 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 567 568 if (!tmp1 || !tmp2) { 569 ret = 0; 570 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 571 goto abort; 572 } 573 574 *tmp1 = *sb1; 575 *tmp2 = *sb2; 576 577 /* 578 * nr_disks is not constant 579 */ 580 tmp1->nr_disks = 0; 581 tmp2->nr_disks = 0; 582 583 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 584 ret = 0; 585 else 586 ret = 1; 587 588 abort: 589 kfree(tmp1); 590 kfree(tmp2); 591 return ret; 592 } 593 594 595 static u32 md_csum_fold(u32 csum) 596 { 597 csum = (csum & 0xffff) + (csum >> 16); 598 return (csum & 0xffff) + (csum >> 16); 599 } 600 601 static unsigned int calc_sb_csum(mdp_super_t * sb) 602 { 603 u64 newcsum = 0; 604 u32 *sb32 = (u32*)sb; 605 int i; 606 unsigned int disk_csum, csum; 607 608 disk_csum = sb->sb_csum; 609 sb->sb_csum = 0; 610 611 for (i = 0; i < MD_SB_BYTES/4 ; i++) 612 newcsum += sb32[i]; 613 csum = (newcsum & 0xffffffff) + (newcsum>>32); 614 615 616 #ifdef CONFIG_ALPHA 617 /* This used to use csum_partial, which was wrong for several 618 * reasons including that different results are returned on 619 * different architectures. It isn't critical that we get exactly 620 * the same return value as before (we always csum_fold before 621 * testing, and that removes any differences). However as we 622 * know that csum_partial always returned a 16bit value on 623 * alphas, do a fold to maximise conformity to previous behaviour. 624 */ 625 sb->sb_csum = md_csum_fold(disk_csum); 626 #else 627 sb->sb_csum = disk_csum; 628 #endif 629 return csum; 630 } 631 632 633 /* 634 * Handle superblock details. 635 * We want to be able to handle multiple superblock formats 636 * so we have a common interface to them all, and an array of 637 * different handlers. 638 * We rely on user-space to write the initial superblock, and support 639 * reading and updating of superblocks. 640 * Interface methods are: 641 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 642 * loads and validates a superblock on dev. 643 * if refdev != NULL, compare superblocks on both devices 644 * Return: 645 * 0 - dev has a superblock that is compatible with refdev 646 * 1 - dev has a superblock that is compatible and newer than refdev 647 * so dev should be used as the refdev in future 648 * -EINVAL superblock incompatible or invalid 649 * -othererror e.g. -EIO 650 * 651 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 652 * Verify that dev is acceptable into mddev. 653 * The first time, mddev->raid_disks will be 0, and data from 654 * dev should be merged in. Subsequent calls check that dev 655 * is new enough. Return 0 or -EINVAL 656 * 657 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 658 * Update the superblock for rdev with data in mddev 659 * This does not write to disc. 660 * 661 */ 662 663 struct super_type { 664 char *name; 665 struct module *owner; 666 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 667 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 668 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 669 }; 670 671 /* 672 * load_super for 0.90.0 673 */ 674 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 675 { 676 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 677 mdp_super_t *sb; 678 int ret; 679 sector_t sb_offset; 680 681 /* 682 * Calculate the position of the superblock, 683 * it's at the end of the disk. 684 * 685 * It also happens to be a multiple of 4Kb. 686 */ 687 sb_offset = calc_dev_sboffset(rdev->bdev); 688 rdev->sb_offset = sb_offset; 689 690 ret = read_disk_sb(rdev, MD_SB_BYTES); 691 if (ret) return ret; 692 693 ret = -EINVAL; 694 695 bdevname(rdev->bdev, b); 696 sb = (mdp_super_t*)page_address(rdev->sb_page); 697 698 if (sb->md_magic != MD_SB_MAGIC) { 699 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 700 b); 701 goto abort; 702 } 703 704 if (sb->major_version != 0 || 705 sb->minor_version < 90 || 706 sb->minor_version > 91) { 707 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 708 sb->major_version, sb->minor_version, 709 b); 710 goto abort; 711 } 712 713 if (sb->raid_disks <= 0) 714 goto abort; 715 716 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 717 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 718 b); 719 goto abort; 720 } 721 722 rdev->preferred_minor = sb->md_minor; 723 rdev->data_offset = 0; 724 rdev->sb_size = MD_SB_BYTES; 725 726 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { 727 if (sb->level != 1 && sb->level != 4 728 && sb->level != 5 && sb->level != 6 729 && sb->level != 10) { 730 /* FIXME use a better test */ 731 printk(KERN_WARNING 732 "md: bitmaps not supported for this level.\n"); 733 goto abort; 734 } 735 } 736 737 if (sb->level == LEVEL_MULTIPATH) 738 rdev->desc_nr = -1; 739 else 740 rdev->desc_nr = sb->this_disk.number; 741 742 if (refdev == 0) 743 ret = 1; 744 else { 745 __u64 ev1, ev2; 746 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 747 if (!uuid_equal(refsb, sb)) { 748 printk(KERN_WARNING "md: %s has different UUID to %s\n", 749 b, bdevname(refdev->bdev,b2)); 750 goto abort; 751 } 752 if (!sb_equal(refsb, sb)) { 753 printk(KERN_WARNING "md: %s has same UUID" 754 " but different superblock to %s\n", 755 b, bdevname(refdev->bdev, b2)); 756 goto abort; 757 } 758 ev1 = md_event(sb); 759 ev2 = md_event(refsb); 760 if (ev1 > ev2) 761 ret = 1; 762 else 763 ret = 0; 764 } 765 rdev->size = calc_dev_size(rdev, sb->chunk_size); 766 767 if (rdev->size < sb->size && sb->level > 1) 768 /* "this cannot possibly happen" ... */ 769 ret = -EINVAL; 770 771 abort: 772 return ret; 773 } 774 775 /* 776 * validate_super for 0.90.0 777 */ 778 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 779 { 780 mdp_disk_t *desc; 781 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 782 __u64 ev1 = md_event(sb); 783 784 rdev->raid_disk = -1; 785 rdev->flags = 0; 786 if (mddev->raid_disks == 0) { 787 mddev->major_version = 0; 788 mddev->minor_version = sb->minor_version; 789 mddev->patch_version = sb->patch_version; 790 mddev->persistent = ! sb->not_persistent; 791 mddev->chunk_size = sb->chunk_size; 792 mddev->ctime = sb->ctime; 793 mddev->utime = sb->utime; 794 mddev->level = sb->level; 795 mddev->clevel[0] = 0; 796 mddev->layout = sb->layout; 797 mddev->raid_disks = sb->raid_disks; 798 mddev->size = sb->size; 799 mddev->events = ev1; 800 mddev->bitmap_offset = 0; 801 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 802 803 if (mddev->minor_version >= 91) { 804 mddev->reshape_position = sb->reshape_position; 805 mddev->delta_disks = sb->delta_disks; 806 mddev->new_level = sb->new_level; 807 mddev->new_layout = sb->new_layout; 808 mddev->new_chunk = sb->new_chunk; 809 } else { 810 mddev->reshape_position = MaxSector; 811 mddev->delta_disks = 0; 812 mddev->new_level = mddev->level; 813 mddev->new_layout = mddev->layout; 814 mddev->new_chunk = mddev->chunk_size; 815 } 816 817 if (sb->state & (1<<MD_SB_CLEAN)) 818 mddev->recovery_cp = MaxSector; 819 else { 820 if (sb->events_hi == sb->cp_events_hi && 821 sb->events_lo == sb->cp_events_lo) { 822 mddev->recovery_cp = sb->recovery_cp; 823 } else 824 mddev->recovery_cp = 0; 825 } 826 827 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 828 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 829 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 830 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 831 832 mddev->max_disks = MD_SB_DISKS; 833 834 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 835 mddev->bitmap_file == NULL) 836 mddev->bitmap_offset = mddev->default_bitmap_offset; 837 838 } else if (mddev->pers == NULL) { 839 /* Insist on good event counter while assembling */ 840 ++ev1; 841 if (ev1 < mddev->events) 842 return -EINVAL; 843 } else if (mddev->bitmap) { 844 /* if adding to array with a bitmap, then we can accept an 845 * older device ... but not too old. 846 */ 847 if (ev1 < mddev->bitmap->events_cleared) 848 return 0; 849 } else { 850 if (ev1 < mddev->events) 851 /* just a hot-add of a new device, leave raid_disk at -1 */ 852 return 0; 853 } 854 855 if (mddev->level != LEVEL_MULTIPATH) { 856 desc = sb->disks + rdev->desc_nr; 857 858 if (desc->state & (1<<MD_DISK_FAULTY)) 859 set_bit(Faulty, &rdev->flags); 860 else if (desc->state & (1<<MD_DISK_SYNC) /* && 861 desc->raid_disk < mddev->raid_disks */) { 862 set_bit(In_sync, &rdev->flags); 863 rdev->raid_disk = desc->raid_disk; 864 } 865 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 866 set_bit(WriteMostly, &rdev->flags); 867 } else /* MULTIPATH are always insync */ 868 set_bit(In_sync, &rdev->flags); 869 return 0; 870 } 871 872 /* 873 * sync_super for 0.90.0 874 */ 875 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 876 { 877 mdp_super_t *sb; 878 struct list_head *tmp; 879 mdk_rdev_t *rdev2; 880 int next_spare = mddev->raid_disks; 881 882 883 /* make rdev->sb match mddev data.. 884 * 885 * 1/ zero out disks 886 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 887 * 3/ any empty disks < next_spare become removed 888 * 889 * disks[0] gets initialised to REMOVED because 890 * we cannot be sure from other fields if it has 891 * been initialised or not. 892 */ 893 int i; 894 int active=0, working=0,failed=0,spare=0,nr_disks=0; 895 896 rdev->sb_size = MD_SB_BYTES; 897 898 sb = (mdp_super_t*)page_address(rdev->sb_page); 899 900 memset(sb, 0, sizeof(*sb)); 901 902 sb->md_magic = MD_SB_MAGIC; 903 sb->major_version = mddev->major_version; 904 sb->patch_version = mddev->patch_version; 905 sb->gvalid_words = 0; /* ignored */ 906 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 907 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 908 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 909 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 910 911 sb->ctime = mddev->ctime; 912 sb->level = mddev->level; 913 sb->size = mddev->size; 914 sb->raid_disks = mddev->raid_disks; 915 sb->md_minor = mddev->md_minor; 916 sb->not_persistent = !mddev->persistent; 917 sb->utime = mddev->utime; 918 sb->state = 0; 919 sb->events_hi = (mddev->events>>32); 920 sb->events_lo = (u32)mddev->events; 921 922 if (mddev->reshape_position == MaxSector) 923 sb->minor_version = 90; 924 else { 925 sb->minor_version = 91; 926 sb->reshape_position = mddev->reshape_position; 927 sb->new_level = mddev->new_level; 928 sb->delta_disks = mddev->delta_disks; 929 sb->new_layout = mddev->new_layout; 930 sb->new_chunk = mddev->new_chunk; 931 } 932 mddev->minor_version = sb->minor_version; 933 if (mddev->in_sync) 934 { 935 sb->recovery_cp = mddev->recovery_cp; 936 sb->cp_events_hi = (mddev->events>>32); 937 sb->cp_events_lo = (u32)mddev->events; 938 if (mddev->recovery_cp == MaxSector) 939 sb->state = (1<< MD_SB_CLEAN); 940 } else 941 sb->recovery_cp = 0; 942 943 sb->layout = mddev->layout; 944 sb->chunk_size = mddev->chunk_size; 945 946 if (mddev->bitmap && mddev->bitmap_file == NULL) 947 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 948 949 sb->disks[0].state = (1<<MD_DISK_REMOVED); 950 ITERATE_RDEV(mddev,rdev2,tmp) { 951 mdp_disk_t *d; 952 int desc_nr; 953 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 954 && !test_bit(Faulty, &rdev2->flags)) 955 desc_nr = rdev2->raid_disk; 956 else 957 desc_nr = next_spare++; 958 rdev2->desc_nr = desc_nr; 959 d = &sb->disks[rdev2->desc_nr]; 960 nr_disks++; 961 d->number = rdev2->desc_nr; 962 d->major = MAJOR(rdev2->bdev->bd_dev); 963 d->minor = MINOR(rdev2->bdev->bd_dev); 964 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 965 && !test_bit(Faulty, &rdev2->flags)) 966 d->raid_disk = rdev2->raid_disk; 967 else 968 d->raid_disk = rdev2->desc_nr; /* compatibility */ 969 if (test_bit(Faulty, &rdev2->flags)) 970 d->state = (1<<MD_DISK_FAULTY); 971 else if (test_bit(In_sync, &rdev2->flags)) { 972 d->state = (1<<MD_DISK_ACTIVE); 973 d->state |= (1<<MD_DISK_SYNC); 974 active++; 975 working++; 976 } else { 977 d->state = 0; 978 spare++; 979 working++; 980 } 981 if (test_bit(WriteMostly, &rdev2->flags)) 982 d->state |= (1<<MD_DISK_WRITEMOSTLY); 983 } 984 /* now set the "removed" and "faulty" bits on any missing devices */ 985 for (i=0 ; i < mddev->raid_disks ; i++) { 986 mdp_disk_t *d = &sb->disks[i]; 987 if (d->state == 0 && d->number == 0) { 988 d->number = i; 989 d->raid_disk = i; 990 d->state = (1<<MD_DISK_REMOVED); 991 d->state |= (1<<MD_DISK_FAULTY); 992 failed++; 993 } 994 } 995 sb->nr_disks = nr_disks; 996 sb->active_disks = active; 997 sb->working_disks = working; 998 sb->failed_disks = failed; 999 sb->spare_disks = spare; 1000 1001 sb->this_disk = sb->disks[rdev->desc_nr]; 1002 sb->sb_csum = calc_sb_csum(sb); 1003 } 1004 1005 /* 1006 * version 1 superblock 1007 */ 1008 1009 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1010 { 1011 __le32 disk_csum; 1012 u32 csum; 1013 unsigned long long newcsum; 1014 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1015 __le32 *isuper = (__le32*)sb; 1016 int i; 1017 1018 disk_csum = sb->sb_csum; 1019 sb->sb_csum = 0; 1020 newcsum = 0; 1021 for (i=0; size>=4; size -= 4 ) 1022 newcsum += le32_to_cpu(*isuper++); 1023 1024 if (size == 2) 1025 newcsum += le16_to_cpu(*(__le16*) isuper); 1026 1027 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1028 sb->sb_csum = disk_csum; 1029 return cpu_to_le32(csum); 1030 } 1031 1032 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1033 { 1034 struct mdp_superblock_1 *sb; 1035 int ret; 1036 sector_t sb_offset; 1037 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1038 int bmask; 1039 1040 /* 1041 * Calculate the position of the superblock. 1042 * It is always aligned to a 4K boundary and 1043 * depeding on minor_version, it can be: 1044 * 0: At least 8K, but less than 12K, from end of device 1045 * 1: At start of device 1046 * 2: 4K from start of device. 1047 */ 1048 switch(minor_version) { 1049 case 0: 1050 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1051 sb_offset -= 8*2; 1052 sb_offset &= ~(sector_t)(4*2-1); 1053 /* convert from sectors to K */ 1054 sb_offset /= 2; 1055 break; 1056 case 1: 1057 sb_offset = 0; 1058 break; 1059 case 2: 1060 sb_offset = 4; 1061 break; 1062 default: 1063 return -EINVAL; 1064 } 1065 rdev->sb_offset = sb_offset; 1066 1067 /* superblock is rarely larger than 1K, but it can be larger, 1068 * and it is safe to read 4k, so we do that 1069 */ 1070 ret = read_disk_sb(rdev, 4096); 1071 if (ret) return ret; 1072 1073 1074 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1075 1076 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1077 sb->major_version != cpu_to_le32(1) || 1078 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1079 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1080 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1081 return -EINVAL; 1082 1083 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1084 printk("md: invalid superblock checksum on %s\n", 1085 bdevname(rdev->bdev,b)); 1086 return -EINVAL; 1087 } 1088 if (le64_to_cpu(sb->data_size) < 10) { 1089 printk("md: data_size too small on %s\n", 1090 bdevname(rdev->bdev,b)); 1091 return -EINVAL; 1092 } 1093 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { 1094 if (sb->level != cpu_to_le32(1) && 1095 sb->level != cpu_to_le32(4) && 1096 sb->level != cpu_to_le32(5) && 1097 sb->level != cpu_to_le32(6) && 1098 sb->level != cpu_to_le32(10)) { 1099 printk(KERN_WARNING 1100 "md: bitmaps not supported for this level.\n"); 1101 return -EINVAL; 1102 } 1103 } 1104 1105 rdev->preferred_minor = 0xffff; 1106 rdev->data_offset = le64_to_cpu(sb->data_offset); 1107 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1108 1109 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1110 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1111 if (rdev->sb_size & bmask) 1112 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1113 1114 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1115 rdev->desc_nr = -1; 1116 else 1117 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1118 1119 if (refdev == 0) 1120 ret = 1; 1121 else { 1122 __u64 ev1, ev2; 1123 struct mdp_superblock_1 *refsb = 1124 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1125 1126 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1127 sb->level != refsb->level || 1128 sb->layout != refsb->layout || 1129 sb->chunksize != refsb->chunksize) { 1130 printk(KERN_WARNING "md: %s has strangely different" 1131 " superblock to %s\n", 1132 bdevname(rdev->bdev,b), 1133 bdevname(refdev->bdev,b2)); 1134 return -EINVAL; 1135 } 1136 ev1 = le64_to_cpu(sb->events); 1137 ev2 = le64_to_cpu(refsb->events); 1138 1139 if (ev1 > ev2) 1140 ret = 1; 1141 else 1142 ret = 0; 1143 } 1144 if (minor_version) 1145 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1146 else 1147 rdev->size = rdev->sb_offset; 1148 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1149 return -EINVAL; 1150 rdev->size = le64_to_cpu(sb->data_size)/2; 1151 if (le32_to_cpu(sb->chunksize)) 1152 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1153 1154 if (le64_to_cpu(sb->size) > rdev->size*2) 1155 return -EINVAL; 1156 return ret; 1157 } 1158 1159 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1160 { 1161 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1162 __u64 ev1 = le64_to_cpu(sb->events); 1163 1164 rdev->raid_disk = -1; 1165 rdev->flags = 0; 1166 if (mddev->raid_disks == 0) { 1167 mddev->major_version = 1; 1168 mddev->patch_version = 0; 1169 mddev->persistent = 1; 1170 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1171 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1172 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1173 mddev->level = le32_to_cpu(sb->level); 1174 mddev->clevel[0] = 0; 1175 mddev->layout = le32_to_cpu(sb->layout); 1176 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1177 mddev->size = le64_to_cpu(sb->size)/2; 1178 mddev->events = ev1; 1179 mddev->bitmap_offset = 0; 1180 mddev->default_bitmap_offset = 1024 >> 9; 1181 1182 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1183 memcpy(mddev->uuid, sb->set_uuid, 16); 1184 1185 mddev->max_disks = (4096-256)/2; 1186 1187 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1188 mddev->bitmap_file == NULL ) 1189 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1190 1191 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1192 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1193 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1194 mddev->new_level = le32_to_cpu(sb->new_level); 1195 mddev->new_layout = le32_to_cpu(sb->new_layout); 1196 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1197 } else { 1198 mddev->reshape_position = MaxSector; 1199 mddev->delta_disks = 0; 1200 mddev->new_level = mddev->level; 1201 mddev->new_layout = mddev->layout; 1202 mddev->new_chunk = mddev->chunk_size; 1203 } 1204 1205 } else if (mddev->pers == NULL) { 1206 /* Insist of good event counter while assembling */ 1207 ++ev1; 1208 if (ev1 < mddev->events) 1209 return -EINVAL; 1210 } else if (mddev->bitmap) { 1211 /* If adding to array with a bitmap, then we can accept an 1212 * older device, but not too old. 1213 */ 1214 if (ev1 < mddev->bitmap->events_cleared) 1215 return 0; 1216 } else { 1217 if (ev1 < mddev->events) 1218 /* just a hot-add of a new device, leave raid_disk at -1 */ 1219 return 0; 1220 } 1221 if (mddev->level != LEVEL_MULTIPATH) { 1222 int role; 1223 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1224 switch(role) { 1225 case 0xffff: /* spare */ 1226 break; 1227 case 0xfffe: /* faulty */ 1228 set_bit(Faulty, &rdev->flags); 1229 break; 1230 default: 1231 if ((le32_to_cpu(sb->feature_map) & 1232 MD_FEATURE_RECOVERY_OFFSET)) 1233 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1234 else 1235 set_bit(In_sync, &rdev->flags); 1236 rdev->raid_disk = role; 1237 break; 1238 } 1239 if (sb->devflags & WriteMostly1) 1240 set_bit(WriteMostly, &rdev->flags); 1241 } else /* MULTIPATH are always insync */ 1242 set_bit(In_sync, &rdev->flags); 1243 1244 return 0; 1245 } 1246 1247 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1248 { 1249 struct mdp_superblock_1 *sb; 1250 struct list_head *tmp; 1251 mdk_rdev_t *rdev2; 1252 int max_dev, i; 1253 /* make rdev->sb match mddev and rdev data. */ 1254 1255 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1256 1257 sb->feature_map = 0; 1258 sb->pad0 = 0; 1259 sb->recovery_offset = cpu_to_le64(0); 1260 memset(sb->pad1, 0, sizeof(sb->pad1)); 1261 memset(sb->pad2, 0, sizeof(sb->pad2)); 1262 memset(sb->pad3, 0, sizeof(sb->pad3)); 1263 1264 sb->utime = cpu_to_le64((__u64)mddev->utime); 1265 sb->events = cpu_to_le64(mddev->events); 1266 if (mddev->in_sync) 1267 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1268 else 1269 sb->resync_offset = cpu_to_le64(0); 1270 1271 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1272 1273 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1274 sb->size = cpu_to_le64(mddev->size<<1); 1275 1276 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1277 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1278 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1279 } 1280 1281 if (rdev->raid_disk >= 0 && 1282 !test_bit(In_sync, &rdev->flags) && 1283 rdev->recovery_offset > 0) { 1284 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1285 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1286 } 1287 1288 if (mddev->reshape_position != MaxSector) { 1289 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1290 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1291 sb->new_layout = cpu_to_le32(mddev->new_layout); 1292 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1293 sb->new_level = cpu_to_le32(mddev->new_level); 1294 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1295 } 1296 1297 max_dev = 0; 1298 ITERATE_RDEV(mddev,rdev2,tmp) 1299 if (rdev2->desc_nr+1 > max_dev) 1300 max_dev = rdev2->desc_nr+1; 1301 1302 if (max_dev > le32_to_cpu(sb->max_dev)) 1303 sb->max_dev = cpu_to_le32(max_dev); 1304 for (i=0; i<max_dev;i++) 1305 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1306 1307 ITERATE_RDEV(mddev,rdev2,tmp) { 1308 i = rdev2->desc_nr; 1309 if (test_bit(Faulty, &rdev2->flags)) 1310 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1311 else if (test_bit(In_sync, &rdev2->flags)) 1312 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1313 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1314 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1315 else 1316 sb->dev_roles[i] = cpu_to_le16(0xffff); 1317 } 1318 1319 sb->sb_csum = calc_sb_1_csum(sb); 1320 } 1321 1322 1323 static struct super_type super_types[] = { 1324 [0] = { 1325 .name = "0.90.0", 1326 .owner = THIS_MODULE, 1327 .load_super = super_90_load, 1328 .validate_super = super_90_validate, 1329 .sync_super = super_90_sync, 1330 }, 1331 [1] = { 1332 .name = "md-1", 1333 .owner = THIS_MODULE, 1334 .load_super = super_1_load, 1335 .validate_super = super_1_validate, 1336 .sync_super = super_1_sync, 1337 }, 1338 }; 1339 1340 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1341 { 1342 struct list_head *tmp, *tmp2; 1343 mdk_rdev_t *rdev, *rdev2; 1344 1345 ITERATE_RDEV(mddev1,rdev,tmp) 1346 ITERATE_RDEV(mddev2, rdev2, tmp2) 1347 if (rdev->bdev->bd_contains == 1348 rdev2->bdev->bd_contains) 1349 return 1; 1350 1351 return 0; 1352 } 1353 1354 static LIST_HEAD(pending_raid_disks); 1355 1356 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1357 { 1358 char b[BDEVNAME_SIZE]; 1359 struct kobject *ko; 1360 char *s; 1361 int err; 1362 1363 if (rdev->mddev) { 1364 MD_BUG(); 1365 return -EINVAL; 1366 } 1367 /* make sure rdev->size exceeds mddev->size */ 1368 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1369 if (mddev->pers) { 1370 /* Cannot change size, so fail 1371 * If mddev->level <= 0, then we don't care 1372 * about aligning sizes (e.g. linear) 1373 */ 1374 if (mddev->level > 0) 1375 return -ENOSPC; 1376 } else 1377 mddev->size = rdev->size; 1378 } 1379 1380 /* Verify rdev->desc_nr is unique. 1381 * If it is -1, assign a free number, else 1382 * check number is not in use 1383 */ 1384 if (rdev->desc_nr < 0) { 1385 int choice = 0; 1386 if (mddev->pers) choice = mddev->raid_disks; 1387 while (find_rdev_nr(mddev, choice)) 1388 choice++; 1389 rdev->desc_nr = choice; 1390 } else { 1391 if (find_rdev_nr(mddev, rdev->desc_nr)) 1392 return -EBUSY; 1393 } 1394 bdevname(rdev->bdev,b); 1395 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1396 return -ENOMEM; 1397 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1398 *s = '!'; 1399 1400 rdev->mddev = mddev; 1401 printk(KERN_INFO "md: bind<%s>\n", b); 1402 1403 rdev->kobj.parent = &mddev->kobj; 1404 if ((err = kobject_add(&rdev->kobj))) 1405 goto fail; 1406 1407 if (rdev->bdev->bd_part) 1408 ko = &rdev->bdev->bd_part->kobj; 1409 else 1410 ko = &rdev->bdev->bd_disk->kobj; 1411 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1412 kobject_del(&rdev->kobj); 1413 goto fail; 1414 } 1415 list_add(&rdev->same_set, &mddev->disks); 1416 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1417 return 0; 1418 1419 fail: 1420 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1421 b, mdname(mddev)); 1422 return err; 1423 } 1424 1425 static void delayed_delete(struct work_struct *ws) 1426 { 1427 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1428 kobject_del(&rdev->kobj); 1429 } 1430 1431 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1432 { 1433 char b[BDEVNAME_SIZE]; 1434 if (!rdev->mddev) { 1435 MD_BUG(); 1436 return; 1437 } 1438 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1439 list_del_init(&rdev->same_set); 1440 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1441 rdev->mddev = NULL; 1442 sysfs_remove_link(&rdev->kobj, "block"); 1443 1444 /* We need to delay this, otherwise we can deadlock when 1445 * writing to 'remove' to "dev/state" 1446 */ 1447 INIT_WORK(&rdev->del_work, delayed_delete); 1448 schedule_work(&rdev->del_work); 1449 } 1450 1451 /* 1452 * prevent the device from being mounted, repartitioned or 1453 * otherwise reused by a RAID array (or any other kernel 1454 * subsystem), by bd_claiming the device. 1455 */ 1456 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1457 { 1458 int err = 0; 1459 struct block_device *bdev; 1460 char b[BDEVNAME_SIZE]; 1461 1462 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1463 if (IS_ERR(bdev)) { 1464 printk(KERN_ERR "md: could not open %s.\n", 1465 __bdevname(dev, b)); 1466 return PTR_ERR(bdev); 1467 } 1468 err = bd_claim(bdev, rdev); 1469 if (err) { 1470 printk(KERN_ERR "md: could not bd_claim %s.\n", 1471 bdevname(bdev, b)); 1472 blkdev_put(bdev); 1473 return err; 1474 } 1475 rdev->bdev = bdev; 1476 return err; 1477 } 1478 1479 static void unlock_rdev(mdk_rdev_t *rdev) 1480 { 1481 struct block_device *bdev = rdev->bdev; 1482 rdev->bdev = NULL; 1483 if (!bdev) 1484 MD_BUG(); 1485 bd_release(bdev); 1486 blkdev_put(bdev); 1487 } 1488 1489 void md_autodetect_dev(dev_t dev); 1490 1491 static void export_rdev(mdk_rdev_t * rdev) 1492 { 1493 char b[BDEVNAME_SIZE]; 1494 printk(KERN_INFO "md: export_rdev(%s)\n", 1495 bdevname(rdev->bdev,b)); 1496 if (rdev->mddev) 1497 MD_BUG(); 1498 free_disk_sb(rdev); 1499 list_del_init(&rdev->same_set); 1500 #ifndef MODULE 1501 md_autodetect_dev(rdev->bdev->bd_dev); 1502 #endif 1503 unlock_rdev(rdev); 1504 kobject_put(&rdev->kobj); 1505 } 1506 1507 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1508 { 1509 unbind_rdev_from_array(rdev); 1510 export_rdev(rdev); 1511 } 1512 1513 static void export_array(mddev_t *mddev) 1514 { 1515 struct list_head *tmp; 1516 mdk_rdev_t *rdev; 1517 1518 ITERATE_RDEV(mddev,rdev,tmp) { 1519 if (!rdev->mddev) { 1520 MD_BUG(); 1521 continue; 1522 } 1523 kick_rdev_from_array(rdev); 1524 } 1525 if (!list_empty(&mddev->disks)) 1526 MD_BUG(); 1527 mddev->raid_disks = 0; 1528 mddev->major_version = 0; 1529 } 1530 1531 static void print_desc(mdp_disk_t *desc) 1532 { 1533 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1534 desc->major,desc->minor,desc->raid_disk,desc->state); 1535 } 1536 1537 static void print_sb(mdp_super_t *sb) 1538 { 1539 int i; 1540 1541 printk(KERN_INFO 1542 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1543 sb->major_version, sb->minor_version, sb->patch_version, 1544 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1545 sb->ctime); 1546 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1547 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1548 sb->md_minor, sb->layout, sb->chunk_size); 1549 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1550 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1551 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1552 sb->failed_disks, sb->spare_disks, 1553 sb->sb_csum, (unsigned long)sb->events_lo); 1554 1555 printk(KERN_INFO); 1556 for (i = 0; i < MD_SB_DISKS; i++) { 1557 mdp_disk_t *desc; 1558 1559 desc = sb->disks + i; 1560 if (desc->number || desc->major || desc->minor || 1561 desc->raid_disk || (desc->state && (desc->state != 4))) { 1562 printk(" D %2d: ", i); 1563 print_desc(desc); 1564 } 1565 } 1566 printk(KERN_INFO "md: THIS: "); 1567 print_desc(&sb->this_disk); 1568 1569 } 1570 1571 static void print_rdev(mdk_rdev_t *rdev) 1572 { 1573 char b[BDEVNAME_SIZE]; 1574 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1575 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1576 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1577 rdev->desc_nr); 1578 if (rdev->sb_loaded) { 1579 printk(KERN_INFO "md: rdev superblock:\n"); 1580 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1581 } else 1582 printk(KERN_INFO "md: no rdev superblock!\n"); 1583 } 1584 1585 static void md_print_devices(void) 1586 { 1587 struct list_head *tmp, *tmp2; 1588 mdk_rdev_t *rdev; 1589 mddev_t *mddev; 1590 char b[BDEVNAME_SIZE]; 1591 1592 printk("\n"); 1593 printk("md: **********************************\n"); 1594 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1595 printk("md: **********************************\n"); 1596 ITERATE_MDDEV(mddev,tmp) { 1597 1598 if (mddev->bitmap) 1599 bitmap_print_sb(mddev->bitmap); 1600 else 1601 printk("%s: ", mdname(mddev)); 1602 ITERATE_RDEV(mddev,rdev,tmp2) 1603 printk("<%s>", bdevname(rdev->bdev,b)); 1604 printk("\n"); 1605 1606 ITERATE_RDEV(mddev,rdev,tmp2) 1607 print_rdev(rdev); 1608 } 1609 printk("md: **********************************\n"); 1610 printk("\n"); 1611 } 1612 1613 1614 static void sync_sbs(mddev_t * mddev, int nospares) 1615 { 1616 /* Update each superblock (in-memory image), but 1617 * if we are allowed to, skip spares which already 1618 * have the right event counter, or have one earlier 1619 * (which would mean they aren't being marked as dirty 1620 * with the rest of the array) 1621 */ 1622 mdk_rdev_t *rdev; 1623 struct list_head *tmp; 1624 1625 ITERATE_RDEV(mddev,rdev,tmp) { 1626 if (rdev->sb_events == mddev->events || 1627 (nospares && 1628 rdev->raid_disk < 0 && 1629 (rdev->sb_events&1)==0 && 1630 rdev->sb_events+1 == mddev->events)) { 1631 /* Don't update this superblock */ 1632 rdev->sb_loaded = 2; 1633 } else { 1634 super_types[mddev->major_version]. 1635 sync_super(mddev, rdev); 1636 rdev->sb_loaded = 1; 1637 } 1638 } 1639 } 1640 1641 static void md_update_sb(mddev_t * mddev, int force_change) 1642 { 1643 struct list_head *tmp; 1644 mdk_rdev_t *rdev; 1645 int sync_req; 1646 int nospares = 0; 1647 1648 repeat: 1649 spin_lock_irq(&mddev->write_lock); 1650 1651 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1652 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1653 force_change = 1; 1654 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1655 /* just a clean<-> dirty transition, possibly leave spares alone, 1656 * though if events isn't the right even/odd, we will have to do 1657 * spares after all 1658 */ 1659 nospares = 1; 1660 if (force_change) 1661 nospares = 0; 1662 if (mddev->degraded) 1663 /* If the array is degraded, then skipping spares is both 1664 * dangerous and fairly pointless. 1665 * Dangerous because a device that was removed from the array 1666 * might have a event_count that still looks up-to-date, 1667 * so it can be re-added without a resync. 1668 * Pointless because if there are any spares to skip, 1669 * then a recovery will happen and soon that array won't 1670 * be degraded any more and the spare can go back to sleep then. 1671 */ 1672 nospares = 0; 1673 1674 sync_req = mddev->in_sync; 1675 mddev->utime = get_seconds(); 1676 1677 /* If this is just a dirty<->clean transition, and the array is clean 1678 * and 'events' is odd, we can roll back to the previous clean state */ 1679 if (nospares 1680 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1681 && (mddev->events & 1) 1682 && mddev->events != 1) 1683 mddev->events--; 1684 else { 1685 /* otherwise we have to go forward and ... */ 1686 mddev->events ++; 1687 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1688 /* .. if the array isn't clean, insist on an odd 'events' */ 1689 if ((mddev->events&1)==0) { 1690 mddev->events++; 1691 nospares = 0; 1692 } 1693 } else { 1694 /* otherwise insist on an even 'events' (for clean states) */ 1695 if ((mddev->events&1)) { 1696 mddev->events++; 1697 nospares = 0; 1698 } 1699 } 1700 } 1701 1702 if (!mddev->events) { 1703 /* 1704 * oops, this 64-bit counter should never wrap. 1705 * Either we are in around ~1 trillion A.C., assuming 1706 * 1 reboot per second, or we have a bug: 1707 */ 1708 MD_BUG(); 1709 mddev->events --; 1710 } 1711 sync_sbs(mddev, nospares); 1712 1713 /* 1714 * do not write anything to disk if using 1715 * nonpersistent superblocks 1716 */ 1717 if (!mddev->persistent) { 1718 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1719 spin_unlock_irq(&mddev->write_lock); 1720 wake_up(&mddev->sb_wait); 1721 return; 1722 } 1723 spin_unlock_irq(&mddev->write_lock); 1724 1725 dprintk(KERN_INFO 1726 "md: updating %s RAID superblock on device (in sync %d)\n", 1727 mdname(mddev),mddev->in_sync); 1728 1729 bitmap_update_sb(mddev->bitmap); 1730 ITERATE_RDEV(mddev,rdev,tmp) { 1731 char b[BDEVNAME_SIZE]; 1732 dprintk(KERN_INFO "md: "); 1733 if (rdev->sb_loaded != 1) 1734 continue; /* no noise on spare devices */ 1735 if (test_bit(Faulty, &rdev->flags)) 1736 dprintk("(skipping faulty "); 1737 1738 dprintk("%s ", bdevname(rdev->bdev,b)); 1739 if (!test_bit(Faulty, &rdev->flags)) { 1740 md_super_write(mddev,rdev, 1741 rdev->sb_offset<<1, rdev->sb_size, 1742 rdev->sb_page); 1743 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1744 bdevname(rdev->bdev,b), 1745 (unsigned long long)rdev->sb_offset); 1746 rdev->sb_events = mddev->events; 1747 1748 } else 1749 dprintk(")\n"); 1750 if (mddev->level == LEVEL_MULTIPATH) 1751 /* only need to write one superblock... */ 1752 break; 1753 } 1754 md_super_wait(mddev); 1755 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1756 1757 spin_lock_irq(&mddev->write_lock); 1758 if (mddev->in_sync != sync_req || 1759 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1760 /* have to write it out again */ 1761 spin_unlock_irq(&mddev->write_lock); 1762 goto repeat; 1763 } 1764 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1765 spin_unlock_irq(&mddev->write_lock); 1766 wake_up(&mddev->sb_wait); 1767 1768 } 1769 1770 /* words written to sysfs files may, or my not, be \n terminated. 1771 * We want to accept with case. For this we use cmd_match. 1772 */ 1773 static int cmd_match(const char *cmd, const char *str) 1774 { 1775 /* See if cmd, written into a sysfs file, matches 1776 * str. They must either be the same, or cmd can 1777 * have a trailing newline 1778 */ 1779 while (*cmd && *str && *cmd == *str) { 1780 cmd++; 1781 str++; 1782 } 1783 if (*cmd == '\n') 1784 cmd++; 1785 if (*str || *cmd) 1786 return 0; 1787 return 1; 1788 } 1789 1790 struct rdev_sysfs_entry { 1791 struct attribute attr; 1792 ssize_t (*show)(mdk_rdev_t *, char *); 1793 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1794 }; 1795 1796 static ssize_t 1797 state_show(mdk_rdev_t *rdev, char *page) 1798 { 1799 char *sep = ""; 1800 int len=0; 1801 1802 if (test_bit(Faulty, &rdev->flags)) { 1803 len+= sprintf(page+len, "%sfaulty",sep); 1804 sep = ","; 1805 } 1806 if (test_bit(In_sync, &rdev->flags)) { 1807 len += sprintf(page+len, "%sin_sync",sep); 1808 sep = ","; 1809 } 1810 if (test_bit(WriteMostly, &rdev->flags)) { 1811 len += sprintf(page+len, "%swrite_mostly",sep); 1812 sep = ","; 1813 } 1814 if (!test_bit(Faulty, &rdev->flags) && 1815 !test_bit(In_sync, &rdev->flags)) { 1816 len += sprintf(page+len, "%sspare", sep); 1817 sep = ","; 1818 } 1819 return len+sprintf(page+len, "\n"); 1820 } 1821 1822 static ssize_t 1823 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1824 { 1825 /* can write 1826 * faulty - simulates and error 1827 * remove - disconnects the device 1828 * writemostly - sets write_mostly 1829 * -writemostly - clears write_mostly 1830 */ 1831 int err = -EINVAL; 1832 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1833 md_error(rdev->mddev, rdev); 1834 err = 0; 1835 } else if (cmd_match(buf, "remove")) { 1836 if (rdev->raid_disk >= 0) 1837 err = -EBUSY; 1838 else { 1839 mddev_t *mddev = rdev->mddev; 1840 kick_rdev_from_array(rdev); 1841 if (mddev->pers) 1842 md_update_sb(mddev, 1); 1843 md_new_event(mddev); 1844 err = 0; 1845 } 1846 } else if (cmd_match(buf, "writemostly")) { 1847 set_bit(WriteMostly, &rdev->flags); 1848 err = 0; 1849 } else if (cmd_match(buf, "-writemostly")) { 1850 clear_bit(WriteMostly, &rdev->flags); 1851 err = 0; 1852 } 1853 return err ? err : len; 1854 } 1855 static struct rdev_sysfs_entry rdev_state = 1856 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1857 1858 static ssize_t 1859 super_show(mdk_rdev_t *rdev, char *page) 1860 { 1861 if (rdev->sb_loaded && rdev->sb_size) { 1862 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1863 return rdev->sb_size; 1864 } else 1865 return 0; 1866 } 1867 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1868 1869 static ssize_t 1870 errors_show(mdk_rdev_t *rdev, char *page) 1871 { 1872 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1873 } 1874 1875 static ssize_t 1876 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1877 { 1878 char *e; 1879 unsigned long n = simple_strtoul(buf, &e, 10); 1880 if (*buf && (*e == 0 || *e == '\n')) { 1881 atomic_set(&rdev->corrected_errors, n); 1882 return len; 1883 } 1884 return -EINVAL; 1885 } 1886 static struct rdev_sysfs_entry rdev_errors = 1887 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1888 1889 static ssize_t 1890 slot_show(mdk_rdev_t *rdev, char *page) 1891 { 1892 if (rdev->raid_disk < 0) 1893 return sprintf(page, "none\n"); 1894 else 1895 return sprintf(page, "%d\n", rdev->raid_disk); 1896 } 1897 1898 static ssize_t 1899 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1900 { 1901 char *e; 1902 int slot = simple_strtoul(buf, &e, 10); 1903 if (strncmp(buf, "none", 4)==0) 1904 slot = -1; 1905 else if (e==buf || (*e && *e!= '\n')) 1906 return -EINVAL; 1907 if (rdev->mddev->pers) 1908 /* Cannot set slot in active array (yet) */ 1909 return -EBUSY; 1910 if (slot >= rdev->mddev->raid_disks) 1911 return -ENOSPC; 1912 rdev->raid_disk = slot; 1913 /* assume it is working */ 1914 rdev->flags = 0; 1915 set_bit(In_sync, &rdev->flags); 1916 return len; 1917 } 1918 1919 1920 static struct rdev_sysfs_entry rdev_slot = 1921 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1922 1923 static ssize_t 1924 offset_show(mdk_rdev_t *rdev, char *page) 1925 { 1926 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1927 } 1928 1929 static ssize_t 1930 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1931 { 1932 char *e; 1933 unsigned long long offset = simple_strtoull(buf, &e, 10); 1934 if (e==buf || (*e && *e != '\n')) 1935 return -EINVAL; 1936 if (rdev->mddev->pers) 1937 return -EBUSY; 1938 rdev->data_offset = offset; 1939 return len; 1940 } 1941 1942 static struct rdev_sysfs_entry rdev_offset = 1943 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1944 1945 static ssize_t 1946 rdev_size_show(mdk_rdev_t *rdev, char *page) 1947 { 1948 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1949 } 1950 1951 static ssize_t 1952 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1953 { 1954 char *e; 1955 unsigned long long size = simple_strtoull(buf, &e, 10); 1956 if (e==buf || (*e && *e != '\n')) 1957 return -EINVAL; 1958 if (rdev->mddev->pers) 1959 return -EBUSY; 1960 rdev->size = size; 1961 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1962 rdev->mddev->size = size; 1963 return len; 1964 } 1965 1966 static struct rdev_sysfs_entry rdev_size = 1967 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1968 1969 static struct attribute *rdev_default_attrs[] = { 1970 &rdev_state.attr, 1971 &rdev_super.attr, 1972 &rdev_errors.attr, 1973 &rdev_slot.attr, 1974 &rdev_offset.attr, 1975 &rdev_size.attr, 1976 NULL, 1977 }; 1978 static ssize_t 1979 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1980 { 1981 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1982 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1983 1984 if (!entry->show) 1985 return -EIO; 1986 return entry->show(rdev, page); 1987 } 1988 1989 static ssize_t 1990 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1991 const char *page, size_t length) 1992 { 1993 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1994 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1995 1996 if (!entry->store) 1997 return -EIO; 1998 if (!capable(CAP_SYS_ADMIN)) 1999 return -EACCES; 2000 return entry->store(rdev, page, length); 2001 } 2002 2003 static void rdev_free(struct kobject *ko) 2004 { 2005 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2006 kfree(rdev); 2007 } 2008 static struct sysfs_ops rdev_sysfs_ops = { 2009 .show = rdev_attr_show, 2010 .store = rdev_attr_store, 2011 }; 2012 static struct kobj_type rdev_ktype = { 2013 .release = rdev_free, 2014 .sysfs_ops = &rdev_sysfs_ops, 2015 .default_attrs = rdev_default_attrs, 2016 }; 2017 2018 /* 2019 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2020 * 2021 * mark the device faulty if: 2022 * 2023 * - the device is nonexistent (zero size) 2024 * - the device has no valid superblock 2025 * 2026 * a faulty rdev _never_ has rdev->sb set. 2027 */ 2028 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2029 { 2030 char b[BDEVNAME_SIZE]; 2031 int err; 2032 mdk_rdev_t *rdev; 2033 sector_t size; 2034 2035 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2036 if (!rdev) { 2037 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2038 return ERR_PTR(-ENOMEM); 2039 } 2040 2041 if ((err = alloc_disk_sb(rdev))) 2042 goto abort_free; 2043 2044 err = lock_rdev(rdev, newdev); 2045 if (err) 2046 goto abort_free; 2047 2048 rdev->kobj.parent = NULL; 2049 rdev->kobj.ktype = &rdev_ktype; 2050 kobject_init(&rdev->kobj); 2051 2052 rdev->desc_nr = -1; 2053 rdev->saved_raid_disk = -1; 2054 rdev->raid_disk = -1; 2055 rdev->flags = 0; 2056 rdev->data_offset = 0; 2057 rdev->sb_events = 0; 2058 atomic_set(&rdev->nr_pending, 0); 2059 atomic_set(&rdev->read_errors, 0); 2060 atomic_set(&rdev->corrected_errors, 0); 2061 2062 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2063 if (!size) { 2064 printk(KERN_WARNING 2065 "md: %s has zero or unknown size, marking faulty!\n", 2066 bdevname(rdev->bdev,b)); 2067 err = -EINVAL; 2068 goto abort_free; 2069 } 2070 2071 if (super_format >= 0) { 2072 err = super_types[super_format]. 2073 load_super(rdev, NULL, super_minor); 2074 if (err == -EINVAL) { 2075 printk(KERN_WARNING 2076 "md: %s does not have a valid v%d.%d " 2077 "superblock, not importing!\n", 2078 bdevname(rdev->bdev,b), 2079 super_format, super_minor); 2080 goto abort_free; 2081 } 2082 if (err < 0) { 2083 printk(KERN_WARNING 2084 "md: could not read %s's sb, not importing!\n", 2085 bdevname(rdev->bdev,b)); 2086 goto abort_free; 2087 } 2088 } 2089 INIT_LIST_HEAD(&rdev->same_set); 2090 2091 return rdev; 2092 2093 abort_free: 2094 if (rdev->sb_page) { 2095 if (rdev->bdev) 2096 unlock_rdev(rdev); 2097 free_disk_sb(rdev); 2098 } 2099 kfree(rdev); 2100 return ERR_PTR(err); 2101 } 2102 2103 /* 2104 * Check a full RAID array for plausibility 2105 */ 2106 2107 2108 static void analyze_sbs(mddev_t * mddev) 2109 { 2110 int i; 2111 struct list_head *tmp; 2112 mdk_rdev_t *rdev, *freshest; 2113 char b[BDEVNAME_SIZE]; 2114 2115 freshest = NULL; 2116 ITERATE_RDEV(mddev,rdev,tmp) 2117 switch (super_types[mddev->major_version]. 2118 load_super(rdev, freshest, mddev->minor_version)) { 2119 case 1: 2120 freshest = rdev; 2121 break; 2122 case 0: 2123 break; 2124 default: 2125 printk( KERN_ERR \ 2126 "md: fatal superblock inconsistency in %s" 2127 " -- removing from array\n", 2128 bdevname(rdev->bdev,b)); 2129 kick_rdev_from_array(rdev); 2130 } 2131 2132 2133 super_types[mddev->major_version]. 2134 validate_super(mddev, freshest); 2135 2136 i = 0; 2137 ITERATE_RDEV(mddev,rdev,tmp) { 2138 if (rdev != freshest) 2139 if (super_types[mddev->major_version]. 2140 validate_super(mddev, rdev)) { 2141 printk(KERN_WARNING "md: kicking non-fresh %s" 2142 " from array!\n", 2143 bdevname(rdev->bdev,b)); 2144 kick_rdev_from_array(rdev); 2145 continue; 2146 } 2147 if (mddev->level == LEVEL_MULTIPATH) { 2148 rdev->desc_nr = i++; 2149 rdev->raid_disk = rdev->desc_nr; 2150 set_bit(In_sync, &rdev->flags); 2151 } else if (rdev->raid_disk >= mddev->raid_disks) { 2152 rdev->raid_disk = -1; 2153 clear_bit(In_sync, &rdev->flags); 2154 } 2155 } 2156 2157 2158 2159 if (mddev->recovery_cp != MaxSector && 2160 mddev->level >= 1) 2161 printk(KERN_ERR "md: %s: raid array is not clean" 2162 " -- starting background reconstruction\n", 2163 mdname(mddev)); 2164 2165 } 2166 2167 static ssize_t 2168 safe_delay_show(mddev_t *mddev, char *page) 2169 { 2170 int msec = (mddev->safemode_delay*1000)/HZ; 2171 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2172 } 2173 static ssize_t 2174 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2175 { 2176 int scale=1; 2177 int dot=0; 2178 int i; 2179 unsigned long msec; 2180 char buf[30]; 2181 char *e; 2182 /* remove a period, and count digits after it */ 2183 if (len >= sizeof(buf)) 2184 return -EINVAL; 2185 strlcpy(buf, cbuf, len); 2186 buf[len] = 0; 2187 for (i=0; i<len; i++) { 2188 if (dot) { 2189 if (isdigit(buf[i])) { 2190 buf[i-1] = buf[i]; 2191 scale *= 10; 2192 } 2193 buf[i] = 0; 2194 } else if (buf[i] == '.') { 2195 dot=1; 2196 buf[i] = 0; 2197 } 2198 } 2199 msec = simple_strtoul(buf, &e, 10); 2200 if (e == buf || (*e && *e != '\n')) 2201 return -EINVAL; 2202 msec = (msec * 1000) / scale; 2203 if (msec == 0) 2204 mddev->safemode_delay = 0; 2205 else { 2206 mddev->safemode_delay = (msec*HZ)/1000; 2207 if (mddev->safemode_delay == 0) 2208 mddev->safemode_delay = 1; 2209 } 2210 return len; 2211 } 2212 static struct md_sysfs_entry md_safe_delay = 2213 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2214 2215 static ssize_t 2216 level_show(mddev_t *mddev, char *page) 2217 { 2218 struct mdk_personality *p = mddev->pers; 2219 if (p) 2220 return sprintf(page, "%s\n", p->name); 2221 else if (mddev->clevel[0]) 2222 return sprintf(page, "%s\n", mddev->clevel); 2223 else if (mddev->level != LEVEL_NONE) 2224 return sprintf(page, "%d\n", mddev->level); 2225 else 2226 return 0; 2227 } 2228 2229 static ssize_t 2230 level_store(mddev_t *mddev, const char *buf, size_t len) 2231 { 2232 int rv = len; 2233 if (mddev->pers) 2234 return -EBUSY; 2235 if (len == 0) 2236 return 0; 2237 if (len >= sizeof(mddev->clevel)) 2238 return -ENOSPC; 2239 strncpy(mddev->clevel, buf, len); 2240 if (mddev->clevel[len-1] == '\n') 2241 len--; 2242 mddev->clevel[len] = 0; 2243 mddev->level = LEVEL_NONE; 2244 return rv; 2245 } 2246 2247 static struct md_sysfs_entry md_level = 2248 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2249 2250 2251 static ssize_t 2252 layout_show(mddev_t *mddev, char *page) 2253 { 2254 /* just a number, not meaningful for all levels */ 2255 if (mddev->reshape_position != MaxSector && 2256 mddev->layout != mddev->new_layout) 2257 return sprintf(page, "%d (%d)\n", 2258 mddev->new_layout, mddev->layout); 2259 return sprintf(page, "%d\n", mddev->layout); 2260 } 2261 2262 static ssize_t 2263 layout_store(mddev_t *mddev, const char *buf, size_t len) 2264 { 2265 char *e; 2266 unsigned long n = simple_strtoul(buf, &e, 10); 2267 2268 if (!*buf || (*e && *e != '\n')) 2269 return -EINVAL; 2270 2271 if (mddev->pers) 2272 return -EBUSY; 2273 if (mddev->reshape_position != MaxSector) 2274 mddev->new_layout = n; 2275 else 2276 mddev->layout = n; 2277 return len; 2278 } 2279 static struct md_sysfs_entry md_layout = 2280 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2281 2282 2283 static ssize_t 2284 raid_disks_show(mddev_t *mddev, char *page) 2285 { 2286 if (mddev->raid_disks == 0) 2287 return 0; 2288 if (mddev->reshape_position != MaxSector && 2289 mddev->delta_disks != 0) 2290 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2291 mddev->raid_disks - mddev->delta_disks); 2292 return sprintf(page, "%d\n", mddev->raid_disks); 2293 } 2294 2295 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2296 2297 static ssize_t 2298 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2299 { 2300 char *e; 2301 int rv = 0; 2302 unsigned long n = simple_strtoul(buf, &e, 10); 2303 2304 if (!*buf || (*e && *e != '\n')) 2305 return -EINVAL; 2306 2307 if (mddev->pers) 2308 rv = update_raid_disks(mddev, n); 2309 else if (mddev->reshape_position != MaxSector) { 2310 int olddisks = mddev->raid_disks - mddev->delta_disks; 2311 mddev->delta_disks = n - olddisks; 2312 mddev->raid_disks = n; 2313 } else 2314 mddev->raid_disks = n; 2315 return rv ? rv : len; 2316 } 2317 static struct md_sysfs_entry md_raid_disks = 2318 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2319 2320 static ssize_t 2321 chunk_size_show(mddev_t *mddev, char *page) 2322 { 2323 if (mddev->reshape_position != MaxSector && 2324 mddev->chunk_size != mddev->new_chunk) 2325 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2326 mddev->chunk_size); 2327 return sprintf(page, "%d\n", mddev->chunk_size); 2328 } 2329 2330 static ssize_t 2331 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2332 { 2333 /* can only set chunk_size if array is not yet active */ 2334 char *e; 2335 unsigned long n = simple_strtoul(buf, &e, 10); 2336 2337 if (!*buf || (*e && *e != '\n')) 2338 return -EINVAL; 2339 2340 if (mddev->pers) 2341 return -EBUSY; 2342 else if (mddev->reshape_position != MaxSector) 2343 mddev->new_chunk = n; 2344 else 2345 mddev->chunk_size = n; 2346 return len; 2347 } 2348 static struct md_sysfs_entry md_chunk_size = 2349 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2350 2351 static ssize_t 2352 resync_start_show(mddev_t *mddev, char *page) 2353 { 2354 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2355 } 2356 2357 static ssize_t 2358 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2359 { 2360 /* can only set chunk_size if array is not yet active */ 2361 char *e; 2362 unsigned long long n = simple_strtoull(buf, &e, 10); 2363 2364 if (mddev->pers) 2365 return -EBUSY; 2366 if (!*buf || (*e && *e != '\n')) 2367 return -EINVAL; 2368 2369 mddev->recovery_cp = n; 2370 return len; 2371 } 2372 static struct md_sysfs_entry md_resync_start = 2373 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2374 2375 /* 2376 * The array state can be: 2377 * 2378 * clear 2379 * No devices, no size, no level 2380 * Equivalent to STOP_ARRAY ioctl 2381 * inactive 2382 * May have some settings, but array is not active 2383 * all IO results in error 2384 * When written, doesn't tear down array, but just stops it 2385 * suspended (not supported yet) 2386 * All IO requests will block. The array can be reconfigured. 2387 * Writing this, if accepted, will block until array is quiessent 2388 * readonly 2389 * no resync can happen. no superblocks get written. 2390 * write requests fail 2391 * read-auto 2392 * like readonly, but behaves like 'clean' on a write request. 2393 * 2394 * clean - no pending writes, but otherwise active. 2395 * When written to inactive array, starts without resync 2396 * If a write request arrives then 2397 * if metadata is known, mark 'dirty' and switch to 'active'. 2398 * if not known, block and switch to write-pending 2399 * If written to an active array that has pending writes, then fails. 2400 * active 2401 * fully active: IO and resync can be happening. 2402 * When written to inactive array, starts with resync 2403 * 2404 * write-pending 2405 * clean, but writes are blocked waiting for 'active' to be written. 2406 * 2407 * active-idle 2408 * like active, but no writes have been seen for a while (100msec). 2409 * 2410 */ 2411 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2412 write_pending, active_idle, bad_word}; 2413 static char *array_states[] = { 2414 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2415 "write-pending", "active-idle", NULL }; 2416 2417 static int match_word(const char *word, char **list) 2418 { 2419 int n; 2420 for (n=0; list[n]; n++) 2421 if (cmd_match(word, list[n])) 2422 break; 2423 return n; 2424 } 2425 2426 static ssize_t 2427 array_state_show(mddev_t *mddev, char *page) 2428 { 2429 enum array_state st = inactive; 2430 2431 if (mddev->pers) 2432 switch(mddev->ro) { 2433 case 1: 2434 st = readonly; 2435 break; 2436 case 2: 2437 st = read_auto; 2438 break; 2439 case 0: 2440 if (mddev->in_sync) 2441 st = clean; 2442 else if (mddev->safemode) 2443 st = active_idle; 2444 else 2445 st = active; 2446 } 2447 else { 2448 if (list_empty(&mddev->disks) && 2449 mddev->raid_disks == 0 && 2450 mddev->size == 0) 2451 st = clear; 2452 else 2453 st = inactive; 2454 } 2455 return sprintf(page, "%s\n", array_states[st]); 2456 } 2457 2458 static int do_md_stop(mddev_t * mddev, int ro); 2459 static int do_md_run(mddev_t * mddev); 2460 static int restart_array(mddev_t *mddev); 2461 2462 static ssize_t 2463 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2464 { 2465 int err = -EINVAL; 2466 enum array_state st = match_word(buf, array_states); 2467 switch(st) { 2468 case bad_word: 2469 break; 2470 case clear: 2471 /* stopping an active array */ 2472 if (mddev->pers) { 2473 if (atomic_read(&mddev->active) > 1) 2474 return -EBUSY; 2475 err = do_md_stop(mddev, 0); 2476 } 2477 break; 2478 case inactive: 2479 /* stopping an active array */ 2480 if (mddev->pers) { 2481 if (atomic_read(&mddev->active) > 1) 2482 return -EBUSY; 2483 err = do_md_stop(mddev, 2); 2484 } 2485 break; 2486 case suspended: 2487 break; /* not supported yet */ 2488 case readonly: 2489 if (mddev->pers) 2490 err = do_md_stop(mddev, 1); 2491 else { 2492 mddev->ro = 1; 2493 err = do_md_run(mddev); 2494 } 2495 break; 2496 case read_auto: 2497 /* stopping an active array */ 2498 if (mddev->pers) { 2499 err = do_md_stop(mddev, 1); 2500 if (err == 0) 2501 mddev->ro = 2; /* FIXME mark devices writable */ 2502 } else { 2503 mddev->ro = 2; 2504 err = do_md_run(mddev); 2505 } 2506 break; 2507 case clean: 2508 if (mddev->pers) { 2509 restart_array(mddev); 2510 spin_lock_irq(&mddev->write_lock); 2511 if (atomic_read(&mddev->writes_pending) == 0) { 2512 mddev->in_sync = 1; 2513 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 2514 } 2515 spin_unlock_irq(&mddev->write_lock); 2516 } else { 2517 mddev->ro = 0; 2518 mddev->recovery_cp = MaxSector; 2519 err = do_md_run(mddev); 2520 } 2521 break; 2522 case active: 2523 if (mddev->pers) { 2524 restart_array(mddev); 2525 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2526 wake_up(&mddev->sb_wait); 2527 err = 0; 2528 } else { 2529 mddev->ro = 0; 2530 err = do_md_run(mddev); 2531 } 2532 break; 2533 case write_pending: 2534 case active_idle: 2535 /* these cannot be set */ 2536 break; 2537 } 2538 if (err) 2539 return err; 2540 else 2541 return len; 2542 } 2543 static struct md_sysfs_entry md_array_state = 2544 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2545 2546 static ssize_t 2547 null_show(mddev_t *mddev, char *page) 2548 { 2549 return -EINVAL; 2550 } 2551 2552 static ssize_t 2553 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2554 { 2555 /* buf must be %d:%d\n? giving major and minor numbers */ 2556 /* The new device is added to the array. 2557 * If the array has a persistent superblock, we read the 2558 * superblock to initialise info and check validity. 2559 * Otherwise, only checking done is that in bind_rdev_to_array, 2560 * which mainly checks size. 2561 */ 2562 char *e; 2563 int major = simple_strtoul(buf, &e, 10); 2564 int minor; 2565 dev_t dev; 2566 mdk_rdev_t *rdev; 2567 int err; 2568 2569 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2570 return -EINVAL; 2571 minor = simple_strtoul(e+1, &e, 10); 2572 if (*e && *e != '\n') 2573 return -EINVAL; 2574 dev = MKDEV(major, minor); 2575 if (major != MAJOR(dev) || 2576 minor != MINOR(dev)) 2577 return -EOVERFLOW; 2578 2579 2580 if (mddev->persistent) { 2581 rdev = md_import_device(dev, mddev->major_version, 2582 mddev->minor_version); 2583 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2584 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2585 mdk_rdev_t, same_set); 2586 err = super_types[mddev->major_version] 2587 .load_super(rdev, rdev0, mddev->minor_version); 2588 if (err < 0) 2589 goto out; 2590 } 2591 } else 2592 rdev = md_import_device(dev, -1, -1); 2593 2594 if (IS_ERR(rdev)) 2595 return PTR_ERR(rdev); 2596 err = bind_rdev_to_array(rdev, mddev); 2597 out: 2598 if (err) 2599 export_rdev(rdev); 2600 return err ? err : len; 2601 } 2602 2603 static struct md_sysfs_entry md_new_device = 2604 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2605 2606 static ssize_t 2607 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2608 { 2609 char *end; 2610 unsigned long chunk, end_chunk; 2611 2612 if (!mddev->bitmap) 2613 goto out; 2614 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2615 while (*buf) { 2616 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2617 if (buf == end) break; 2618 if (*end == '-') { /* range */ 2619 buf = end + 1; 2620 end_chunk = simple_strtoul(buf, &end, 0); 2621 if (buf == end) break; 2622 } 2623 if (*end && !isspace(*end)) break; 2624 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2625 buf = end; 2626 while (isspace(*buf)) buf++; 2627 } 2628 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2629 out: 2630 return len; 2631 } 2632 2633 static struct md_sysfs_entry md_bitmap = 2634 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2635 2636 static ssize_t 2637 size_show(mddev_t *mddev, char *page) 2638 { 2639 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2640 } 2641 2642 static int update_size(mddev_t *mddev, unsigned long size); 2643 2644 static ssize_t 2645 size_store(mddev_t *mddev, const char *buf, size_t len) 2646 { 2647 /* If array is inactive, we can reduce the component size, but 2648 * not increase it (except from 0). 2649 * If array is active, we can try an on-line resize 2650 */ 2651 char *e; 2652 int err = 0; 2653 unsigned long long size = simple_strtoull(buf, &e, 10); 2654 if (!*buf || *buf == '\n' || 2655 (*e && *e != '\n')) 2656 return -EINVAL; 2657 2658 if (mddev->pers) { 2659 err = update_size(mddev, size); 2660 md_update_sb(mddev, 1); 2661 } else { 2662 if (mddev->size == 0 || 2663 mddev->size > size) 2664 mddev->size = size; 2665 else 2666 err = -ENOSPC; 2667 } 2668 return err ? err : len; 2669 } 2670 2671 static struct md_sysfs_entry md_size = 2672 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2673 2674 2675 /* Metdata version. 2676 * This is either 'none' for arrays with externally managed metadata, 2677 * or N.M for internally known formats 2678 */ 2679 static ssize_t 2680 metadata_show(mddev_t *mddev, char *page) 2681 { 2682 if (mddev->persistent) 2683 return sprintf(page, "%d.%d\n", 2684 mddev->major_version, mddev->minor_version); 2685 else 2686 return sprintf(page, "none\n"); 2687 } 2688 2689 static ssize_t 2690 metadata_store(mddev_t *mddev, const char *buf, size_t len) 2691 { 2692 int major, minor; 2693 char *e; 2694 if (!list_empty(&mddev->disks)) 2695 return -EBUSY; 2696 2697 if (cmd_match(buf, "none")) { 2698 mddev->persistent = 0; 2699 mddev->major_version = 0; 2700 mddev->minor_version = 90; 2701 return len; 2702 } 2703 major = simple_strtoul(buf, &e, 10); 2704 if (e==buf || *e != '.') 2705 return -EINVAL; 2706 buf = e+1; 2707 minor = simple_strtoul(buf, &e, 10); 2708 if (e==buf || (*e && *e != '\n') ) 2709 return -EINVAL; 2710 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 2711 return -ENOENT; 2712 mddev->major_version = major; 2713 mddev->minor_version = minor; 2714 mddev->persistent = 1; 2715 return len; 2716 } 2717 2718 static struct md_sysfs_entry md_metadata = 2719 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2720 2721 static ssize_t 2722 action_show(mddev_t *mddev, char *page) 2723 { 2724 char *type = "idle"; 2725 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2726 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2727 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2728 type = "reshape"; 2729 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2730 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2731 type = "resync"; 2732 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2733 type = "check"; 2734 else 2735 type = "repair"; 2736 } else 2737 type = "recover"; 2738 } 2739 return sprintf(page, "%s\n", type); 2740 } 2741 2742 static ssize_t 2743 action_store(mddev_t *mddev, const char *page, size_t len) 2744 { 2745 if (!mddev->pers || !mddev->pers->sync_request) 2746 return -EINVAL; 2747 2748 if (cmd_match(page, "idle")) { 2749 if (mddev->sync_thread) { 2750 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2751 md_unregister_thread(mddev->sync_thread); 2752 mddev->sync_thread = NULL; 2753 mddev->recovery = 0; 2754 } 2755 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2756 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2757 return -EBUSY; 2758 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2759 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2760 else if (cmd_match(page, "reshape")) { 2761 int err; 2762 if (mddev->pers->start_reshape == NULL) 2763 return -EINVAL; 2764 err = mddev->pers->start_reshape(mddev); 2765 if (err) 2766 return err; 2767 } else { 2768 if (cmd_match(page, "check")) 2769 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2770 else if (!cmd_match(page, "repair")) 2771 return -EINVAL; 2772 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2773 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2774 } 2775 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2776 md_wakeup_thread(mddev->thread); 2777 return len; 2778 } 2779 2780 static ssize_t 2781 mismatch_cnt_show(mddev_t *mddev, char *page) 2782 { 2783 return sprintf(page, "%llu\n", 2784 (unsigned long long) mddev->resync_mismatches); 2785 } 2786 2787 static struct md_sysfs_entry md_scan_mode = 2788 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2789 2790 2791 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2792 2793 static ssize_t 2794 sync_min_show(mddev_t *mddev, char *page) 2795 { 2796 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2797 mddev->sync_speed_min ? "local": "system"); 2798 } 2799 2800 static ssize_t 2801 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2802 { 2803 int min; 2804 char *e; 2805 if (strncmp(buf, "system", 6)==0) { 2806 mddev->sync_speed_min = 0; 2807 return len; 2808 } 2809 min = simple_strtoul(buf, &e, 10); 2810 if (buf == e || (*e && *e != '\n') || min <= 0) 2811 return -EINVAL; 2812 mddev->sync_speed_min = min; 2813 return len; 2814 } 2815 2816 static struct md_sysfs_entry md_sync_min = 2817 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2818 2819 static ssize_t 2820 sync_max_show(mddev_t *mddev, char *page) 2821 { 2822 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2823 mddev->sync_speed_max ? "local": "system"); 2824 } 2825 2826 static ssize_t 2827 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2828 { 2829 int max; 2830 char *e; 2831 if (strncmp(buf, "system", 6)==0) { 2832 mddev->sync_speed_max = 0; 2833 return len; 2834 } 2835 max = simple_strtoul(buf, &e, 10); 2836 if (buf == e || (*e && *e != '\n') || max <= 0) 2837 return -EINVAL; 2838 mddev->sync_speed_max = max; 2839 return len; 2840 } 2841 2842 static struct md_sysfs_entry md_sync_max = 2843 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2844 2845 2846 static ssize_t 2847 sync_speed_show(mddev_t *mddev, char *page) 2848 { 2849 unsigned long resync, dt, db; 2850 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2851 dt = ((jiffies - mddev->resync_mark) / HZ); 2852 if (!dt) dt++; 2853 db = resync - (mddev->resync_mark_cnt); 2854 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2855 } 2856 2857 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2858 2859 static ssize_t 2860 sync_completed_show(mddev_t *mddev, char *page) 2861 { 2862 unsigned long max_blocks, resync; 2863 2864 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2865 max_blocks = mddev->resync_max_sectors; 2866 else 2867 max_blocks = mddev->size << 1; 2868 2869 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2870 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2871 } 2872 2873 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2874 2875 static ssize_t 2876 suspend_lo_show(mddev_t *mddev, char *page) 2877 { 2878 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2879 } 2880 2881 static ssize_t 2882 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2883 { 2884 char *e; 2885 unsigned long long new = simple_strtoull(buf, &e, 10); 2886 2887 if (mddev->pers->quiesce == NULL) 2888 return -EINVAL; 2889 if (buf == e || (*e && *e != '\n')) 2890 return -EINVAL; 2891 if (new >= mddev->suspend_hi || 2892 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2893 mddev->suspend_lo = new; 2894 mddev->pers->quiesce(mddev, 2); 2895 return len; 2896 } else 2897 return -EINVAL; 2898 } 2899 static struct md_sysfs_entry md_suspend_lo = 2900 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2901 2902 2903 static ssize_t 2904 suspend_hi_show(mddev_t *mddev, char *page) 2905 { 2906 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2907 } 2908 2909 static ssize_t 2910 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2911 { 2912 char *e; 2913 unsigned long long new = simple_strtoull(buf, &e, 10); 2914 2915 if (mddev->pers->quiesce == NULL) 2916 return -EINVAL; 2917 if (buf == e || (*e && *e != '\n')) 2918 return -EINVAL; 2919 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2920 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2921 mddev->suspend_hi = new; 2922 mddev->pers->quiesce(mddev, 1); 2923 mddev->pers->quiesce(mddev, 0); 2924 return len; 2925 } else 2926 return -EINVAL; 2927 } 2928 static struct md_sysfs_entry md_suspend_hi = 2929 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2930 2931 static ssize_t 2932 reshape_position_show(mddev_t *mddev, char *page) 2933 { 2934 if (mddev->reshape_position != MaxSector) 2935 return sprintf(page, "%llu\n", 2936 (unsigned long long)mddev->reshape_position); 2937 strcpy(page, "none\n"); 2938 return 5; 2939 } 2940 2941 static ssize_t 2942 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 2943 { 2944 char *e; 2945 unsigned long long new = simple_strtoull(buf, &e, 10); 2946 if (mddev->pers) 2947 return -EBUSY; 2948 if (buf == e || (*e && *e != '\n')) 2949 return -EINVAL; 2950 mddev->reshape_position = new; 2951 mddev->delta_disks = 0; 2952 mddev->new_level = mddev->level; 2953 mddev->new_layout = mddev->layout; 2954 mddev->new_chunk = mddev->chunk_size; 2955 return len; 2956 } 2957 2958 static struct md_sysfs_entry md_reshape_position = 2959 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 2960 reshape_position_store); 2961 2962 2963 static struct attribute *md_default_attrs[] = { 2964 &md_level.attr, 2965 &md_layout.attr, 2966 &md_raid_disks.attr, 2967 &md_chunk_size.attr, 2968 &md_size.attr, 2969 &md_resync_start.attr, 2970 &md_metadata.attr, 2971 &md_new_device.attr, 2972 &md_safe_delay.attr, 2973 &md_array_state.attr, 2974 &md_reshape_position.attr, 2975 NULL, 2976 }; 2977 2978 static struct attribute *md_redundancy_attrs[] = { 2979 &md_scan_mode.attr, 2980 &md_mismatches.attr, 2981 &md_sync_min.attr, 2982 &md_sync_max.attr, 2983 &md_sync_speed.attr, 2984 &md_sync_completed.attr, 2985 &md_suspend_lo.attr, 2986 &md_suspend_hi.attr, 2987 &md_bitmap.attr, 2988 NULL, 2989 }; 2990 static struct attribute_group md_redundancy_group = { 2991 .name = NULL, 2992 .attrs = md_redundancy_attrs, 2993 }; 2994 2995 2996 static ssize_t 2997 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2998 { 2999 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3000 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3001 ssize_t rv; 3002 3003 if (!entry->show) 3004 return -EIO; 3005 rv = mddev_lock(mddev); 3006 if (!rv) { 3007 rv = entry->show(mddev, page); 3008 mddev_unlock(mddev); 3009 } 3010 return rv; 3011 } 3012 3013 static ssize_t 3014 md_attr_store(struct kobject *kobj, struct attribute *attr, 3015 const char *page, size_t length) 3016 { 3017 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3018 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3019 ssize_t rv; 3020 3021 if (!entry->store) 3022 return -EIO; 3023 if (!capable(CAP_SYS_ADMIN)) 3024 return -EACCES; 3025 rv = mddev_lock(mddev); 3026 if (!rv) { 3027 rv = entry->store(mddev, page, length); 3028 mddev_unlock(mddev); 3029 } 3030 return rv; 3031 } 3032 3033 static void md_free(struct kobject *ko) 3034 { 3035 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3036 kfree(mddev); 3037 } 3038 3039 static struct sysfs_ops md_sysfs_ops = { 3040 .show = md_attr_show, 3041 .store = md_attr_store, 3042 }; 3043 static struct kobj_type md_ktype = { 3044 .release = md_free, 3045 .sysfs_ops = &md_sysfs_ops, 3046 .default_attrs = md_default_attrs, 3047 }; 3048 3049 int mdp_major = 0; 3050 3051 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3052 { 3053 static DEFINE_MUTEX(disks_mutex); 3054 mddev_t *mddev = mddev_find(dev); 3055 struct gendisk *disk; 3056 int partitioned = (MAJOR(dev) != MD_MAJOR); 3057 int shift = partitioned ? MdpMinorShift : 0; 3058 int unit = MINOR(dev) >> shift; 3059 3060 if (!mddev) 3061 return NULL; 3062 3063 mutex_lock(&disks_mutex); 3064 if (mddev->gendisk) { 3065 mutex_unlock(&disks_mutex); 3066 mddev_put(mddev); 3067 return NULL; 3068 } 3069 disk = alloc_disk(1 << shift); 3070 if (!disk) { 3071 mutex_unlock(&disks_mutex); 3072 mddev_put(mddev); 3073 return NULL; 3074 } 3075 disk->major = MAJOR(dev); 3076 disk->first_minor = unit << shift; 3077 if (partitioned) 3078 sprintf(disk->disk_name, "md_d%d", unit); 3079 else 3080 sprintf(disk->disk_name, "md%d", unit); 3081 disk->fops = &md_fops; 3082 disk->private_data = mddev; 3083 disk->queue = mddev->queue; 3084 add_disk(disk); 3085 mddev->gendisk = disk; 3086 mutex_unlock(&disks_mutex); 3087 mddev->kobj.parent = &disk->kobj; 3088 mddev->kobj.k_name = NULL; 3089 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 3090 mddev->kobj.ktype = &md_ktype; 3091 if (kobject_register(&mddev->kobj)) 3092 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3093 disk->disk_name); 3094 return NULL; 3095 } 3096 3097 static void md_safemode_timeout(unsigned long data) 3098 { 3099 mddev_t *mddev = (mddev_t *) data; 3100 3101 mddev->safemode = 1; 3102 md_wakeup_thread(mddev->thread); 3103 } 3104 3105 static int start_dirty_degraded; 3106 3107 static int do_md_run(mddev_t * mddev) 3108 { 3109 int err; 3110 int chunk_size; 3111 struct list_head *tmp; 3112 mdk_rdev_t *rdev; 3113 struct gendisk *disk; 3114 struct mdk_personality *pers; 3115 char b[BDEVNAME_SIZE]; 3116 3117 if (list_empty(&mddev->disks)) 3118 /* cannot run an array with no devices.. */ 3119 return -EINVAL; 3120 3121 if (mddev->pers) 3122 return -EBUSY; 3123 3124 /* 3125 * Analyze all RAID superblock(s) 3126 */ 3127 if (!mddev->raid_disks) 3128 analyze_sbs(mddev); 3129 3130 chunk_size = mddev->chunk_size; 3131 3132 if (chunk_size) { 3133 if (chunk_size > MAX_CHUNK_SIZE) { 3134 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3135 chunk_size, MAX_CHUNK_SIZE); 3136 return -EINVAL; 3137 } 3138 /* 3139 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3140 */ 3141 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3142 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3143 return -EINVAL; 3144 } 3145 if (chunk_size < PAGE_SIZE) { 3146 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3147 chunk_size, PAGE_SIZE); 3148 return -EINVAL; 3149 } 3150 3151 /* devices must have minimum size of one chunk */ 3152 ITERATE_RDEV(mddev,rdev,tmp) { 3153 if (test_bit(Faulty, &rdev->flags)) 3154 continue; 3155 if (rdev->size < chunk_size / 1024) { 3156 printk(KERN_WARNING 3157 "md: Dev %s smaller than chunk_size:" 3158 " %lluk < %dk\n", 3159 bdevname(rdev->bdev,b), 3160 (unsigned long long)rdev->size, 3161 chunk_size / 1024); 3162 return -EINVAL; 3163 } 3164 } 3165 } 3166 3167 #ifdef CONFIG_KMOD 3168 if (mddev->level != LEVEL_NONE) 3169 request_module("md-level-%d", mddev->level); 3170 else if (mddev->clevel[0]) 3171 request_module("md-%s", mddev->clevel); 3172 #endif 3173 3174 /* 3175 * Drop all container device buffers, from now on 3176 * the only valid external interface is through the md 3177 * device. 3178 */ 3179 ITERATE_RDEV(mddev,rdev,tmp) { 3180 if (test_bit(Faulty, &rdev->flags)) 3181 continue; 3182 sync_blockdev(rdev->bdev); 3183 invalidate_bdev(rdev->bdev); 3184 3185 /* perform some consistency tests on the device. 3186 * We don't want the data to overlap the metadata, 3187 * Internal Bitmap issues has handled elsewhere. 3188 */ 3189 if (rdev->data_offset < rdev->sb_offset) { 3190 if (mddev->size && 3191 rdev->data_offset + mddev->size*2 3192 > rdev->sb_offset*2) { 3193 printk("md: %s: data overlaps metadata\n", 3194 mdname(mddev)); 3195 return -EINVAL; 3196 } 3197 } else { 3198 if (rdev->sb_offset*2 + rdev->sb_size/512 3199 > rdev->data_offset) { 3200 printk("md: %s: metadata overlaps data\n", 3201 mdname(mddev)); 3202 return -EINVAL; 3203 } 3204 } 3205 } 3206 3207 md_probe(mddev->unit, NULL, NULL); 3208 disk = mddev->gendisk; 3209 if (!disk) 3210 return -ENOMEM; 3211 3212 spin_lock(&pers_lock); 3213 pers = find_pers(mddev->level, mddev->clevel); 3214 if (!pers || !try_module_get(pers->owner)) { 3215 spin_unlock(&pers_lock); 3216 if (mddev->level != LEVEL_NONE) 3217 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3218 mddev->level); 3219 else 3220 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3221 mddev->clevel); 3222 return -EINVAL; 3223 } 3224 mddev->pers = pers; 3225 spin_unlock(&pers_lock); 3226 mddev->level = pers->level; 3227 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3228 3229 if (mddev->reshape_position != MaxSector && 3230 pers->start_reshape == NULL) { 3231 /* This personality cannot handle reshaping... */ 3232 mddev->pers = NULL; 3233 module_put(pers->owner); 3234 return -EINVAL; 3235 } 3236 3237 if (pers->sync_request) { 3238 /* Warn if this is a potentially silly 3239 * configuration. 3240 */ 3241 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3242 mdk_rdev_t *rdev2; 3243 struct list_head *tmp2; 3244 int warned = 0; 3245 ITERATE_RDEV(mddev, rdev, tmp) { 3246 ITERATE_RDEV(mddev, rdev2, tmp2) { 3247 if (rdev < rdev2 && 3248 rdev->bdev->bd_contains == 3249 rdev2->bdev->bd_contains) { 3250 printk(KERN_WARNING 3251 "%s: WARNING: %s appears to be" 3252 " on the same physical disk as" 3253 " %s.\n", 3254 mdname(mddev), 3255 bdevname(rdev->bdev,b), 3256 bdevname(rdev2->bdev,b2)); 3257 warned = 1; 3258 } 3259 } 3260 } 3261 if (warned) 3262 printk(KERN_WARNING 3263 "True protection against single-disk" 3264 " failure might be compromised.\n"); 3265 } 3266 3267 mddev->recovery = 0; 3268 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3269 mddev->barriers_work = 1; 3270 mddev->ok_start_degraded = start_dirty_degraded; 3271 3272 if (start_readonly) 3273 mddev->ro = 2; /* read-only, but switch on first write */ 3274 3275 err = mddev->pers->run(mddev); 3276 if (!err && mddev->pers->sync_request) { 3277 err = bitmap_create(mddev); 3278 if (err) { 3279 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3280 mdname(mddev), err); 3281 mddev->pers->stop(mddev); 3282 } 3283 } 3284 if (err) { 3285 printk(KERN_ERR "md: pers->run() failed ...\n"); 3286 module_put(mddev->pers->owner); 3287 mddev->pers = NULL; 3288 bitmap_destroy(mddev); 3289 return err; 3290 } 3291 if (mddev->pers->sync_request) { 3292 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3293 printk(KERN_WARNING 3294 "md: cannot register extra attributes for %s\n", 3295 mdname(mddev)); 3296 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3297 mddev->ro = 0; 3298 3299 atomic_set(&mddev->writes_pending,0); 3300 mddev->safemode = 0; 3301 mddev->safemode_timer.function = md_safemode_timeout; 3302 mddev->safemode_timer.data = (unsigned long) mddev; 3303 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3304 mddev->in_sync = 1; 3305 3306 ITERATE_RDEV(mddev,rdev,tmp) 3307 if (rdev->raid_disk >= 0) { 3308 char nm[20]; 3309 sprintf(nm, "rd%d", rdev->raid_disk); 3310 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3311 printk("md: cannot register %s for %s\n", 3312 nm, mdname(mddev)); 3313 } 3314 3315 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3316 3317 if (mddev->flags) 3318 md_update_sb(mddev, 0); 3319 3320 set_capacity(disk, mddev->array_size<<1); 3321 3322 /* If we call blk_queue_make_request here, it will 3323 * re-initialise max_sectors etc which may have been 3324 * refined inside -> run. So just set the bits we need to set. 3325 * Most initialisation happended when we called 3326 * blk_queue_make_request(..., md_fail_request) 3327 * earlier. 3328 */ 3329 mddev->queue->queuedata = mddev; 3330 mddev->queue->make_request_fn = mddev->pers->make_request; 3331 3332 /* If there is a partially-recovered drive we need to 3333 * start recovery here. If we leave it to md_check_recovery, 3334 * it will remove the drives and not do the right thing 3335 */ 3336 if (mddev->degraded && !mddev->sync_thread) { 3337 struct list_head *rtmp; 3338 int spares = 0; 3339 ITERATE_RDEV(mddev,rdev,rtmp) 3340 if (rdev->raid_disk >= 0 && 3341 !test_bit(In_sync, &rdev->flags) && 3342 !test_bit(Faulty, &rdev->flags)) 3343 /* complete an interrupted recovery */ 3344 spares++; 3345 if (spares && mddev->pers->sync_request) { 3346 mddev->recovery = 0; 3347 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3348 mddev->sync_thread = md_register_thread(md_do_sync, 3349 mddev, 3350 "%s_resync"); 3351 if (!mddev->sync_thread) { 3352 printk(KERN_ERR "%s: could not start resync" 3353 " thread...\n", 3354 mdname(mddev)); 3355 /* leave the spares where they are, it shouldn't hurt */ 3356 mddev->recovery = 0; 3357 } 3358 } 3359 } 3360 md_wakeup_thread(mddev->thread); 3361 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3362 3363 mddev->changed = 1; 3364 md_new_event(mddev); 3365 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); 3366 return 0; 3367 } 3368 3369 static int restart_array(mddev_t *mddev) 3370 { 3371 struct gendisk *disk = mddev->gendisk; 3372 int err; 3373 3374 /* 3375 * Complain if it has no devices 3376 */ 3377 err = -ENXIO; 3378 if (list_empty(&mddev->disks)) 3379 goto out; 3380 3381 if (mddev->pers) { 3382 err = -EBUSY; 3383 if (!mddev->ro) 3384 goto out; 3385 3386 mddev->safemode = 0; 3387 mddev->ro = 0; 3388 set_disk_ro(disk, 0); 3389 3390 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3391 mdname(mddev)); 3392 /* 3393 * Kick recovery or resync if necessary 3394 */ 3395 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3396 md_wakeup_thread(mddev->thread); 3397 md_wakeup_thread(mddev->sync_thread); 3398 err = 0; 3399 } else 3400 err = -EINVAL; 3401 3402 out: 3403 return err; 3404 } 3405 3406 /* similar to deny_write_access, but accounts for our holding a reference 3407 * to the file ourselves */ 3408 static int deny_bitmap_write_access(struct file * file) 3409 { 3410 struct inode *inode = file->f_mapping->host; 3411 3412 spin_lock(&inode->i_lock); 3413 if (atomic_read(&inode->i_writecount) > 1) { 3414 spin_unlock(&inode->i_lock); 3415 return -ETXTBSY; 3416 } 3417 atomic_set(&inode->i_writecount, -1); 3418 spin_unlock(&inode->i_lock); 3419 3420 return 0; 3421 } 3422 3423 static void restore_bitmap_write_access(struct file *file) 3424 { 3425 struct inode *inode = file->f_mapping->host; 3426 3427 spin_lock(&inode->i_lock); 3428 atomic_set(&inode->i_writecount, 1); 3429 spin_unlock(&inode->i_lock); 3430 } 3431 3432 /* mode: 3433 * 0 - completely stop and dis-assemble array 3434 * 1 - switch to readonly 3435 * 2 - stop but do not disassemble array 3436 */ 3437 static int do_md_stop(mddev_t * mddev, int mode) 3438 { 3439 int err = 0; 3440 struct gendisk *disk = mddev->gendisk; 3441 3442 if (mddev->pers) { 3443 if (atomic_read(&mddev->active)>2) { 3444 printk("md: %s still in use.\n",mdname(mddev)); 3445 return -EBUSY; 3446 } 3447 3448 if (mddev->sync_thread) { 3449 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3450 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3451 md_unregister_thread(mddev->sync_thread); 3452 mddev->sync_thread = NULL; 3453 } 3454 3455 del_timer_sync(&mddev->safemode_timer); 3456 3457 invalidate_partition(disk, 0); 3458 3459 switch(mode) { 3460 case 1: /* readonly */ 3461 err = -ENXIO; 3462 if (mddev->ro==1) 3463 goto out; 3464 mddev->ro = 1; 3465 break; 3466 case 0: /* disassemble */ 3467 case 2: /* stop */ 3468 bitmap_flush(mddev); 3469 md_super_wait(mddev); 3470 if (mddev->ro) 3471 set_disk_ro(disk, 0); 3472 blk_queue_make_request(mddev->queue, md_fail_request); 3473 mddev->pers->stop(mddev); 3474 mddev->queue->merge_bvec_fn = NULL; 3475 mddev->queue->unplug_fn = NULL; 3476 mddev->queue->issue_flush_fn = NULL; 3477 mddev->queue->backing_dev_info.congested_fn = NULL; 3478 if (mddev->pers->sync_request) 3479 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3480 3481 module_put(mddev->pers->owner); 3482 mddev->pers = NULL; 3483 3484 set_capacity(disk, 0); 3485 mddev->changed = 1; 3486 3487 if (mddev->ro) 3488 mddev->ro = 0; 3489 } 3490 if (!mddev->in_sync || mddev->flags) { 3491 /* mark array as shutdown cleanly */ 3492 mddev->in_sync = 1; 3493 md_update_sb(mddev, 1); 3494 } 3495 if (mode == 1) 3496 set_disk_ro(disk, 1); 3497 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3498 } 3499 3500 /* 3501 * Free resources if final stop 3502 */ 3503 if (mode == 0) { 3504 mdk_rdev_t *rdev; 3505 struct list_head *tmp; 3506 3507 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3508 3509 bitmap_destroy(mddev); 3510 if (mddev->bitmap_file) { 3511 restore_bitmap_write_access(mddev->bitmap_file); 3512 fput(mddev->bitmap_file); 3513 mddev->bitmap_file = NULL; 3514 } 3515 mddev->bitmap_offset = 0; 3516 3517 ITERATE_RDEV(mddev,rdev,tmp) 3518 if (rdev->raid_disk >= 0) { 3519 char nm[20]; 3520 sprintf(nm, "rd%d", rdev->raid_disk); 3521 sysfs_remove_link(&mddev->kobj, nm); 3522 } 3523 3524 /* make sure all delayed_delete calls have finished */ 3525 flush_scheduled_work(); 3526 3527 export_array(mddev); 3528 3529 mddev->array_size = 0; 3530 mddev->size = 0; 3531 mddev->raid_disks = 0; 3532 mddev->recovery_cp = 0; 3533 mddev->reshape_position = MaxSector; 3534 3535 } else if (mddev->pers) 3536 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3537 mdname(mddev)); 3538 err = 0; 3539 md_new_event(mddev); 3540 out: 3541 return err; 3542 } 3543 3544 #ifndef MODULE 3545 static void autorun_array(mddev_t *mddev) 3546 { 3547 mdk_rdev_t *rdev; 3548 struct list_head *tmp; 3549 int err; 3550 3551 if (list_empty(&mddev->disks)) 3552 return; 3553 3554 printk(KERN_INFO "md: running: "); 3555 3556 ITERATE_RDEV(mddev,rdev,tmp) { 3557 char b[BDEVNAME_SIZE]; 3558 printk("<%s>", bdevname(rdev->bdev,b)); 3559 } 3560 printk("\n"); 3561 3562 err = do_md_run (mddev); 3563 if (err) { 3564 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3565 do_md_stop (mddev, 0); 3566 } 3567 } 3568 3569 /* 3570 * lets try to run arrays based on all disks that have arrived 3571 * until now. (those are in pending_raid_disks) 3572 * 3573 * the method: pick the first pending disk, collect all disks with 3574 * the same UUID, remove all from the pending list and put them into 3575 * the 'same_array' list. Then order this list based on superblock 3576 * update time (freshest comes first), kick out 'old' disks and 3577 * compare superblocks. If everything's fine then run it. 3578 * 3579 * If "unit" is allocated, then bump its reference count 3580 */ 3581 static void autorun_devices(int part) 3582 { 3583 struct list_head *tmp; 3584 mdk_rdev_t *rdev0, *rdev; 3585 mddev_t *mddev; 3586 char b[BDEVNAME_SIZE]; 3587 3588 printk(KERN_INFO "md: autorun ...\n"); 3589 while (!list_empty(&pending_raid_disks)) { 3590 int unit; 3591 dev_t dev; 3592 LIST_HEAD(candidates); 3593 rdev0 = list_entry(pending_raid_disks.next, 3594 mdk_rdev_t, same_set); 3595 3596 printk(KERN_INFO "md: considering %s ...\n", 3597 bdevname(rdev0->bdev,b)); 3598 INIT_LIST_HEAD(&candidates); 3599 ITERATE_RDEV_PENDING(rdev,tmp) 3600 if (super_90_load(rdev, rdev0, 0) >= 0) { 3601 printk(KERN_INFO "md: adding %s ...\n", 3602 bdevname(rdev->bdev,b)); 3603 list_move(&rdev->same_set, &candidates); 3604 } 3605 /* 3606 * now we have a set of devices, with all of them having 3607 * mostly sane superblocks. It's time to allocate the 3608 * mddev. 3609 */ 3610 if (part) { 3611 dev = MKDEV(mdp_major, 3612 rdev0->preferred_minor << MdpMinorShift); 3613 unit = MINOR(dev) >> MdpMinorShift; 3614 } else { 3615 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3616 unit = MINOR(dev); 3617 } 3618 if (rdev0->preferred_minor != unit) { 3619 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3620 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3621 break; 3622 } 3623 3624 md_probe(dev, NULL, NULL); 3625 mddev = mddev_find(dev); 3626 if (!mddev) { 3627 printk(KERN_ERR 3628 "md: cannot allocate memory for md drive.\n"); 3629 break; 3630 } 3631 if (mddev_lock(mddev)) 3632 printk(KERN_WARNING "md: %s locked, cannot run\n", 3633 mdname(mddev)); 3634 else if (mddev->raid_disks || mddev->major_version 3635 || !list_empty(&mddev->disks)) { 3636 printk(KERN_WARNING 3637 "md: %s already running, cannot run %s\n", 3638 mdname(mddev), bdevname(rdev0->bdev,b)); 3639 mddev_unlock(mddev); 3640 } else { 3641 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3642 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3643 list_del_init(&rdev->same_set); 3644 if (bind_rdev_to_array(rdev, mddev)) 3645 export_rdev(rdev); 3646 } 3647 autorun_array(mddev); 3648 mddev_unlock(mddev); 3649 } 3650 /* on success, candidates will be empty, on error 3651 * it won't... 3652 */ 3653 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3654 export_rdev(rdev); 3655 mddev_put(mddev); 3656 } 3657 printk(KERN_INFO "md: ... autorun DONE.\n"); 3658 } 3659 #endif /* !MODULE */ 3660 3661 static int get_version(void __user * arg) 3662 { 3663 mdu_version_t ver; 3664 3665 ver.major = MD_MAJOR_VERSION; 3666 ver.minor = MD_MINOR_VERSION; 3667 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3668 3669 if (copy_to_user(arg, &ver, sizeof(ver))) 3670 return -EFAULT; 3671 3672 return 0; 3673 } 3674 3675 static int get_array_info(mddev_t * mddev, void __user * arg) 3676 { 3677 mdu_array_info_t info; 3678 int nr,working,active,failed,spare; 3679 mdk_rdev_t *rdev; 3680 struct list_head *tmp; 3681 3682 nr=working=active=failed=spare=0; 3683 ITERATE_RDEV(mddev,rdev,tmp) { 3684 nr++; 3685 if (test_bit(Faulty, &rdev->flags)) 3686 failed++; 3687 else { 3688 working++; 3689 if (test_bit(In_sync, &rdev->flags)) 3690 active++; 3691 else 3692 spare++; 3693 } 3694 } 3695 3696 info.major_version = mddev->major_version; 3697 info.minor_version = mddev->minor_version; 3698 info.patch_version = MD_PATCHLEVEL_VERSION; 3699 info.ctime = mddev->ctime; 3700 info.level = mddev->level; 3701 info.size = mddev->size; 3702 if (info.size != mddev->size) /* overflow */ 3703 info.size = -1; 3704 info.nr_disks = nr; 3705 info.raid_disks = mddev->raid_disks; 3706 info.md_minor = mddev->md_minor; 3707 info.not_persistent= !mddev->persistent; 3708 3709 info.utime = mddev->utime; 3710 info.state = 0; 3711 if (mddev->in_sync) 3712 info.state = (1<<MD_SB_CLEAN); 3713 if (mddev->bitmap && mddev->bitmap_offset) 3714 info.state = (1<<MD_SB_BITMAP_PRESENT); 3715 info.active_disks = active; 3716 info.working_disks = working; 3717 info.failed_disks = failed; 3718 info.spare_disks = spare; 3719 3720 info.layout = mddev->layout; 3721 info.chunk_size = mddev->chunk_size; 3722 3723 if (copy_to_user(arg, &info, sizeof(info))) 3724 return -EFAULT; 3725 3726 return 0; 3727 } 3728 3729 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3730 { 3731 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3732 char *ptr, *buf = NULL; 3733 int err = -ENOMEM; 3734 3735 md_allow_write(mddev); 3736 3737 file = kmalloc(sizeof(*file), GFP_KERNEL); 3738 if (!file) 3739 goto out; 3740 3741 /* bitmap disabled, zero the first byte and copy out */ 3742 if (!mddev->bitmap || !mddev->bitmap->file) { 3743 file->pathname[0] = '\0'; 3744 goto copy_out; 3745 } 3746 3747 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3748 if (!buf) 3749 goto out; 3750 3751 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3752 if (!ptr) 3753 goto out; 3754 3755 strcpy(file->pathname, ptr); 3756 3757 copy_out: 3758 err = 0; 3759 if (copy_to_user(arg, file, sizeof(*file))) 3760 err = -EFAULT; 3761 out: 3762 kfree(buf); 3763 kfree(file); 3764 return err; 3765 } 3766 3767 static int get_disk_info(mddev_t * mddev, void __user * arg) 3768 { 3769 mdu_disk_info_t info; 3770 unsigned int nr; 3771 mdk_rdev_t *rdev; 3772 3773 if (copy_from_user(&info, arg, sizeof(info))) 3774 return -EFAULT; 3775 3776 nr = info.number; 3777 3778 rdev = find_rdev_nr(mddev, nr); 3779 if (rdev) { 3780 info.major = MAJOR(rdev->bdev->bd_dev); 3781 info.minor = MINOR(rdev->bdev->bd_dev); 3782 info.raid_disk = rdev->raid_disk; 3783 info.state = 0; 3784 if (test_bit(Faulty, &rdev->flags)) 3785 info.state |= (1<<MD_DISK_FAULTY); 3786 else if (test_bit(In_sync, &rdev->flags)) { 3787 info.state |= (1<<MD_DISK_ACTIVE); 3788 info.state |= (1<<MD_DISK_SYNC); 3789 } 3790 if (test_bit(WriteMostly, &rdev->flags)) 3791 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3792 } else { 3793 info.major = info.minor = 0; 3794 info.raid_disk = -1; 3795 info.state = (1<<MD_DISK_REMOVED); 3796 } 3797 3798 if (copy_to_user(arg, &info, sizeof(info))) 3799 return -EFAULT; 3800 3801 return 0; 3802 } 3803 3804 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3805 { 3806 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3807 mdk_rdev_t *rdev; 3808 dev_t dev = MKDEV(info->major,info->minor); 3809 3810 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3811 return -EOVERFLOW; 3812 3813 if (!mddev->raid_disks) { 3814 int err; 3815 /* expecting a device which has a superblock */ 3816 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3817 if (IS_ERR(rdev)) { 3818 printk(KERN_WARNING 3819 "md: md_import_device returned %ld\n", 3820 PTR_ERR(rdev)); 3821 return PTR_ERR(rdev); 3822 } 3823 if (!list_empty(&mddev->disks)) { 3824 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3825 mdk_rdev_t, same_set); 3826 int err = super_types[mddev->major_version] 3827 .load_super(rdev, rdev0, mddev->minor_version); 3828 if (err < 0) { 3829 printk(KERN_WARNING 3830 "md: %s has different UUID to %s\n", 3831 bdevname(rdev->bdev,b), 3832 bdevname(rdev0->bdev,b2)); 3833 export_rdev(rdev); 3834 return -EINVAL; 3835 } 3836 } 3837 err = bind_rdev_to_array(rdev, mddev); 3838 if (err) 3839 export_rdev(rdev); 3840 return err; 3841 } 3842 3843 /* 3844 * add_new_disk can be used once the array is assembled 3845 * to add "hot spares". They must already have a superblock 3846 * written 3847 */ 3848 if (mddev->pers) { 3849 int err; 3850 if (!mddev->pers->hot_add_disk) { 3851 printk(KERN_WARNING 3852 "%s: personality does not support diskops!\n", 3853 mdname(mddev)); 3854 return -EINVAL; 3855 } 3856 if (mddev->persistent) 3857 rdev = md_import_device(dev, mddev->major_version, 3858 mddev->minor_version); 3859 else 3860 rdev = md_import_device(dev, -1, -1); 3861 if (IS_ERR(rdev)) { 3862 printk(KERN_WARNING 3863 "md: md_import_device returned %ld\n", 3864 PTR_ERR(rdev)); 3865 return PTR_ERR(rdev); 3866 } 3867 /* set save_raid_disk if appropriate */ 3868 if (!mddev->persistent) { 3869 if (info->state & (1<<MD_DISK_SYNC) && 3870 info->raid_disk < mddev->raid_disks) 3871 rdev->raid_disk = info->raid_disk; 3872 else 3873 rdev->raid_disk = -1; 3874 } else 3875 super_types[mddev->major_version]. 3876 validate_super(mddev, rdev); 3877 rdev->saved_raid_disk = rdev->raid_disk; 3878 3879 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3880 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3881 set_bit(WriteMostly, &rdev->flags); 3882 3883 rdev->raid_disk = -1; 3884 err = bind_rdev_to_array(rdev, mddev); 3885 if (!err && !mddev->pers->hot_remove_disk) { 3886 /* If there is hot_add_disk but no hot_remove_disk 3887 * then added disks for geometry changes, 3888 * and should be added immediately. 3889 */ 3890 super_types[mddev->major_version]. 3891 validate_super(mddev, rdev); 3892 err = mddev->pers->hot_add_disk(mddev, rdev); 3893 if (err) 3894 unbind_rdev_from_array(rdev); 3895 } 3896 if (err) 3897 export_rdev(rdev); 3898 3899 md_update_sb(mddev, 1); 3900 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3901 md_wakeup_thread(mddev->thread); 3902 return err; 3903 } 3904 3905 /* otherwise, add_new_disk is only allowed 3906 * for major_version==0 superblocks 3907 */ 3908 if (mddev->major_version != 0) { 3909 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3910 mdname(mddev)); 3911 return -EINVAL; 3912 } 3913 3914 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3915 int err; 3916 rdev = md_import_device (dev, -1, 0); 3917 if (IS_ERR(rdev)) { 3918 printk(KERN_WARNING 3919 "md: error, md_import_device() returned %ld\n", 3920 PTR_ERR(rdev)); 3921 return PTR_ERR(rdev); 3922 } 3923 rdev->desc_nr = info->number; 3924 if (info->raid_disk < mddev->raid_disks) 3925 rdev->raid_disk = info->raid_disk; 3926 else 3927 rdev->raid_disk = -1; 3928 3929 rdev->flags = 0; 3930 3931 if (rdev->raid_disk < mddev->raid_disks) 3932 if (info->state & (1<<MD_DISK_SYNC)) 3933 set_bit(In_sync, &rdev->flags); 3934 3935 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3936 set_bit(WriteMostly, &rdev->flags); 3937 3938 if (!mddev->persistent) { 3939 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3940 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3941 } else 3942 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3943 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3944 3945 err = bind_rdev_to_array(rdev, mddev); 3946 if (err) { 3947 export_rdev(rdev); 3948 return err; 3949 } 3950 } 3951 3952 return 0; 3953 } 3954 3955 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3956 { 3957 char b[BDEVNAME_SIZE]; 3958 mdk_rdev_t *rdev; 3959 3960 if (!mddev->pers) 3961 return -ENODEV; 3962 3963 rdev = find_rdev(mddev, dev); 3964 if (!rdev) 3965 return -ENXIO; 3966 3967 if (rdev->raid_disk >= 0) 3968 goto busy; 3969 3970 kick_rdev_from_array(rdev); 3971 md_update_sb(mddev, 1); 3972 md_new_event(mddev); 3973 3974 return 0; 3975 busy: 3976 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3977 bdevname(rdev->bdev,b), mdname(mddev)); 3978 return -EBUSY; 3979 } 3980 3981 static int hot_add_disk(mddev_t * mddev, dev_t dev) 3982 { 3983 char b[BDEVNAME_SIZE]; 3984 int err; 3985 unsigned int size; 3986 mdk_rdev_t *rdev; 3987 3988 if (!mddev->pers) 3989 return -ENODEV; 3990 3991 if (mddev->major_version != 0) { 3992 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3993 " version-0 superblocks.\n", 3994 mdname(mddev)); 3995 return -EINVAL; 3996 } 3997 if (!mddev->pers->hot_add_disk) { 3998 printk(KERN_WARNING 3999 "%s: personality does not support diskops!\n", 4000 mdname(mddev)); 4001 return -EINVAL; 4002 } 4003 4004 rdev = md_import_device (dev, -1, 0); 4005 if (IS_ERR(rdev)) { 4006 printk(KERN_WARNING 4007 "md: error, md_import_device() returned %ld\n", 4008 PTR_ERR(rdev)); 4009 return -EINVAL; 4010 } 4011 4012 if (mddev->persistent) 4013 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4014 else 4015 rdev->sb_offset = 4016 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 4017 4018 size = calc_dev_size(rdev, mddev->chunk_size); 4019 rdev->size = size; 4020 4021 if (test_bit(Faulty, &rdev->flags)) { 4022 printk(KERN_WARNING 4023 "md: can not hot-add faulty %s disk to %s!\n", 4024 bdevname(rdev->bdev,b), mdname(mddev)); 4025 err = -EINVAL; 4026 goto abort_export; 4027 } 4028 clear_bit(In_sync, &rdev->flags); 4029 rdev->desc_nr = -1; 4030 rdev->saved_raid_disk = -1; 4031 err = bind_rdev_to_array(rdev, mddev); 4032 if (err) 4033 goto abort_export; 4034 4035 /* 4036 * The rest should better be atomic, we can have disk failures 4037 * noticed in interrupt contexts ... 4038 */ 4039 4040 if (rdev->desc_nr == mddev->max_disks) { 4041 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 4042 mdname(mddev)); 4043 err = -EBUSY; 4044 goto abort_unbind_export; 4045 } 4046 4047 rdev->raid_disk = -1; 4048 4049 md_update_sb(mddev, 1); 4050 4051 /* 4052 * Kick recovery, maybe this spare has to be added to the 4053 * array immediately. 4054 */ 4055 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4056 md_wakeup_thread(mddev->thread); 4057 md_new_event(mddev); 4058 return 0; 4059 4060 abort_unbind_export: 4061 unbind_rdev_from_array(rdev); 4062 4063 abort_export: 4064 export_rdev(rdev); 4065 return err; 4066 } 4067 4068 static int set_bitmap_file(mddev_t *mddev, int fd) 4069 { 4070 int err; 4071 4072 if (mddev->pers) { 4073 if (!mddev->pers->quiesce) 4074 return -EBUSY; 4075 if (mddev->recovery || mddev->sync_thread) 4076 return -EBUSY; 4077 /* we should be able to change the bitmap.. */ 4078 } 4079 4080 4081 if (fd >= 0) { 4082 if (mddev->bitmap) 4083 return -EEXIST; /* cannot add when bitmap is present */ 4084 mddev->bitmap_file = fget(fd); 4085 4086 if (mddev->bitmap_file == NULL) { 4087 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4088 mdname(mddev)); 4089 return -EBADF; 4090 } 4091 4092 err = deny_bitmap_write_access(mddev->bitmap_file); 4093 if (err) { 4094 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4095 mdname(mddev)); 4096 fput(mddev->bitmap_file); 4097 mddev->bitmap_file = NULL; 4098 return err; 4099 } 4100 mddev->bitmap_offset = 0; /* file overrides offset */ 4101 } else if (mddev->bitmap == NULL) 4102 return -ENOENT; /* cannot remove what isn't there */ 4103 err = 0; 4104 if (mddev->pers) { 4105 mddev->pers->quiesce(mddev, 1); 4106 if (fd >= 0) 4107 err = bitmap_create(mddev); 4108 if (fd < 0 || err) { 4109 bitmap_destroy(mddev); 4110 fd = -1; /* make sure to put the file */ 4111 } 4112 mddev->pers->quiesce(mddev, 0); 4113 } 4114 if (fd < 0) { 4115 if (mddev->bitmap_file) { 4116 restore_bitmap_write_access(mddev->bitmap_file); 4117 fput(mddev->bitmap_file); 4118 } 4119 mddev->bitmap_file = NULL; 4120 } 4121 4122 return err; 4123 } 4124 4125 /* 4126 * set_array_info is used two different ways 4127 * The original usage is when creating a new array. 4128 * In this usage, raid_disks is > 0 and it together with 4129 * level, size, not_persistent,layout,chunksize determine the 4130 * shape of the array. 4131 * This will always create an array with a type-0.90.0 superblock. 4132 * The newer usage is when assembling an array. 4133 * In this case raid_disks will be 0, and the major_version field is 4134 * use to determine which style super-blocks are to be found on the devices. 4135 * The minor and patch _version numbers are also kept incase the 4136 * super_block handler wishes to interpret them. 4137 */ 4138 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 4139 { 4140 4141 if (info->raid_disks == 0) { 4142 /* just setting version number for superblock loading */ 4143 if (info->major_version < 0 || 4144 info->major_version >= ARRAY_SIZE(super_types) || 4145 super_types[info->major_version].name == NULL) { 4146 /* maybe try to auto-load a module? */ 4147 printk(KERN_INFO 4148 "md: superblock version %d not known\n", 4149 info->major_version); 4150 return -EINVAL; 4151 } 4152 mddev->major_version = info->major_version; 4153 mddev->minor_version = info->minor_version; 4154 mddev->patch_version = info->patch_version; 4155 mddev->persistent = !info->not_persistent; 4156 return 0; 4157 } 4158 mddev->major_version = MD_MAJOR_VERSION; 4159 mddev->minor_version = MD_MINOR_VERSION; 4160 mddev->patch_version = MD_PATCHLEVEL_VERSION; 4161 mddev->ctime = get_seconds(); 4162 4163 mddev->level = info->level; 4164 mddev->clevel[0] = 0; 4165 mddev->size = info->size; 4166 mddev->raid_disks = info->raid_disks; 4167 /* don't set md_minor, it is determined by which /dev/md* was 4168 * openned 4169 */ 4170 if (info->state & (1<<MD_SB_CLEAN)) 4171 mddev->recovery_cp = MaxSector; 4172 else 4173 mddev->recovery_cp = 0; 4174 mddev->persistent = ! info->not_persistent; 4175 4176 mddev->layout = info->layout; 4177 mddev->chunk_size = info->chunk_size; 4178 4179 mddev->max_disks = MD_SB_DISKS; 4180 4181 mddev->flags = 0; 4182 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4183 4184 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4185 mddev->bitmap_offset = 0; 4186 4187 mddev->reshape_position = MaxSector; 4188 4189 /* 4190 * Generate a 128 bit UUID 4191 */ 4192 get_random_bytes(mddev->uuid, 16); 4193 4194 mddev->new_level = mddev->level; 4195 mddev->new_chunk = mddev->chunk_size; 4196 mddev->new_layout = mddev->layout; 4197 mddev->delta_disks = 0; 4198 4199 return 0; 4200 } 4201 4202 static int update_size(mddev_t *mddev, unsigned long size) 4203 { 4204 mdk_rdev_t * rdev; 4205 int rv; 4206 struct list_head *tmp; 4207 int fit = (size == 0); 4208 4209 if (mddev->pers->resize == NULL) 4210 return -EINVAL; 4211 /* The "size" is the amount of each device that is used. 4212 * This can only make sense for arrays with redundancy. 4213 * linear and raid0 always use whatever space is available 4214 * We can only consider changing the size if no resync 4215 * or reconstruction is happening, and if the new size 4216 * is acceptable. It must fit before the sb_offset or, 4217 * if that is <data_offset, it must fit before the 4218 * size of each device. 4219 * If size is zero, we find the largest size that fits. 4220 */ 4221 if (mddev->sync_thread) 4222 return -EBUSY; 4223 ITERATE_RDEV(mddev,rdev,tmp) { 4224 sector_t avail; 4225 avail = rdev->size * 2; 4226 4227 if (fit && (size == 0 || size > avail/2)) 4228 size = avail/2; 4229 if (avail < ((sector_t)size << 1)) 4230 return -ENOSPC; 4231 } 4232 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4233 if (!rv) { 4234 struct block_device *bdev; 4235 4236 bdev = bdget_disk(mddev->gendisk, 0); 4237 if (bdev) { 4238 mutex_lock(&bdev->bd_inode->i_mutex); 4239 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4240 mutex_unlock(&bdev->bd_inode->i_mutex); 4241 bdput(bdev); 4242 } 4243 } 4244 return rv; 4245 } 4246 4247 static int update_raid_disks(mddev_t *mddev, int raid_disks) 4248 { 4249 int rv; 4250 /* change the number of raid disks */ 4251 if (mddev->pers->check_reshape == NULL) 4252 return -EINVAL; 4253 if (raid_disks <= 0 || 4254 raid_disks >= mddev->max_disks) 4255 return -EINVAL; 4256 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4257 return -EBUSY; 4258 mddev->delta_disks = raid_disks - mddev->raid_disks; 4259 4260 rv = mddev->pers->check_reshape(mddev); 4261 return rv; 4262 } 4263 4264 4265 /* 4266 * update_array_info is used to change the configuration of an 4267 * on-line array. 4268 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4269 * fields in the info are checked against the array. 4270 * Any differences that cannot be handled will cause an error. 4271 * Normally, only one change can be managed at a time. 4272 */ 4273 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4274 { 4275 int rv = 0; 4276 int cnt = 0; 4277 int state = 0; 4278 4279 /* calculate expected state,ignoring low bits */ 4280 if (mddev->bitmap && mddev->bitmap_offset) 4281 state |= (1 << MD_SB_BITMAP_PRESENT); 4282 4283 if (mddev->major_version != info->major_version || 4284 mddev->minor_version != info->minor_version || 4285 /* mddev->patch_version != info->patch_version || */ 4286 mddev->ctime != info->ctime || 4287 mddev->level != info->level || 4288 /* mddev->layout != info->layout || */ 4289 !mddev->persistent != info->not_persistent|| 4290 mddev->chunk_size != info->chunk_size || 4291 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4292 ((state^info->state) & 0xfffffe00) 4293 ) 4294 return -EINVAL; 4295 /* Check there is only one change */ 4296 if (info->size >= 0 && mddev->size != info->size) cnt++; 4297 if (mddev->raid_disks != info->raid_disks) cnt++; 4298 if (mddev->layout != info->layout) cnt++; 4299 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4300 if (cnt == 0) return 0; 4301 if (cnt > 1) return -EINVAL; 4302 4303 if (mddev->layout != info->layout) { 4304 /* Change layout 4305 * we don't need to do anything at the md level, the 4306 * personality will take care of it all. 4307 */ 4308 if (mddev->pers->reconfig == NULL) 4309 return -EINVAL; 4310 else 4311 return mddev->pers->reconfig(mddev, info->layout, -1); 4312 } 4313 if (info->size >= 0 && mddev->size != info->size) 4314 rv = update_size(mddev, info->size); 4315 4316 if (mddev->raid_disks != info->raid_disks) 4317 rv = update_raid_disks(mddev, info->raid_disks); 4318 4319 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4320 if (mddev->pers->quiesce == NULL) 4321 return -EINVAL; 4322 if (mddev->recovery || mddev->sync_thread) 4323 return -EBUSY; 4324 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4325 /* add the bitmap */ 4326 if (mddev->bitmap) 4327 return -EEXIST; 4328 if (mddev->default_bitmap_offset == 0) 4329 return -EINVAL; 4330 mddev->bitmap_offset = mddev->default_bitmap_offset; 4331 mddev->pers->quiesce(mddev, 1); 4332 rv = bitmap_create(mddev); 4333 if (rv) 4334 bitmap_destroy(mddev); 4335 mddev->pers->quiesce(mddev, 0); 4336 } else { 4337 /* remove the bitmap */ 4338 if (!mddev->bitmap) 4339 return -ENOENT; 4340 if (mddev->bitmap->file) 4341 return -EINVAL; 4342 mddev->pers->quiesce(mddev, 1); 4343 bitmap_destroy(mddev); 4344 mddev->pers->quiesce(mddev, 0); 4345 mddev->bitmap_offset = 0; 4346 } 4347 } 4348 md_update_sb(mddev, 1); 4349 return rv; 4350 } 4351 4352 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4353 { 4354 mdk_rdev_t *rdev; 4355 4356 if (mddev->pers == NULL) 4357 return -ENODEV; 4358 4359 rdev = find_rdev(mddev, dev); 4360 if (!rdev) 4361 return -ENODEV; 4362 4363 md_error(mddev, rdev); 4364 return 0; 4365 } 4366 4367 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4368 { 4369 mddev_t *mddev = bdev->bd_disk->private_data; 4370 4371 geo->heads = 2; 4372 geo->sectors = 4; 4373 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4374 return 0; 4375 } 4376 4377 static int md_ioctl(struct inode *inode, struct file *file, 4378 unsigned int cmd, unsigned long arg) 4379 { 4380 int err = 0; 4381 void __user *argp = (void __user *)arg; 4382 mddev_t *mddev = NULL; 4383 4384 if (!capable(CAP_SYS_ADMIN)) 4385 return -EACCES; 4386 4387 /* 4388 * Commands dealing with the RAID driver but not any 4389 * particular array: 4390 */ 4391 switch (cmd) 4392 { 4393 case RAID_VERSION: 4394 err = get_version(argp); 4395 goto done; 4396 4397 case PRINT_RAID_DEBUG: 4398 err = 0; 4399 md_print_devices(); 4400 goto done; 4401 4402 #ifndef MODULE 4403 case RAID_AUTORUN: 4404 err = 0; 4405 autostart_arrays(arg); 4406 goto done; 4407 #endif 4408 default:; 4409 } 4410 4411 /* 4412 * Commands creating/starting a new array: 4413 */ 4414 4415 mddev = inode->i_bdev->bd_disk->private_data; 4416 4417 if (!mddev) { 4418 BUG(); 4419 goto abort; 4420 } 4421 4422 err = mddev_lock(mddev); 4423 if (err) { 4424 printk(KERN_INFO 4425 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4426 err, cmd); 4427 goto abort; 4428 } 4429 4430 switch (cmd) 4431 { 4432 case SET_ARRAY_INFO: 4433 { 4434 mdu_array_info_t info; 4435 if (!arg) 4436 memset(&info, 0, sizeof(info)); 4437 else if (copy_from_user(&info, argp, sizeof(info))) { 4438 err = -EFAULT; 4439 goto abort_unlock; 4440 } 4441 if (mddev->pers) { 4442 err = update_array_info(mddev, &info); 4443 if (err) { 4444 printk(KERN_WARNING "md: couldn't update" 4445 " array info. %d\n", err); 4446 goto abort_unlock; 4447 } 4448 goto done_unlock; 4449 } 4450 if (!list_empty(&mddev->disks)) { 4451 printk(KERN_WARNING 4452 "md: array %s already has disks!\n", 4453 mdname(mddev)); 4454 err = -EBUSY; 4455 goto abort_unlock; 4456 } 4457 if (mddev->raid_disks) { 4458 printk(KERN_WARNING 4459 "md: array %s already initialised!\n", 4460 mdname(mddev)); 4461 err = -EBUSY; 4462 goto abort_unlock; 4463 } 4464 err = set_array_info(mddev, &info); 4465 if (err) { 4466 printk(KERN_WARNING "md: couldn't set" 4467 " array info. %d\n", err); 4468 goto abort_unlock; 4469 } 4470 } 4471 goto done_unlock; 4472 4473 default:; 4474 } 4475 4476 /* 4477 * Commands querying/configuring an existing array: 4478 */ 4479 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4480 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 4481 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4482 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 4483 && cmd != GET_BITMAP_FILE) { 4484 err = -ENODEV; 4485 goto abort_unlock; 4486 } 4487 4488 /* 4489 * Commands even a read-only array can execute: 4490 */ 4491 switch (cmd) 4492 { 4493 case GET_ARRAY_INFO: 4494 err = get_array_info(mddev, argp); 4495 goto done_unlock; 4496 4497 case GET_BITMAP_FILE: 4498 err = get_bitmap_file(mddev, argp); 4499 goto done_unlock; 4500 4501 case GET_DISK_INFO: 4502 err = get_disk_info(mddev, argp); 4503 goto done_unlock; 4504 4505 case RESTART_ARRAY_RW: 4506 err = restart_array(mddev); 4507 goto done_unlock; 4508 4509 case STOP_ARRAY: 4510 err = do_md_stop (mddev, 0); 4511 goto done_unlock; 4512 4513 case STOP_ARRAY_RO: 4514 err = do_md_stop (mddev, 1); 4515 goto done_unlock; 4516 4517 /* 4518 * We have a problem here : there is no easy way to give a CHS 4519 * virtual geometry. We currently pretend that we have a 2 heads 4520 * 4 sectors (with a BIG number of cylinders...). This drives 4521 * dosfs just mad... ;-) 4522 */ 4523 } 4524 4525 /* 4526 * The remaining ioctls are changing the state of the 4527 * superblock, so we do not allow them on read-only arrays. 4528 * However non-MD ioctls (e.g. get-size) will still come through 4529 * here and hit the 'default' below, so only disallow 4530 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4531 */ 4532 if (_IOC_TYPE(cmd) == MD_MAJOR && 4533 mddev->ro && mddev->pers) { 4534 if (mddev->ro == 2) { 4535 mddev->ro = 0; 4536 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4537 md_wakeup_thread(mddev->thread); 4538 4539 } else { 4540 err = -EROFS; 4541 goto abort_unlock; 4542 } 4543 } 4544 4545 switch (cmd) 4546 { 4547 case ADD_NEW_DISK: 4548 { 4549 mdu_disk_info_t info; 4550 if (copy_from_user(&info, argp, sizeof(info))) 4551 err = -EFAULT; 4552 else 4553 err = add_new_disk(mddev, &info); 4554 goto done_unlock; 4555 } 4556 4557 case HOT_REMOVE_DISK: 4558 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4559 goto done_unlock; 4560 4561 case HOT_ADD_DISK: 4562 err = hot_add_disk(mddev, new_decode_dev(arg)); 4563 goto done_unlock; 4564 4565 case SET_DISK_FAULTY: 4566 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4567 goto done_unlock; 4568 4569 case RUN_ARRAY: 4570 err = do_md_run (mddev); 4571 goto done_unlock; 4572 4573 case SET_BITMAP_FILE: 4574 err = set_bitmap_file(mddev, (int)arg); 4575 goto done_unlock; 4576 4577 default: 4578 err = -EINVAL; 4579 goto abort_unlock; 4580 } 4581 4582 done_unlock: 4583 abort_unlock: 4584 mddev_unlock(mddev); 4585 4586 return err; 4587 done: 4588 if (err) 4589 MD_BUG(); 4590 abort: 4591 return err; 4592 } 4593 4594 static int md_open(struct inode *inode, struct file *file) 4595 { 4596 /* 4597 * Succeed if we can lock the mddev, which confirms that 4598 * it isn't being stopped right now. 4599 */ 4600 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4601 int err; 4602 4603 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 4604 goto out; 4605 4606 err = 0; 4607 mddev_get(mddev); 4608 mddev_unlock(mddev); 4609 4610 check_disk_change(inode->i_bdev); 4611 out: 4612 return err; 4613 } 4614 4615 static int md_release(struct inode *inode, struct file * file) 4616 { 4617 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4618 4619 BUG_ON(!mddev); 4620 mddev_put(mddev); 4621 4622 return 0; 4623 } 4624 4625 static int md_media_changed(struct gendisk *disk) 4626 { 4627 mddev_t *mddev = disk->private_data; 4628 4629 return mddev->changed; 4630 } 4631 4632 static int md_revalidate(struct gendisk *disk) 4633 { 4634 mddev_t *mddev = disk->private_data; 4635 4636 mddev->changed = 0; 4637 return 0; 4638 } 4639 static struct block_device_operations md_fops = 4640 { 4641 .owner = THIS_MODULE, 4642 .open = md_open, 4643 .release = md_release, 4644 .ioctl = md_ioctl, 4645 .getgeo = md_getgeo, 4646 .media_changed = md_media_changed, 4647 .revalidate_disk= md_revalidate, 4648 }; 4649 4650 static int md_thread(void * arg) 4651 { 4652 mdk_thread_t *thread = arg; 4653 4654 /* 4655 * md_thread is a 'system-thread', it's priority should be very 4656 * high. We avoid resource deadlocks individually in each 4657 * raid personality. (RAID5 does preallocation) We also use RR and 4658 * the very same RT priority as kswapd, thus we will never get 4659 * into a priority inversion deadlock. 4660 * 4661 * we definitely have to have equal or higher priority than 4662 * bdflush, otherwise bdflush will deadlock if there are too 4663 * many dirty RAID5 blocks. 4664 */ 4665 4666 allow_signal(SIGKILL); 4667 while (!kthread_should_stop()) { 4668 4669 /* We need to wait INTERRUPTIBLE so that 4670 * we don't add to the load-average. 4671 * That means we need to be sure no signals are 4672 * pending 4673 */ 4674 if (signal_pending(current)) 4675 flush_signals(current); 4676 4677 wait_event_interruptible_timeout 4678 (thread->wqueue, 4679 test_bit(THREAD_WAKEUP, &thread->flags) 4680 || kthread_should_stop(), 4681 thread->timeout); 4682 4683 clear_bit(THREAD_WAKEUP, &thread->flags); 4684 4685 thread->run(thread->mddev); 4686 } 4687 4688 return 0; 4689 } 4690 4691 void md_wakeup_thread(mdk_thread_t *thread) 4692 { 4693 if (thread) { 4694 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4695 set_bit(THREAD_WAKEUP, &thread->flags); 4696 wake_up(&thread->wqueue); 4697 } 4698 } 4699 4700 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4701 const char *name) 4702 { 4703 mdk_thread_t *thread; 4704 4705 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4706 if (!thread) 4707 return NULL; 4708 4709 init_waitqueue_head(&thread->wqueue); 4710 4711 thread->run = run; 4712 thread->mddev = mddev; 4713 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4714 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4715 if (IS_ERR(thread->tsk)) { 4716 kfree(thread); 4717 return NULL; 4718 } 4719 return thread; 4720 } 4721 4722 void md_unregister_thread(mdk_thread_t *thread) 4723 { 4724 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4725 4726 kthread_stop(thread->tsk); 4727 kfree(thread); 4728 } 4729 4730 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4731 { 4732 if (!mddev) { 4733 MD_BUG(); 4734 return; 4735 } 4736 4737 if (!rdev || test_bit(Faulty, &rdev->flags)) 4738 return; 4739 /* 4740 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4741 mdname(mddev), 4742 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4743 __builtin_return_address(0),__builtin_return_address(1), 4744 __builtin_return_address(2),__builtin_return_address(3)); 4745 */ 4746 if (!mddev->pers) 4747 return; 4748 if (!mddev->pers->error_handler) 4749 return; 4750 mddev->pers->error_handler(mddev,rdev); 4751 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4752 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4753 md_wakeup_thread(mddev->thread); 4754 md_new_event_inintr(mddev); 4755 } 4756 4757 /* seq_file implementation /proc/mdstat */ 4758 4759 static void status_unused(struct seq_file *seq) 4760 { 4761 int i = 0; 4762 mdk_rdev_t *rdev; 4763 struct list_head *tmp; 4764 4765 seq_printf(seq, "unused devices: "); 4766 4767 ITERATE_RDEV_PENDING(rdev,tmp) { 4768 char b[BDEVNAME_SIZE]; 4769 i++; 4770 seq_printf(seq, "%s ", 4771 bdevname(rdev->bdev,b)); 4772 } 4773 if (!i) 4774 seq_printf(seq, "<none>"); 4775 4776 seq_printf(seq, "\n"); 4777 } 4778 4779 4780 static void status_resync(struct seq_file *seq, mddev_t * mddev) 4781 { 4782 sector_t max_blocks, resync, res; 4783 unsigned long dt, db, rt; 4784 int scale; 4785 unsigned int per_milli; 4786 4787 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4788 4789 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4790 max_blocks = mddev->resync_max_sectors >> 1; 4791 else 4792 max_blocks = mddev->size; 4793 4794 /* 4795 * Should not happen. 4796 */ 4797 if (!max_blocks) { 4798 MD_BUG(); 4799 return; 4800 } 4801 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4802 * in a sector_t, and (max_blocks>>scale) will fit in a 4803 * u32, as those are the requirements for sector_div. 4804 * Thus 'scale' must be at least 10 4805 */ 4806 scale = 10; 4807 if (sizeof(sector_t) > sizeof(unsigned long)) { 4808 while ( max_blocks/2 > (1ULL<<(scale+32))) 4809 scale++; 4810 } 4811 res = (resync>>scale)*1000; 4812 sector_div(res, (u32)((max_blocks>>scale)+1)); 4813 4814 per_milli = res; 4815 { 4816 int i, x = per_milli/50, y = 20-x; 4817 seq_printf(seq, "["); 4818 for (i = 0; i < x; i++) 4819 seq_printf(seq, "="); 4820 seq_printf(seq, ">"); 4821 for (i = 0; i < y; i++) 4822 seq_printf(seq, "."); 4823 seq_printf(seq, "] "); 4824 } 4825 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4826 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4827 "reshape" : 4828 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 4829 "check" : 4830 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4831 "resync" : "recovery"))), 4832 per_milli/10, per_milli % 10, 4833 (unsigned long long) resync, 4834 (unsigned long long) max_blocks); 4835 4836 /* 4837 * We do not want to overflow, so the order of operands and 4838 * the * 100 / 100 trick are important. We do a +1 to be 4839 * safe against division by zero. We only estimate anyway. 4840 * 4841 * dt: time from mark until now 4842 * db: blocks written from mark until now 4843 * rt: remaining time 4844 */ 4845 dt = ((jiffies - mddev->resync_mark) / HZ); 4846 if (!dt) dt++; 4847 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4848 - mddev->resync_mark_cnt; 4849 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4850 4851 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4852 4853 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4854 } 4855 4856 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4857 { 4858 struct list_head *tmp; 4859 loff_t l = *pos; 4860 mddev_t *mddev; 4861 4862 if (l >= 0x10000) 4863 return NULL; 4864 if (!l--) 4865 /* header */ 4866 return (void*)1; 4867 4868 spin_lock(&all_mddevs_lock); 4869 list_for_each(tmp,&all_mddevs) 4870 if (!l--) { 4871 mddev = list_entry(tmp, mddev_t, all_mddevs); 4872 mddev_get(mddev); 4873 spin_unlock(&all_mddevs_lock); 4874 return mddev; 4875 } 4876 spin_unlock(&all_mddevs_lock); 4877 if (!l--) 4878 return (void*)2;/* tail */ 4879 return NULL; 4880 } 4881 4882 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4883 { 4884 struct list_head *tmp; 4885 mddev_t *next_mddev, *mddev = v; 4886 4887 ++*pos; 4888 if (v == (void*)2) 4889 return NULL; 4890 4891 spin_lock(&all_mddevs_lock); 4892 if (v == (void*)1) 4893 tmp = all_mddevs.next; 4894 else 4895 tmp = mddev->all_mddevs.next; 4896 if (tmp != &all_mddevs) 4897 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4898 else { 4899 next_mddev = (void*)2; 4900 *pos = 0x10000; 4901 } 4902 spin_unlock(&all_mddevs_lock); 4903 4904 if (v != (void*)1) 4905 mddev_put(mddev); 4906 return next_mddev; 4907 4908 } 4909 4910 static void md_seq_stop(struct seq_file *seq, void *v) 4911 { 4912 mddev_t *mddev = v; 4913 4914 if (mddev && v != (void*)1 && v != (void*)2) 4915 mddev_put(mddev); 4916 } 4917 4918 struct mdstat_info { 4919 int event; 4920 }; 4921 4922 static int md_seq_show(struct seq_file *seq, void *v) 4923 { 4924 mddev_t *mddev = v; 4925 sector_t size; 4926 struct list_head *tmp2; 4927 mdk_rdev_t *rdev; 4928 struct mdstat_info *mi = seq->private; 4929 struct bitmap *bitmap; 4930 4931 if (v == (void*)1) { 4932 struct mdk_personality *pers; 4933 seq_printf(seq, "Personalities : "); 4934 spin_lock(&pers_lock); 4935 list_for_each_entry(pers, &pers_list, list) 4936 seq_printf(seq, "[%s] ", pers->name); 4937 4938 spin_unlock(&pers_lock); 4939 seq_printf(seq, "\n"); 4940 mi->event = atomic_read(&md_event_count); 4941 return 0; 4942 } 4943 if (v == (void*)2) { 4944 status_unused(seq); 4945 return 0; 4946 } 4947 4948 if (mddev_lock(mddev) < 0) 4949 return -EINTR; 4950 4951 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4952 seq_printf(seq, "%s : %sactive", mdname(mddev), 4953 mddev->pers ? "" : "in"); 4954 if (mddev->pers) { 4955 if (mddev->ro==1) 4956 seq_printf(seq, " (read-only)"); 4957 if (mddev->ro==2) 4958 seq_printf(seq, "(auto-read-only)"); 4959 seq_printf(seq, " %s", mddev->pers->name); 4960 } 4961 4962 size = 0; 4963 ITERATE_RDEV(mddev,rdev,tmp2) { 4964 char b[BDEVNAME_SIZE]; 4965 seq_printf(seq, " %s[%d]", 4966 bdevname(rdev->bdev,b), rdev->desc_nr); 4967 if (test_bit(WriteMostly, &rdev->flags)) 4968 seq_printf(seq, "(W)"); 4969 if (test_bit(Faulty, &rdev->flags)) { 4970 seq_printf(seq, "(F)"); 4971 continue; 4972 } else if (rdev->raid_disk < 0) 4973 seq_printf(seq, "(S)"); /* spare */ 4974 size += rdev->size; 4975 } 4976 4977 if (!list_empty(&mddev->disks)) { 4978 if (mddev->pers) 4979 seq_printf(seq, "\n %llu blocks", 4980 (unsigned long long)mddev->array_size); 4981 else 4982 seq_printf(seq, "\n %llu blocks", 4983 (unsigned long long)size); 4984 } 4985 if (mddev->persistent) { 4986 if (mddev->major_version != 0 || 4987 mddev->minor_version != 90) { 4988 seq_printf(seq," super %d.%d", 4989 mddev->major_version, 4990 mddev->minor_version); 4991 } 4992 } else 4993 seq_printf(seq, " super non-persistent"); 4994 4995 if (mddev->pers) { 4996 mddev->pers->status (seq, mddev); 4997 seq_printf(seq, "\n "); 4998 if (mddev->pers->sync_request) { 4999 if (mddev->curr_resync > 2) { 5000 status_resync (seq, mddev); 5001 seq_printf(seq, "\n "); 5002 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5003 seq_printf(seq, "\tresync=DELAYED\n "); 5004 else if (mddev->recovery_cp < MaxSector) 5005 seq_printf(seq, "\tresync=PENDING\n "); 5006 } 5007 } else 5008 seq_printf(seq, "\n "); 5009 5010 if ((bitmap = mddev->bitmap)) { 5011 unsigned long chunk_kb; 5012 unsigned long flags; 5013 spin_lock_irqsave(&bitmap->lock, flags); 5014 chunk_kb = bitmap->chunksize >> 10; 5015 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5016 "%lu%s chunk", 5017 bitmap->pages - bitmap->missing_pages, 5018 bitmap->pages, 5019 (bitmap->pages - bitmap->missing_pages) 5020 << (PAGE_SHIFT - 10), 5021 chunk_kb ? chunk_kb : bitmap->chunksize, 5022 chunk_kb ? "KB" : "B"); 5023 if (bitmap->file) { 5024 seq_printf(seq, ", file: "); 5025 seq_path(seq, bitmap->file->f_path.mnt, 5026 bitmap->file->f_path.dentry," \t\n"); 5027 } 5028 5029 seq_printf(seq, "\n"); 5030 spin_unlock_irqrestore(&bitmap->lock, flags); 5031 } 5032 5033 seq_printf(seq, "\n"); 5034 } 5035 mddev_unlock(mddev); 5036 5037 return 0; 5038 } 5039 5040 static struct seq_operations md_seq_ops = { 5041 .start = md_seq_start, 5042 .next = md_seq_next, 5043 .stop = md_seq_stop, 5044 .show = md_seq_show, 5045 }; 5046 5047 static int md_seq_open(struct inode *inode, struct file *file) 5048 { 5049 int error; 5050 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5051 if (mi == NULL) 5052 return -ENOMEM; 5053 5054 error = seq_open(file, &md_seq_ops); 5055 if (error) 5056 kfree(mi); 5057 else { 5058 struct seq_file *p = file->private_data; 5059 p->private = mi; 5060 mi->event = atomic_read(&md_event_count); 5061 } 5062 return error; 5063 } 5064 5065 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5066 { 5067 struct seq_file *m = filp->private_data; 5068 struct mdstat_info *mi = m->private; 5069 int mask; 5070 5071 poll_wait(filp, &md_event_waiters, wait); 5072 5073 /* always allow read */ 5074 mask = POLLIN | POLLRDNORM; 5075 5076 if (mi->event != atomic_read(&md_event_count)) 5077 mask |= POLLERR | POLLPRI; 5078 return mask; 5079 } 5080 5081 static const struct file_operations md_seq_fops = { 5082 .owner = THIS_MODULE, 5083 .open = md_seq_open, 5084 .read = seq_read, 5085 .llseek = seq_lseek, 5086 .release = seq_release_private, 5087 .poll = mdstat_poll, 5088 }; 5089 5090 int register_md_personality(struct mdk_personality *p) 5091 { 5092 spin_lock(&pers_lock); 5093 list_add_tail(&p->list, &pers_list); 5094 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 5095 spin_unlock(&pers_lock); 5096 return 0; 5097 } 5098 5099 int unregister_md_personality(struct mdk_personality *p) 5100 { 5101 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 5102 spin_lock(&pers_lock); 5103 list_del_init(&p->list); 5104 spin_unlock(&pers_lock); 5105 return 0; 5106 } 5107 5108 static int is_mddev_idle(mddev_t *mddev) 5109 { 5110 mdk_rdev_t * rdev; 5111 struct list_head *tmp; 5112 int idle; 5113 long curr_events; 5114 5115 idle = 1; 5116 ITERATE_RDEV(mddev,rdev,tmp) { 5117 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5118 curr_events = disk_stat_read(disk, sectors[0]) + 5119 disk_stat_read(disk, sectors[1]) - 5120 atomic_read(&disk->sync_io); 5121 /* sync IO will cause sync_io to increase before the disk_stats 5122 * as sync_io is counted when a request starts, and 5123 * disk_stats is counted when it completes. 5124 * So resync activity will cause curr_events to be smaller than 5125 * when there was no such activity. 5126 * non-sync IO will cause disk_stat to increase without 5127 * increasing sync_io so curr_events will (eventually) 5128 * be larger than it was before. Once it becomes 5129 * substantially larger, the test below will cause 5130 * the array to appear non-idle, and resync will slow 5131 * down. 5132 * If there is a lot of outstanding resync activity when 5133 * we set last_event to curr_events, then all that activity 5134 * completing might cause the array to appear non-idle 5135 * and resync will be slowed down even though there might 5136 * not have been non-resync activity. This will only 5137 * happen once though. 'last_events' will soon reflect 5138 * the state where there is little or no outstanding 5139 * resync requests, and further resync activity will 5140 * always make curr_events less than last_events. 5141 * 5142 */ 5143 if (curr_events - rdev->last_events > 4096) { 5144 rdev->last_events = curr_events; 5145 idle = 0; 5146 } 5147 } 5148 return idle; 5149 } 5150 5151 void md_done_sync(mddev_t *mddev, int blocks, int ok) 5152 { 5153 /* another "blocks" (512byte) blocks have been synced */ 5154 atomic_sub(blocks, &mddev->recovery_active); 5155 wake_up(&mddev->recovery_wait); 5156 if (!ok) { 5157 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5158 md_wakeup_thread(mddev->thread); 5159 // stop recovery, signal do_sync .... 5160 } 5161 } 5162 5163 5164 /* md_write_start(mddev, bi) 5165 * If we need to update some array metadata (e.g. 'active' flag 5166 * in superblock) before writing, schedule a superblock update 5167 * and wait for it to complete. 5168 */ 5169 void md_write_start(mddev_t *mddev, struct bio *bi) 5170 { 5171 if (bio_data_dir(bi) != WRITE) 5172 return; 5173 5174 BUG_ON(mddev->ro == 1); 5175 if (mddev->ro == 2) { 5176 /* need to switch to read/write */ 5177 mddev->ro = 0; 5178 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5179 md_wakeup_thread(mddev->thread); 5180 } 5181 atomic_inc(&mddev->writes_pending); 5182 if (mddev->in_sync) { 5183 spin_lock_irq(&mddev->write_lock); 5184 if (mddev->in_sync) { 5185 mddev->in_sync = 0; 5186 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5187 md_wakeup_thread(mddev->thread); 5188 } 5189 spin_unlock_irq(&mddev->write_lock); 5190 } 5191 wait_event(mddev->sb_wait, mddev->flags==0); 5192 } 5193 5194 void md_write_end(mddev_t *mddev) 5195 { 5196 if (atomic_dec_and_test(&mddev->writes_pending)) { 5197 if (mddev->safemode == 2) 5198 md_wakeup_thread(mddev->thread); 5199 else if (mddev->safemode_delay) 5200 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5201 } 5202 } 5203 5204 /* md_allow_write(mddev) 5205 * Calling this ensures that the array is marked 'active' so that writes 5206 * may proceed without blocking. It is important to call this before 5207 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5208 * Must be called with mddev_lock held. 5209 */ 5210 void md_allow_write(mddev_t *mddev) 5211 { 5212 if (!mddev->pers) 5213 return; 5214 if (mddev->ro) 5215 return; 5216 5217 spin_lock_irq(&mddev->write_lock); 5218 if (mddev->in_sync) { 5219 mddev->in_sync = 0; 5220 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5221 if (mddev->safemode_delay && 5222 mddev->safemode == 0) 5223 mddev->safemode = 1; 5224 spin_unlock_irq(&mddev->write_lock); 5225 md_update_sb(mddev, 0); 5226 } else 5227 spin_unlock_irq(&mddev->write_lock); 5228 } 5229 EXPORT_SYMBOL_GPL(md_allow_write); 5230 5231 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5232 5233 #define SYNC_MARKS 10 5234 #define SYNC_MARK_STEP (3*HZ) 5235 void md_do_sync(mddev_t *mddev) 5236 { 5237 mddev_t *mddev2; 5238 unsigned int currspeed = 0, 5239 window; 5240 sector_t max_sectors,j, io_sectors; 5241 unsigned long mark[SYNC_MARKS]; 5242 sector_t mark_cnt[SYNC_MARKS]; 5243 int last_mark,m; 5244 struct list_head *tmp; 5245 sector_t last_check; 5246 int skipped = 0; 5247 struct list_head *rtmp; 5248 mdk_rdev_t *rdev; 5249 char *desc; 5250 5251 /* just incase thread restarts... */ 5252 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5253 return; 5254 if (mddev->ro) /* never try to sync a read-only array */ 5255 return; 5256 5257 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5258 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5259 desc = "data-check"; 5260 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5261 desc = "requested-resync"; 5262 else 5263 desc = "resync"; 5264 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5265 desc = "reshape"; 5266 else 5267 desc = "recovery"; 5268 5269 /* we overload curr_resync somewhat here. 5270 * 0 == not engaged in resync at all 5271 * 2 == checking that there is no conflict with another sync 5272 * 1 == like 2, but have yielded to allow conflicting resync to 5273 * commense 5274 * other == active in resync - this many blocks 5275 * 5276 * Before starting a resync we must have set curr_resync to 5277 * 2, and then checked that every "conflicting" array has curr_resync 5278 * less than ours. When we find one that is the same or higher 5279 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5280 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5281 * This will mean we have to start checking from the beginning again. 5282 * 5283 */ 5284 5285 do { 5286 mddev->curr_resync = 2; 5287 5288 try_again: 5289 if (kthread_should_stop()) { 5290 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5291 goto skip; 5292 } 5293 ITERATE_MDDEV(mddev2,tmp) { 5294 if (mddev2 == mddev) 5295 continue; 5296 if (mddev2->curr_resync && 5297 match_mddev_units(mddev,mddev2)) { 5298 DEFINE_WAIT(wq); 5299 if (mddev < mddev2 && mddev->curr_resync == 2) { 5300 /* arbitrarily yield */ 5301 mddev->curr_resync = 1; 5302 wake_up(&resync_wait); 5303 } 5304 if (mddev > mddev2 && mddev->curr_resync == 1) 5305 /* no need to wait here, we can wait the next 5306 * time 'round when curr_resync == 2 5307 */ 5308 continue; 5309 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5310 if (!kthread_should_stop() && 5311 mddev2->curr_resync >= mddev->curr_resync) { 5312 printk(KERN_INFO "md: delaying %s of %s" 5313 " until %s has finished (they" 5314 " share one or more physical units)\n", 5315 desc, mdname(mddev), mdname(mddev2)); 5316 mddev_put(mddev2); 5317 schedule(); 5318 finish_wait(&resync_wait, &wq); 5319 goto try_again; 5320 } 5321 finish_wait(&resync_wait, &wq); 5322 } 5323 } 5324 } while (mddev->curr_resync < 2); 5325 5326 j = 0; 5327 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5328 /* resync follows the size requested by the personality, 5329 * which defaults to physical size, but can be virtual size 5330 */ 5331 max_sectors = mddev->resync_max_sectors; 5332 mddev->resync_mismatches = 0; 5333 /* we don't use the checkpoint if there's a bitmap */ 5334 if (!mddev->bitmap && 5335 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5336 j = mddev->recovery_cp; 5337 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5338 max_sectors = mddev->size << 1; 5339 else { 5340 /* recovery follows the physical size of devices */ 5341 max_sectors = mddev->size << 1; 5342 j = MaxSector; 5343 ITERATE_RDEV(mddev,rdev,rtmp) 5344 if (rdev->raid_disk >= 0 && 5345 !test_bit(Faulty, &rdev->flags) && 5346 !test_bit(In_sync, &rdev->flags) && 5347 rdev->recovery_offset < j) 5348 j = rdev->recovery_offset; 5349 } 5350 5351 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5352 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5353 " %d KB/sec/disk.\n", speed_min(mddev)); 5354 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5355 "(but not more than %d KB/sec) for %s.\n", 5356 speed_max(mddev), desc); 5357 5358 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5359 5360 io_sectors = 0; 5361 for (m = 0; m < SYNC_MARKS; m++) { 5362 mark[m] = jiffies; 5363 mark_cnt[m] = io_sectors; 5364 } 5365 last_mark = 0; 5366 mddev->resync_mark = mark[last_mark]; 5367 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5368 5369 /* 5370 * Tune reconstruction: 5371 */ 5372 window = 32*(PAGE_SIZE/512); 5373 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5374 window/2,(unsigned long long) max_sectors/2); 5375 5376 atomic_set(&mddev->recovery_active, 0); 5377 init_waitqueue_head(&mddev->recovery_wait); 5378 last_check = 0; 5379 5380 if (j>2) { 5381 printk(KERN_INFO 5382 "md: resuming %s of %s from checkpoint.\n", 5383 desc, mdname(mddev)); 5384 mddev->curr_resync = j; 5385 } 5386 5387 while (j < max_sectors) { 5388 sector_t sectors; 5389 5390 skipped = 0; 5391 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5392 currspeed < speed_min(mddev)); 5393 if (sectors == 0) { 5394 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5395 goto out; 5396 } 5397 5398 if (!skipped) { /* actual IO requested */ 5399 io_sectors += sectors; 5400 atomic_add(sectors, &mddev->recovery_active); 5401 } 5402 5403 j += sectors; 5404 if (j>1) mddev->curr_resync = j; 5405 mddev->curr_mark_cnt = io_sectors; 5406 if (last_check == 0) 5407 /* this is the earliers that rebuilt will be 5408 * visible in /proc/mdstat 5409 */ 5410 md_new_event(mddev); 5411 5412 if (last_check + window > io_sectors || j == max_sectors) 5413 continue; 5414 5415 last_check = io_sectors; 5416 5417 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5418 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5419 break; 5420 5421 repeat: 5422 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5423 /* step marks */ 5424 int next = (last_mark+1) % SYNC_MARKS; 5425 5426 mddev->resync_mark = mark[next]; 5427 mddev->resync_mark_cnt = mark_cnt[next]; 5428 mark[next] = jiffies; 5429 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5430 last_mark = next; 5431 } 5432 5433 5434 if (kthread_should_stop()) { 5435 /* 5436 * got a signal, exit. 5437 */ 5438 printk(KERN_INFO 5439 "md: md_do_sync() got signal ... exiting\n"); 5440 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5441 goto out; 5442 } 5443 5444 /* 5445 * this loop exits only if either when we are slower than 5446 * the 'hard' speed limit, or the system was IO-idle for 5447 * a jiffy. 5448 * the system might be non-idle CPU-wise, but we only care 5449 * about not overloading the IO subsystem. (things like an 5450 * e2fsck being done on the RAID array should execute fast) 5451 */ 5452 mddev->queue->unplug_fn(mddev->queue); 5453 cond_resched(); 5454 5455 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5456 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5457 5458 if (currspeed > speed_min(mddev)) { 5459 if ((currspeed > speed_max(mddev)) || 5460 !is_mddev_idle(mddev)) { 5461 msleep(500); 5462 goto repeat; 5463 } 5464 } 5465 } 5466 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5467 /* 5468 * this also signals 'finished resyncing' to md_stop 5469 */ 5470 out: 5471 mddev->queue->unplug_fn(mddev->queue); 5472 5473 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5474 5475 /* tell personality that we are finished */ 5476 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5477 5478 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5479 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5480 mddev->curr_resync > 2) { 5481 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5482 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5483 if (mddev->curr_resync >= mddev->recovery_cp) { 5484 printk(KERN_INFO 5485 "md: checkpointing %s of %s.\n", 5486 desc, mdname(mddev)); 5487 mddev->recovery_cp = mddev->curr_resync; 5488 } 5489 } else 5490 mddev->recovery_cp = MaxSector; 5491 } else { 5492 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5493 mddev->curr_resync = MaxSector; 5494 ITERATE_RDEV(mddev,rdev,rtmp) 5495 if (rdev->raid_disk >= 0 && 5496 !test_bit(Faulty, &rdev->flags) && 5497 !test_bit(In_sync, &rdev->flags) && 5498 rdev->recovery_offset < mddev->curr_resync) 5499 rdev->recovery_offset = mddev->curr_resync; 5500 } 5501 } 5502 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5503 5504 skip: 5505 mddev->curr_resync = 0; 5506 wake_up(&resync_wait); 5507 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5508 md_wakeup_thread(mddev->thread); 5509 } 5510 EXPORT_SYMBOL_GPL(md_do_sync); 5511 5512 5513 static int remove_and_add_spares(mddev_t *mddev) 5514 { 5515 mdk_rdev_t *rdev; 5516 struct list_head *rtmp; 5517 int spares = 0; 5518 5519 ITERATE_RDEV(mddev,rdev,rtmp) 5520 if (rdev->raid_disk >= 0 && 5521 (test_bit(Faulty, &rdev->flags) || 5522 ! test_bit(In_sync, &rdev->flags)) && 5523 atomic_read(&rdev->nr_pending)==0) { 5524 if (mddev->pers->hot_remove_disk( 5525 mddev, rdev->raid_disk)==0) { 5526 char nm[20]; 5527 sprintf(nm,"rd%d", rdev->raid_disk); 5528 sysfs_remove_link(&mddev->kobj, nm); 5529 rdev->raid_disk = -1; 5530 } 5531 } 5532 5533 if (mddev->degraded) { 5534 ITERATE_RDEV(mddev,rdev,rtmp) 5535 if (rdev->raid_disk < 0 5536 && !test_bit(Faulty, &rdev->flags)) { 5537 rdev->recovery_offset = 0; 5538 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5539 char nm[20]; 5540 sprintf(nm, "rd%d", rdev->raid_disk); 5541 if (sysfs_create_link(&mddev->kobj, 5542 &rdev->kobj, nm)) 5543 printk(KERN_WARNING 5544 "md: cannot register " 5545 "%s for %s\n", 5546 nm, mdname(mddev)); 5547 spares++; 5548 md_new_event(mddev); 5549 } else 5550 break; 5551 } 5552 } 5553 return spares; 5554 } 5555 /* 5556 * This routine is regularly called by all per-raid-array threads to 5557 * deal with generic issues like resync and super-block update. 5558 * Raid personalities that don't have a thread (linear/raid0) do not 5559 * need this as they never do any recovery or update the superblock. 5560 * 5561 * It does not do any resync itself, but rather "forks" off other threads 5562 * to do that as needed. 5563 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5564 * "->recovery" and create a thread at ->sync_thread. 5565 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5566 * and wakeups up this thread which will reap the thread and finish up. 5567 * This thread also removes any faulty devices (with nr_pending == 0). 5568 * 5569 * The overall approach is: 5570 * 1/ if the superblock needs updating, update it. 5571 * 2/ If a recovery thread is running, don't do anything else. 5572 * 3/ If recovery has finished, clean up, possibly marking spares active. 5573 * 4/ If there are any faulty devices, remove them. 5574 * 5/ If array is degraded, try to add spares devices 5575 * 6/ If array has spares or is not in-sync, start a resync thread. 5576 */ 5577 void md_check_recovery(mddev_t *mddev) 5578 { 5579 mdk_rdev_t *rdev; 5580 struct list_head *rtmp; 5581 5582 5583 if (mddev->bitmap) 5584 bitmap_daemon_work(mddev->bitmap); 5585 5586 if (mddev->ro) 5587 return; 5588 5589 if (signal_pending(current)) { 5590 if (mddev->pers->sync_request) { 5591 printk(KERN_INFO "md: %s in immediate safe mode\n", 5592 mdname(mddev)); 5593 mddev->safemode = 2; 5594 } 5595 flush_signals(current); 5596 } 5597 5598 if ( ! ( 5599 mddev->flags || 5600 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5601 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5602 (mddev->safemode == 1) || 5603 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5604 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5605 )) 5606 return; 5607 5608 if (mddev_trylock(mddev)) { 5609 int spares = 0; 5610 5611 spin_lock_irq(&mddev->write_lock); 5612 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5613 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5614 mddev->in_sync = 1; 5615 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5616 } 5617 if (mddev->safemode == 1) 5618 mddev->safemode = 0; 5619 spin_unlock_irq(&mddev->write_lock); 5620 5621 if (mddev->flags) 5622 md_update_sb(mddev, 0); 5623 5624 5625 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5626 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5627 /* resync/recovery still happening */ 5628 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5629 goto unlock; 5630 } 5631 if (mddev->sync_thread) { 5632 /* resync has finished, collect result */ 5633 md_unregister_thread(mddev->sync_thread); 5634 mddev->sync_thread = NULL; 5635 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5636 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5637 /* success...*/ 5638 /* activate any spares */ 5639 mddev->pers->spare_active(mddev); 5640 } 5641 md_update_sb(mddev, 1); 5642 5643 /* if array is no-longer degraded, then any saved_raid_disk 5644 * information must be scrapped 5645 */ 5646 if (!mddev->degraded) 5647 ITERATE_RDEV(mddev,rdev,rtmp) 5648 rdev->saved_raid_disk = -1; 5649 5650 mddev->recovery = 0; 5651 /* flag recovery needed just to double check */ 5652 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5653 md_new_event(mddev); 5654 goto unlock; 5655 } 5656 /* Clear some bits that don't mean anything, but 5657 * might be left set 5658 */ 5659 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5660 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5661 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5662 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5663 5664 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5665 goto unlock; 5666 /* no recovery is running. 5667 * remove any failed drives, then 5668 * add spares if possible. 5669 * Spare are also removed and re-added, to allow 5670 * the personality to fail the re-add. 5671 */ 5672 5673 if (mddev->reshape_position != MaxSector) { 5674 if (mddev->pers->check_reshape(mddev) != 0) 5675 /* Cannot proceed */ 5676 goto unlock; 5677 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5678 } else if ((spares = remove_and_add_spares(mddev))) { 5679 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5680 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5681 } else if (mddev->recovery_cp < MaxSector) { 5682 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5683 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5684 /* nothing to be done ... */ 5685 goto unlock; 5686 5687 if (mddev->pers->sync_request) { 5688 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5689 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5690 /* We are adding a device or devices to an array 5691 * which has the bitmap stored on all devices. 5692 * So make sure all bitmap pages get written 5693 */ 5694 bitmap_write_all(mddev->bitmap); 5695 } 5696 mddev->sync_thread = md_register_thread(md_do_sync, 5697 mddev, 5698 "%s_resync"); 5699 if (!mddev->sync_thread) { 5700 printk(KERN_ERR "%s: could not start resync" 5701 " thread...\n", 5702 mdname(mddev)); 5703 /* leave the spares where they are, it shouldn't hurt */ 5704 mddev->recovery = 0; 5705 } else 5706 md_wakeup_thread(mddev->sync_thread); 5707 md_new_event(mddev); 5708 } 5709 unlock: 5710 mddev_unlock(mddev); 5711 } 5712 } 5713 5714 static int md_notify_reboot(struct notifier_block *this, 5715 unsigned long code, void *x) 5716 { 5717 struct list_head *tmp; 5718 mddev_t *mddev; 5719 5720 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5721 5722 printk(KERN_INFO "md: stopping all md devices.\n"); 5723 5724 ITERATE_MDDEV(mddev,tmp) 5725 if (mddev_trylock(mddev)) { 5726 do_md_stop (mddev, 1); 5727 mddev_unlock(mddev); 5728 } 5729 /* 5730 * certain more exotic SCSI devices are known to be 5731 * volatile wrt too early system reboots. While the 5732 * right place to handle this issue is the given 5733 * driver, we do want to have a safe RAID driver ... 5734 */ 5735 mdelay(1000*1); 5736 } 5737 return NOTIFY_DONE; 5738 } 5739 5740 static struct notifier_block md_notifier = { 5741 .notifier_call = md_notify_reboot, 5742 .next = NULL, 5743 .priority = INT_MAX, /* before any real devices */ 5744 }; 5745 5746 static void md_geninit(void) 5747 { 5748 struct proc_dir_entry *p; 5749 5750 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5751 5752 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5753 if (p) 5754 p->proc_fops = &md_seq_fops; 5755 } 5756 5757 static int __init md_init(void) 5758 { 5759 if (register_blkdev(MAJOR_NR, "md")) 5760 return -1; 5761 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5762 unregister_blkdev(MAJOR_NR, "md"); 5763 return -1; 5764 } 5765 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 5766 md_probe, NULL, NULL); 5767 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 5768 md_probe, NULL, NULL); 5769 5770 register_reboot_notifier(&md_notifier); 5771 raid_table_header = register_sysctl_table(raid_root_table); 5772 5773 md_geninit(); 5774 return (0); 5775 } 5776 5777 5778 #ifndef MODULE 5779 5780 /* 5781 * Searches all registered partitions for autorun RAID arrays 5782 * at boot time. 5783 */ 5784 static dev_t detected_devices[128]; 5785 static int dev_cnt; 5786 5787 void md_autodetect_dev(dev_t dev) 5788 { 5789 if (dev_cnt >= 0 && dev_cnt < 127) 5790 detected_devices[dev_cnt++] = dev; 5791 } 5792 5793 5794 static void autostart_arrays(int part) 5795 { 5796 mdk_rdev_t *rdev; 5797 int i; 5798 5799 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5800 5801 for (i = 0; i < dev_cnt; i++) { 5802 dev_t dev = detected_devices[i]; 5803 5804 rdev = md_import_device(dev,0, 90); 5805 if (IS_ERR(rdev)) 5806 continue; 5807 5808 if (test_bit(Faulty, &rdev->flags)) { 5809 MD_BUG(); 5810 continue; 5811 } 5812 list_add(&rdev->same_set, &pending_raid_disks); 5813 } 5814 dev_cnt = 0; 5815 5816 autorun_devices(part); 5817 } 5818 5819 #endif /* !MODULE */ 5820 5821 static __exit void md_exit(void) 5822 { 5823 mddev_t *mddev; 5824 struct list_head *tmp; 5825 5826 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 5827 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 5828 5829 unregister_blkdev(MAJOR_NR,"md"); 5830 unregister_blkdev(mdp_major, "mdp"); 5831 unregister_reboot_notifier(&md_notifier); 5832 unregister_sysctl_table(raid_table_header); 5833 remove_proc_entry("mdstat", NULL); 5834 ITERATE_MDDEV(mddev,tmp) { 5835 struct gendisk *disk = mddev->gendisk; 5836 if (!disk) 5837 continue; 5838 export_array(mddev); 5839 del_gendisk(disk); 5840 put_disk(disk); 5841 mddev->gendisk = NULL; 5842 mddev_put(mddev); 5843 } 5844 } 5845 5846 subsys_initcall(md_init); 5847 module_exit(md_exit) 5848 5849 static int get_ro(char *buffer, struct kernel_param *kp) 5850 { 5851 return sprintf(buffer, "%d", start_readonly); 5852 } 5853 static int set_ro(const char *val, struct kernel_param *kp) 5854 { 5855 char *e; 5856 int num = simple_strtoul(val, &e, 10); 5857 if (*val && (*e == '\0' || *e == '\n')) { 5858 start_readonly = num; 5859 return 0; 5860 } 5861 return -EINVAL; 5862 } 5863 5864 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5865 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5866 5867 5868 EXPORT_SYMBOL(register_md_personality); 5869 EXPORT_SYMBOL(unregister_md_personality); 5870 EXPORT_SYMBOL(md_error); 5871 EXPORT_SYMBOL(md_done_sync); 5872 EXPORT_SYMBOL(md_write_start); 5873 EXPORT_SYMBOL(md_write_end); 5874 EXPORT_SYMBOL(md_register_thread); 5875 EXPORT_SYMBOL(md_unregister_thread); 5876 EXPORT_SYMBOL(md_wakeup_thread); 5877 EXPORT_SYMBOL(md_check_recovery); 5878 MODULE_LICENSE("GPL"); 5879 MODULE_ALIAS("md"); 5880 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 5881