1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/sysctl.h> 38 #include <linux/seq_file.h> 39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/hdreg.h> 43 #include <linux/proc_fs.h> 44 #include <linux/random.h> 45 #include <linux/reboot.h> 46 #include <linux/file.h> 47 #include <linux/delay.h> 48 #include <linux/raid/md_p.h> 49 #include <linux/raid/md_u.h> 50 #include "md.h" 51 #include "bitmap.h" 52 53 #define DEBUG 0 54 #define dprintk(x...) ((void)(DEBUG && printk(x))) 55 56 57 #ifndef MODULE 58 static void autostart_arrays(int part); 59 #endif 60 61 static LIST_HEAD(pers_list); 62 static DEFINE_SPINLOCK(pers_lock); 63 64 static void md_print_devices(void); 65 66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 67 68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 69 70 /* 71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 72 * is 1000 KB/sec, so the extra system load does not show up that much. 73 * Increase it if you want to have more _guaranteed_ speed. Note that 74 * the RAID driver will use the maximum available bandwidth if the IO 75 * subsystem is idle. There is also an 'absolute maximum' reconstruction 76 * speed limit - in case reconstruction slows down your system despite 77 * idle IO detection. 78 * 79 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 80 * or /sys/block/mdX/md/sync_speed_{min,max} 81 */ 82 83 static int sysctl_speed_limit_min = 1000; 84 static int sysctl_speed_limit_max = 200000; 85 static inline int speed_min(mddev_t *mddev) 86 { 87 return mddev->sync_speed_min ? 88 mddev->sync_speed_min : sysctl_speed_limit_min; 89 } 90 91 static inline int speed_max(mddev_t *mddev) 92 { 93 return mddev->sync_speed_max ? 94 mddev->sync_speed_max : sysctl_speed_limit_max; 95 } 96 97 static struct ctl_table_header *raid_table_header; 98 99 static ctl_table raid_table[] = { 100 { 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 102 .procname = "speed_limit_min", 103 .data = &sysctl_speed_limit_min, 104 .maxlen = sizeof(int), 105 .mode = S_IRUGO|S_IWUSR, 106 .proc_handler = &proc_dointvec, 107 }, 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 110 .procname = "speed_limit_max", 111 .data = &sysctl_speed_limit_max, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { .ctl_name = 0 } 117 }; 118 119 static ctl_table raid_dir_table[] = { 120 { 121 .ctl_name = DEV_RAID, 122 .procname = "raid", 123 .maxlen = 0, 124 .mode = S_IRUGO|S_IXUGO, 125 .child = raid_table, 126 }, 127 { .ctl_name = 0 } 128 }; 129 130 static ctl_table raid_root_table[] = { 131 { 132 .ctl_name = CTL_DEV, 133 .procname = "dev", 134 .maxlen = 0, 135 .mode = 0555, 136 .child = raid_dir_table, 137 }, 138 { .ctl_name = 0 } 139 }; 140 141 static const struct block_device_operations md_fops; 142 143 static int start_readonly; 144 145 /* 146 * We have a system wide 'event count' that is incremented 147 * on any 'interesting' event, and readers of /proc/mdstat 148 * can use 'poll' or 'select' to find out when the event 149 * count increases. 150 * 151 * Events are: 152 * start array, stop array, error, add device, remove device, 153 * start build, activate spare 154 */ 155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 156 static atomic_t md_event_count; 157 void md_new_event(mddev_t *mddev) 158 { 159 atomic_inc(&md_event_count); 160 wake_up(&md_event_waiters); 161 } 162 EXPORT_SYMBOL_GPL(md_new_event); 163 164 /* Alternate version that can be called from interrupts 165 * when calling sysfs_notify isn't needed. 166 */ 167 static void md_new_event_inintr(mddev_t *mddev) 168 { 169 atomic_inc(&md_event_count); 170 wake_up(&md_event_waiters); 171 } 172 173 /* 174 * Enables to iterate over all existing md arrays 175 * all_mddevs_lock protects this list. 176 */ 177 static LIST_HEAD(all_mddevs); 178 static DEFINE_SPINLOCK(all_mddevs_lock); 179 180 181 /* 182 * iterates through all used mddevs in the system. 183 * We take care to grab the all_mddevs_lock whenever navigating 184 * the list, and to always hold a refcount when unlocked. 185 * Any code which breaks out of this loop while own 186 * a reference to the current mddev and must mddev_put it. 187 */ 188 #define for_each_mddev(mddev,tmp) \ 189 \ 190 for (({ spin_lock(&all_mddevs_lock); \ 191 tmp = all_mddevs.next; \ 192 mddev = NULL;}); \ 193 ({ if (tmp != &all_mddevs) \ 194 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 195 spin_unlock(&all_mddevs_lock); \ 196 if (mddev) mddev_put(mddev); \ 197 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 198 tmp != &all_mddevs;}); \ 199 ({ spin_lock(&all_mddevs_lock); \ 200 tmp = tmp->next;}) \ 201 ) 202 203 204 /* Rather than calling directly into the personality make_request function, 205 * IO requests come here first so that we can check if the device is 206 * being suspended pending a reconfiguration. 207 * We hold a refcount over the call to ->make_request. By the time that 208 * call has finished, the bio has been linked into some internal structure 209 * and so is visible to ->quiesce(), so we don't need the refcount any more. 210 */ 211 static int md_make_request(struct request_queue *q, struct bio *bio) 212 { 213 mddev_t *mddev = q->queuedata; 214 int rv; 215 if (mddev == NULL || mddev->pers == NULL) { 216 bio_io_error(bio); 217 return 0; 218 } 219 rcu_read_lock(); 220 if (mddev->suspended) { 221 DEFINE_WAIT(__wait); 222 for (;;) { 223 prepare_to_wait(&mddev->sb_wait, &__wait, 224 TASK_UNINTERRUPTIBLE); 225 if (!mddev->suspended) 226 break; 227 rcu_read_unlock(); 228 schedule(); 229 rcu_read_lock(); 230 } 231 finish_wait(&mddev->sb_wait, &__wait); 232 } 233 atomic_inc(&mddev->active_io); 234 rcu_read_unlock(); 235 rv = mddev->pers->make_request(q, bio); 236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 237 wake_up(&mddev->sb_wait); 238 239 return rv; 240 } 241 242 static void mddev_suspend(mddev_t *mddev) 243 { 244 BUG_ON(mddev->suspended); 245 mddev->suspended = 1; 246 synchronize_rcu(); 247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 248 mddev->pers->quiesce(mddev, 1); 249 md_unregister_thread(mddev->thread); 250 mddev->thread = NULL; 251 /* we now know that no code is executing in the personality module, 252 * except possibly the tail end of a ->bi_end_io function, but that 253 * is certain to complete before the module has a chance to get 254 * unloaded 255 */ 256 } 257 258 static void mddev_resume(mddev_t *mddev) 259 { 260 mddev->suspended = 0; 261 wake_up(&mddev->sb_wait); 262 mddev->pers->quiesce(mddev, 0); 263 } 264 265 int mddev_congested(mddev_t *mddev, int bits) 266 { 267 return mddev->suspended; 268 } 269 EXPORT_SYMBOL(mddev_congested); 270 271 272 static inline mddev_t *mddev_get(mddev_t *mddev) 273 { 274 atomic_inc(&mddev->active); 275 return mddev; 276 } 277 278 static void mddev_delayed_delete(struct work_struct *ws); 279 280 static void mddev_put(mddev_t *mddev) 281 { 282 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 283 return; 284 if (!mddev->raid_disks && list_empty(&mddev->disks) && 285 !mddev->hold_active) { 286 list_del(&mddev->all_mddevs); 287 if (mddev->gendisk) { 288 /* we did a probe so need to clean up. 289 * Call schedule_work inside the spinlock 290 * so that flush_scheduled_work() after 291 * mddev_find will succeed in waiting for the 292 * work to be done. 293 */ 294 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 295 schedule_work(&mddev->del_work); 296 } else 297 kfree(mddev); 298 } 299 spin_unlock(&all_mddevs_lock); 300 } 301 302 static mddev_t * mddev_find(dev_t unit) 303 { 304 mddev_t *mddev, *new = NULL; 305 306 retry: 307 spin_lock(&all_mddevs_lock); 308 309 if (unit) { 310 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 311 if (mddev->unit == unit) { 312 mddev_get(mddev); 313 spin_unlock(&all_mddevs_lock); 314 kfree(new); 315 return mddev; 316 } 317 318 if (new) { 319 list_add(&new->all_mddevs, &all_mddevs); 320 spin_unlock(&all_mddevs_lock); 321 new->hold_active = UNTIL_IOCTL; 322 return new; 323 } 324 } else if (new) { 325 /* find an unused unit number */ 326 static int next_minor = 512; 327 int start = next_minor; 328 int is_free = 0; 329 int dev = 0; 330 while (!is_free) { 331 dev = MKDEV(MD_MAJOR, next_minor); 332 next_minor++; 333 if (next_minor > MINORMASK) 334 next_minor = 0; 335 if (next_minor == start) { 336 /* Oh dear, all in use. */ 337 spin_unlock(&all_mddevs_lock); 338 kfree(new); 339 return NULL; 340 } 341 342 is_free = 1; 343 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 344 if (mddev->unit == dev) { 345 is_free = 0; 346 break; 347 } 348 } 349 new->unit = dev; 350 new->md_minor = MINOR(dev); 351 new->hold_active = UNTIL_STOP; 352 list_add(&new->all_mddevs, &all_mddevs); 353 spin_unlock(&all_mddevs_lock); 354 return new; 355 } 356 spin_unlock(&all_mddevs_lock); 357 358 new = kzalloc(sizeof(*new), GFP_KERNEL); 359 if (!new) 360 return NULL; 361 362 new->unit = unit; 363 if (MAJOR(unit) == MD_MAJOR) 364 new->md_minor = MINOR(unit); 365 else 366 new->md_minor = MINOR(unit) >> MdpMinorShift; 367 368 mutex_init(&new->open_mutex); 369 mutex_init(&new->reconfig_mutex); 370 INIT_LIST_HEAD(&new->disks); 371 INIT_LIST_HEAD(&new->all_mddevs); 372 init_timer(&new->safemode_timer); 373 atomic_set(&new->active, 1); 374 atomic_set(&new->openers, 0); 375 atomic_set(&new->active_io, 0); 376 spin_lock_init(&new->write_lock); 377 init_waitqueue_head(&new->sb_wait); 378 init_waitqueue_head(&new->recovery_wait); 379 new->reshape_position = MaxSector; 380 new->resync_min = 0; 381 new->resync_max = MaxSector; 382 new->level = LEVEL_NONE; 383 384 goto retry; 385 } 386 387 static inline int mddev_lock(mddev_t * mddev) 388 { 389 return mutex_lock_interruptible(&mddev->reconfig_mutex); 390 } 391 392 static inline int mddev_is_locked(mddev_t *mddev) 393 { 394 return mutex_is_locked(&mddev->reconfig_mutex); 395 } 396 397 static inline int mddev_trylock(mddev_t * mddev) 398 { 399 return mutex_trylock(&mddev->reconfig_mutex); 400 } 401 402 static inline void mddev_unlock(mddev_t * mddev) 403 { 404 mutex_unlock(&mddev->reconfig_mutex); 405 406 md_wakeup_thread(mddev->thread); 407 } 408 409 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 410 { 411 mdk_rdev_t *rdev; 412 413 list_for_each_entry(rdev, &mddev->disks, same_set) 414 if (rdev->desc_nr == nr) 415 return rdev; 416 417 return NULL; 418 } 419 420 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 421 { 422 mdk_rdev_t *rdev; 423 424 list_for_each_entry(rdev, &mddev->disks, same_set) 425 if (rdev->bdev->bd_dev == dev) 426 return rdev; 427 428 return NULL; 429 } 430 431 static struct mdk_personality *find_pers(int level, char *clevel) 432 { 433 struct mdk_personality *pers; 434 list_for_each_entry(pers, &pers_list, list) { 435 if (level != LEVEL_NONE && pers->level == level) 436 return pers; 437 if (strcmp(pers->name, clevel)==0) 438 return pers; 439 } 440 return NULL; 441 } 442 443 /* return the offset of the super block in 512byte sectors */ 444 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 445 { 446 sector_t num_sectors = bdev->bd_inode->i_size / 512; 447 return MD_NEW_SIZE_SECTORS(num_sectors); 448 } 449 450 static int alloc_disk_sb(mdk_rdev_t * rdev) 451 { 452 if (rdev->sb_page) 453 MD_BUG(); 454 455 rdev->sb_page = alloc_page(GFP_KERNEL); 456 if (!rdev->sb_page) { 457 printk(KERN_ALERT "md: out of memory.\n"); 458 return -ENOMEM; 459 } 460 461 return 0; 462 } 463 464 static void free_disk_sb(mdk_rdev_t * rdev) 465 { 466 if (rdev->sb_page) { 467 put_page(rdev->sb_page); 468 rdev->sb_loaded = 0; 469 rdev->sb_page = NULL; 470 rdev->sb_start = 0; 471 rdev->sectors = 0; 472 } 473 } 474 475 476 static void super_written(struct bio *bio, int error) 477 { 478 mdk_rdev_t *rdev = bio->bi_private; 479 mddev_t *mddev = rdev->mddev; 480 481 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 482 printk("md: super_written gets error=%d, uptodate=%d\n", 483 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 484 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 485 md_error(mddev, rdev); 486 } 487 488 if (atomic_dec_and_test(&mddev->pending_writes)) 489 wake_up(&mddev->sb_wait); 490 bio_put(bio); 491 } 492 493 static void super_written_barrier(struct bio *bio, int error) 494 { 495 struct bio *bio2 = bio->bi_private; 496 mdk_rdev_t *rdev = bio2->bi_private; 497 mddev_t *mddev = rdev->mddev; 498 499 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 500 error == -EOPNOTSUPP) { 501 unsigned long flags; 502 /* barriers don't appear to be supported :-( */ 503 set_bit(BarriersNotsupp, &rdev->flags); 504 mddev->barriers_work = 0; 505 spin_lock_irqsave(&mddev->write_lock, flags); 506 bio2->bi_next = mddev->biolist; 507 mddev->biolist = bio2; 508 spin_unlock_irqrestore(&mddev->write_lock, flags); 509 wake_up(&mddev->sb_wait); 510 bio_put(bio); 511 } else { 512 bio_put(bio2); 513 bio->bi_private = rdev; 514 super_written(bio, error); 515 } 516 } 517 518 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 519 sector_t sector, int size, struct page *page) 520 { 521 /* write first size bytes of page to sector of rdev 522 * Increment mddev->pending_writes before returning 523 * and decrement it on completion, waking up sb_wait 524 * if zero is reached. 525 * If an error occurred, call md_error 526 * 527 * As we might need to resubmit the request if BIO_RW_BARRIER 528 * causes ENOTSUPP, we allocate a spare bio... 529 */ 530 struct bio *bio = bio_alloc(GFP_NOIO, 1); 531 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 532 533 bio->bi_bdev = rdev->bdev; 534 bio->bi_sector = sector; 535 bio_add_page(bio, page, size, 0); 536 bio->bi_private = rdev; 537 bio->bi_end_io = super_written; 538 bio->bi_rw = rw; 539 540 atomic_inc(&mddev->pending_writes); 541 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 542 struct bio *rbio; 543 rw |= (1<<BIO_RW_BARRIER); 544 rbio = bio_clone(bio, GFP_NOIO); 545 rbio->bi_private = bio; 546 rbio->bi_end_io = super_written_barrier; 547 submit_bio(rw, rbio); 548 } else 549 submit_bio(rw, bio); 550 } 551 552 void md_super_wait(mddev_t *mddev) 553 { 554 /* wait for all superblock writes that were scheduled to complete. 555 * if any had to be retried (due to BARRIER problems), retry them 556 */ 557 DEFINE_WAIT(wq); 558 for(;;) { 559 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 560 if (atomic_read(&mddev->pending_writes)==0) 561 break; 562 while (mddev->biolist) { 563 struct bio *bio; 564 spin_lock_irq(&mddev->write_lock); 565 bio = mddev->biolist; 566 mddev->biolist = bio->bi_next ; 567 bio->bi_next = NULL; 568 spin_unlock_irq(&mddev->write_lock); 569 submit_bio(bio->bi_rw, bio); 570 } 571 schedule(); 572 } 573 finish_wait(&mddev->sb_wait, &wq); 574 } 575 576 static void bi_complete(struct bio *bio, int error) 577 { 578 complete((struct completion*)bio->bi_private); 579 } 580 581 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 582 struct page *page, int rw) 583 { 584 struct bio *bio = bio_alloc(GFP_NOIO, 1); 585 struct completion event; 586 int ret; 587 588 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 589 590 bio->bi_bdev = bdev; 591 bio->bi_sector = sector; 592 bio_add_page(bio, page, size, 0); 593 init_completion(&event); 594 bio->bi_private = &event; 595 bio->bi_end_io = bi_complete; 596 submit_bio(rw, bio); 597 wait_for_completion(&event); 598 599 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 600 bio_put(bio); 601 return ret; 602 } 603 EXPORT_SYMBOL_GPL(sync_page_io); 604 605 static int read_disk_sb(mdk_rdev_t * rdev, int size) 606 { 607 char b[BDEVNAME_SIZE]; 608 if (!rdev->sb_page) { 609 MD_BUG(); 610 return -EINVAL; 611 } 612 if (rdev->sb_loaded) 613 return 0; 614 615 616 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 617 goto fail; 618 rdev->sb_loaded = 1; 619 return 0; 620 621 fail: 622 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 623 bdevname(rdev->bdev,b)); 624 return -EINVAL; 625 } 626 627 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 628 { 629 return sb1->set_uuid0 == sb2->set_uuid0 && 630 sb1->set_uuid1 == sb2->set_uuid1 && 631 sb1->set_uuid2 == sb2->set_uuid2 && 632 sb1->set_uuid3 == sb2->set_uuid3; 633 } 634 635 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 636 { 637 int ret; 638 mdp_super_t *tmp1, *tmp2; 639 640 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 641 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 642 643 if (!tmp1 || !tmp2) { 644 ret = 0; 645 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 646 goto abort; 647 } 648 649 *tmp1 = *sb1; 650 *tmp2 = *sb2; 651 652 /* 653 * nr_disks is not constant 654 */ 655 tmp1->nr_disks = 0; 656 tmp2->nr_disks = 0; 657 658 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 659 abort: 660 kfree(tmp1); 661 kfree(tmp2); 662 return ret; 663 } 664 665 666 static u32 md_csum_fold(u32 csum) 667 { 668 csum = (csum & 0xffff) + (csum >> 16); 669 return (csum & 0xffff) + (csum >> 16); 670 } 671 672 static unsigned int calc_sb_csum(mdp_super_t * sb) 673 { 674 u64 newcsum = 0; 675 u32 *sb32 = (u32*)sb; 676 int i; 677 unsigned int disk_csum, csum; 678 679 disk_csum = sb->sb_csum; 680 sb->sb_csum = 0; 681 682 for (i = 0; i < MD_SB_BYTES/4 ; i++) 683 newcsum += sb32[i]; 684 csum = (newcsum & 0xffffffff) + (newcsum>>32); 685 686 687 #ifdef CONFIG_ALPHA 688 /* This used to use csum_partial, which was wrong for several 689 * reasons including that different results are returned on 690 * different architectures. It isn't critical that we get exactly 691 * the same return value as before (we always csum_fold before 692 * testing, and that removes any differences). However as we 693 * know that csum_partial always returned a 16bit value on 694 * alphas, do a fold to maximise conformity to previous behaviour. 695 */ 696 sb->sb_csum = md_csum_fold(disk_csum); 697 #else 698 sb->sb_csum = disk_csum; 699 #endif 700 return csum; 701 } 702 703 704 /* 705 * Handle superblock details. 706 * We want to be able to handle multiple superblock formats 707 * so we have a common interface to them all, and an array of 708 * different handlers. 709 * We rely on user-space to write the initial superblock, and support 710 * reading and updating of superblocks. 711 * Interface methods are: 712 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 713 * loads and validates a superblock on dev. 714 * if refdev != NULL, compare superblocks on both devices 715 * Return: 716 * 0 - dev has a superblock that is compatible with refdev 717 * 1 - dev has a superblock that is compatible and newer than refdev 718 * so dev should be used as the refdev in future 719 * -EINVAL superblock incompatible or invalid 720 * -othererror e.g. -EIO 721 * 722 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 723 * Verify that dev is acceptable into mddev. 724 * The first time, mddev->raid_disks will be 0, and data from 725 * dev should be merged in. Subsequent calls check that dev 726 * is new enough. Return 0 or -EINVAL 727 * 728 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 729 * Update the superblock for rdev with data in mddev 730 * This does not write to disc. 731 * 732 */ 733 734 struct super_type { 735 char *name; 736 struct module *owner; 737 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 738 int minor_version); 739 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 740 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 741 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 742 sector_t num_sectors); 743 }; 744 745 /* 746 * Check that the given mddev has no bitmap. 747 * 748 * This function is called from the run method of all personalities that do not 749 * support bitmaps. It prints an error message and returns non-zero if mddev 750 * has a bitmap. Otherwise, it returns 0. 751 * 752 */ 753 int md_check_no_bitmap(mddev_t *mddev) 754 { 755 if (!mddev->bitmap_file && !mddev->bitmap_offset) 756 return 0; 757 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 758 mdname(mddev), mddev->pers->name); 759 return 1; 760 } 761 EXPORT_SYMBOL(md_check_no_bitmap); 762 763 /* 764 * load_super for 0.90.0 765 */ 766 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 767 { 768 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 769 mdp_super_t *sb; 770 int ret; 771 772 /* 773 * Calculate the position of the superblock (512byte sectors), 774 * it's at the end of the disk. 775 * 776 * It also happens to be a multiple of 4Kb. 777 */ 778 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 779 780 ret = read_disk_sb(rdev, MD_SB_BYTES); 781 if (ret) return ret; 782 783 ret = -EINVAL; 784 785 bdevname(rdev->bdev, b); 786 sb = (mdp_super_t*)page_address(rdev->sb_page); 787 788 if (sb->md_magic != MD_SB_MAGIC) { 789 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 790 b); 791 goto abort; 792 } 793 794 if (sb->major_version != 0 || 795 sb->minor_version < 90 || 796 sb->minor_version > 91) { 797 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 798 sb->major_version, sb->minor_version, 799 b); 800 goto abort; 801 } 802 803 if (sb->raid_disks <= 0) 804 goto abort; 805 806 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 807 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 808 b); 809 goto abort; 810 } 811 812 rdev->preferred_minor = sb->md_minor; 813 rdev->data_offset = 0; 814 rdev->sb_size = MD_SB_BYTES; 815 816 if (sb->level == LEVEL_MULTIPATH) 817 rdev->desc_nr = -1; 818 else 819 rdev->desc_nr = sb->this_disk.number; 820 821 if (!refdev) { 822 ret = 1; 823 } else { 824 __u64 ev1, ev2; 825 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 826 if (!uuid_equal(refsb, sb)) { 827 printk(KERN_WARNING "md: %s has different UUID to %s\n", 828 b, bdevname(refdev->bdev,b2)); 829 goto abort; 830 } 831 if (!sb_equal(refsb, sb)) { 832 printk(KERN_WARNING "md: %s has same UUID" 833 " but different superblock to %s\n", 834 b, bdevname(refdev->bdev, b2)); 835 goto abort; 836 } 837 ev1 = md_event(sb); 838 ev2 = md_event(refsb); 839 if (ev1 > ev2) 840 ret = 1; 841 else 842 ret = 0; 843 } 844 rdev->sectors = rdev->sb_start; 845 846 if (rdev->sectors < sb->size * 2 && sb->level > 1) 847 /* "this cannot possibly happen" ... */ 848 ret = -EINVAL; 849 850 abort: 851 return ret; 852 } 853 854 /* 855 * validate_super for 0.90.0 856 */ 857 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 858 { 859 mdp_disk_t *desc; 860 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 861 __u64 ev1 = md_event(sb); 862 863 rdev->raid_disk = -1; 864 clear_bit(Faulty, &rdev->flags); 865 clear_bit(In_sync, &rdev->flags); 866 clear_bit(WriteMostly, &rdev->flags); 867 clear_bit(BarriersNotsupp, &rdev->flags); 868 869 if (mddev->raid_disks == 0) { 870 mddev->major_version = 0; 871 mddev->minor_version = sb->minor_version; 872 mddev->patch_version = sb->patch_version; 873 mddev->external = 0; 874 mddev->chunk_sectors = sb->chunk_size >> 9; 875 mddev->ctime = sb->ctime; 876 mddev->utime = sb->utime; 877 mddev->level = sb->level; 878 mddev->clevel[0] = 0; 879 mddev->layout = sb->layout; 880 mddev->raid_disks = sb->raid_disks; 881 mddev->dev_sectors = sb->size * 2; 882 mddev->events = ev1; 883 mddev->bitmap_offset = 0; 884 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 885 886 if (mddev->minor_version >= 91) { 887 mddev->reshape_position = sb->reshape_position; 888 mddev->delta_disks = sb->delta_disks; 889 mddev->new_level = sb->new_level; 890 mddev->new_layout = sb->new_layout; 891 mddev->new_chunk_sectors = sb->new_chunk >> 9; 892 } else { 893 mddev->reshape_position = MaxSector; 894 mddev->delta_disks = 0; 895 mddev->new_level = mddev->level; 896 mddev->new_layout = mddev->layout; 897 mddev->new_chunk_sectors = mddev->chunk_sectors; 898 } 899 900 if (sb->state & (1<<MD_SB_CLEAN)) 901 mddev->recovery_cp = MaxSector; 902 else { 903 if (sb->events_hi == sb->cp_events_hi && 904 sb->events_lo == sb->cp_events_lo) { 905 mddev->recovery_cp = sb->recovery_cp; 906 } else 907 mddev->recovery_cp = 0; 908 } 909 910 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 911 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 912 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 913 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 914 915 mddev->max_disks = MD_SB_DISKS; 916 917 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 918 mddev->bitmap_file == NULL) 919 mddev->bitmap_offset = mddev->default_bitmap_offset; 920 921 } else if (mddev->pers == NULL) { 922 /* Insist on good event counter while assembling */ 923 ++ev1; 924 if (ev1 < mddev->events) 925 return -EINVAL; 926 } else if (mddev->bitmap) { 927 /* if adding to array with a bitmap, then we can accept an 928 * older device ... but not too old. 929 */ 930 if (ev1 < mddev->bitmap->events_cleared) 931 return 0; 932 } else { 933 if (ev1 < mddev->events) 934 /* just a hot-add of a new device, leave raid_disk at -1 */ 935 return 0; 936 } 937 938 if (mddev->level != LEVEL_MULTIPATH) { 939 desc = sb->disks + rdev->desc_nr; 940 941 if (desc->state & (1<<MD_DISK_FAULTY)) 942 set_bit(Faulty, &rdev->flags); 943 else if (desc->state & (1<<MD_DISK_SYNC) /* && 944 desc->raid_disk < mddev->raid_disks */) { 945 set_bit(In_sync, &rdev->flags); 946 rdev->raid_disk = desc->raid_disk; 947 } 948 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 949 set_bit(WriteMostly, &rdev->flags); 950 } else /* MULTIPATH are always insync */ 951 set_bit(In_sync, &rdev->flags); 952 return 0; 953 } 954 955 /* 956 * sync_super for 0.90.0 957 */ 958 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 959 { 960 mdp_super_t *sb; 961 mdk_rdev_t *rdev2; 962 int next_spare = mddev->raid_disks; 963 964 965 /* make rdev->sb match mddev data.. 966 * 967 * 1/ zero out disks 968 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 969 * 3/ any empty disks < next_spare become removed 970 * 971 * disks[0] gets initialised to REMOVED because 972 * we cannot be sure from other fields if it has 973 * been initialised or not. 974 */ 975 int i; 976 int active=0, working=0,failed=0,spare=0,nr_disks=0; 977 978 rdev->sb_size = MD_SB_BYTES; 979 980 sb = (mdp_super_t*)page_address(rdev->sb_page); 981 982 memset(sb, 0, sizeof(*sb)); 983 984 sb->md_magic = MD_SB_MAGIC; 985 sb->major_version = mddev->major_version; 986 sb->patch_version = mddev->patch_version; 987 sb->gvalid_words = 0; /* ignored */ 988 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 989 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 990 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 991 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 992 993 sb->ctime = mddev->ctime; 994 sb->level = mddev->level; 995 sb->size = mddev->dev_sectors / 2; 996 sb->raid_disks = mddev->raid_disks; 997 sb->md_minor = mddev->md_minor; 998 sb->not_persistent = 0; 999 sb->utime = mddev->utime; 1000 sb->state = 0; 1001 sb->events_hi = (mddev->events>>32); 1002 sb->events_lo = (u32)mddev->events; 1003 1004 if (mddev->reshape_position == MaxSector) 1005 sb->minor_version = 90; 1006 else { 1007 sb->minor_version = 91; 1008 sb->reshape_position = mddev->reshape_position; 1009 sb->new_level = mddev->new_level; 1010 sb->delta_disks = mddev->delta_disks; 1011 sb->new_layout = mddev->new_layout; 1012 sb->new_chunk = mddev->new_chunk_sectors << 9; 1013 } 1014 mddev->minor_version = sb->minor_version; 1015 if (mddev->in_sync) 1016 { 1017 sb->recovery_cp = mddev->recovery_cp; 1018 sb->cp_events_hi = (mddev->events>>32); 1019 sb->cp_events_lo = (u32)mddev->events; 1020 if (mddev->recovery_cp == MaxSector) 1021 sb->state = (1<< MD_SB_CLEAN); 1022 } else 1023 sb->recovery_cp = 0; 1024 1025 sb->layout = mddev->layout; 1026 sb->chunk_size = mddev->chunk_sectors << 9; 1027 1028 if (mddev->bitmap && mddev->bitmap_file == NULL) 1029 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1030 1031 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1032 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1033 mdp_disk_t *d; 1034 int desc_nr; 1035 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1036 && !test_bit(Faulty, &rdev2->flags)) 1037 desc_nr = rdev2->raid_disk; 1038 else 1039 desc_nr = next_spare++; 1040 rdev2->desc_nr = desc_nr; 1041 d = &sb->disks[rdev2->desc_nr]; 1042 nr_disks++; 1043 d->number = rdev2->desc_nr; 1044 d->major = MAJOR(rdev2->bdev->bd_dev); 1045 d->minor = MINOR(rdev2->bdev->bd_dev); 1046 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1047 && !test_bit(Faulty, &rdev2->flags)) 1048 d->raid_disk = rdev2->raid_disk; 1049 else 1050 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1051 if (test_bit(Faulty, &rdev2->flags)) 1052 d->state = (1<<MD_DISK_FAULTY); 1053 else if (test_bit(In_sync, &rdev2->flags)) { 1054 d->state = (1<<MD_DISK_ACTIVE); 1055 d->state |= (1<<MD_DISK_SYNC); 1056 active++; 1057 working++; 1058 } else { 1059 d->state = 0; 1060 spare++; 1061 working++; 1062 } 1063 if (test_bit(WriteMostly, &rdev2->flags)) 1064 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1065 } 1066 /* now set the "removed" and "faulty" bits on any missing devices */ 1067 for (i=0 ; i < mddev->raid_disks ; i++) { 1068 mdp_disk_t *d = &sb->disks[i]; 1069 if (d->state == 0 && d->number == 0) { 1070 d->number = i; 1071 d->raid_disk = i; 1072 d->state = (1<<MD_DISK_REMOVED); 1073 d->state |= (1<<MD_DISK_FAULTY); 1074 failed++; 1075 } 1076 } 1077 sb->nr_disks = nr_disks; 1078 sb->active_disks = active; 1079 sb->working_disks = working; 1080 sb->failed_disks = failed; 1081 sb->spare_disks = spare; 1082 1083 sb->this_disk = sb->disks[rdev->desc_nr]; 1084 sb->sb_csum = calc_sb_csum(sb); 1085 } 1086 1087 /* 1088 * rdev_size_change for 0.90.0 1089 */ 1090 static unsigned long long 1091 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1092 { 1093 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1094 return 0; /* component must fit device */ 1095 if (rdev->mddev->bitmap_offset) 1096 return 0; /* can't move bitmap */ 1097 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1098 if (!num_sectors || num_sectors > rdev->sb_start) 1099 num_sectors = rdev->sb_start; 1100 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1101 rdev->sb_page); 1102 md_super_wait(rdev->mddev); 1103 return num_sectors / 2; /* kB for sysfs */ 1104 } 1105 1106 1107 /* 1108 * version 1 superblock 1109 */ 1110 1111 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1112 { 1113 __le32 disk_csum; 1114 u32 csum; 1115 unsigned long long newcsum; 1116 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1117 __le32 *isuper = (__le32*)sb; 1118 int i; 1119 1120 disk_csum = sb->sb_csum; 1121 sb->sb_csum = 0; 1122 newcsum = 0; 1123 for (i=0; size>=4; size -= 4 ) 1124 newcsum += le32_to_cpu(*isuper++); 1125 1126 if (size == 2) 1127 newcsum += le16_to_cpu(*(__le16*) isuper); 1128 1129 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1130 sb->sb_csum = disk_csum; 1131 return cpu_to_le32(csum); 1132 } 1133 1134 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1135 { 1136 struct mdp_superblock_1 *sb; 1137 int ret; 1138 sector_t sb_start; 1139 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1140 int bmask; 1141 1142 /* 1143 * Calculate the position of the superblock in 512byte sectors. 1144 * It is always aligned to a 4K boundary and 1145 * depeding on minor_version, it can be: 1146 * 0: At least 8K, but less than 12K, from end of device 1147 * 1: At start of device 1148 * 2: 4K from start of device. 1149 */ 1150 switch(minor_version) { 1151 case 0: 1152 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1153 sb_start -= 8*2; 1154 sb_start &= ~(sector_t)(4*2-1); 1155 break; 1156 case 1: 1157 sb_start = 0; 1158 break; 1159 case 2: 1160 sb_start = 8; 1161 break; 1162 default: 1163 return -EINVAL; 1164 } 1165 rdev->sb_start = sb_start; 1166 1167 /* superblock is rarely larger than 1K, but it can be larger, 1168 * and it is safe to read 4k, so we do that 1169 */ 1170 ret = read_disk_sb(rdev, 4096); 1171 if (ret) return ret; 1172 1173 1174 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1175 1176 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1177 sb->major_version != cpu_to_le32(1) || 1178 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1179 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1180 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1181 return -EINVAL; 1182 1183 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1184 printk("md: invalid superblock checksum on %s\n", 1185 bdevname(rdev->bdev,b)); 1186 return -EINVAL; 1187 } 1188 if (le64_to_cpu(sb->data_size) < 10) { 1189 printk("md: data_size too small on %s\n", 1190 bdevname(rdev->bdev,b)); 1191 return -EINVAL; 1192 } 1193 1194 rdev->preferred_minor = 0xffff; 1195 rdev->data_offset = le64_to_cpu(sb->data_offset); 1196 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1197 1198 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1199 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1200 if (rdev->sb_size & bmask) 1201 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1202 1203 if (minor_version 1204 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1205 return -EINVAL; 1206 1207 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1208 rdev->desc_nr = -1; 1209 else 1210 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1211 1212 if (!refdev) { 1213 ret = 1; 1214 } else { 1215 __u64 ev1, ev2; 1216 struct mdp_superblock_1 *refsb = 1217 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1218 1219 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1220 sb->level != refsb->level || 1221 sb->layout != refsb->layout || 1222 sb->chunksize != refsb->chunksize) { 1223 printk(KERN_WARNING "md: %s has strangely different" 1224 " superblock to %s\n", 1225 bdevname(rdev->bdev,b), 1226 bdevname(refdev->bdev,b2)); 1227 return -EINVAL; 1228 } 1229 ev1 = le64_to_cpu(sb->events); 1230 ev2 = le64_to_cpu(refsb->events); 1231 1232 if (ev1 > ev2) 1233 ret = 1; 1234 else 1235 ret = 0; 1236 } 1237 if (minor_version) 1238 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1239 le64_to_cpu(sb->data_offset); 1240 else 1241 rdev->sectors = rdev->sb_start; 1242 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1243 return -EINVAL; 1244 rdev->sectors = le64_to_cpu(sb->data_size); 1245 if (le64_to_cpu(sb->size) > rdev->sectors) 1246 return -EINVAL; 1247 return ret; 1248 } 1249 1250 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1251 { 1252 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1253 __u64 ev1 = le64_to_cpu(sb->events); 1254 1255 rdev->raid_disk = -1; 1256 clear_bit(Faulty, &rdev->flags); 1257 clear_bit(In_sync, &rdev->flags); 1258 clear_bit(WriteMostly, &rdev->flags); 1259 clear_bit(BarriersNotsupp, &rdev->flags); 1260 1261 if (mddev->raid_disks == 0) { 1262 mddev->major_version = 1; 1263 mddev->patch_version = 0; 1264 mddev->external = 0; 1265 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1266 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1267 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1268 mddev->level = le32_to_cpu(sb->level); 1269 mddev->clevel[0] = 0; 1270 mddev->layout = le32_to_cpu(sb->layout); 1271 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1272 mddev->dev_sectors = le64_to_cpu(sb->size); 1273 mddev->events = ev1; 1274 mddev->bitmap_offset = 0; 1275 mddev->default_bitmap_offset = 1024 >> 9; 1276 1277 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1278 memcpy(mddev->uuid, sb->set_uuid, 16); 1279 1280 mddev->max_disks = (4096-256)/2; 1281 1282 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1283 mddev->bitmap_file == NULL ) 1284 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1285 1286 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1287 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1288 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1289 mddev->new_level = le32_to_cpu(sb->new_level); 1290 mddev->new_layout = le32_to_cpu(sb->new_layout); 1291 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1292 } else { 1293 mddev->reshape_position = MaxSector; 1294 mddev->delta_disks = 0; 1295 mddev->new_level = mddev->level; 1296 mddev->new_layout = mddev->layout; 1297 mddev->new_chunk_sectors = mddev->chunk_sectors; 1298 } 1299 1300 } else if (mddev->pers == NULL) { 1301 /* Insist of good event counter while assembling */ 1302 ++ev1; 1303 if (ev1 < mddev->events) 1304 return -EINVAL; 1305 } else if (mddev->bitmap) { 1306 /* If adding to array with a bitmap, then we can accept an 1307 * older device, but not too old. 1308 */ 1309 if (ev1 < mddev->bitmap->events_cleared) 1310 return 0; 1311 } else { 1312 if (ev1 < mddev->events) 1313 /* just a hot-add of a new device, leave raid_disk at -1 */ 1314 return 0; 1315 } 1316 if (mddev->level != LEVEL_MULTIPATH) { 1317 int role; 1318 if (rdev->desc_nr < 0 || 1319 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1320 role = 0xffff; 1321 rdev->desc_nr = -1; 1322 } else 1323 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1324 switch(role) { 1325 case 0xffff: /* spare */ 1326 break; 1327 case 0xfffe: /* faulty */ 1328 set_bit(Faulty, &rdev->flags); 1329 break; 1330 default: 1331 if ((le32_to_cpu(sb->feature_map) & 1332 MD_FEATURE_RECOVERY_OFFSET)) 1333 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1334 else 1335 set_bit(In_sync, &rdev->flags); 1336 rdev->raid_disk = role; 1337 break; 1338 } 1339 if (sb->devflags & WriteMostly1) 1340 set_bit(WriteMostly, &rdev->flags); 1341 } else /* MULTIPATH are always insync */ 1342 set_bit(In_sync, &rdev->flags); 1343 1344 return 0; 1345 } 1346 1347 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1348 { 1349 struct mdp_superblock_1 *sb; 1350 mdk_rdev_t *rdev2; 1351 int max_dev, i; 1352 /* make rdev->sb match mddev and rdev data. */ 1353 1354 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1355 1356 sb->feature_map = 0; 1357 sb->pad0 = 0; 1358 sb->recovery_offset = cpu_to_le64(0); 1359 memset(sb->pad1, 0, sizeof(sb->pad1)); 1360 memset(sb->pad2, 0, sizeof(sb->pad2)); 1361 memset(sb->pad3, 0, sizeof(sb->pad3)); 1362 1363 sb->utime = cpu_to_le64((__u64)mddev->utime); 1364 sb->events = cpu_to_le64(mddev->events); 1365 if (mddev->in_sync) 1366 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1367 else 1368 sb->resync_offset = cpu_to_le64(0); 1369 1370 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1371 1372 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1373 sb->size = cpu_to_le64(mddev->dev_sectors); 1374 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1375 sb->level = cpu_to_le32(mddev->level); 1376 sb->layout = cpu_to_le32(mddev->layout); 1377 1378 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1379 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1380 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1381 } 1382 1383 if (rdev->raid_disk >= 0 && 1384 !test_bit(In_sync, &rdev->flags)) { 1385 if (mddev->curr_resync_completed > rdev->recovery_offset) 1386 rdev->recovery_offset = mddev->curr_resync_completed; 1387 if (rdev->recovery_offset > 0) { 1388 sb->feature_map |= 1389 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1390 sb->recovery_offset = 1391 cpu_to_le64(rdev->recovery_offset); 1392 } 1393 } 1394 1395 if (mddev->reshape_position != MaxSector) { 1396 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1397 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1398 sb->new_layout = cpu_to_le32(mddev->new_layout); 1399 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1400 sb->new_level = cpu_to_le32(mddev->new_level); 1401 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1402 } 1403 1404 max_dev = 0; 1405 list_for_each_entry(rdev2, &mddev->disks, same_set) 1406 if (rdev2->desc_nr+1 > max_dev) 1407 max_dev = rdev2->desc_nr+1; 1408 1409 if (max_dev > le32_to_cpu(sb->max_dev)) { 1410 int bmask; 1411 sb->max_dev = cpu_to_le32(max_dev); 1412 rdev->sb_size = max_dev * 2 + 256; 1413 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1414 if (rdev->sb_size & bmask) 1415 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1416 } 1417 for (i=0; i<max_dev;i++) 1418 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1419 1420 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1421 i = rdev2->desc_nr; 1422 if (test_bit(Faulty, &rdev2->flags)) 1423 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1424 else if (test_bit(In_sync, &rdev2->flags)) 1425 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1426 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1427 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1428 else 1429 sb->dev_roles[i] = cpu_to_le16(0xffff); 1430 } 1431 1432 sb->sb_csum = calc_sb_1_csum(sb); 1433 } 1434 1435 static unsigned long long 1436 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1437 { 1438 struct mdp_superblock_1 *sb; 1439 sector_t max_sectors; 1440 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1441 return 0; /* component must fit device */ 1442 if (rdev->sb_start < rdev->data_offset) { 1443 /* minor versions 1 and 2; superblock before data */ 1444 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1445 max_sectors -= rdev->data_offset; 1446 if (!num_sectors || num_sectors > max_sectors) 1447 num_sectors = max_sectors; 1448 } else if (rdev->mddev->bitmap_offset) { 1449 /* minor version 0 with bitmap we can't move */ 1450 return 0; 1451 } else { 1452 /* minor version 0; superblock after data */ 1453 sector_t sb_start; 1454 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1455 sb_start &= ~(sector_t)(4*2 - 1); 1456 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1457 if (!num_sectors || num_sectors > max_sectors) 1458 num_sectors = max_sectors; 1459 rdev->sb_start = sb_start; 1460 } 1461 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1462 sb->data_size = cpu_to_le64(num_sectors); 1463 sb->super_offset = rdev->sb_start; 1464 sb->sb_csum = calc_sb_1_csum(sb); 1465 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1466 rdev->sb_page); 1467 md_super_wait(rdev->mddev); 1468 return num_sectors / 2; /* kB for sysfs */ 1469 } 1470 1471 static struct super_type super_types[] = { 1472 [0] = { 1473 .name = "0.90.0", 1474 .owner = THIS_MODULE, 1475 .load_super = super_90_load, 1476 .validate_super = super_90_validate, 1477 .sync_super = super_90_sync, 1478 .rdev_size_change = super_90_rdev_size_change, 1479 }, 1480 [1] = { 1481 .name = "md-1", 1482 .owner = THIS_MODULE, 1483 .load_super = super_1_load, 1484 .validate_super = super_1_validate, 1485 .sync_super = super_1_sync, 1486 .rdev_size_change = super_1_rdev_size_change, 1487 }, 1488 }; 1489 1490 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1491 { 1492 mdk_rdev_t *rdev, *rdev2; 1493 1494 rcu_read_lock(); 1495 rdev_for_each_rcu(rdev, mddev1) 1496 rdev_for_each_rcu(rdev2, mddev2) 1497 if (rdev->bdev->bd_contains == 1498 rdev2->bdev->bd_contains) { 1499 rcu_read_unlock(); 1500 return 1; 1501 } 1502 rcu_read_unlock(); 1503 return 0; 1504 } 1505 1506 static LIST_HEAD(pending_raid_disks); 1507 1508 /* 1509 * Try to register data integrity profile for an mddev 1510 * 1511 * This is called when an array is started and after a disk has been kicked 1512 * from the array. It only succeeds if all working and active component devices 1513 * are integrity capable with matching profiles. 1514 */ 1515 int md_integrity_register(mddev_t *mddev) 1516 { 1517 mdk_rdev_t *rdev, *reference = NULL; 1518 1519 if (list_empty(&mddev->disks)) 1520 return 0; /* nothing to do */ 1521 if (blk_get_integrity(mddev->gendisk)) 1522 return 0; /* already registered */ 1523 list_for_each_entry(rdev, &mddev->disks, same_set) { 1524 /* skip spares and non-functional disks */ 1525 if (test_bit(Faulty, &rdev->flags)) 1526 continue; 1527 if (rdev->raid_disk < 0) 1528 continue; 1529 /* 1530 * If at least one rdev is not integrity capable, we can not 1531 * enable data integrity for the md device. 1532 */ 1533 if (!bdev_get_integrity(rdev->bdev)) 1534 return -EINVAL; 1535 if (!reference) { 1536 /* Use the first rdev as the reference */ 1537 reference = rdev; 1538 continue; 1539 } 1540 /* does this rdev's profile match the reference profile? */ 1541 if (blk_integrity_compare(reference->bdev->bd_disk, 1542 rdev->bdev->bd_disk) < 0) 1543 return -EINVAL; 1544 } 1545 /* 1546 * All component devices are integrity capable and have matching 1547 * profiles, register the common profile for the md device. 1548 */ 1549 if (blk_integrity_register(mddev->gendisk, 1550 bdev_get_integrity(reference->bdev)) != 0) { 1551 printk(KERN_ERR "md: failed to register integrity for %s\n", 1552 mdname(mddev)); 1553 return -EINVAL; 1554 } 1555 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1556 mdname(mddev)); 1557 return 0; 1558 } 1559 EXPORT_SYMBOL(md_integrity_register); 1560 1561 /* Disable data integrity if non-capable/non-matching disk is being added */ 1562 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 1563 { 1564 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1565 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); 1566 1567 if (!bi_mddev) /* nothing to do */ 1568 return; 1569 if (rdev->raid_disk < 0) /* skip spares */ 1570 return; 1571 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 1572 rdev->bdev->bd_disk) >= 0) 1573 return; 1574 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 1575 blk_integrity_unregister(mddev->gendisk); 1576 } 1577 EXPORT_SYMBOL(md_integrity_add_rdev); 1578 1579 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1580 { 1581 char b[BDEVNAME_SIZE]; 1582 struct kobject *ko; 1583 char *s; 1584 int err; 1585 1586 if (rdev->mddev) { 1587 MD_BUG(); 1588 return -EINVAL; 1589 } 1590 1591 /* prevent duplicates */ 1592 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1593 return -EEXIST; 1594 1595 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 1596 if (rdev->sectors && (mddev->dev_sectors == 0 || 1597 rdev->sectors < mddev->dev_sectors)) { 1598 if (mddev->pers) { 1599 /* Cannot change size, so fail 1600 * If mddev->level <= 0, then we don't care 1601 * about aligning sizes (e.g. linear) 1602 */ 1603 if (mddev->level > 0) 1604 return -ENOSPC; 1605 } else 1606 mddev->dev_sectors = rdev->sectors; 1607 } 1608 1609 /* Verify rdev->desc_nr is unique. 1610 * If it is -1, assign a free number, else 1611 * check number is not in use 1612 */ 1613 if (rdev->desc_nr < 0) { 1614 int choice = 0; 1615 if (mddev->pers) choice = mddev->raid_disks; 1616 while (find_rdev_nr(mddev, choice)) 1617 choice++; 1618 rdev->desc_nr = choice; 1619 } else { 1620 if (find_rdev_nr(mddev, rdev->desc_nr)) 1621 return -EBUSY; 1622 } 1623 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 1624 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 1625 mdname(mddev), mddev->max_disks); 1626 return -EBUSY; 1627 } 1628 bdevname(rdev->bdev,b); 1629 while ( (s=strchr(b, '/')) != NULL) 1630 *s = '!'; 1631 1632 rdev->mddev = mddev; 1633 printk(KERN_INFO "md: bind<%s>\n", b); 1634 1635 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1636 goto fail; 1637 1638 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1639 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1640 kobject_del(&rdev->kobj); 1641 goto fail; 1642 } 1643 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1644 1645 list_add_rcu(&rdev->same_set, &mddev->disks); 1646 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1647 1648 /* May as well allow recovery to be retried once */ 1649 mddev->recovery_disabled = 0; 1650 1651 return 0; 1652 1653 fail: 1654 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1655 b, mdname(mddev)); 1656 return err; 1657 } 1658 1659 static void md_delayed_delete(struct work_struct *ws) 1660 { 1661 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1662 kobject_del(&rdev->kobj); 1663 kobject_put(&rdev->kobj); 1664 } 1665 1666 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1667 { 1668 char b[BDEVNAME_SIZE]; 1669 if (!rdev->mddev) { 1670 MD_BUG(); 1671 return; 1672 } 1673 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1674 list_del_rcu(&rdev->same_set); 1675 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1676 rdev->mddev = NULL; 1677 sysfs_remove_link(&rdev->kobj, "block"); 1678 sysfs_put(rdev->sysfs_state); 1679 rdev->sysfs_state = NULL; 1680 /* We need to delay this, otherwise we can deadlock when 1681 * writing to 'remove' to "dev/state". We also need 1682 * to delay it due to rcu usage. 1683 */ 1684 synchronize_rcu(); 1685 INIT_WORK(&rdev->del_work, md_delayed_delete); 1686 kobject_get(&rdev->kobj); 1687 schedule_work(&rdev->del_work); 1688 } 1689 1690 /* 1691 * prevent the device from being mounted, repartitioned or 1692 * otherwise reused by a RAID array (or any other kernel 1693 * subsystem), by bd_claiming the device. 1694 */ 1695 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1696 { 1697 int err = 0; 1698 struct block_device *bdev; 1699 char b[BDEVNAME_SIZE]; 1700 1701 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1702 if (IS_ERR(bdev)) { 1703 printk(KERN_ERR "md: could not open %s.\n", 1704 __bdevname(dev, b)); 1705 return PTR_ERR(bdev); 1706 } 1707 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1708 if (err) { 1709 printk(KERN_ERR "md: could not bd_claim %s.\n", 1710 bdevname(bdev, b)); 1711 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1712 return err; 1713 } 1714 if (!shared) 1715 set_bit(AllReserved, &rdev->flags); 1716 rdev->bdev = bdev; 1717 return err; 1718 } 1719 1720 static void unlock_rdev(mdk_rdev_t *rdev) 1721 { 1722 struct block_device *bdev = rdev->bdev; 1723 rdev->bdev = NULL; 1724 if (!bdev) 1725 MD_BUG(); 1726 bd_release(bdev); 1727 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1728 } 1729 1730 void md_autodetect_dev(dev_t dev); 1731 1732 static void export_rdev(mdk_rdev_t * rdev) 1733 { 1734 char b[BDEVNAME_SIZE]; 1735 printk(KERN_INFO "md: export_rdev(%s)\n", 1736 bdevname(rdev->bdev,b)); 1737 if (rdev->mddev) 1738 MD_BUG(); 1739 free_disk_sb(rdev); 1740 #ifndef MODULE 1741 if (test_bit(AutoDetected, &rdev->flags)) 1742 md_autodetect_dev(rdev->bdev->bd_dev); 1743 #endif 1744 unlock_rdev(rdev); 1745 kobject_put(&rdev->kobj); 1746 } 1747 1748 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1749 { 1750 unbind_rdev_from_array(rdev); 1751 export_rdev(rdev); 1752 } 1753 1754 static void export_array(mddev_t *mddev) 1755 { 1756 mdk_rdev_t *rdev, *tmp; 1757 1758 rdev_for_each(rdev, tmp, mddev) { 1759 if (!rdev->mddev) { 1760 MD_BUG(); 1761 continue; 1762 } 1763 kick_rdev_from_array(rdev); 1764 } 1765 if (!list_empty(&mddev->disks)) 1766 MD_BUG(); 1767 mddev->raid_disks = 0; 1768 mddev->major_version = 0; 1769 } 1770 1771 static void print_desc(mdp_disk_t *desc) 1772 { 1773 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1774 desc->major,desc->minor,desc->raid_disk,desc->state); 1775 } 1776 1777 static void print_sb_90(mdp_super_t *sb) 1778 { 1779 int i; 1780 1781 printk(KERN_INFO 1782 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1783 sb->major_version, sb->minor_version, sb->patch_version, 1784 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1785 sb->ctime); 1786 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1787 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1788 sb->md_minor, sb->layout, sb->chunk_size); 1789 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1790 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1791 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1792 sb->failed_disks, sb->spare_disks, 1793 sb->sb_csum, (unsigned long)sb->events_lo); 1794 1795 printk(KERN_INFO); 1796 for (i = 0; i < MD_SB_DISKS; i++) { 1797 mdp_disk_t *desc; 1798 1799 desc = sb->disks + i; 1800 if (desc->number || desc->major || desc->minor || 1801 desc->raid_disk || (desc->state && (desc->state != 4))) { 1802 printk(" D %2d: ", i); 1803 print_desc(desc); 1804 } 1805 } 1806 printk(KERN_INFO "md: THIS: "); 1807 print_desc(&sb->this_disk); 1808 } 1809 1810 static void print_sb_1(struct mdp_superblock_1 *sb) 1811 { 1812 __u8 *uuid; 1813 1814 uuid = sb->set_uuid; 1815 printk(KERN_INFO 1816 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1817 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" 1818 "md: Name: \"%s\" CT:%llu\n", 1819 le32_to_cpu(sb->major_version), 1820 le32_to_cpu(sb->feature_map), 1821 uuid[0], uuid[1], uuid[2], uuid[3], 1822 uuid[4], uuid[5], uuid[6], uuid[7], 1823 uuid[8], uuid[9], uuid[10], uuid[11], 1824 uuid[12], uuid[13], uuid[14], uuid[15], 1825 sb->set_name, 1826 (unsigned long long)le64_to_cpu(sb->ctime) 1827 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1828 1829 uuid = sb->device_uuid; 1830 printk(KERN_INFO 1831 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1832 " RO:%llu\n" 1833 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1834 ":%02x%02x%02x%02x%02x%02x\n" 1835 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1836 "md: (MaxDev:%u) \n", 1837 le32_to_cpu(sb->level), 1838 (unsigned long long)le64_to_cpu(sb->size), 1839 le32_to_cpu(sb->raid_disks), 1840 le32_to_cpu(sb->layout), 1841 le32_to_cpu(sb->chunksize), 1842 (unsigned long long)le64_to_cpu(sb->data_offset), 1843 (unsigned long long)le64_to_cpu(sb->data_size), 1844 (unsigned long long)le64_to_cpu(sb->super_offset), 1845 (unsigned long long)le64_to_cpu(sb->recovery_offset), 1846 le32_to_cpu(sb->dev_number), 1847 uuid[0], uuid[1], uuid[2], uuid[3], 1848 uuid[4], uuid[5], uuid[6], uuid[7], 1849 uuid[8], uuid[9], uuid[10], uuid[11], 1850 uuid[12], uuid[13], uuid[14], uuid[15], 1851 sb->devflags, 1852 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 1853 (unsigned long long)le64_to_cpu(sb->events), 1854 (unsigned long long)le64_to_cpu(sb->resync_offset), 1855 le32_to_cpu(sb->sb_csum), 1856 le32_to_cpu(sb->max_dev) 1857 ); 1858 } 1859 1860 static void print_rdev(mdk_rdev_t *rdev, int major_version) 1861 { 1862 char b[BDEVNAME_SIZE]; 1863 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 1864 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 1865 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1866 rdev->desc_nr); 1867 if (rdev->sb_loaded) { 1868 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 1869 switch (major_version) { 1870 case 0: 1871 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 1872 break; 1873 case 1: 1874 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 1875 break; 1876 } 1877 } else 1878 printk(KERN_INFO "md: no rdev superblock!\n"); 1879 } 1880 1881 static void md_print_devices(void) 1882 { 1883 struct list_head *tmp; 1884 mdk_rdev_t *rdev; 1885 mddev_t *mddev; 1886 char b[BDEVNAME_SIZE]; 1887 1888 printk("\n"); 1889 printk("md: **********************************\n"); 1890 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1891 printk("md: **********************************\n"); 1892 for_each_mddev(mddev, tmp) { 1893 1894 if (mddev->bitmap) 1895 bitmap_print_sb(mddev->bitmap); 1896 else 1897 printk("%s: ", mdname(mddev)); 1898 list_for_each_entry(rdev, &mddev->disks, same_set) 1899 printk("<%s>", bdevname(rdev->bdev,b)); 1900 printk("\n"); 1901 1902 list_for_each_entry(rdev, &mddev->disks, same_set) 1903 print_rdev(rdev, mddev->major_version); 1904 } 1905 printk("md: **********************************\n"); 1906 printk("\n"); 1907 } 1908 1909 1910 static void sync_sbs(mddev_t * mddev, int nospares) 1911 { 1912 /* Update each superblock (in-memory image), but 1913 * if we are allowed to, skip spares which already 1914 * have the right event counter, or have one earlier 1915 * (which would mean they aren't being marked as dirty 1916 * with the rest of the array) 1917 */ 1918 mdk_rdev_t *rdev; 1919 1920 list_for_each_entry(rdev, &mddev->disks, same_set) { 1921 if (rdev->sb_events == mddev->events || 1922 (nospares && 1923 rdev->raid_disk < 0 && 1924 (rdev->sb_events&1)==0 && 1925 rdev->sb_events+1 == mddev->events)) { 1926 /* Don't update this superblock */ 1927 rdev->sb_loaded = 2; 1928 } else { 1929 super_types[mddev->major_version]. 1930 sync_super(mddev, rdev); 1931 rdev->sb_loaded = 1; 1932 } 1933 } 1934 } 1935 1936 static void md_update_sb(mddev_t * mddev, int force_change) 1937 { 1938 mdk_rdev_t *rdev; 1939 int sync_req; 1940 int nospares = 0; 1941 1942 mddev->utime = get_seconds(); 1943 if (mddev->external) 1944 return; 1945 repeat: 1946 spin_lock_irq(&mddev->write_lock); 1947 1948 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1949 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1950 force_change = 1; 1951 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1952 /* just a clean<-> dirty transition, possibly leave spares alone, 1953 * though if events isn't the right even/odd, we will have to do 1954 * spares after all 1955 */ 1956 nospares = 1; 1957 if (force_change) 1958 nospares = 0; 1959 if (mddev->degraded) 1960 /* If the array is degraded, then skipping spares is both 1961 * dangerous and fairly pointless. 1962 * Dangerous because a device that was removed from the array 1963 * might have a event_count that still looks up-to-date, 1964 * so it can be re-added without a resync. 1965 * Pointless because if there are any spares to skip, 1966 * then a recovery will happen and soon that array won't 1967 * be degraded any more and the spare can go back to sleep then. 1968 */ 1969 nospares = 0; 1970 1971 sync_req = mddev->in_sync; 1972 1973 /* If this is just a dirty<->clean transition, and the array is clean 1974 * and 'events' is odd, we can roll back to the previous clean state */ 1975 if (nospares 1976 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1977 && (mddev->events & 1) 1978 && mddev->events != 1) 1979 mddev->events--; 1980 else { 1981 /* otherwise we have to go forward and ... */ 1982 mddev->events ++; 1983 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1984 /* .. if the array isn't clean, an 'even' event must also go 1985 * to spares. */ 1986 if ((mddev->events&1)==0) 1987 nospares = 0; 1988 } else { 1989 /* otherwise an 'odd' event must go to spares */ 1990 if ((mddev->events&1)) 1991 nospares = 0; 1992 } 1993 } 1994 1995 if (!mddev->events) { 1996 /* 1997 * oops, this 64-bit counter should never wrap. 1998 * Either we are in around ~1 trillion A.C., assuming 1999 * 1 reboot per second, or we have a bug: 2000 */ 2001 MD_BUG(); 2002 mddev->events --; 2003 } 2004 2005 /* 2006 * do not write anything to disk if using 2007 * nonpersistent superblocks 2008 */ 2009 if (!mddev->persistent) { 2010 if (!mddev->external) 2011 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2012 2013 spin_unlock_irq(&mddev->write_lock); 2014 wake_up(&mddev->sb_wait); 2015 return; 2016 } 2017 sync_sbs(mddev, nospares); 2018 spin_unlock_irq(&mddev->write_lock); 2019 2020 dprintk(KERN_INFO 2021 "md: updating %s RAID superblock on device (in sync %d)\n", 2022 mdname(mddev),mddev->in_sync); 2023 2024 bitmap_update_sb(mddev->bitmap); 2025 list_for_each_entry(rdev, &mddev->disks, same_set) { 2026 char b[BDEVNAME_SIZE]; 2027 dprintk(KERN_INFO "md: "); 2028 if (rdev->sb_loaded != 1) 2029 continue; /* no noise on spare devices */ 2030 if (test_bit(Faulty, &rdev->flags)) 2031 dprintk("(skipping faulty "); 2032 2033 dprintk("%s ", bdevname(rdev->bdev,b)); 2034 if (!test_bit(Faulty, &rdev->flags)) { 2035 md_super_write(mddev,rdev, 2036 rdev->sb_start, rdev->sb_size, 2037 rdev->sb_page); 2038 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 2039 bdevname(rdev->bdev,b), 2040 (unsigned long long)rdev->sb_start); 2041 rdev->sb_events = mddev->events; 2042 2043 } else 2044 dprintk(")\n"); 2045 if (mddev->level == LEVEL_MULTIPATH) 2046 /* only need to write one superblock... */ 2047 break; 2048 } 2049 md_super_wait(mddev); 2050 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2051 2052 spin_lock_irq(&mddev->write_lock); 2053 if (mddev->in_sync != sync_req || 2054 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2055 /* have to write it out again */ 2056 spin_unlock_irq(&mddev->write_lock); 2057 goto repeat; 2058 } 2059 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2060 spin_unlock_irq(&mddev->write_lock); 2061 wake_up(&mddev->sb_wait); 2062 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2063 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2064 2065 } 2066 2067 /* words written to sysfs files may, or may not, be \n terminated. 2068 * We want to accept with case. For this we use cmd_match. 2069 */ 2070 static int cmd_match(const char *cmd, const char *str) 2071 { 2072 /* See if cmd, written into a sysfs file, matches 2073 * str. They must either be the same, or cmd can 2074 * have a trailing newline 2075 */ 2076 while (*cmd && *str && *cmd == *str) { 2077 cmd++; 2078 str++; 2079 } 2080 if (*cmd == '\n') 2081 cmd++; 2082 if (*str || *cmd) 2083 return 0; 2084 return 1; 2085 } 2086 2087 struct rdev_sysfs_entry { 2088 struct attribute attr; 2089 ssize_t (*show)(mdk_rdev_t *, char *); 2090 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 2091 }; 2092 2093 static ssize_t 2094 state_show(mdk_rdev_t *rdev, char *page) 2095 { 2096 char *sep = ""; 2097 size_t len = 0; 2098 2099 if (test_bit(Faulty, &rdev->flags)) { 2100 len+= sprintf(page+len, "%sfaulty",sep); 2101 sep = ","; 2102 } 2103 if (test_bit(In_sync, &rdev->flags)) { 2104 len += sprintf(page+len, "%sin_sync",sep); 2105 sep = ","; 2106 } 2107 if (test_bit(WriteMostly, &rdev->flags)) { 2108 len += sprintf(page+len, "%swrite_mostly",sep); 2109 sep = ","; 2110 } 2111 if (test_bit(Blocked, &rdev->flags)) { 2112 len += sprintf(page+len, "%sblocked", sep); 2113 sep = ","; 2114 } 2115 if (!test_bit(Faulty, &rdev->flags) && 2116 !test_bit(In_sync, &rdev->flags)) { 2117 len += sprintf(page+len, "%sspare", sep); 2118 sep = ","; 2119 } 2120 return len+sprintf(page+len, "\n"); 2121 } 2122 2123 static ssize_t 2124 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2125 { 2126 /* can write 2127 * faulty - simulates and error 2128 * remove - disconnects the device 2129 * writemostly - sets write_mostly 2130 * -writemostly - clears write_mostly 2131 * blocked - sets the Blocked flag 2132 * -blocked - clears the Blocked flag 2133 * insync - sets Insync providing device isn't active 2134 */ 2135 int err = -EINVAL; 2136 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2137 md_error(rdev->mddev, rdev); 2138 err = 0; 2139 } else if (cmd_match(buf, "remove")) { 2140 if (rdev->raid_disk >= 0) 2141 err = -EBUSY; 2142 else { 2143 mddev_t *mddev = rdev->mddev; 2144 kick_rdev_from_array(rdev); 2145 if (mddev->pers) 2146 md_update_sb(mddev, 1); 2147 md_new_event(mddev); 2148 err = 0; 2149 } 2150 } else if (cmd_match(buf, "writemostly")) { 2151 set_bit(WriteMostly, &rdev->flags); 2152 err = 0; 2153 } else if (cmd_match(buf, "-writemostly")) { 2154 clear_bit(WriteMostly, &rdev->flags); 2155 err = 0; 2156 } else if (cmd_match(buf, "blocked")) { 2157 set_bit(Blocked, &rdev->flags); 2158 err = 0; 2159 } else if (cmd_match(buf, "-blocked")) { 2160 clear_bit(Blocked, &rdev->flags); 2161 wake_up(&rdev->blocked_wait); 2162 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2163 md_wakeup_thread(rdev->mddev->thread); 2164 2165 err = 0; 2166 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2167 set_bit(In_sync, &rdev->flags); 2168 err = 0; 2169 } 2170 if (!err && rdev->sysfs_state) 2171 sysfs_notify_dirent(rdev->sysfs_state); 2172 return err ? err : len; 2173 } 2174 static struct rdev_sysfs_entry rdev_state = 2175 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2176 2177 static ssize_t 2178 errors_show(mdk_rdev_t *rdev, char *page) 2179 { 2180 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2181 } 2182 2183 static ssize_t 2184 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2185 { 2186 char *e; 2187 unsigned long n = simple_strtoul(buf, &e, 10); 2188 if (*buf && (*e == 0 || *e == '\n')) { 2189 atomic_set(&rdev->corrected_errors, n); 2190 return len; 2191 } 2192 return -EINVAL; 2193 } 2194 static struct rdev_sysfs_entry rdev_errors = 2195 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2196 2197 static ssize_t 2198 slot_show(mdk_rdev_t *rdev, char *page) 2199 { 2200 if (rdev->raid_disk < 0) 2201 return sprintf(page, "none\n"); 2202 else 2203 return sprintf(page, "%d\n", rdev->raid_disk); 2204 } 2205 2206 static ssize_t 2207 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2208 { 2209 char *e; 2210 int err; 2211 char nm[20]; 2212 int slot = simple_strtoul(buf, &e, 10); 2213 if (strncmp(buf, "none", 4)==0) 2214 slot = -1; 2215 else if (e==buf || (*e && *e!= '\n')) 2216 return -EINVAL; 2217 if (rdev->mddev->pers && slot == -1) { 2218 /* Setting 'slot' on an active array requires also 2219 * updating the 'rd%d' link, and communicating 2220 * with the personality with ->hot_*_disk. 2221 * For now we only support removing 2222 * failed/spare devices. This normally happens automatically, 2223 * but not when the metadata is externally managed. 2224 */ 2225 if (rdev->raid_disk == -1) 2226 return -EEXIST; 2227 /* personality does all needed checks */ 2228 if (rdev->mddev->pers->hot_add_disk == NULL) 2229 return -EINVAL; 2230 err = rdev->mddev->pers-> 2231 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2232 if (err) 2233 return err; 2234 sprintf(nm, "rd%d", rdev->raid_disk); 2235 sysfs_remove_link(&rdev->mddev->kobj, nm); 2236 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2237 md_wakeup_thread(rdev->mddev->thread); 2238 } else if (rdev->mddev->pers) { 2239 mdk_rdev_t *rdev2; 2240 /* Activating a spare .. or possibly reactivating 2241 * if we ever get bitmaps working here. 2242 */ 2243 2244 if (rdev->raid_disk != -1) 2245 return -EBUSY; 2246 2247 if (rdev->mddev->pers->hot_add_disk == NULL) 2248 return -EINVAL; 2249 2250 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) 2251 if (rdev2->raid_disk == slot) 2252 return -EEXIST; 2253 2254 rdev->raid_disk = slot; 2255 if (test_bit(In_sync, &rdev->flags)) 2256 rdev->saved_raid_disk = slot; 2257 else 2258 rdev->saved_raid_disk = -1; 2259 err = rdev->mddev->pers-> 2260 hot_add_disk(rdev->mddev, rdev); 2261 if (err) { 2262 rdev->raid_disk = -1; 2263 return err; 2264 } else 2265 sysfs_notify_dirent(rdev->sysfs_state); 2266 sprintf(nm, "rd%d", rdev->raid_disk); 2267 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2268 printk(KERN_WARNING 2269 "md: cannot register " 2270 "%s for %s\n", 2271 nm, mdname(rdev->mddev)); 2272 2273 /* don't wakeup anyone, leave that to userspace. */ 2274 } else { 2275 if (slot >= rdev->mddev->raid_disks) 2276 return -ENOSPC; 2277 rdev->raid_disk = slot; 2278 /* assume it is working */ 2279 clear_bit(Faulty, &rdev->flags); 2280 clear_bit(WriteMostly, &rdev->flags); 2281 set_bit(In_sync, &rdev->flags); 2282 sysfs_notify_dirent(rdev->sysfs_state); 2283 } 2284 return len; 2285 } 2286 2287 2288 static struct rdev_sysfs_entry rdev_slot = 2289 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2290 2291 static ssize_t 2292 offset_show(mdk_rdev_t *rdev, char *page) 2293 { 2294 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2295 } 2296 2297 static ssize_t 2298 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2299 { 2300 char *e; 2301 unsigned long long offset = simple_strtoull(buf, &e, 10); 2302 if (e==buf || (*e && *e != '\n')) 2303 return -EINVAL; 2304 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2305 return -EBUSY; 2306 if (rdev->sectors && rdev->mddev->external) 2307 /* Must set offset before size, so overlap checks 2308 * can be sane */ 2309 return -EBUSY; 2310 rdev->data_offset = offset; 2311 return len; 2312 } 2313 2314 static struct rdev_sysfs_entry rdev_offset = 2315 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2316 2317 static ssize_t 2318 rdev_size_show(mdk_rdev_t *rdev, char *page) 2319 { 2320 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2321 } 2322 2323 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2324 { 2325 /* check if two start/length pairs overlap */ 2326 if (s1+l1 <= s2) 2327 return 0; 2328 if (s2+l2 <= s1) 2329 return 0; 2330 return 1; 2331 } 2332 2333 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2334 { 2335 unsigned long long blocks; 2336 sector_t new; 2337 2338 if (strict_strtoull(buf, 10, &blocks) < 0) 2339 return -EINVAL; 2340 2341 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2342 return -EINVAL; /* sector conversion overflow */ 2343 2344 new = blocks * 2; 2345 if (new != blocks * 2) 2346 return -EINVAL; /* unsigned long long to sector_t overflow */ 2347 2348 *sectors = new; 2349 return 0; 2350 } 2351 2352 static ssize_t 2353 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2354 { 2355 mddev_t *my_mddev = rdev->mddev; 2356 sector_t oldsectors = rdev->sectors; 2357 sector_t sectors; 2358 2359 if (strict_blocks_to_sectors(buf, §ors) < 0) 2360 return -EINVAL; 2361 if (my_mddev->pers && rdev->raid_disk >= 0) { 2362 if (my_mddev->persistent) { 2363 sectors = super_types[my_mddev->major_version]. 2364 rdev_size_change(rdev, sectors); 2365 if (!sectors) 2366 return -EBUSY; 2367 } else if (!sectors) 2368 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2369 rdev->data_offset; 2370 } 2371 if (sectors < my_mddev->dev_sectors) 2372 return -EINVAL; /* component must fit device */ 2373 2374 rdev->sectors = sectors; 2375 if (sectors > oldsectors && my_mddev->external) { 2376 /* need to check that all other rdevs with the same ->bdev 2377 * do not overlap. We need to unlock the mddev to avoid 2378 * a deadlock. We have already changed rdev->sectors, and if 2379 * we have to change it back, we will have the lock again. 2380 */ 2381 mddev_t *mddev; 2382 int overlap = 0; 2383 struct list_head *tmp; 2384 2385 mddev_unlock(my_mddev); 2386 for_each_mddev(mddev, tmp) { 2387 mdk_rdev_t *rdev2; 2388 2389 mddev_lock(mddev); 2390 list_for_each_entry(rdev2, &mddev->disks, same_set) 2391 if (test_bit(AllReserved, &rdev2->flags) || 2392 (rdev->bdev == rdev2->bdev && 2393 rdev != rdev2 && 2394 overlaps(rdev->data_offset, rdev->sectors, 2395 rdev2->data_offset, 2396 rdev2->sectors))) { 2397 overlap = 1; 2398 break; 2399 } 2400 mddev_unlock(mddev); 2401 if (overlap) { 2402 mddev_put(mddev); 2403 break; 2404 } 2405 } 2406 mddev_lock(my_mddev); 2407 if (overlap) { 2408 /* Someone else could have slipped in a size 2409 * change here, but doing so is just silly. 2410 * We put oldsectors back because we *know* it is 2411 * safe, and trust userspace not to race with 2412 * itself 2413 */ 2414 rdev->sectors = oldsectors; 2415 return -EBUSY; 2416 } 2417 } 2418 return len; 2419 } 2420 2421 static struct rdev_sysfs_entry rdev_size = 2422 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2423 2424 static struct attribute *rdev_default_attrs[] = { 2425 &rdev_state.attr, 2426 &rdev_errors.attr, 2427 &rdev_slot.attr, 2428 &rdev_offset.attr, 2429 &rdev_size.attr, 2430 NULL, 2431 }; 2432 static ssize_t 2433 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2434 { 2435 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2436 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2437 mddev_t *mddev = rdev->mddev; 2438 ssize_t rv; 2439 2440 if (!entry->show) 2441 return -EIO; 2442 2443 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2444 if (!rv) { 2445 if (rdev->mddev == NULL) 2446 rv = -EBUSY; 2447 else 2448 rv = entry->show(rdev, page); 2449 mddev_unlock(mddev); 2450 } 2451 return rv; 2452 } 2453 2454 static ssize_t 2455 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2456 const char *page, size_t length) 2457 { 2458 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2459 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2460 ssize_t rv; 2461 mddev_t *mddev = rdev->mddev; 2462 2463 if (!entry->store) 2464 return -EIO; 2465 if (!capable(CAP_SYS_ADMIN)) 2466 return -EACCES; 2467 rv = mddev ? mddev_lock(mddev): -EBUSY; 2468 if (!rv) { 2469 if (rdev->mddev == NULL) 2470 rv = -EBUSY; 2471 else 2472 rv = entry->store(rdev, page, length); 2473 mddev_unlock(mddev); 2474 } 2475 return rv; 2476 } 2477 2478 static void rdev_free(struct kobject *ko) 2479 { 2480 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2481 kfree(rdev); 2482 } 2483 static struct sysfs_ops rdev_sysfs_ops = { 2484 .show = rdev_attr_show, 2485 .store = rdev_attr_store, 2486 }; 2487 static struct kobj_type rdev_ktype = { 2488 .release = rdev_free, 2489 .sysfs_ops = &rdev_sysfs_ops, 2490 .default_attrs = rdev_default_attrs, 2491 }; 2492 2493 /* 2494 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2495 * 2496 * mark the device faulty if: 2497 * 2498 * - the device is nonexistent (zero size) 2499 * - the device has no valid superblock 2500 * 2501 * a faulty rdev _never_ has rdev->sb set. 2502 */ 2503 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2504 { 2505 char b[BDEVNAME_SIZE]; 2506 int err; 2507 mdk_rdev_t *rdev; 2508 sector_t size; 2509 2510 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2511 if (!rdev) { 2512 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2513 return ERR_PTR(-ENOMEM); 2514 } 2515 2516 if ((err = alloc_disk_sb(rdev))) 2517 goto abort_free; 2518 2519 err = lock_rdev(rdev, newdev, super_format == -2); 2520 if (err) 2521 goto abort_free; 2522 2523 kobject_init(&rdev->kobj, &rdev_ktype); 2524 2525 rdev->desc_nr = -1; 2526 rdev->saved_raid_disk = -1; 2527 rdev->raid_disk = -1; 2528 rdev->flags = 0; 2529 rdev->data_offset = 0; 2530 rdev->sb_events = 0; 2531 atomic_set(&rdev->nr_pending, 0); 2532 atomic_set(&rdev->read_errors, 0); 2533 atomic_set(&rdev->corrected_errors, 0); 2534 2535 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2536 if (!size) { 2537 printk(KERN_WARNING 2538 "md: %s has zero or unknown size, marking faulty!\n", 2539 bdevname(rdev->bdev,b)); 2540 err = -EINVAL; 2541 goto abort_free; 2542 } 2543 2544 if (super_format >= 0) { 2545 err = super_types[super_format]. 2546 load_super(rdev, NULL, super_minor); 2547 if (err == -EINVAL) { 2548 printk(KERN_WARNING 2549 "md: %s does not have a valid v%d.%d " 2550 "superblock, not importing!\n", 2551 bdevname(rdev->bdev,b), 2552 super_format, super_minor); 2553 goto abort_free; 2554 } 2555 if (err < 0) { 2556 printk(KERN_WARNING 2557 "md: could not read %s's sb, not importing!\n", 2558 bdevname(rdev->bdev,b)); 2559 goto abort_free; 2560 } 2561 } 2562 2563 INIT_LIST_HEAD(&rdev->same_set); 2564 init_waitqueue_head(&rdev->blocked_wait); 2565 2566 return rdev; 2567 2568 abort_free: 2569 if (rdev->sb_page) { 2570 if (rdev->bdev) 2571 unlock_rdev(rdev); 2572 free_disk_sb(rdev); 2573 } 2574 kfree(rdev); 2575 return ERR_PTR(err); 2576 } 2577 2578 /* 2579 * Check a full RAID array for plausibility 2580 */ 2581 2582 2583 static void analyze_sbs(mddev_t * mddev) 2584 { 2585 int i; 2586 mdk_rdev_t *rdev, *freshest, *tmp; 2587 char b[BDEVNAME_SIZE]; 2588 2589 freshest = NULL; 2590 rdev_for_each(rdev, tmp, mddev) 2591 switch (super_types[mddev->major_version]. 2592 load_super(rdev, freshest, mddev->minor_version)) { 2593 case 1: 2594 freshest = rdev; 2595 break; 2596 case 0: 2597 break; 2598 default: 2599 printk( KERN_ERR \ 2600 "md: fatal superblock inconsistency in %s" 2601 " -- removing from array\n", 2602 bdevname(rdev->bdev,b)); 2603 kick_rdev_from_array(rdev); 2604 } 2605 2606 2607 super_types[mddev->major_version]. 2608 validate_super(mddev, freshest); 2609 2610 i = 0; 2611 rdev_for_each(rdev, tmp, mddev) { 2612 if (rdev->desc_nr >= mddev->max_disks || 2613 i > mddev->max_disks) { 2614 printk(KERN_WARNING 2615 "md: %s: %s: only %d devices permitted\n", 2616 mdname(mddev), bdevname(rdev->bdev, b), 2617 mddev->max_disks); 2618 kick_rdev_from_array(rdev); 2619 continue; 2620 } 2621 if (rdev != freshest) 2622 if (super_types[mddev->major_version]. 2623 validate_super(mddev, rdev)) { 2624 printk(KERN_WARNING "md: kicking non-fresh %s" 2625 " from array!\n", 2626 bdevname(rdev->bdev,b)); 2627 kick_rdev_from_array(rdev); 2628 continue; 2629 } 2630 if (mddev->level == LEVEL_MULTIPATH) { 2631 rdev->desc_nr = i++; 2632 rdev->raid_disk = rdev->desc_nr; 2633 set_bit(In_sync, &rdev->flags); 2634 } else if (rdev->raid_disk >= mddev->raid_disks) { 2635 rdev->raid_disk = -1; 2636 clear_bit(In_sync, &rdev->flags); 2637 } 2638 } 2639 } 2640 2641 static void md_safemode_timeout(unsigned long data); 2642 2643 static ssize_t 2644 safe_delay_show(mddev_t *mddev, char *page) 2645 { 2646 int msec = (mddev->safemode_delay*1000)/HZ; 2647 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2648 } 2649 static ssize_t 2650 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2651 { 2652 int scale=1; 2653 int dot=0; 2654 int i; 2655 unsigned long msec; 2656 char buf[30]; 2657 2658 /* remove a period, and count digits after it */ 2659 if (len >= sizeof(buf)) 2660 return -EINVAL; 2661 strlcpy(buf, cbuf, sizeof(buf)); 2662 for (i=0; i<len; i++) { 2663 if (dot) { 2664 if (isdigit(buf[i])) { 2665 buf[i-1] = buf[i]; 2666 scale *= 10; 2667 } 2668 buf[i] = 0; 2669 } else if (buf[i] == '.') { 2670 dot=1; 2671 buf[i] = 0; 2672 } 2673 } 2674 if (strict_strtoul(buf, 10, &msec) < 0) 2675 return -EINVAL; 2676 msec = (msec * 1000) / scale; 2677 if (msec == 0) 2678 mddev->safemode_delay = 0; 2679 else { 2680 unsigned long old_delay = mddev->safemode_delay; 2681 mddev->safemode_delay = (msec*HZ)/1000; 2682 if (mddev->safemode_delay == 0) 2683 mddev->safemode_delay = 1; 2684 if (mddev->safemode_delay < old_delay) 2685 md_safemode_timeout((unsigned long)mddev); 2686 } 2687 return len; 2688 } 2689 static struct md_sysfs_entry md_safe_delay = 2690 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2691 2692 static ssize_t 2693 level_show(mddev_t *mddev, char *page) 2694 { 2695 struct mdk_personality *p = mddev->pers; 2696 if (p) 2697 return sprintf(page, "%s\n", p->name); 2698 else if (mddev->clevel[0]) 2699 return sprintf(page, "%s\n", mddev->clevel); 2700 else if (mddev->level != LEVEL_NONE) 2701 return sprintf(page, "%d\n", mddev->level); 2702 else 2703 return 0; 2704 } 2705 2706 static ssize_t 2707 level_store(mddev_t *mddev, const char *buf, size_t len) 2708 { 2709 char level[16]; 2710 ssize_t rv = len; 2711 struct mdk_personality *pers; 2712 void *priv; 2713 mdk_rdev_t *rdev; 2714 2715 if (mddev->pers == NULL) { 2716 if (len == 0) 2717 return 0; 2718 if (len >= sizeof(mddev->clevel)) 2719 return -ENOSPC; 2720 strncpy(mddev->clevel, buf, len); 2721 if (mddev->clevel[len-1] == '\n') 2722 len--; 2723 mddev->clevel[len] = 0; 2724 mddev->level = LEVEL_NONE; 2725 return rv; 2726 } 2727 2728 /* request to change the personality. Need to ensure: 2729 * - array is not engaged in resync/recovery/reshape 2730 * - old personality can be suspended 2731 * - new personality will access other array. 2732 */ 2733 2734 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 2735 return -EBUSY; 2736 2737 if (!mddev->pers->quiesce) { 2738 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 2739 mdname(mddev), mddev->pers->name); 2740 return -EINVAL; 2741 } 2742 2743 /* Now find the new personality */ 2744 if (len == 0 || len >= sizeof(level)) 2745 return -EINVAL; 2746 strncpy(level, buf, len); 2747 if (level[len-1] == '\n') 2748 len--; 2749 level[len] = 0; 2750 2751 request_module("md-%s", level); 2752 spin_lock(&pers_lock); 2753 pers = find_pers(LEVEL_NONE, level); 2754 if (!pers || !try_module_get(pers->owner)) { 2755 spin_unlock(&pers_lock); 2756 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2757 return -EINVAL; 2758 } 2759 spin_unlock(&pers_lock); 2760 2761 if (pers == mddev->pers) { 2762 /* Nothing to do! */ 2763 module_put(pers->owner); 2764 return rv; 2765 } 2766 if (!pers->takeover) { 2767 module_put(pers->owner); 2768 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2769 mdname(mddev), level); 2770 return -EINVAL; 2771 } 2772 2773 /* ->takeover must set new_* and/or delta_disks 2774 * if it succeeds, and may set them when it fails. 2775 */ 2776 priv = pers->takeover(mddev); 2777 if (IS_ERR(priv)) { 2778 mddev->new_level = mddev->level; 2779 mddev->new_layout = mddev->layout; 2780 mddev->new_chunk_sectors = mddev->chunk_sectors; 2781 mddev->raid_disks -= mddev->delta_disks; 2782 mddev->delta_disks = 0; 2783 module_put(pers->owner); 2784 printk(KERN_WARNING "md: %s: %s would not accept array\n", 2785 mdname(mddev), level); 2786 return PTR_ERR(priv); 2787 } 2788 2789 /* Looks like we have a winner */ 2790 mddev_suspend(mddev); 2791 mddev->pers->stop(mddev); 2792 module_put(mddev->pers->owner); 2793 /* Invalidate devices that are now superfluous */ 2794 list_for_each_entry(rdev, &mddev->disks, same_set) 2795 if (rdev->raid_disk >= mddev->raid_disks) { 2796 rdev->raid_disk = -1; 2797 clear_bit(In_sync, &rdev->flags); 2798 } 2799 mddev->pers = pers; 2800 mddev->private = priv; 2801 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2802 mddev->level = mddev->new_level; 2803 mddev->layout = mddev->new_layout; 2804 mddev->chunk_sectors = mddev->new_chunk_sectors; 2805 mddev->delta_disks = 0; 2806 pers->run(mddev); 2807 mddev_resume(mddev); 2808 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2809 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2810 md_wakeup_thread(mddev->thread); 2811 return rv; 2812 } 2813 2814 static struct md_sysfs_entry md_level = 2815 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2816 2817 2818 static ssize_t 2819 layout_show(mddev_t *mddev, char *page) 2820 { 2821 /* just a number, not meaningful for all levels */ 2822 if (mddev->reshape_position != MaxSector && 2823 mddev->layout != mddev->new_layout) 2824 return sprintf(page, "%d (%d)\n", 2825 mddev->new_layout, mddev->layout); 2826 return sprintf(page, "%d\n", mddev->layout); 2827 } 2828 2829 static ssize_t 2830 layout_store(mddev_t *mddev, const char *buf, size_t len) 2831 { 2832 char *e; 2833 unsigned long n = simple_strtoul(buf, &e, 10); 2834 2835 if (!*buf || (*e && *e != '\n')) 2836 return -EINVAL; 2837 2838 if (mddev->pers) { 2839 int err; 2840 if (mddev->pers->check_reshape == NULL) 2841 return -EBUSY; 2842 mddev->new_layout = n; 2843 err = mddev->pers->check_reshape(mddev); 2844 if (err) { 2845 mddev->new_layout = mddev->layout; 2846 return err; 2847 } 2848 } else { 2849 mddev->new_layout = n; 2850 if (mddev->reshape_position == MaxSector) 2851 mddev->layout = n; 2852 } 2853 return len; 2854 } 2855 static struct md_sysfs_entry md_layout = 2856 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2857 2858 2859 static ssize_t 2860 raid_disks_show(mddev_t *mddev, char *page) 2861 { 2862 if (mddev->raid_disks == 0) 2863 return 0; 2864 if (mddev->reshape_position != MaxSector && 2865 mddev->delta_disks != 0) 2866 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2867 mddev->raid_disks - mddev->delta_disks); 2868 return sprintf(page, "%d\n", mddev->raid_disks); 2869 } 2870 2871 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2872 2873 static ssize_t 2874 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2875 { 2876 char *e; 2877 int rv = 0; 2878 unsigned long n = simple_strtoul(buf, &e, 10); 2879 2880 if (!*buf || (*e && *e != '\n')) 2881 return -EINVAL; 2882 2883 if (mddev->pers) 2884 rv = update_raid_disks(mddev, n); 2885 else if (mddev->reshape_position != MaxSector) { 2886 int olddisks = mddev->raid_disks - mddev->delta_disks; 2887 mddev->delta_disks = n - olddisks; 2888 mddev->raid_disks = n; 2889 } else 2890 mddev->raid_disks = n; 2891 return rv ? rv : len; 2892 } 2893 static struct md_sysfs_entry md_raid_disks = 2894 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2895 2896 static ssize_t 2897 chunk_size_show(mddev_t *mddev, char *page) 2898 { 2899 if (mddev->reshape_position != MaxSector && 2900 mddev->chunk_sectors != mddev->new_chunk_sectors) 2901 return sprintf(page, "%d (%d)\n", 2902 mddev->new_chunk_sectors << 9, 2903 mddev->chunk_sectors << 9); 2904 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 2905 } 2906 2907 static ssize_t 2908 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2909 { 2910 char *e; 2911 unsigned long n = simple_strtoul(buf, &e, 10); 2912 2913 if (!*buf || (*e && *e != '\n')) 2914 return -EINVAL; 2915 2916 if (mddev->pers) { 2917 int err; 2918 if (mddev->pers->check_reshape == NULL) 2919 return -EBUSY; 2920 mddev->new_chunk_sectors = n >> 9; 2921 err = mddev->pers->check_reshape(mddev); 2922 if (err) { 2923 mddev->new_chunk_sectors = mddev->chunk_sectors; 2924 return err; 2925 } 2926 } else { 2927 mddev->new_chunk_sectors = n >> 9; 2928 if (mddev->reshape_position == MaxSector) 2929 mddev->chunk_sectors = n >> 9; 2930 } 2931 return len; 2932 } 2933 static struct md_sysfs_entry md_chunk_size = 2934 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2935 2936 static ssize_t 2937 resync_start_show(mddev_t *mddev, char *page) 2938 { 2939 if (mddev->recovery_cp == MaxSector) 2940 return sprintf(page, "none\n"); 2941 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2942 } 2943 2944 static ssize_t 2945 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2946 { 2947 char *e; 2948 unsigned long long n = simple_strtoull(buf, &e, 10); 2949 2950 if (mddev->pers) 2951 return -EBUSY; 2952 if (!*buf || (*e && *e != '\n')) 2953 return -EINVAL; 2954 2955 mddev->recovery_cp = n; 2956 return len; 2957 } 2958 static struct md_sysfs_entry md_resync_start = 2959 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2960 2961 /* 2962 * The array state can be: 2963 * 2964 * clear 2965 * No devices, no size, no level 2966 * Equivalent to STOP_ARRAY ioctl 2967 * inactive 2968 * May have some settings, but array is not active 2969 * all IO results in error 2970 * When written, doesn't tear down array, but just stops it 2971 * suspended (not supported yet) 2972 * All IO requests will block. The array can be reconfigured. 2973 * Writing this, if accepted, will block until array is quiescent 2974 * readonly 2975 * no resync can happen. no superblocks get written. 2976 * write requests fail 2977 * read-auto 2978 * like readonly, but behaves like 'clean' on a write request. 2979 * 2980 * clean - no pending writes, but otherwise active. 2981 * When written to inactive array, starts without resync 2982 * If a write request arrives then 2983 * if metadata is known, mark 'dirty' and switch to 'active'. 2984 * if not known, block and switch to write-pending 2985 * If written to an active array that has pending writes, then fails. 2986 * active 2987 * fully active: IO and resync can be happening. 2988 * When written to inactive array, starts with resync 2989 * 2990 * write-pending 2991 * clean, but writes are blocked waiting for 'active' to be written. 2992 * 2993 * active-idle 2994 * like active, but no writes have been seen for a while (100msec). 2995 * 2996 */ 2997 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2998 write_pending, active_idle, bad_word}; 2999 static char *array_states[] = { 3000 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3001 "write-pending", "active-idle", NULL }; 3002 3003 static int match_word(const char *word, char **list) 3004 { 3005 int n; 3006 for (n=0; list[n]; n++) 3007 if (cmd_match(word, list[n])) 3008 break; 3009 return n; 3010 } 3011 3012 static ssize_t 3013 array_state_show(mddev_t *mddev, char *page) 3014 { 3015 enum array_state st = inactive; 3016 3017 if (mddev->pers) 3018 switch(mddev->ro) { 3019 case 1: 3020 st = readonly; 3021 break; 3022 case 2: 3023 st = read_auto; 3024 break; 3025 case 0: 3026 if (mddev->in_sync) 3027 st = clean; 3028 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 3029 st = write_pending; 3030 else if (mddev->safemode) 3031 st = active_idle; 3032 else 3033 st = active; 3034 } 3035 else { 3036 if (list_empty(&mddev->disks) && 3037 mddev->raid_disks == 0 && 3038 mddev->dev_sectors == 0) 3039 st = clear; 3040 else 3041 st = inactive; 3042 } 3043 return sprintf(page, "%s\n", array_states[st]); 3044 } 3045 3046 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3047 static int do_md_run(mddev_t * mddev); 3048 static int restart_array(mddev_t *mddev); 3049 3050 static ssize_t 3051 array_state_store(mddev_t *mddev, const char *buf, size_t len) 3052 { 3053 int err = -EINVAL; 3054 enum array_state st = match_word(buf, array_states); 3055 switch(st) { 3056 case bad_word: 3057 break; 3058 case clear: 3059 /* stopping an active array */ 3060 if (atomic_read(&mddev->openers) > 0) 3061 return -EBUSY; 3062 err = do_md_stop(mddev, 0, 0); 3063 break; 3064 case inactive: 3065 /* stopping an active array */ 3066 if (mddev->pers) { 3067 if (atomic_read(&mddev->openers) > 0) 3068 return -EBUSY; 3069 err = do_md_stop(mddev, 2, 0); 3070 } else 3071 err = 0; /* already inactive */ 3072 break; 3073 case suspended: 3074 break; /* not supported yet */ 3075 case readonly: 3076 if (mddev->pers) 3077 err = do_md_stop(mddev, 1, 0); 3078 else { 3079 mddev->ro = 1; 3080 set_disk_ro(mddev->gendisk, 1); 3081 err = do_md_run(mddev); 3082 } 3083 break; 3084 case read_auto: 3085 if (mddev->pers) { 3086 if (mddev->ro == 0) 3087 err = do_md_stop(mddev, 1, 0); 3088 else if (mddev->ro == 1) 3089 err = restart_array(mddev); 3090 if (err == 0) { 3091 mddev->ro = 2; 3092 set_disk_ro(mddev->gendisk, 0); 3093 } 3094 } else { 3095 mddev->ro = 2; 3096 err = do_md_run(mddev); 3097 } 3098 break; 3099 case clean: 3100 if (mddev->pers) { 3101 restart_array(mddev); 3102 spin_lock_irq(&mddev->write_lock); 3103 if (atomic_read(&mddev->writes_pending) == 0) { 3104 if (mddev->in_sync == 0) { 3105 mddev->in_sync = 1; 3106 if (mddev->safemode == 1) 3107 mddev->safemode = 0; 3108 if (mddev->persistent) 3109 set_bit(MD_CHANGE_CLEAN, 3110 &mddev->flags); 3111 } 3112 err = 0; 3113 } else 3114 err = -EBUSY; 3115 spin_unlock_irq(&mddev->write_lock); 3116 } else 3117 err = -EINVAL; 3118 break; 3119 case active: 3120 if (mddev->pers) { 3121 restart_array(mddev); 3122 if (mddev->external) 3123 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 3124 wake_up(&mddev->sb_wait); 3125 err = 0; 3126 } else { 3127 mddev->ro = 0; 3128 set_disk_ro(mddev->gendisk, 0); 3129 err = do_md_run(mddev); 3130 } 3131 break; 3132 case write_pending: 3133 case active_idle: 3134 /* these cannot be set */ 3135 break; 3136 } 3137 if (err) 3138 return err; 3139 else { 3140 sysfs_notify_dirent(mddev->sysfs_state); 3141 return len; 3142 } 3143 } 3144 static struct md_sysfs_entry md_array_state = 3145 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3146 3147 static ssize_t 3148 null_show(mddev_t *mddev, char *page) 3149 { 3150 return -EINVAL; 3151 } 3152 3153 static ssize_t 3154 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 3155 { 3156 /* buf must be %d:%d\n? giving major and minor numbers */ 3157 /* The new device is added to the array. 3158 * If the array has a persistent superblock, we read the 3159 * superblock to initialise info and check validity. 3160 * Otherwise, only checking done is that in bind_rdev_to_array, 3161 * which mainly checks size. 3162 */ 3163 char *e; 3164 int major = simple_strtoul(buf, &e, 10); 3165 int minor; 3166 dev_t dev; 3167 mdk_rdev_t *rdev; 3168 int err; 3169 3170 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3171 return -EINVAL; 3172 minor = simple_strtoul(e+1, &e, 10); 3173 if (*e && *e != '\n') 3174 return -EINVAL; 3175 dev = MKDEV(major, minor); 3176 if (major != MAJOR(dev) || 3177 minor != MINOR(dev)) 3178 return -EOVERFLOW; 3179 3180 3181 if (mddev->persistent) { 3182 rdev = md_import_device(dev, mddev->major_version, 3183 mddev->minor_version); 3184 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3185 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3186 mdk_rdev_t, same_set); 3187 err = super_types[mddev->major_version] 3188 .load_super(rdev, rdev0, mddev->minor_version); 3189 if (err < 0) 3190 goto out; 3191 } 3192 } else if (mddev->external) 3193 rdev = md_import_device(dev, -2, -1); 3194 else 3195 rdev = md_import_device(dev, -1, -1); 3196 3197 if (IS_ERR(rdev)) 3198 return PTR_ERR(rdev); 3199 err = bind_rdev_to_array(rdev, mddev); 3200 out: 3201 if (err) 3202 export_rdev(rdev); 3203 return err ? err : len; 3204 } 3205 3206 static struct md_sysfs_entry md_new_device = 3207 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3208 3209 static ssize_t 3210 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 3211 { 3212 char *end; 3213 unsigned long chunk, end_chunk; 3214 3215 if (!mddev->bitmap) 3216 goto out; 3217 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3218 while (*buf) { 3219 chunk = end_chunk = simple_strtoul(buf, &end, 0); 3220 if (buf == end) break; 3221 if (*end == '-') { /* range */ 3222 buf = end + 1; 3223 end_chunk = simple_strtoul(buf, &end, 0); 3224 if (buf == end) break; 3225 } 3226 if (*end && !isspace(*end)) break; 3227 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3228 buf = end; 3229 while (isspace(*buf)) buf++; 3230 } 3231 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3232 out: 3233 return len; 3234 } 3235 3236 static struct md_sysfs_entry md_bitmap = 3237 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3238 3239 static ssize_t 3240 size_show(mddev_t *mddev, char *page) 3241 { 3242 return sprintf(page, "%llu\n", 3243 (unsigned long long)mddev->dev_sectors / 2); 3244 } 3245 3246 static int update_size(mddev_t *mddev, sector_t num_sectors); 3247 3248 static ssize_t 3249 size_store(mddev_t *mddev, const char *buf, size_t len) 3250 { 3251 /* If array is inactive, we can reduce the component size, but 3252 * not increase it (except from 0). 3253 * If array is active, we can try an on-line resize 3254 */ 3255 sector_t sectors; 3256 int err = strict_blocks_to_sectors(buf, §ors); 3257 3258 if (err < 0) 3259 return err; 3260 if (mddev->pers) { 3261 err = update_size(mddev, sectors); 3262 md_update_sb(mddev, 1); 3263 } else { 3264 if (mddev->dev_sectors == 0 || 3265 mddev->dev_sectors > sectors) 3266 mddev->dev_sectors = sectors; 3267 else 3268 err = -ENOSPC; 3269 } 3270 return err ? err : len; 3271 } 3272 3273 static struct md_sysfs_entry md_size = 3274 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3275 3276 3277 /* Metdata version. 3278 * This is one of 3279 * 'none' for arrays with no metadata (good luck...) 3280 * 'external' for arrays with externally managed metadata, 3281 * or N.M for internally known formats 3282 */ 3283 static ssize_t 3284 metadata_show(mddev_t *mddev, char *page) 3285 { 3286 if (mddev->persistent) 3287 return sprintf(page, "%d.%d\n", 3288 mddev->major_version, mddev->minor_version); 3289 else if (mddev->external) 3290 return sprintf(page, "external:%s\n", mddev->metadata_type); 3291 else 3292 return sprintf(page, "none\n"); 3293 } 3294 3295 static ssize_t 3296 metadata_store(mddev_t *mddev, const char *buf, size_t len) 3297 { 3298 int major, minor; 3299 char *e; 3300 /* Changing the details of 'external' metadata is 3301 * always permitted. Otherwise there must be 3302 * no devices attached to the array. 3303 */ 3304 if (mddev->external && strncmp(buf, "external:", 9) == 0) 3305 ; 3306 else if (!list_empty(&mddev->disks)) 3307 return -EBUSY; 3308 3309 if (cmd_match(buf, "none")) { 3310 mddev->persistent = 0; 3311 mddev->external = 0; 3312 mddev->major_version = 0; 3313 mddev->minor_version = 90; 3314 return len; 3315 } 3316 if (strncmp(buf, "external:", 9) == 0) { 3317 size_t namelen = len-9; 3318 if (namelen >= sizeof(mddev->metadata_type)) 3319 namelen = sizeof(mddev->metadata_type)-1; 3320 strncpy(mddev->metadata_type, buf+9, namelen); 3321 mddev->metadata_type[namelen] = 0; 3322 if (namelen && mddev->metadata_type[namelen-1] == '\n') 3323 mddev->metadata_type[--namelen] = 0; 3324 mddev->persistent = 0; 3325 mddev->external = 1; 3326 mddev->major_version = 0; 3327 mddev->minor_version = 90; 3328 return len; 3329 } 3330 major = simple_strtoul(buf, &e, 10); 3331 if (e==buf || *e != '.') 3332 return -EINVAL; 3333 buf = e+1; 3334 minor = simple_strtoul(buf, &e, 10); 3335 if (e==buf || (*e && *e != '\n') ) 3336 return -EINVAL; 3337 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 3338 return -ENOENT; 3339 mddev->major_version = major; 3340 mddev->minor_version = minor; 3341 mddev->persistent = 1; 3342 mddev->external = 0; 3343 return len; 3344 } 3345 3346 static struct md_sysfs_entry md_metadata = 3347 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 3348 3349 static ssize_t 3350 action_show(mddev_t *mddev, char *page) 3351 { 3352 char *type = "idle"; 3353 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3354 type = "frozen"; 3355 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3356 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 3357 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3358 type = "reshape"; 3359 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3360 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3361 type = "resync"; 3362 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 3363 type = "check"; 3364 else 3365 type = "repair"; 3366 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3367 type = "recover"; 3368 } 3369 return sprintf(page, "%s\n", type); 3370 } 3371 3372 static ssize_t 3373 action_store(mddev_t *mddev, const char *page, size_t len) 3374 { 3375 if (!mddev->pers || !mddev->pers->sync_request) 3376 return -EINVAL; 3377 3378 if (cmd_match(page, "frozen")) 3379 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3380 else 3381 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3382 3383 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3384 if (mddev->sync_thread) { 3385 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3386 md_unregister_thread(mddev->sync_thread); 3387 mddev->sync_thread = NULL; 3388 mddev->recovery = 0; 3389 } 3390 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3391 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3392 return -EBUSY; 3393 else if (cmd_match(page, "resync")) 3394 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3395 else if (cmd_match(page, "recover")) { 3396 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3397 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3398 } else if (cmd_match(page, "reshape")) { 3399 int err; 3400 if (mddev->pers->start_reshape == NULL) 3401 return -EINVAL; 3402 err = mddev->pers->start_reshape(mddev); 3403 if (err) 3404 return err; 3405 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3406 } else { 3407 if (cmd_match(page, "check")) 3408 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3409 else if (!cmd_match(page, "repair")) 3410 return -EINVAL; 3411 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3412 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3413 } 3414 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3415 md_wakeup_thread(mddev->thread); 3416 sysfs_notify_dirent(mddev->sysfs_action); 3417 return len; 3418 } 3419 3420 static ssize_t 3421 mismatch_cnt_show(mddev_t *mddev, char *page) 3422 { 3423 return sprintf(page, "%llu\n", 3424 (unsigned long long) mddev->resync_mismatches); 3425 } 3426 3427 static struct md_sysfs_entry md_scan_mode = 3428 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3429 3430 3431 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3432 3433 static ssize_t 3434 sync_min_show(mddev_t *mddev, char *page) 3435 { 3436 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3437 mddev->sync_speed_min ? "local": "system"); 3438 } 3439 3440 static ssize_t 3441 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3442 { 3443 int min; 3444 char *e; 3445 if (strncmp(buf, "system", 6)==0) { 3446 mddev->sync_speed_min = 0; 3447 return len; 3448 } 3449 min = simple_strtoul(buf, &e, 10); 3450 if (buf == e || (*e && *e != '\n') || min <= 0) 3451 return -EINVAL; 3452 mddev->sync_speed_min = min; 3453 return len; 3454 } 3455 3456 static struct md_sysfs_entry md_sync_min = 3457 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3458 3459 static ssize_t 3460 sync_max_show(mddev_t *mddev, char *page) 3461 { 3462 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3463 mddev->sync_speed_max ? "local": "system"); 3464 } 3465 3466 static ssize_t 3467 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3468 { 3469 int max; 3470 char *e; 3471 if (strncmp(buf, "system", 6)==0) { 3472 mddev->sync_speed_max = 0; 3473 return len; 3474 } 3475 max = simple_strtoul(buf, &e, 10); 3476 if (buf == e || (*e && *e != '\n') || max <= 0) 3477 return -EINVAL; 3478 mddev->sync_speed_max = max; 3479 return len; 3480 } 3481 3482 static struct md_sysfs_entry md_sync_max = 3483 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3484 3485 static ssize_t 3486 degraded_show(mddev_t *mddev, char *page) 3487 { 3488 return sprintf(page, "%d\n", mddev->degraded); 3489 } 3490 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3491 3492 static ssize_t 3493 sync_force_parallel_show(mddev_t *mddev, char *page) 3494 { 3495 return sprintf(page, "%d\n", mddev->parallel_resync); 3496 } 3497 3498 static ssize_t 3499 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3500 { 3501 long n; 3502 3503 if (strict_strtol(buf, 10, &n)) 3504 return -EINVAL; 3505 3506 if (n != 0 && n != 1) 3507 return -EINVAL; 3508 3509 mddev->parallel_resync = n; 3510 3511 if (mddev->sync_thread) 3512 wake_up(&resync_wait); 3513 3514 return len; 3515 } 3516 3517 /* force parallel resync, even with shared block devices */ 3518 static struct md_sysfs_entry md_sync_force_parallel = 3519 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3520 sync_force_parallel_show, sync_force_parallel_store); 3521 3522 static ssize_t 3523 sync_speed_show(mddev_t *mddev, char *page) 3524 { 3525 unsigned long resync, dt, db; 3526 if (mddev->curr_resync == 0) 3527 return sprintf(page, "none\n"); 3528 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3529 dt = (jiffies - mddev->resync_mark) / HZ; 3530 if (!dt) dt++; 3531 db = resync - mddev->resync_mark_cnt; 3532 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3533 } 3534 3535 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3536 3537 static ssize_t 3538 sync_completed_show(mddev_t *mddev, char *page) 3539 { 3540 unsigned long max_sectors, resync; 3541 3542 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3543 return sprintf(page, "none\n"); 3544 3545 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3546 max_sectors = mddev->resync_max_sectors; 3547 else 3548 max_sectors = mddev->dev_sectors; 3549 3550 resync = mddev->curr_resync_completed; 3551 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3552 } 3553 3554 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3555 3556 static ssize_t 3557 min_sync_show(mddev_t *mddev, char *page) 3558 { 3559 return sprintf(page, "%llu\n", 3560 (unsigned long long)mddev->resync_min); 3561 } 3562 static ssize_t 3563 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3564 { 3565 unsigned long long min; 3566 if (strict_strtoull(buf, 10, &min)) 3567 return -EINVAL; 3568 if (min > mddev->resync_max) 3569 return -EINVAL; 3570 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3571 return -EBUSY; 3572 3573 /* Must be a multiple of chunk_size */ 3574 if (mddev->chunk_sectors) { 3575 sector_t temp = min; 3576 if (sector_div(temp, mddev->chunk_sectors)) 3577 return -EINVAL; 3578 } 3579 mddev->resync_min = min; 3580 3581 return len; 3582 } 3583 3584 static struct md_sysfs_entry md_min_sync = 3585 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3586 3587 static ssize_t 3588 max_sync_show(mddev_t *mddev, char *page) 3589 { 3590 if (mddev->resync_max == MaxSector) 3591 return sprintf(page, "max\n"); 3592 else 3593 return sprintf(page, "%llu\n", 3594 (unsigned long long)mddev->resync_max); 3595 } 3596 static ssize_t 3597 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3598 { 3599 if (strncmp(buf, "max", 3) == 0) 3600 mddev->resync_max = MaxSector; 3601 else { 3602 unsigned long long max; 3603 if (strict_strtoull(buf, 10, &max)) 3604 return -EINVAL; 3605 if (max < mddev->resync_min) 3606 return -EINVAL; 3607 if (max < mddev->resync_max && 3608 mddev->ro == 0 && 3609 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3610 return -EBUSY; 3611 3612 /* Must be a multiple of chunk_size */ 3613 if (mddev->chunk_sectors) { 3614 sector_t temp = max; 3615 if (sector_div(temp, mddev->chunk_sectors)) 3616 return -EINVAL; 3617 } 3618 mddev->resync_max = max; 3619 } 3620 wake_up(&mddev->recovery_wait); 3621 return len; 3622 } 3623 3624 static struct md_sysfs_entry md_max_sync = 3625 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3626 3627 static ssize_t 3628 suspend_lo_show(mddev_t *mddev, char *page) 3629 { 3630 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3631 } 3632 3633 static ssize_t 3634 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3635 { 3636 char *e; 3637 unsigned long long new = simple_strtoull(buf, &e, 10); 3638 3639 if (mddev->pers == NULL || 3640 mddev->pers->quiesce == NULL) 3641 return -EINVAL; 3642 if (buf == e || (*e && *e != '\n')) 3643 return -EINVAL; 3644 if (new >= mddev->suspend_hi || 3645 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3646 mddev->suspend_lo = new; 3647 mddev->pers->quiesce(mddev, 2); 3648 return len; 3649 } else 3650 return -EINVAL; 3651 } 3652 static struct md_sysfs_entry md_suspend_lo = 3653 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3654 3655 3656 static ssize_t 3657 suspend_hi_show(mddev_t *mddev, char *page) 3658 { 3659 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3660 } 3661 3662 static ssize_t 3663 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3664 { 3665 char *e; 3666 unsigned long long new = simple_strtoull(buf, &e, 10); 3667 3668 if (mddev->pers == NULL || 3669 mddev->pers->quiesce == NULL) 3670 return -EINVAL; 3671 if (buf == e || (*e && *e != '\n')) 3672 return -EINVAL; 3673 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3674 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3675 mddev->suspend_hi = new; 3676 mddev->pers->quiesce(mddev, 1); 3677 mddev->pers->quiesce(mddev, 0); 3678 return len; 3679 } else 3680 return -EINVAL; 3681 } 3682 static struct md_sysfs_entry md_suspend_hi = 3683 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3684 3685 static ssize_t 3686 reshape_position_show(mddev_t *mddev, char *page) 3687 { 3688 if (mddev->reshape_position != MaxSector) 3689 return sprintf(page, "%llu\n", 3690 (unsigned long long)mddev->reshape_position); 3691 strcpy(page, "none\n"); 3692 return 5; 3693 } 3694 3695 static ssize_t 3696 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3697 { 3698 char *e; 3699 unsigned long long new = simple_strtoull(buf, &e, 10); 3700 if (mddev->pers) 3701 return -EBUSY; 3702 if (buf == e || (*e && *e != '\n')) 3703 return -EINVAL; 3704 mddev->reshape_position = new; 3705 mddev->delta_disks = 0; 3706 mddev->new_level = mddev->level; 3707 mddev->new_layout = mddev->layout; 3708 mddev->new_chunk_sectors = mddev->chunk_sectors; 3709 return len; 3710 } 3711 3712 static struct md_sysfs_entry md_reshape_position = 3713 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3714 reshape_position_store); 3715 3716 static ssize_t 3717 array_size_show(mddev_t *mddev, char *page) 3718 { 3719 if (mddev->external_size) 3720 return sprintf(page, "%llu\n", 3721 (unsigned long long)mddev->array_sectors/2); 3722 else 3723 return sprintf(page, "default\n"); 3724 } 3725 3726 static ssize_t 3727 array_size_store(mddev_t *mddev, const char *buf, size_t len) 3728 { 3729 sector_t sectors; 3730 3731 if (strncmp(buf, "default", 7) == 0) { 3732 if (mddev->pers) 3733 sectors = mddev->pers->size(mddev, 0, 0); 3734 else 3735 sectors = mddev->array_sectors; 3736 3737 mddev->external_size = 0; 3738 } else { 3739 if (strict_blocks_to_sectors(buf, §ors) < 0) 3740 return -EINVAL; 3741 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 3742 return -E2BIG; 3743 3744 mddev->external_size = 1; 3745 } 3746 3747 mddev->array_sectors = sectors; 3748 set_capacity(mddev->gendisk, mddev->array_sectors); 3749 if (mddev->pers) 3750 revalidate_disk(mddev->gendisk); 3751 3752 return len; 3753 } 3754 3755 static struct md_sysfs_entry md_array_size = 3756 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 3757 array_size_store); 3758 3759 static struct attribute *md_default_attrs[] = { 3760 &md_level.attr, 3761 &md_layout.attr, 3762 &md_raid_disks.attr, 3763 &md_chunk_size.attr, 3764 &md_size.attr, 3765 &md_resync_start.attr, 3766 &md_metadata.attr, 3767 &md_new_device.attr, 3768 &md_safe_delay.attr, 3769 &md_array_state.attr, 3770 &md_reshape_position.attr, 3771 &md_array_size.attr, 3772 NULL, 3773 }; 3774 3775 static struct attribute *md_redundancy_attrs[] = { 3776 &md_scan_mode.attr, 3777 &md_mismatches.attr, 3778 &md_sync_min.attr, 3779 &md_sync_max.attr, 3780 &md_sync_speed.attr, 3781 &md_sync_force_parallel.attr, 3782 &md_sync_completed.attr, 3783 &md_min_sync.attr, 3784 &md_max_sync.attr, 3785 &md_suspend_lo.attr, 3786 &md_suspend_hi.attr, 3787 &md_bitmap.attr, 3788 &md_degraded.attr, 3789 NULL, 3790 }; 3791 static struct attribute_group md_redundancy_group = { 3792 .name = NULL, 3793 .attrs = md_redundancy_attrs, 3794 }; 3795 3796 3797 static ssize_t 3798 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3799 { 3800 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3801 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3802 ssize_t rv; 3803 3804 if (!entry->show) 3805 return -EIO; 3806 rv = mddev_lock(mddev); 3807 if (!rv) { 3808 rv = entry->show(mddev, page); 3809 mddev_unlock(mddev); 3810 } 3811 return rv; 3812 } 3813 3814 static ssize_t 3815 md_attr_store(struct kobject *kobj, struct attribute *attr, 3816 const char *page, size_t length) 3817 { 3818 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3819 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3820 ssize_t rv; 3821 3822 if (!entry->store) 3823 return -EIO; 3824 if (!capable(CAP_SYS_ADMIN)) 3825 return -EACCES; 3826 rv = mddev_lock(mddev); 3827 if (mddev->hold_active == UNTIL_IOCTL) 3828 mddev->hold_active = 0; 3829 if (!rv) { 3830 rv = entry->store(mddev, page, length); 3831 mddev_unlock(mddev); 3832 } 3833 return rv; 3834 } 3835 3836 static void md_free(struct kobject *ko) 3837 { 3838 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3839 3840 if (mddev->sysfs_state) 3841 sysfs_put(mddev->sysfs_state); 3842 3843 if (mddev->gendisk) { 3844 del_gendisk(mddev->gendisk); 3845 put_disk(mddev->gendisk); 3846 } 3847 if (mddev->queue) 3848 blk_cleanup_queue(mddev->queue); 3849 3850 kfree(mddev); 3851 } 3852 3853 static struct sysfs_ops md_sysfs_ops = { 3854 .show = md_attr_show, 3855 .store = md_attr_store, 3856 }; 3857 static struct kobj_type md_ktype = { 3858 .release = md_free, 3859 .sysfs_ops = &md_sysfs_ops, 3860 .default_attrs = md_default_attrs, 3861 }; 3862 3863 int mdp_major = 0; 3864 3865 static void mddev_delayed_delete(struct work_struct *ws) 3866 { 3867 mddev_t *mddev = container_of(ws, mddev_t, del_work); 3868 3869 if (mddev->private == &md_redundancy_group) { 3870 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3871 if (mddev->sysfs_action) 3872 sysfs_put(mddev->sysfs_action); 3873 mddev->sysfs_action = NULL; 3874 mddev->private = NULL; 3875 } 3876 kobject_del(&mddev->kobj); 3877 kobject_put(&mddev->kobj); 3878 } 3879 3880 static int md_alloc(dev_t dev, char *name) 3881 { 3882 static DEFINE_MUTEX(disks_mutex); 3883 mddev_t *mddev = mddev_find(dev); 3884 struct gendisk *disk; 3885 int partitioned; 3886 int shift; 3887 int unit; 3888 int error; 3889 3890 if (!mddev) 3891 return -ENODEV; 3892 3893 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 3894 shift = partitioned ? MdpMinorShift : 0; 3895 unit = MINOR(mddev->unit) >> shift; 3896 3897 /* wait for any previous instance if this device 3898 * to be completed removed (mddev_delayed_delete). 3899 */ 3900 flush_scheduled_work(); 3901 3902 mutex_lock(&disks_mutex); 3903 error = -EEXIST; 3904 if (mddev->gendisk) 3905 goto abort; 3906 3907 if (name) { 3908 /* Need to ensure that 'name' is not a duplicate. 3909 */ 3910 mddev_t *mddev2; 3911 spin_lock(&all_mddevs_lock); 3912 3913 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 3914 if (mddev2->gendisk && 3915 strcmp(mddev2->gendisk->disk_name, name) == 0) { 3916 spin_unlock(&all_mddevs_lock); 3917 goto abort; 3918 } 3919 spin_unlock(&all_mddevs_lock); 3920 } 3921 3922 error = -ENOMEM; 3923 mddev->queue = blk_alloc_queue(GFP_KERNEL); 3924 if (!mddev->queue) 3925 goto abort; 3926 mddev->queue->queuedata = mddev; 3927 3928 /* Can be unlocked because the queue is new: no concurrency */ 3929 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3930 3931 blk_queue_make_request(mddev->queue, md_make_request); 3932 3933 disk = alloc_disk(1 << shift); 3934 if (!disk) { 3935 blk_cleanup_queue(mddev->queue); 3936 mddev->queue = NULL; 3937 goto abort; 3938 } 3939 disk->major = MAJOR(mddev->unit); 3940 disk->first_minor = unit << shift; 3941 if (name) 3942 strcpy(disk->disk_name, name); 3943 else if (partitioned) 3944 sprintf(disk->disk_name, "md_d%d", unit); 3945 else 3946 sprintf(disk->disk_name, "md%d", unit); 3947 disk->fops = &md_fops; 3948 disk->private_data = mddev; 3949 disk->queue = mddev->queue; 3950 /* Allow extended partitions. This makes the 3951 * 'mdp' device redundant, but we can't really 3952 * remove it now. 3953 */ 3954 disk->flags |= GENHD_FL_EXT_DEVT; 3955 add_disk(disk); 3956 mddev->gendisk = disk; 3957 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3958 &disk_to_dev(disk)->kobj, "%s", "md"); 3959 if (error) { 3960 /* This isn't possible, but as kobject_init_and_add is marked 3961 * __must_check, we must do something with the result 3962 */ 3963 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3964 disk->disk_name); 3965 error = 0; 3966 } 3967 abort: 3968 mutex_unlock(&disks_mutex); 3969 if (!error) { 3970 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3971 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3972 } 3973 mddev_put(mddev); 3974 return error; 3975 } 3976 3977 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3978 { 3979 md_alloc(dev, NULL); 3980 return NULL; 3981 } 3982 3983 static int add_named_array(const char *val, struct kernel_param *kp) 3984 { 3985 /* val must be "md_*" where * is not all digits. 3986 * We allocate an array with a large free minor number, and 3987 * set the name to val. val must not already be an active name. 3988 */ 3989 int len = strlen(val); 3990 char buf[DISK_NAME_LEN]; 3991 3992 while (len && val[len-1] == '\n') 3993 len--; 3994 if (len >= DISK_NAME_LEN) 3995 return -E2BIG; 3996 strlcpy(buf, val, len+1); 3997 if (strncmp(buf, "md_", 3) != 0) 3998 return -EINVAL; 3999 return md_alloc(0, buf); 4000 } 4001 4002 static void md_safemode_timeout(unsigned long data) 4003 { 4004 mddev_t *mddev = (mddev_t *) data; 4005 4006 if (!atomic_read(&mddev->writes_pending)) { 4007 mddev->safemode = 1; 4008 if (mddev->external) 4009 sysfs_notify_dirent(mddev->sysfs_state); 4010 } 4011 md_wakeup_thread(mddev->thread); 4012 } 4013 4014 static int start_dirty_degraded; 4015 4016 static int do_md_run(mddev_t * mddev) 4017 { 4018 int err; 4019 mdk_rdev_t *rdev; 4020 struct gendisk *disk; 4021 struct mdk_personality *pers; 4022 4023 if (list_empty(&mddev->disks)) 4024 /* cannot run an array with no devices.. */ 4025 return -EINVAL; 4026 4027 if (mddev->pers) 4028 return -EBUSY; 4029 4030 /* 4031 * Analyze all RAID superblock(s) 4032 */ 4033 if (!mddev->raid_disks) { 4034 if (!mddev->persistent) 4035 return -EINVAL; 4036 analyze_sbs(mddev); 4037 } 4038 4039 if (mddev->level != LEVEL_NONE) 4040 request_module("md-level-%d", mddev->level); 4041 else if (mddev->clevel[0]) 4042 request_module("md-%s", mddev->clevel); 4043 4044 /* 4045 * Drop all container device buffers, from now on 4046 * the only valid external interface is through the md 4047 * device. 4048 */ 4049 list_for_each_entry(rdev, &mddev->disks, same_set) { 4050 if (test_bit(Faulty, &rdev->flags)) 4051 continue; 4052 sync_blockdev(rdev->bdev); 4053 invalidate_bdev(rdev->bdev); 4054 4055 /* perform some consistency tests on the device. 4056 * We don't want the data to overlap the metadata, 4057 * Internal Bitmap issues have been handled elsewhere. 4058 */ 4059 if (rdev->data_offset < rdev->sb_start) { 4060 if (mddev->dev_sectors && 4061 rdev->data_offset + mddev->dev_sectors 4062 > rdev->sb_start) { 4063 printk("md: %s: data overlaps metadata\n", 4064 mdname(mddev)); 4065 return -EINVAL; 4066 } 4067 } else { 4068 if (rdev->sb_start + rdev->sb_size/512 4069 > rdev->data_offset) { 4070 printk("md: %s: metadata overlaps data\n", 4071 mdname(mddev)); 4072 return -EINVAL; 4073 } 4074 } 4075 sysfs_notify_dirent(rdev->sysfs_state); 4076 } 4077 4078 md_probe(mddev->unit, NULL, NULL); 4079 disk = mddev->gendisk; 4080 if (!disk) 4081 return -ENOMEM; 4082 4083 spin_lock(&pers_lock); 4084 pers = find_pers(mddev->level, mddev->clevel); 4085 if (!pers || !try_module_get(pers->owner)) { 4086 spin_unlock(&pers_lock); 4087 if (mddev->level != LEVEL_NONE) 4088 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 4089 mddev->level); 4090 else 4091 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 4092 mddev->clevel); 4093 return -EINVAL; 4094 } 4095 mddev->pers = pers; 4096 spin_unlock(&pers_lock); 4097 if (mddev->level != pers->level) { 4098 mddev->level = pers->level; 4099 mddev->new_level = pers->level; 4100 } 4101 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4102 4103 if (mddev->reshape_position != MaxSector && 4104 pers->start_reshape == NULL) { 4105 /* This personality cannot handle reshaping... */ 4106 mddev->pers = NULL; 4107 module_put(pers->owner); 4108 return -EINVAL; 4109 } 4110 4111 if (pers->sync_request) { 4112 /* Warn if this is a potentially silly 4113 * configuration. 4114 */ 4115 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4116 mdk_rdev_t *rdev2; 4117 int warned = 0; 4118 4119 list_for_each_entry(rdev, &mddev->disks, same_set) 4120 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4121 if (rdev < rdev2 && 4122 rdev->bdev->bd_contains == 4123 rdev2->bdev->bd_contains) { 4124 printk(KERN_WARNING 4125 "%s: WARNING: %s appears to be" 4126 " on the same physical disk as" 4127 " %s.\n", 4128 mdname(mddev), 4129 bdevname(rdev->bdev,b), 4130 bdevname(rdev2->bdev,b2)); 4131 warned = 1; 4132 } 4133 } 4134 4135 if (warned) 4136 printk(KERN_WARNING 4137 "True protection against single-disk" 4138 " failure might be compromised.\n"); 4139 } 4140 4141 mddev->recovery = 0; 4142 /* may be over-ridden by personality */ 4143 mddev->resync_max_sectors = mddev->dev_sectors; 4144 4145 mddev->barriers_work = 1; 4146 mddev->ok_start_degraded = start_dirty_degraded; 4147 4148 if (start_readonly) 4149 mddev->ro = 2; /* read-only, but switch on first write */ 4150 4151 err = mddev->pers->run(mddev); 4152 if (err) 4153 printk(KERN_ERR "md: pers->run() failed ...\n"); 4154 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 4155 WARN_ONCE(!mddev->external_size, "%s: default size too small," 4156 " but 'external_size' not in effect?\n", __func__); 4157 printk(KERN_ERR 4158 "md: invalid array_size %llu > default size %llu\n", 4159 (unsigned long long)mddev->array_sectors / 2, 4160 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 4161 err = -EINVAL; 4162 mddev->pers->stop(mddev); 4163 } 4164 if (err == 0 && mddev->pers->sync_request) { 4165 err = bitmap_create(mddev); 4166 if (err) { 4167 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4168 mdname(mddev), err); 4169 mddev->pers->stop(mddev); 4170 } 4171 } 4172 if (err) { 4173 module_put(mddev->pers->owner); 4174 mddev->pers = NULL; 4175 bitmap_destroy(mddev); 4176 return err; 4177 } 4178 if (mddev->pers->sync_request) { 4179 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4180 printk(KERN_WARNING 4181 "md: cannot register extra attributes for %s\n", 4182 mdname(mddev)); 4183 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4184 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4185 mddev->ro = 0; 4186 4187 atomic_set(&mddev->writes_pending,0); 4188 mddev->safemode = 0; 4189 mddev->safemode_timer.function = md_safemode_timeout; 4190 mddev->safemode_timer.data = (unsigned long) mddev; 4191 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4192 mddev->in_sync = 1; 4193 4194 list_for_each_entry(rdev, &mddev->disks, same_set) 4195 if (rdev->raid_disk >= 0) { 4196 char nm[20]; 4197 sprintf(nm, "rd%d", rdev->raid_disk); 4198 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4199 printk("md: cannot register %s for %s\n", 4200 nm, mdname(mddev)); 4201 } 4202 4203 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4204 4205 if (mddev->flags) 4206 md_update_sb(mddev, 0); 4207 4208 set_capacity(disk, mddev->array_sectors); 4209 4210 /* If there is a partially-recovered drive we need to 4211 * start recovery here. If we leave it to md_check_recovery, 4212 * it will remove the drives and not do the right thing 4213 */ 4214 if (mddev->degraded && !mddev->sync_thread) { 4215 int spares = 0; 4216 list_for_each_entry(rdev, &mddev->disks, same_set) 4217 if (rdev->raid_disk >= 0 && 4218 !test_bit(In_sync, &rdev->flags) && 4219 !test_bit(Faulty, &rdev->flags)) 4220 /* complete an interrupted recovery */ 4221 spares++; 4222 if (spares && mddev->pers->sync_request) { 4223 mddev->recovery = 0; 4224 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4225 mddev->sync_thread = md_register_thread(md_do_sync, 4226 mddev, 4227 "resync"); 4228 if (!mddev->sync_thread) { 4229 printk(KERN_ERR "%s: could not start resync" 4230 " thread...\n", 4231 mdname(mddev)); 4232 /* leave the spares where they are, it shouldn't hurt */ 4233 mddev->recovery = 0; 4234 } 4235 } 4236 } 4237 md_wakeup_thread(mddev->thread); 4238 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4239 4240 revalidate_disk(mddev->gendisk); 4241 mddev->changed = 1; 4242 md_new_event(mddev); 4243 sysfs_notify_dirent(mddev->sysfs_state); 4244 if (mddev->sysfs_action) 4245 sysfs_notify_dirent(mddev->sysfs_action); 4246 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4247 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4248 return 0; 4249 } 4250 4251 static int restart_array(mddev_t *mddev) 4252 { 4253 struct gendisk *disk = mddev->gendisk; 4254 4255 /* Complain if it has no devices */ 4256 if (list_empty(&mddev->disks)) 4257 return -ENXIO; 4258 if (!mddev->pers) 4259 return -EINVAL; 4260 if (!mddev->ro) 4261 return -EBUSY; 4262 mddev->safemode = 0; 4263 mddev->ro = 0; 4264 set_disk_ro(disk, 0); 4265 printk(KERN_INFO "md: %s switched to read-write mode.\n", 4266 mdname(mddev)); 4267 /* Kick recovery or resync if necessary */ 4268 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4269 md_wakeup_thread(mddev->thread); 4270 md_wakeup_thread(mddev->sync_thread); 4271 sysfs_notify_dirent(mddev->sysfs_state); 4272 return 0; 4273 } 4274 4275 /* similar to deny_write_access, but accounts for our holding a reference 4276 * to the file ourselves */ 4277 static int deny_bitmap_write_access(struct file * file) 4278 { 4279 struct inode *inode = file->f_mapping->host; 4280 4281 spin_lock(&inode->i_lock); 4282 if (atomic_read(&inode->i_writecount) > 1) { 4283 spin_unlock(&inode->i_lock); 4284 return -ETXTBSY; 4285 } 4286 atomic_set(&inode->i_writecount, -1); 4287 spin_unlock(&inode->i_lock); 4288 4289 return 0; 4290 } 4291 4292 static void restore_bitmap_write_access(struct file *file) 4293 { 4294 struct inode *inode = file->f_mapping->host; 4295 4296 spin_lock(&inode->i_lock); 4297 atomic_set(&inode->i_writecount, 1); 4298 spin_unlock(&inode->i_lock); 4299 } 4300 4301 /* mode: 4302 * 0 - completely stop and dis-assemble array 4303 * 1 - switch to readonly 4304 * 2 - stop but do not disassemble array 4305 */ 4306 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4307 { 4308 int err = 0; 4309 struct gendisk *disk = mddev->gendisk; 4310 mdk_rdev_t *rdev; 4311 4312 mutex_lock(&mddev->open_mutex); 4313 if (atomic_read(&mddev->openers) > is_open) { 4314 printk("md: %s still in use.\n",mdname(mddev)); 4315 err = -EBUSY; 4316 } else if (mddev->pers) { 4317 4318 if (mddev->sync_thread) { 4319 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4320 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4321 md_unregister_thread(mddev->sync_thread); 4322 mddev->sync_thread = NULL; 4323 } 4324 4325 del_timer_sync(&mddev->safemode_timer); 4326 4327 switch(mode) { 4328 case 1: /* readonly */ 4329 err = -ENXIO; 4330 if (mddev->ro==1) 4331 goto out; 4332 mddev->ro = 1; 4333 break; 4334 case 0: /* disassemble */ 4335 case 2: /* stop */ 4336 bitmap_flush(mddev); 4337 md_super_wait(mddev); 4338 if (mddev->ro) 4339 set_disk_ro(disk, 0); 4340 4341 mddev->pers->stop(mddev); 4342 mddev->queue->merge_bvec_fn = NULL; 4343 mddev->queue->unplug_fn = NULL; 4344 mddev->queue->backing_dev_info.congested_fn = NULL; 4345 module_put(mddev->pers->owner); 4346 if (mddev->pers->sync_request) 4347 mddev->private = &md_redundancy_group; 4348 mddev->pers = NULL; 4349 /* tell userspace to handle 'inactive' */ 4350 sysfs_notify_dirent(mddev->sysfs_state); 4351 4352 list_for_each_entry(rdev, &mddev->disks, same_set) 4353 if (rdev->raid_disk >= 0) { 4354 char nm[20]; 4355 sprintf(nm, "rd%d", rdev->raid_disk); 4356 sysfs_remove_link(&mddev->kobj, nm); 4357 } 4358 4359 set_capacity(disk, 0); 4360 mddev->changed = 1; 4361 4362 if (mddev->ro) 4363 mddev->ro = 0; 4364 } 4365 if (!mddev->in_sync || mddev->flags) { 4366 /* mark array as shutdown cleanly */ 4367 mddev->in_sync = 1; 4368 md_update_sb(mddev, 1); 4369 } 4370 if (mode == 1) 4371 set_disk_ro(disk, 1); 4372 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4373 err = 0; 4374 } 4375 out: 4376 mutex_unlock(&mddev->open_mutex); 4377 if (err) 4378 return err; 4379 /* 4380 * Free resources if final stop 4381 */ 4382 if (mode == 0) { 4383 4384 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4385 4386 bitmap_destroy(mddev); 4387 if (mddev->bitmap_file) { 4388 restore_bitmap_write_access(mddev->bitmap_file); 4389 fput(mddev->bitmap_file); 4390 mddev->bitmap_file = NULL; 4391 } 4392 mddev->bitmap_offset = 0; 4393 4394 /* make sure all md_delayed_delete calls have finished */ 4395 flush_scheduled_work(); 4396 4397 export_array(mddev); 4398 4399 mddev->array_sectors = 0; 4400 mddev->external_size = 0; 4401 mddev->dev_sectors = 0; 4402 mddev->raid_disks = 0; 4403 mddev->recovery_cp = 0; 4404 mddev->resync_min = 0; 4405 mddev->resync_max = MaxSector; 4406 mddev->reshape_position = MaxSector; 4407 mddev->external = 0; 4408 mddev->persistent = 0; 4409 mddev->level = LEVEL_NONE; 4410 mddev->clevel[0] = 0; 4411 mddev->flags = 0; 4412 mddev->ro = 0; 4413 mddev->metadata_type[0] = 0; 4414 mddev->chunk_sectors = 0; 4415 mddev->ctime = mddev->utime = 0; 4416 mddev->layout = 0; 4417 mddev->max_disks = 0; 4418 mddev->events = 0; 4419 mddev->delta_disks = 0; 4420 mddev->new_level = LEVEL_NONE; 4421 mddev->new_layout = 0; 4422 mddev->new_chunk_sectors = 0; 4423 mddev->curr_resync = 0; 4424 mddev->resync_mismatches = 0; 4425 mddev->suspend_lo = mddev->suspend_hi = 0; 4426 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4427 mddev->recovery = 0; 4428 mddev->in_sync = 0; 4429 mddev->changed = 0; 4430 mddev->degraded = 0; 4431 mddev->barriers_work = 0; 4432 mddev->safemode = 0; 4433 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4434 if (mddev->hold_active == UNTIL_STOP) 4435 mddev->hold_active = 0; 4436 4437 } else if (mddev->pers) 4438 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4439 mdname(mddev)); 4440 err = 0; 4441 blk_integrity_unregister(disk); 4442 md_new_event(mddev); 4443 sysfs_notify_dirent(mddev->sysfs_state); 4444 return err; 4445 } 4446 4447 #ifndef MODULE 4448 static void autorun_array(mddev_t *mddev) 4449 { 4450 mdk_rdev_t *rdev; 4451 int err; 4452 4453 if (list_empty(&mddev->disks)) 4454 return; 4455 4456 printk(KERN_INFO "md: running: "); 4457 4458 list_for_each_entry(rdev, &mddev->disks, same_set) { 4459 char b[BDEVNAME_SIZE]; 4460 printk("<%s>", bdevname(rdev->bdev,b)); 4461 } 4462 printk("\n"); 4463 4464 err = do_md_run(mddev); 4465 if (err) { 4466 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 4467 do_md_stop(mddev, 0, 0); 4468 } 4469 } 4470 4471 /* 4472 * lets try to run arrays based on all disks that have arrived 4473 * until now. (those are in pending_raid_disks) 4474 * 4475 * the method: pick the first pending disk, collect all disks with 4476 * the same UUID, remove all from the pending list and put them into 4477 * the 'same_array' list. Then order this list based on superblock 4478 * update time (freshest comes first), kick out 'old' disks and 4479 * compare superblocks. If everything's fine then run it. 4480 * 4481 * If "unit" is allocated, then bump its reference count 4482 */ 4483 static void autorun_devices(int part) 4484 { 4485 mdk_rdev_t *rdev0, *rdev, *tmp; 4486 mddev_t *mddev; 4487 char b[BDEVNAME_SIZE]; 4488 4489 printk(KERN_INFO "md: autorun ...\n"); 4490 while (!list_empty(&pending_raid_disks)) { 4491 int unit; 4492 dev_t dev; 4493 LIST_HEAD(candidates); 4494 rdev0 = list_entry(pending_raid_disks.next, 4495 mdk_rdev_t, same_set); 4496 4497 printk(KERN_INFO "md: considering %s ...\n", 4498 bdevname(rdev0->bdev,b)); 4499 INIT_LIST_HEAD(&candidates); 4500 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 4501 if (super_90_load(rdev, rdev0, 0) >= 0) { 4502 printk(KERN_INFO "md: adding %s ...\n", 4503 bdevname(rdev->bdev,b)); 4504 list_move(&rdev->same_set, &candidates); 4505 } 4506 /* 4507 * now we have a set of devices, with all of them having 4508 * mostly sane superblocks. It's time to allocate the 4509 * mddev. 4510 */ 4511 if (part) { 4512 dev = MKDEV(mdp_major, 4513 rdev0->preferred_minor << MdpMinorShift); 4514 unit = MINOR(dev) >> MdpMinorShift; 4515 } else { 4516 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4517 unit = MINOR(dev); 4518 } 4519 if (rdev0->preferred_minor != unit) { 4520 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4521 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4522 break; 4523 } 4524 4525 md_probe(dev, NULL, NULL); 4526 mddev = mddev_find(dev); 4527 if (!mddev || !mddev->gendisk) { 4528 if (mddev) 4529 mddev_put(mddev); 4530 printk(KERN_ERR 4531 "md: cannot allocate memory for md drive.\n"); 4532 break; 4533 } 4534 if (mddev_lock(mddev)) 4535 printk(KERN_WARNING "md: %s locked, cannot run\n", 4536 mdname(mddev)); 4537 else if (mddev->raid_disks || mddev->major_version 4538 || !list_empty(&mddev->disks)) { 4539 printk(KERN_WARNING 4540 "md: %s already running, cannot run %s\n", 4541 mdname(mddev), bdevname(rdev0->bdev,b)); 4542 mddev_unlock(mddev); 4543 } else { 4544 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4545 mddev->persistent = 1; 4546 rdev_for_each_list(rdev, tmp, &candidates) { 4547 list_del_init(&rdev->same_set); 4548 if (bind_rdev_to_array(rdev, mddev)) 4549 export_rdev(rdev); 4550 } 4551 autorun_array(mddev); 4552 mddev_unlock(mddev); 4553 } 4554 /* on success, candidates will be empty, on error 4555 * it won't... 4556 */ 4557 rdev_for_each_list(rdev, tmp, &candidates) { 4558 list_del_init(&rdev->same_set); 4559 export_rdev(rdev); 4560 } 4561 mddev_put(mddev); 4562 } 4563 printk(KERN_INFO "md: ... autorun DONE.\n"); 4564 } 4565 #endif /* !MODULE */ 4566 4567 static int get_version(void __user * arg) 4568 { 4569 mdu_version_t ver; 4570 4571 ver.major = MD_MAJOR_VERSION; 4572 ver.minor = MD_MINOR_VERSION; 4573 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4574 4575 if (copy_to_user(arg, &ver, sizeof(ver))) 4576 return -EFAULT; 4577 4578 return 0; 4579 } 4580 4581 static int get_array_info(mddev_t * mddev, void __user * arg) 4582 { 4583 mdu_array_info_t info; 4584 int nr,working,insync,failed,spare; 4585 mdk_rdev_t *rdev; 4586 4587 nr=working=insync=failed=spare=0; 4588 list_for_each_entry(rdev, &mddev->disks, same_set) { 4589 nr++; 4590 if (test_bit(Faulty, &rdev->flags)) 4591 failed++; 4592 else { 4593 working++; 4594 if (test_bit(In_sync, &rdev->flags)) 4595 insync++; 4596 else 4597 spare++; 4598 } 4599 } 4600 4601 info.major_version = mddev->major_version; 4602 info.minor_version = mddev->minor_version; 4603 info.patch_version = MD_PATCHLEVEL_VERSION; 4604 info.ctime = mddev->ctime; 4605 info.level = mddev->level; 4606 info.size = mddev->dev_sectors / 2; 4607 if (info.size != mddev->dev_sectors / 2) /* overflow */ 4608 info.size = -1; 4609 info.nr_disks = nr; 4610 info.raid_disks = mddev->raid_disks; 4611 info.md_minor = mddev->md_minor; 4612 info.not_persistent= !mddev->persistent; 4613 4614 info.utime = mddev->utime; 4615 info.state = 0; 4616 if (mddev->in_sync) 4617 info.state = (1<<MD_SB_CLEAN); 4618 if (mddev->bitmap && mddev->bitmap_offset) 4619 info.state = (1<<MD_SB_BITMAP_PRESENT); 4620 info.active_disks = insync; 4621 info.working_disks = working; 4622 info.failed_disks = failed; 4623 info.spare_disks = spare; 4624 4625 info.layout = mddev->layout; 4626 info.chunk_size = mddev->chunk_sectors << 9; 4627 4628 if (copy_to_user(arg, &info, sizeof(info))) 4629 return -EFAULT; 4630 4631 return 0; 4632 } 4633 4634 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4635 { 4636 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4637 char *ptr, *buf = NULL; 4638 int err = -ENOMEM; 4639 4640 if (md_allow_write(mddev)) 4641 file = kmalloc(sizeof(*file), GFP_NOIO); 4642 else 4643 file = kmalloc(sizeof(*file), GFP_KERNEL); 4644 4645 if (!file) 4646 goto out; 4647 4648 /* bitmap disabled, zero the first byte and copy out */ 4649 if (!mddev->bitmap || !mddev->bitmap->file) { 4650 file->pathname[0] = '\0'; 4651 goto copy_out; 4652 } 4653 4654 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4655 if (!buf) 4656 goto out; 4657 4658 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4659 if (IS_ERR(ptr)) 4660 goto out; 4661 4662 strcpy(file->pathname, ptr); 4663 4664 copy_out: 4665 err = 0; 4666 if (copy_to_user(arg, file, sizeof(*file))) 4667 err = -EFAULT; 4668 out: 4669 kfree(buf); 4670 kfree(file); 4671 return err; 4672 } 4673 4674 static int get_disk_info(mddev_t * mddev, void __user * arg) 4675 { 4676 mdu_disk_info_t info; 4677 mdk_rdev_t *rdev; 4678 4679 if (copy_from_user(&info, arg, sizeof(info))) 4680 return -EFAULT; 4681 4682 rdev = find_rdev_nr(mddev, info.number); 4683 if (rdev) { 4684 info.major = MAJOR(rdev->bdev->bd_dev); 4685 info.minor = MINOR(rdev->bdev->bd_dev); 4686 info.raid_disk = rdev->raid_disk; 4687 info.state = 0; 4688 if (test_bit(Faulty, &rdev->flags)) 4689 info.state |= (1<<MD_DISK_FAULTY); 4690 else if (test_bit(In_sync, &rdev->flags)) { 4691 info.state |= (1<<MD_DISK_ACTIVE); 4692 info.state |= (1<<MD_DISK_SYNC); 4693 } 4694 if (test_bit(WriteMostly, &rdev->flags)) 4695 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4696 } else { 4697 info.major = info.minor = 0; 4698 info.raid_disk = -1; 4699 info.state = (1<<MD_DISK_REMOVED); 4700 } 4701 4702 if (copy_to_user(arg, &info, sizeof(info))) 4703 return -EFAULT; 4704 4705 return 0; 4706 } 4707 4708 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4709 { 4710 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4711 mdk_rdev_t *rdev; 4712 dev_t dev = MKDEV(info->major,info->minor); 4713 4714 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4715 return -EOVERFLOW; 4716 4717 if (!mddev->raid_disks) { 4718 int err; 4719 /* expecting a device which has a superblock */ 4720 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4721 if (IS_ERR(rdev)) { 4722 printk(KERN_WARNING 4723 "md: md_import_device returned %ld\n", 4724 PTR_ERR(rdev)); 4725 return PTR_ERR(rdev); 4726 } 4727 if (!list_empty(&mddev->disks)) { 4728 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4729 mdk_rdev_t, same_set); 4730 err = super_types[mddev->major_version] 4731 .load_super(rdev, rdev0, mddev->minor_version); 4732 if (err < 0) { 4733 printk(KERN_WARNING 4734 "md: %s has different UUID to %s\n", 4735 bdevname(rdev->bdev,b), 4736 bdevname(rdev0->bdev,b2)); 4737 export_rdev(rdev); 4738 return -EINVAL; 4739 } 4740 } 4741 err = bind_rdev_to_array(rdev, mddev); 4742 if (err) 4743 export_rdev(rdev); 4744 return err; 4745 } 4746 4747 /* 4748 * add_new_disk can be used once the array is assembled 4749 * to add "hot spares". They must already have a superblock 4750 * written 4751 */ 4752 if (mddev->pers) { 4753 int err; 4754 if (!mddev->pers->hot_add_disk) { 4755 printk(KERN_WARNING 4756 "%s: personality does not support diskops!\n", 4757 mdname(mddev)); 4758 return -EINVAL; 4759 } 4760 if (mddev->persistent) 4761 rdev = md_import_device(dev, mddev->major_version, 4762 mddev->minor_version); 4763 else 4764 rdev = md_import_device(dev, -1, -1); 4765 if (IS_ERR(rdev)) { 4766 printk(KERN_WARNING 4767 "md: md_import_device returned %ld\n", 4768 PTR_ERR(rdev)); 4769 return PTR_ERR(rdev); 4770 } 4771 /* set save_raid_disk if appropriate */ 4772 if (!mddev->persistent) { 4773 if (info->state & (1<<MD_DISK_SYNC) && 4774 info->raid_disk < mddev->raid_disks) 4775 rdev->raid_disk = info->raid_disk; 4776 else 4777 rdev->raid_disk = -1; 4778 } else 4779 super_types[mddev->major_version]. 4780 validate_super(mddev, rdev); 4781 rdev->saved_raid_disk = rdev->raid_disk; 4782 4783 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4784 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4785 set_bit(WriteMostly, &rdev->flags); 4786 else 4787 clear_bit(WriteMostly, &rdev->flags); 4788 4789 rdev->raid_disk = -1; 4790 err = bind_rdev_to_array(rdev, mddev); 4791 if (!err && !mddev->pers->hot_remove_disk) { 4792 /* If there is hot_add_disk but no hot_remove_disk 4793 * then added disks for geometry changes, 4794 * and should be added immediately. 4795 */ 4796 super_types[mddev->major_version]. 4797 validate_super(mddev, rdev); 4798 err = mddev->pers->hot_add_disk(mddev, rdev); 4799 if (err) 4800 unbind_rdev_from_array(rdev); 4801 } 4802 if (err) 4803 export_rdev(rdev); 4804 else 4805 sysfs_notify_dirent(rdev->sysfs_state); 4806 4807 md_update_sb(mddev, 1); 4808 if (mddev->degraded) 4809 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4810 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4811 md_wakeup_thread(mddev->thread); 4812 return err; 4813 } 4814 4815 /* otherwise, add_new_disk is only allowed 4816 * for major_version==0 superblocks 4817 */ 4818 if (mddev->major_version != 0) { 4819 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 4820 mdname(mddev)); 4821 return -EINVAL; 4822 } 4823 4824 if (!(info->state & (1<<MD_DISK_FAULTY))) { 4825 int err; 4826 rdev = md_import_device(dev, -1, 0); 4827 if (IS_ERR(rdev)) { 4828 printk(KERN_WARNING 4829 "md: error, md_import_device() returned %ld\n", 4830 PTR_ERR(rdev)); 4831 return PTR_ERR(rdev); 4832 } 4833 rdev->desc_nr = info->number; 4834 if (info->raid_disk < mddev->raid_disks) 4835 rdev->raid_disk = info->raid_disk; 4836 else 4837 rdev->raid_disk = -1; 4838 4839 if (rdev->raid_disk < mddev->raid_disks) 4840 if (info->state & (1<<MD_DISK_SYNC)) 4841 set_bit(In_sync, &rdev->flags); 4842 4843 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4844 set_bit(WriteMostly, &rdev->flags); 4845 4846 if (!mddev->persistent) { 4847 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4848 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4849 } else 4850 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4851 rdev->sectors = rdev->sb_start; 4852 4853 err = bind_rdev_to_array(rdev, mddev); 4854 if (err) { 4855 export_rdev(rdev); 4856 return err; 4857 } 4858 } 4859 4860 return 0; 4861 } 4862 4863 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 4864 { 4865 char b[BDEVNAME_SIZE]; 4866 mdk_rdev_t *rdev; 4867 4868 rdev = find_rdev(mddev, dev); 4869 if (!rdev) 4870 return -ENXIO; 4871 4872 if (rdev->raid_disk >= 0) 4873 goto busy; 4874 4875 kick_rdev_from_array(rdev); 4876 md_update_sb(mddev, 1); 4877 md_new_event(mddev); 4878 4879 return 0; 4880 busy: 4881 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 4882 bdevname(rdev->bdev,b), mdname(mddev)); 4883 return -EBUSY; 4884 } 4885 4886 static int hot_add_disk(mddev_t * mddev, dev_t dev) 4887 { 4888 char b[BDEVNAME_SIZE]; 4889 int err; 4890 mdk_rdev_t *rdev; 4891 4892 if (!mddev->pers) 4893 return -ENODEV; 4894 4895 if (mddev->major_version != 0) { 4896 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 4897 " version-0 superblocks.\n", 4898 mdname(mddev)); 4899 return -EINVAL; 4900 } 4901 if (!mddev->pers->hot_add_disk) { 4902 printk(KERN_WARNING 4903 "%s: personality does not support diskops!\n", 4904 mdname(mddev)); 4905 return -EINVAL; 4906 } 4907 4908 rdev = md_import_device(dev, -1, 0); 4909 if (IS_ERR(rdev)) { 4910 printk(KERN_WARNING 4911 "md: error, md_import_device() returned %ld\n", 4912 PTR_ERR(rdev)); 4913 return -EINVAL; 4914 } 4915 4916 if (mddev->persistent) 4917 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4918 else 4919 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4920 4921 rdev->sectors = rdev->sb_start; 4922 4923 if (test_bit(Faulty, &rdev->flags)) { 4924 printk(KERN_WARNING 4925 "md: can not hot-add faulty %s disk to %s!\n", 4926 bdevname(rdev->bdev,b), mdname(mddev)); 4927 err = -EINVAL; 4928 goto abort_export; 4929 } 4930 clear_bit(In_sync, &rdev->flags); 4931 rdev->desc_nr = -1; 4932 rdev->saved_raid_disk = -1; 4933 err = bind_rdev_to_array(rdev, mddev); 4934 if (err) 4935 goto abort_export; 4936 4937 /* 4938 * The rest should better be atomic, we can have disk failures 4939 * noticed in interrupt contexts ... 4940 */ 4941 4942 rdev->raid_disk = -1; 4943 4944 md_update_sb(mddev, 1); 4945 4946 /* 4947 * Kick recovery, maybe this spare has to be added to the 4948 * array immediately. 4949 */ 4950 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4951 md_wakeup_thread(mddev->thread); 4952 md_new_event(mddev); 4953 return 0; 4954 4955 abort_export: 4956 export_rdev(rdev); 4957 return err; 4958 } 4959 4960 static int set_bitmap_file(mddev_t *mddev, int fd) 4961 { 4962 int err; 4963 4964 if (mddev->pers) { 4965 if (!mddev->pers->quiesce) 4966 return -EBUSY; 4967 if (mddev->recovery || mddev->sync_thread) 4968 return -EBUSY; 4969 /* we should be able to change the bitmap.. */ 4970 } 4971 4972 4973 if (fd >= 0) { 4974 if (mddev->bitmap) 4975 return -EEXIST; /* cannot add when bitmap is present */ 4976 mddev->bitmap_file = fget(fd); 4977 4978 if (mddev->bitmap_file == NULL) { 4979 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4980 mdname(mddev)); 4981 return -EBADF; 4982 } 4983 4984 err = deny_bitmap_write_access(mddev->bitmap_file); 4985 if (err) { 4986 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4987 mdname(mddev)); 4988 fput(mddev->bitmap_file); 4989 mddev->bitmap_file = NULL; 4990 return err; 4991 } 4992 mddev->bitmap_offset = 0; /* file overrides offset */ 4993 } else if (mddev->bitmap == NULL) 4994 return -ENOENT; /* cannot remove what isn't there */ 4995 err = 0; 4996 if (mddev->pers) { 4997 mddev->pers->quiesce(mddev, 1); 4998 if (fd >= 0) 4999 err = bitmap_create(mddev); 5000 if (fd < 0 || err) { 5001 bitmap_destroy(mddev); 5002 fd = -1; /* make sure to put the file */ 5003 } 5004 mddev->pers->quiesce(mddev, 0); 5005 } 5006 if (fd < 0) { 5007 if (mddev->bitmap_file) { 5008 restore_bitmap_write_access(mddev->bitmap_file); 5009 fput(mddev->bitmap_file); 5010 } 5011 mddev->bitmap_file = NULL; 5012 } 5013 5014 return err; 5015 } 5016 5017 /* 5018 * set_array_info is used two different ways 5019 * The original usage is when creating a new array. 5020 * In this usage, raid_disks is > 0 and it together with 5021 * level, size, not_persistent,layout,chunksize determine the 5022 * shape of the array. 5023 * This will always create an array with a type-0.90.0 superblock. 5024 * The newer usage is when assembling an array. 5025 * In this case raid_disks will be 0, and the major_version field is 5026 * use to determine which style super-blocks are to be found on the devices. 5027 * The minor and patch _version numbers are also kept incase the 5028 * super_block handler wishes to interpret them. 5029 */ 5030 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 5031 { 5032 5033 if (info->raid_disks == 0) { 5034 /* just setting version number for superblock loading */ 5035 if (info->major_version < 0 || 5036 info->major_version >= ARRAY_SIZE(super_types) || 5037 super_types[info->major_version].name == NULL) { 5038 /* maybe try to auto-load a module? */ 5039 printk(KERN_INFO 5040 "md: superblock version %d not known\n", 5041 info->major_version); 5042 return -EINVAL; 5043 } 5044 mddev->major_version = info->major_version; 5045 mddev->minor_version = info->minor_version; 5046 mddev->patch_version = info->patch_version; 5047 mddev->persistent = !info->not_persistent; 5048 return 0; 5049 } 5050 mddev->major_version = MD_MAJOR_VERSION; 5051 mddev->minor_version = MD_MINOR_VERSION; 5052 mddev->patch_version = MD_PATCHLEVEL_VERSION; 5053 mddev->ctime = get_seconds(); 5054 5055 mddev->level = info->level; 5056 mddev->clevel[0] = 0; 5057 mddev->dev_sectors = 2 * (sector_t)info->size; 5058 mddev->raid_disks = info->raid_disks; 5059 /* don't set md_minor, it is determined by which /dev/md* was 5060 * openned 5061 */ 5062 if (info->state & (1<<MD_SB_CLEAN)) 5063 mddev->recovery_cp = MaxSector; 5064 else 5065 mddev->recovery_cp = 0; 5066 mddev->persistent = ! info->not_persistent; 5067 mddev->external = 0; 5068 5069 mddev->layout = info->layout; 5070 mddev->chunk_sectors = info->chunk_size >> 9; 5071 5072 mddev->max_disks = MD_SB_DISKS; 5073 5074 if (mddev->persistent) 5075 mddev->flags = 0; 5076 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5077 5078 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5079 mddev->bitmap_offset = 0; 5080 5081 mddev->reshape_position = MaxSector; 5082 5083 /* 5084 * Generate a 128 bit UUID 5085 */ 5086 get_random_bytes(mddev->uuid, 16); 5087 5088 mddev->new_level = mddev->level; 5089 mddev->new_chunk_sectors = mddev->chunk_sectors; 5090 mddev->new_layout = mddev->layout; 5091 mddev->delta_disks = 0; 5092 5093 return 0; 5094 } 5095 5096 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) 5097 { 5098 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5099 5100 if (mddev->external_size) 5101 return; 5102 5103 mddev->array_sectors = array_sectors; 5104 } 5105 EXPORT_SYMBOL(md_set_array_sectors); 5106 5107 static int update_size(mddev_t *mddev, sector_t num_sectors) 5108 { 5109 mdk_rdev_t *rdev; 5110 int rv; 5111 int fit = (num_sectors == 0); 5112 5113 if (mddev->pers->resize == NULL) 5114 return -EINVAL; 5115 /* The "num_sectors" is the number of sectors of each device that 5116 * is used. This can only make sense for arrays with redundancy. 5117 * linear and raid0 always use whatever space is available. We can only 5118 * consider changing this number if no resync or reconstruction is 5119 * happening, and if the new size is acceptable. It must fit before the 5120 * sb_start or, if that is <data_offset, it must fit before the size 5121 * of each device. If num_sectors is zero, we find the largest size 5122 * that fits. 5123 5124 */ 5125 if (mddev->sync_thread) 5126 return -EBUSY; 5127 if (mddev->bitmap) 5128 /* Sorry, cannot grow a bitmap yet, just remove it, 5129 * grow, and re-add. 5130 */ 5131 return -EBUSY; 5132 list_for_each_entry(rdev, &mddev->disks, same_set) { 5133 sector_t avail = rdev->sectors; 5134 5135 if (fit && (num_sectors == 0 || num_sectors > avail)) 5136 num_sectors = avail; 5137 if (avail < num_sectors) 5138 return -ENOSPC; 5139 } 5140 rv = mddev->pers->resize(mddev, num_sectors); 5141 if (!rv) 5142 revalidate_disk(mddev->gendisk); 5143 return rv; 5144 } 5145 5146 static int update_raid_disks(mddev_t *mddev, int raid_disks) 5147 { 5148 int rv; 5149 /* change the number of raid disks */ 5150 if (mddev->pers->check_reshape == NULL) 5151 return -EINVAL; 5152 if (raid_disks <= 0 || 5153 raid_disks >= mddev->max_disks) 5154 return -EINVAL; 5155 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5156 return -EBUSY; 5157 mddev->delta_disks = raid_disks - mddev->raid_disks; 5158 5159 rv = mddev->pers->check_reshape(mddev); 5160 return rv; 5161 } 5162 5163 5164 /* 5165 * update_array_info is used to change the configuration of an 5166 * on-line array. 5167 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 5168 * fields in the info are checked against the array. 5169 * Any differences that cannot be handled will cause an error. 5170 * Normally, only one change can be managed at a time. 5171 */ 5172 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 5173 { 5174 int rv = 0; 5175 int cnt = 0; 5176 int state = 0; 5177 5178 /* calculate expected state,ignoring low bits */ 5179 if (mddev->bitmap && mddev->bitmap_offset) 5180 state |= (1 << MD_SB_BITMAP_PRESENT); 5181 5182 if (mddev->major_version != info->major_version || 5183 mddev->minor_version != info->minor_version || 5184 /* mddev->patch_version != info->patch_version || */ 5185 mddev->ctime != info->ctime || 5186 mddev->level != info->level || 5187 /* mddev->layout != info->layout || */ 5188 !mddev->persistent != info->not_persistent|| 5189 mddev->chunk_sectors != info->chunk_size >> 9 || 5190 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5191 ((state^info->state) & 0xfffffe00) 5192 ) 5193 return -EINVAL; 5194 /* Check there is only one change */ 5195 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5196 cnt++; 5197 if (mddev->raid_disks != info->raid_disks) 5198 cnt++; 5199 if (mddev->layout != info->layout) 5200 cnt++; 5201 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 5202 cnt++; 5203 if (cnt == 0) 5204 return 0; 5205 if (cnt > 1) 5206 return -EINVAL; 5207 5208 if (mddev->layout != info->layout) { 5209 /* Change layout 5210 * we don't need to do anything at the md level, the 5211 * personality will take care of it all. 5212 */ 5213 if (mddev->pers->check_reshape == NULL) 5214 return -EINVAL; 5215 else { 5216 mddev->new_layout = info->layout; 5217 rv = mddev->pers->check_reshape(mddev); 5218 if (rv) 5219 mddev->new_layout = mddev->layout; 5220 return rv; 5221 } 5222 } 5223 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5224 rv = update_size(mddev, (sector_t)info->size * 2); 5225 5226 if (mddev->raid_disks != info->raid_disks) 5227 rv = update_raid_disks(mddev, info->raid_disks); 5228 5229 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 5230 if (mddev->pers->quiesce == NULL) 5231 return -EINVAL; 5232 if (mddev->recovery || mddev->sync_thread) 5233 return -EBUSY; 5234 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 5235 /* add the bitmap */ 5236 if (mddev->bitmap) 5237 return -EEXIST; 5238 if (mddev->default_bitmap_offset == 0) 5239 return -EINVAL; 5240 mddev->bitmap_offset = mddev->default_bitmap_offset; 5241 mddev->pers->quiesce(mddev, 1); 5242 rv = bitmap_create(mddev); 5243 if (rv) 5244 bitmap_destroy(mddev); 5245 mddev->pers->quiesce(mddev, 0); 5246 } else { 5247 /* remove the bitmap */ 5248 if (!mddev->bitmap) 5249 return -ENOENT; 5250 if (mddev->bitmap->file) 5251 return -EINVAL; 5252 mddev->pers->quiesce(mddev, 1); 5253 bitmap_destroy(mddev); 5254 mddev->pers->quiesce(mddev, 0); 5255 mddev->bitmap_offset = 0; 5256 } 5257 } 5258 md_update_sb(mddev, 1); 5259 return rv; 5260 } 5261 5262 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 5263 { 5264 mdk_rdev_t *rdev; 5265 5266 if (mddev->pers == NULL) 5267 return -ENODEV; 5268 5269 rdev = find_rdev(mddev, dev); 5270 if (!rdev) 5271 return -ENODEV; 5272 5273 md_error(mddev, rdev); 5274 return 0; 5275 } 5276 5277 /* 5278 * We have a problem here : there is no easy way to give a CHS 5279 * virtual geometry. We currently pretend that we have a 2 heads 5280 * 4 sectors (with a BIG number of cylinders...). This drives 5281 * dosfs just mad... ;-) 5282 */ 5283 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 5284 { 5285 mddev_t *mddev = bdev->bd_disk->private_data; 5286 5287 geo->heads = 2; 5288 geo->sectors = 4; 5289 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5290 return 0; 5291 } 5292 5293 static int md_ioctl(struct block_device *bdev, fmode_t mode, 5294 unsigned int cmd, unsigned long arg) 5295 { 5296 int err = 0; 5297 void __user *argp = (void __user *)arg; 5298 mddev_t *mddev = NULL; 5299 5300 if (!capable(CAP_SYS_ADMIN)) 5301 return -EACCES; 5302 5303 /* 5304 * Commands dealing with the RAID driver but not any 5305 * particular array: 5306 */ 5307 switch (cmd) 5308 { 5309 case RAID_VERSION: 5310 err = get_version(argp); 5311 goto done; 5312 5313 case PRINT_RAID_DEBUG: 5314 err = 0; 5315 md_print_devices(); 5316 goto done; 5317 5318 #ifndef MODULE 5319 case RAID_AUTORUN: 5320 err = 0; 5321 autostart_arrays(arg); 5322 goto done; 5323 #endif 5324 default:; 5325 } 5326 5327 /* 5328 * Commands creating/starting a new array: 5329 */ 5330 5331 mddev = bdev->bd_disk->private_data; 5332 5333 if (!mddev) { 5334 BUG(); 5335 goto abort; 5336 } 5337 5338 err = mddev_lock(mddev); 5339 if (err) { 5340 printk(KERN_INFO 5341 "md: ioctl lock interrupted, reason %d, cmd %d\n", 5342 err, cmd); 5343 goto abort; 5344 } 5345 5346 switch (cmd) 5347 { 5348 case SET_ARRAY_INFO: 5349 { 5350 mdu_array_info_t info; 5351 if (!arg) 5352 memset(&info, 0, sizeof(info)); 5353 else if (copy_from_user(&info, argp, sizeof(info))) { 5354 err = -EFAULT; 5355 goto abort_unlock; 5356 } 5357 if (mddev->pers) { 5358 err = update_array_info(mddev, &info); 5359 if (err) { 5360 printk(KERN_WARNING "md: couldn't update" 5361 " array info. %d\n", err); 5362 goto abort_unlock; 5363 } 5364 goto done_unlock; 5365 } 5366 if (!list_empty(&mddev->disks)) { 5367 printk(KERN_WARNING 5368 "md: array %s already has disks!\n", 5369 mdname(mddev)); 5370 err = -EBUSY; 5371 goto abort_unlock; 5372 } 5373 if (mddev->raid_disks) { 5374 printk(KERN_WARNING 5375 "md: array %s already initialised!\n", 5376 mdname(mddev)); 5377 err = -EBUSY; 5378 goto abort_unlock; 5379 } 5380 err = set_array_info(mddev, &info); 5381 if (err) { 5382 printk(KERN_WARNING "md: couldn't set" 5383 " array info. %d\n", err); 5384 goto abort_unlock; 5385 } 5386 } 5387 goto done_unlock; 5388 5389 default:; 5390 } 5391 5392 /* 5393 * Commands querying/configuring an existing array: 5394 */ 5395 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 5396 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 5397 if ((!mddev->raid_disks && !mddev->external) 5398 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 5399 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 5400 && cmd != GET_BITMAP_FILE) { 5401 err = -ENODEV; 5402 goto abort_unlock; 5403 } 5404 5405 /* 5406 * Commands even a read-only array can execute: 5407 */ 5408 switch (cmd) 5409 { 5410 case GET_ARRAY_INFO: 5411 err = get_array_info(mddev, argp); 5412 goto done_unlock; 5413 5414 case GET_BITMAP_FILE: 5415 err = get_bitmap_file(mddev, argp); 5416 goto done_unlock; 5417 5418 case GET_DISK_INFO: 5419 err = get_disk_info(mddev, argp); 5420 goto done_unlock; 5421 5422 case RESTART_ARRAY_RW: 5423 err = restart_array(mddev); 5424 goto done_unlock; 5425 5426 case STOP_ARRAY: 5427 err = do_md_stop(mddev, 0, 1); 5428 goto done_unlock; 5429 5430 case STOP_ARRAY_RO: 5431 err = do_md_stop(mddev, 1, 1); 5432 goto done_unlock; 5433 5434 } 5435 5436 /* 5437 * The remaining ioctls are changing the state of the 5438 * superblock, so we do not allow them on read-only arrays. 5439 * However non-MD ioctls (e.g. get-size) will still come through 5440 * here and hit the 'default' below, so only disallow 5441 * 'md' ioctls, and switch to rw mode if started auto-readonly. 5442 */ 5443 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5444 if (mddev->ro == 2) { 5445 mddev->ro = 0; 5446 sysfs_notify_dirent(mddev->sysfs_state); 5447 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5448 md_wakeup_thread(mddev->thread); 5449 } else { 5450 err = -EROFS; 5451 goto abort_unlock; 5452 } 5453 } 5454 5455 switch (cmd) 5456 { 5457 case ADD_NEW_DISK: 5458 { 5459 mdu_disk_info_t info; 5460 if (copy_from_user(&info, argp, sizeof(info))) 5461 err = -EFAULT; 5462 else 5463 err = add_new_disk(mddev, &info); 5464 goto done_unlock; 5465 } 5466 5467 case HOT_REMOVE_DISK: 5468 err = hot_remove_disk(mddev, new_decode_dev(arg)); 5469 goto done_unlock; 5470 5471 case HOT_ADD_DISK: 5472 err = hot_add_disk(mddev, new_decode_dev(arg)); 5473 goto done_unlock; 5474 5475 case SET_DISK_FAULTY: 5476 err = set_disk_faulty(mddev, new_decode_dev(arg)); 5477 goto done_unlock; 5478 5479 case RUN_ARRAY: 5480 err = do_md_run(mddev); 5481 goto done_unlock; 5482 5483 case SET_BITMAP_FILE: 5484 err = set_bitmap_file(mddev, (int)arg); 5485 goto done_unlock; 5486 5487 default: 5488 err = -EINVAL; 5489 goto abort_unlock; 5490 } 5491 5492 done_unlock: 5493 abort_unlock: 5494 if (mddev->hold_active == UNTIL_IOCTL && 5495 err != -EINVAL) 5496 mddev->hold_active = 0; 5497 mddev_unlock(mddev); 5498 5499 return err; 5500 done: 5501 if (err) 5502 MD_BUG(); 5503 abort: 5504 return err; 5505 } 5506 5507 static int md_open(struct block_device *bdev, fmode_t mode) 5508 { 5509 /* 5510 * Succeed if we can lock the mddev, which confirms that 5511 * it isn't being stopped right now. 5512 */ 5513 mddev_t *mddev = mddev_find(bdev->bd_dev); 5514 int err; 5515 5516 if (mddev->gendisk != bdev->bd_disk) { 5517 /* we are racing with mddev_put which is discarding this 5518 * bd_disk. 5519 */ 5520 mddev_put(mddev); 5521 /* Wait until bdev->bd_disk is definitely gone */ 5522 flush_scheduled_work(); 5523 /* Then retry the open from the top */ 5524 return -ERESTARTSYS; 5525 } 5526 BUG_ON(mddev != bdev->bd_disk->private_data); 5527 5528 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 5529 goto out; 5530 5531 err = 0; 5532 atomic_inc(&mddev->openers); 5533 mutex_unlock(&mddev->open_mutex); 5534 5535 check_disk_change(bdev); 5536 out: 5537 return err; 5538 } 5539 5540 static int md_release(struct gendisk *disk, fmode_t mode) 5541 { 5542 mddev_t *mddev = disk->private_data; 5543 5544 BUG_ON(!mddev); 5545 atomic_dec(&mddev->openers); 5546 mddev_put(mddev); 5547 5548 return 0; 5549 } 5550 5551 static int md_media_changed(struct gendisk *disk) 5552 { 5553 mddev_t *mddev = disk->private_data; 5554 5555 return mddev->changed; 5556 } 5557 5558 static int md_revalidate(struct gendisk *disk) 5559 { 5560 mddev_t *mddev = disk->private_data; 5561 5562 mddev->changed = 0; 5563 return 0; 5564 } 5565 static const struct block_device_operations md_fops = 5566 { 5567 .owner = THIS_MODULE, 5568 .open = md_open, 5569 .release = md_release, 5570 .ioctl = md_ioctl, 5571 .getgeo = md_getgeo, 5572 .media_changed = md_media_changed, 5573 .revalidate_disk= md_revalidate, 5574 }; 5575 5576 static int md_thread(void * arg) 5577 { 5578 mdk_thread_t *thread = arg; 5579 5580 /* 5581 * md_thread is a 'system-thread', it's priority should be very 5582 * high. We avoid resource deadlocks individually in each 5583 * raid personality. (RAID5 does preallocation) We also use RR and 5584 * the very same RT priority as kswapd, thus we will never get 5585 * into a priority inversion deadlock. 5586 * 5587 * we definitely have to have equal or higher priority than 5588 * bdflush, otherwise bdflush will deadlock if there are too 5589 * many dirty RAID5 blocks. 5590 */ 5591 5592 allow_signal(SIGKILL); 5593 while (!kthread_should_stop()) { 5594 5595 /* We need to wait INTERRUPTIBLE so that 5596 * we don't add to the load-average. 5597 * That means we need to be sure no signals are 5598 * pending 5599 */ 5600 if (signal_pending(current)) 5601 flush_signals(current); 5602 5603 wait_event_interruptible_timeout 5604 (thread->wqueue, 5605 test_bit(THREAD_WAKEUP, &thread->flags) 5606 || kthread_should_stop(), 5607 thread->timeout); 5608 5609 clear_bit(THREAD_WAKEUP, &thread->flags); 5610 5611 thread->run(thread->mddev); 5612 } 5613 5614 return 0; 5615 } 5616 5617 void md_wakeup_thread(mdk_thread_t *thread) 5618 { 5619 if (thread) { 5620 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5621 set_bit(THREAD_WAKEUP, &thread->flags); 5622 wake_up(&thread->wqueue); 5623 } 5624 } 5625 5626 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5627 const char *name) 5628 { 5629 mdk_thread_t *thread; 5630 5631 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5632 if (!thread) 5633 return NULL; 5634 5635 init_waitqueue_head(&thread->wqueue); 5636 5637 thread->run = run; 5638 thread->mddev = mddev; 5639 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5640 thread->tsk = kthread_run(md_thread, thread, 5641 "%s_%s", 5642 mdname(thread->mddev), 5643 name ?: mddev->pers->name); 5644 if (IS_ERR(thread->tsk)) { 5645 kfree(thread); 5646 return NULL; 5647 } 5648 return thread; 5649 } 5650 5651 void md_unregister_thread(mdk_thread_t *thread) 5652 { 5653 if (!thread) 5654 return; 5655 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5656 5657 kthread_stop(thread->tsk); 5658 kfree(thread); 5659 } 5660 5661 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5662 { 5663 if (!mddev) { 5664 MD_BUG(); 5665 return; 5666 } 5667 5668 if (!rdev || test_bit(Faulty, &rdev->flags)) 5669 return; 5670 5671 if (mddev->external) 5672 set_bit(Blocked, &rdev->flags); 5673 /* 5674 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5675 mdname(mddev), 5676 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5677 __builtin_return_address(0),__builtin_return_address(1), 5678 __builtin_return_address(2),__builtin_return_address(3)); 5679 */ 5680 if (!mddev->pers) 5681 return; 5682 if (!mddev->pers->error_handler) 5683 return; 5684 mddev->pers->error_handler(mddev,rdev); 5685 if (mddev->degraded) 5686 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5687 set_bit(StateChanged, &rdev->flags); 5688 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5689 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5690 md_wakeup_thread(mddev->thread); 5691 md_new_event_inintr(mddev); 5692 } 5693 5694 /* seq_file implementation /proc/mdstat */ 5695 5696 static void status_unused(struct seq_file *seq) 5697 { 5698 int i = 0; 5699 mdk_rdev_t *rdev; 5700 5701 seq_printf(seq, "unused devices: "); 5702 5703 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 5704 char b[BDEVNAME_SIZE]; 5705 i++; 5706 seq_printf(seq, "%s ", 5707 bdevname(rdev->bdev,b)); 5708 } 5709 if (!i) 5710 seq_printf(seq, "<none>"); 5711 5712 seq_printf(seq, "\n"); 5713 } 5714 5715 5716 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5717 { 5718 sector_t max_sectors, resync, res; 5719 unsigned long dt, db; 5720 sector_t rt; 5721 int scale; 5722 unsigned int per_milli; 5723 5724 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 5725 5726 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5727 max_sectors = mddev->resync_max_sectors; 5728 else 5729 max_sectors = mddev->dev_sectors; 5730 5731 /* 5732 * Should not happen. 5733 */ 5734 if (!max_sectors) { 5735 MD_BUG(); 5736 return; 5737 } 5738 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5739 * in a sector_t, and (max_sectors>>scale) will fit in a 5740 * u32, as those are the requirements for sector_div. 5741 * Thus 'scale' must be at least 10 5742 */ 5743 scale = 10; 5744 if (sizeof(sector_t) > sizeof(unsigned long)) { 5745 while ( max_sectors/2 > (1ULL<<(scale+32))) 5746 scale++; 5747 } 5748 res = (resync>>scale)*1000; 5749 sector_div(res, (u32)((max_sectors>>scale)+1)); 5750 5751 per_milli = res; 5752 { 5753 int i, x = per_milli/50, y = 20-x; 5754 seq_printf(seq, "["); 5755 for (i = 0; i < x; i++) 5756 seq_printf(seq, "="); 5757 seq_printf(seq, ">"); 5758 for (i = 0; i < y; i++) 5759 seq_printf(seq, "."); 5760 seq_printf(seq, "] "); 5761 } 5762 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5763 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5764 "reshape" : 5765 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5766 "check" : 5767 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5768 "resync" : "recovery"))), 5769 per_milli/10, per_milli % 10, 5770 (unsigned long long) resync/2, 5771 (unsigned long long) max_sectors/2); 5772 5773 /* 5774 * dt: time from mark until now 5775 * db: blocks written from mark until now 5776 * rt: remaining time 5777 * 5778 * rt is a sector_t, so could be 32bit or 64bit. 5779 * So we divide before multiply in case it is 32bit and close 5780 * to the limit. 5781 * We scale the divisor (db) by 32 to avoid loosing precision 5782 * near the end of resync when the number of remaining sectors 5783 * is close to 'db'. 5784 * We then divide rt by 32 after multiplying by db to compensate. 5785 * The '+1' avoids division by zero if db is very small. 5786 */ 5787 dt = ((jiffies - mddev->resync_mark) / HZ); 5788 if (!dt) dt++; 5789 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 5790 - mddev->resync_mark_cnt; 5791 5792 rt = max_sectors - resync; /* number of remaining sectors */ 5793 sector_div(rt, db/32+1); 5794 rt *= dt; 5795 rt >>= 5; 5796 5797 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 5798 ((unsigned long)rt % 60)/6); 5799 5800 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 5801 } 5802 5803 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 5804 { 5805 struct list_head *tmp; 5806 loff_t l = *pos; 5807 mddev_t *mddev; 5808 5809 if (l >= 0x10000) 5810 return NULL; 5811 if (!l--) 5812 /* header */ 5813 return (void*)1; 5814 5815 spin_lock(&all_mddevs_lock); 5816 list_for_each(tmp,&all_mddevs) 5817 if (!l--) { 5818 mddev = list_entry(tmp, mddev_t, all_mddevs); 5819 mddev_get(mddev); 5820 spin_unlock(&all_mddevs_lock); 5821 return mddev; 5822 } 5823 spin_unlock(&all_mddevs_lock); 5824 if (!l--) 5825 return (void*)2;/* tail */ 5826 return NULL; 5827 } 5828 5829 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 5830 { 5831 struct list_head *tmp; 5832 mddev_t *next_mddev, *mddev = v; 5833 5834 ++*pos; 5835 if (v == (void*)2) 5836 return NULL; 5837 5838 spin_lock(&all_mddevs_lock); 5839 if (v == (void*)1) 5840 tmp = all_mddevs.next; 5841 else 5842 tmp = mddev->all_mddevs.next; 5843 if (tmp != &all_mddevs) 5844 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 5845 else { 5846 next_mddev = (void*)2; 5847 *pos = 0x10000; 5848 } 5849 spin_unlock(&all_mddevs_lock); 5850 5851 if (v != (void*)1) 5852 mddev_put(mddev); 5853 return next_mddev; 5854 5855 } 5856 5857 static void md_seq_stop(struct seq_file *seq, void *v) 5858 { 5859 mddev_t *mddev = v; 5860 5861 if (mddev && v != (void*)1 && v != (void*)2) 5862 mddev_put(mddev); 5863 } 5864 5865 struct mdstat_info { 5866 int event; 5867 }; 5868 5869 static int md_seq_show(struct seq_file *seq, void *v) 5870 { 5871 mddev_t *mddev = v; 5872 sector_t sectors; 5873 mdk_rdev_t *rdev; 5874 struct mdstat_info *mi = seq->private; 5875 struct bitmap *bitmap; 5876 5877 if (v == (void*)1) { 5878 struct mdk_personality *pers; 5879 seq_printf(seq, "Personalities : "); 5880 spin_lock(&pers_lock); 5881 list_for_each_entry(pers, &pers_list, list) 5882 seq_printf(seq, "[%s] ", pers->name); 5883 5884 spin_unlock(&pers_lock); 5885 seq_printf(seq, "\n"); 5886 mi->event = atomic_read(&md_event_count); 5887 return 0; 5888 } 5889 if (v == (void*)2) { 5890 status_unused(seq); 5891 return 0; 5892 } 5893 5894 if (mddev_lock(mddev) < 0) 5895 return -EINTR; 5896 5897 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 5898 seq_printf(seq, "%s : %sactive", mdname(mddev), 5899 mddev->pers ? "" : "in"); 5900 if (mddev->pers) { 5901 if (mddev->ro==1) 5902 seq_printf(seq, " (read-only)"); 5903 if (mddev->ro==2) 5904 seq_printf(seq, " (auto-read-only)"); 5905 seq_printf(seq, " %s", mddev->pers->name); 5906 } 5907 5908 sectors = 0; 5909 list_for_each_entry(rdev, &mddev->disks, same_set) { 5910 char b[BDEVNAME_SIZE]; 5911 seq_printf(seq, " %s[%d]", 5912 bdevname(rdev->bdev,b), rdev->desc_nr); 5913 if (test_bit(WriteMostly, &rdev->flags)) 5914 seq_printf(seq, "(W)"); 5915 if (test_bit(Faulty, &rdev->flags)) { 5916 seq_printf(seq, "(F)"); 5917 continue; 5918 } else if (rdev->raid_disk < 0) 5919 seq_printf(seq, "(S)"); /* spare */ 5920 sectors += rdev->sectors; 5921 } 5922 5923 if (!list_empty(&mddev->disks)) { 5924 if (mddev->pers) 5925 seq_printf(seq, "\n %llu blocks", 5926 (unsigned long long) 5927 mddev->array_sectors / 2); 5928 else 5929 seq_printf(seq, "\n %llu blocks", 5930 (unsigned long long)sectors / 2); 5931 } 5932 if (mddev->persistent) { 5933 if (mddev->major_version != 0 || 5934 mddev->minor_version != 90) { 5935 seq_printf(seq," super %d.%d", 5936 mddev->major_version, 5937 mddev->minor_version); 5938 } 5939 } else if (mddev->external) 5940 seq_printf(seq, " super external:%s", 5941 mddev->metadata_type); 5942 else 5943 seq_printf(seq, " super non-persistent"); 5944 5945 if (mddev->pers) { 5946 mddev->pers->status(seq, mddev); 5947 seq_printf(seq, "\n "); 5948 if (mddev->pers->sync_request) { 5949 if (mddev->curr_resync > 2) { 5950 status_resync(seq, mddev); 5951 seq_printf(seq, "\n "); 5952 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5953 seq_printf(seq, "\tresync=DELAYED\n "); 5954 else if (mddev->recovery_cp < MaxSector) 5955 seq_printf(seq, "\tresync=PENDING\n "); 5956 } 5957 } else 5958 seq_printf(seq, "\n "); 5959 5960 if ((bitmap = mddev->bitmap)) { 5961 unsigned long chunk_kb; 5962 unsigned long flags; 5963 spin_lock_irqsave(&bitmap->lock, flags); 5964 chunk_kb = bitmap->chunksize >> 10; 5965 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5966 "%lu%s chunk", 5967 bitmap->pages - bitmap->missing_pages, 5968 bitmap->pages, 5969 (bitmap->pages - bitmap->missing_pages) 5970 << (PAGE_SHIFT - 10), 5971 chunk_kb ? chunk_kb : bitmap->chunksize, 5972 chunk_kb ? "KB" : "B"); 5973 if (bitmap->file) { 5974 seq_printf(seq, ", file: "); 5975 seq_path(seq, &bitmap->file->f_path, " \t\n"); 5976 } 5977 5978 seq_printf(seq, "\n"); 5979 spin_unlock_irqrestore(&bitmap->lock, flags); 5980 } 5981 5982 seq_printf(seq, "\n"); 5983 } 5984 mddev_unlock(mddev); 5985 5986 return 0; 5987 } 5988 5989 static const struct seq_operations md_seq_ops = { 5990 .start = md_seq_start, 5991 .next = md_seq_next, 5992 .stop = md_seq_stop, 5993 .show = md_seq_show, 5994 }; 5995 5996 static int md_seq_open(struct inode *inode, struct file *file) 5997 { 5998 int error; 5999 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 6000 if (mi == NULL) 6001 return -ENOMEM; 6002 6003 error = seq_open(file, &md_seq_ops); 6004 if (error) 6005 kfree(mi); 6006 else { 6007 struct seq_file *p = file->private_data; 6008 p->private = mi; 6009 mi->event = atomic_read(&md_event_count); 6010 } 6011 return error; 6012 } 6013 6014 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 6015 { 6016 struct seq_file *m = filp->private_data; 6017 struct mdstat_info *mi = m->private; 6018 int mask; 6019 6020 poll_wait(filp, &md_event_waiters, wait); 6021 6022 /* always allow read */ 6023 mask = POLLIN | POLLRDNORM; 6024 6025 if (mi->event != atomic_read(&md_event_count)) 6026 mask |= POLLERR | POLLPRI; 6027 return mask; 6028 } 6029 6030 static const struct file_operations md_seq_fops = { 6031 .owner = THIS_MODULE, 6032 .open = md_seq_open, 6033 .read = seq_read, 6034 .llseek = seq_lseek, 6035 .release = seq_release_private, 6036 .poll = mdstat_poll, 6037 }; 6038 6039 int register_md_personality(struct mdk_personality *p) 6040 { 6041 spin_lock(&pers_lock); 6042 list_add_tail(&p->list, &pers_list); 6043 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 6044 spin_unlock(&pers_lock); 6045 return 0; 6046 } 6047 6048 int unregister_md_personality(struct mdk_personality *p) 6049 { 6050 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6051 spin_lock(&pers_lock); 6052 list_del_init(&p->list); 6053 spin_unlock(&pers_lock); 6054 return 0; 6055 } 6056 6057 static int is_mddev_idle(mddev_t *mddev, int init) 6058 { 6059 mdk_rdev_t * rdev; 6060 int idle; 6061 int curr_events; 6062 6063 idle = 1; 6064 rcu_read_lock(); 6065 rdev_for_each_rcu(rdev, mddev) { 6066 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6067 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 6068 (int)part_stat_read(&disk->part0, sectors[1]) - 6069 atomic_read(&disk->sync_io); 6070 /* sync IO will cause sync_io to increase before the disk_stats 6071 * as sync_io is counted when a request starts, and 6072 * disk_stats is counted when it completes. 6073 * So resync activity will cause curr_events to be smaller than 6074 * when there was no such activity. 6075 * non-sync IO will cause disk_stat to increase without 6076 * increasing sync_io so curr_events will (eventually) 6077 * be larger than it was before. Once it becomes 6078 * substantially larger, the test below will cause 6079 * the array to appear non-idle, and resync will slow 6080 * down. 6081 * If there is a lot of outstanding resync activity when 6082 * we set last_event to curr_events, then all that activity 6083 * completing might cause the array to appear non-idle 6084 * and resync will be slowed down even though there might 6085 * not have been non-resync activity. This will only 6086 * happen once though. 'last_events' will soon reflect 6087 * the state where there is little or no outstanding 6088 * resync requests, and further resync activity will 6089 * always make curr_events less than last_events. 6090 * 6091 */ 6092 if (init || curr_events - rdev->last_events > 64) { 6093 rdev->last_events = curr_events; 6094 idle = 0; 6095 } 6096 } 6097 rcu_read_unlock(); 6098 return idle; 6099 } 6100 6101 void md_done_sync(mddev_t *mddev, int blocks, int ok) 6102 { 6103 /* another "blocks" (512byte) blocks have been synced */ 6104 atomic_sub(blocks, &mddev->recovery_active); 6105 wake_up(&mddev->recovery_wait); 6106 if (!ok) { 6107 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6108 md_wakeup_thread(mddev->thread); 6109 // stop recovery, signal do_sync .... 6110 } 6111 } 6112 6113 6114 /* md_write_start(mddev, bi) 6115 * If we need to update some array metadata (e.g. 'active' flag 6116 * in superblock) before writing, schedule a superblock update 6117 * and wait for it to complete. 6118 */ 6119 void md_write_start(mddev_t *mddev, struct bio *bi) 6120 { 6121 int did_change = 0; 6122 if (bio_data_dir(bi) != WRITE) 6123 return; 6124 6125 BUG_ON(mddev->ro == 1); 6126 if (mddev->ro == 2) { 6127 /* need to switch to read/write */ 6128 mddev->ro = 0; 6129 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6130 md_wakeup_thread(mddev->thread); 6131 md_wakeup_thread(mddev->sync_thread); 6132 did_change = 1; 6133 } 6134 atomic_inc(&mddev->writes_pending); 6135 if (mddev->safemode == 1) 6136 mddev->safemode = 0; 6137 if (mddev->in_sync) { 6138 spin_lock_irq(&mddev->write_lock); 6139 if (mddev->in_sync) { 6140 mddev->in_sync = 0; 6141 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6142 md_wakeup_thread(mddev->thread); 6143 did_change = 1; 6144 } 6145 spin_unlock_irq(&mddev->write_lock); 6146 } 6147 if (did_change) 6148 sysfs_notify_dirent(mddev->sysfs_state); 6149 wait_event(mddev->sb_wait, 6150 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6151 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6152 } 6153 6154 void md_write_end(mddev_t *mddev) 6155 { 6156 if (atomic_dec_and_test(&mddev->writes_pending)) { 6157 if (mddev->safemode == 2) 6158 md_wakeup_thread(mddev->thread); 6159 else if (mddev->safemode_delay) 6160 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 6161 } 6162 } 6163 6164 /* md_allow_write(mddev) 6165 * Calling this ensures that the array is marked 'active' so that writes 6166 * may proceed without blocking. It is important to call this before 6167 * attempting a GFP_KERNEL allocation while holding the mddev lock. 6168 * Must be called with mddev_lock held. 6169 * 6170 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6171 * is dropped, so return -EAGAIN after notifying userspace. 6172 */ 6173 int md_allow_write(mddev_t *mddev) 6174 { 6175 if (!mddev->pers) 6176 return 0; 6177 if (mddev->ro) 6178 return 0; 6179 if (!mddev->pers->sync_request) 6180 return 0; 6181 6182 spin_lock_irq(&mddev->write_lock); 6183 if (mddev->in_sync) { 6184 mddev->in_sync = 0; 6185 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6186 if (mddev->safemode_delay && 6187 mddev->safemode == 0) 6188 mddev->safemode = 1; 6189 spin_unlock_irq(&mddev->write_lock); 6190 md_update_sb(mddev, 0); 6191 sysfs_notify_dirent(mddev->sysfs_state); 6192 } else 6193 spin_unlock_irq(&mddev->write_lock); 6194 6195 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 6196 return -EAGAIN; 6197 else 6198 return 0; 6199 } 6200 EXPORT_SYMBOL_GPL(md_allow_write); 6201 6202 #define SYNC_MARKS 10 6203 #define SYNC_MARK_STEP (3*HZ) 6204 void md_do_sync(mddev_t *mddev) 6205 { 6206 mddev_t *mddev2; 6207 unsigned int currspeed = 0, 6208 window; 6209 sector_t max_sectors,j, io_sectors; 6210 unsigned long mark[SYNC_MARKS]; 6211 sector_t mark_cnt[SYNC_MARKS]; 6212 int last_mark,m; 6213 struct list_head *tmp; 6214 sector_t last_check; 6215 int skipped = 0; 6216 mdk_rdev_t *rdev; 6217 char *desc; 6218 6219 /* just incase thread restarts... */ 6220 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 6221 return; 6222 if (mddev->ro) /* never try to sync a read-only array */ 6223 return; 6224 6225 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6226 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 6227 desc = "data-check"; 6228 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6229 desc = "requested-resync"; 6230 else 6231 desc = "resync"; 6232 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6233 desc = "reshape"; 6234 else 6235 desc = "recovery"; 6236 6237 /* we overload curr_resync somewhat here. 6238 * 0 == not engaged in resync at all 6239 * 2 == checking that there is no conflict with another sync 6240 * 1 == like 2, but have yielded to allow conflicting resync to 6241 * commense 6242 * other == active in resync - this many blocks 6243 * 6244 * Before starting a resync we must have set curr_resync to 6245 * 2, and then checked that every "conflicting" array has curr_resync 6246 * less than ours. When we find one that is the same or higher 6247 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 6248 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 6249 * This will mean we have to start checking from the beginning again. 6250 * 6251 */ 6252 6253 do { 6254 mddev->curr_resync = 2; 6255 6256 try_again: 6257 if (kthread_should_stop()) { 6258 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6259 goto skip; 6260 } 6261 for_each_mddev(mddev2, tmp) { 6262 if (mddev2 == mddev) 6263 continue; 6264 if (!mddev->parallel_resync 6265 && mddev2->curr_resync 6266 && match_mddev_units(mddev, mddev2)) { 6267 DEFINE_WAIT(wq); 6268 if (mddev < mddev2 && mddev->curr_resync == 2) { 6269 /* arbitrarily yield */ 6270 mddev->curr_resync = 1; 6271 wake_up(&resync_wait); 6272 } 6273 if (mddev > mddev2 && mddev->curr_resync == 1) 6274 /* no need to wait here, we can wait the next 6275 * time 'round when curr_resync == 2 6276 */ 6277 continue; 6278 /* We need to wait 'interruptible' so as not to 6279 * contribute to the load average, and not to 6280 * be caught by 'softlockup' 6281 */ 6282 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 6283 if (!kthread_should_stop() && 6284 mddev2->curr_resync >= mddev->curr_resync) { 6285 printk(KERN_INFO "md: delaying %s of %s" 6286 " until %s has finished (they" 6287 " share one or more physical units)\n", 6288 desc, mdname(mddev), mdname(mddev2)); 6289 mddev_put(mddev2); 6290 if (signal_pending(current)) 6291 flush_signals(current); 6292 schedule(); 6293 finish_wait(&resync_wait, &wq); 6294 goto try_again; 6295 } 6296 finish_wait(&resync_wait, &wq); 6297 } 6298 } 6299 } while (mddev->curr_resync < 2); 6300 6301 j = 0; 6302 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6303 /* resync follows the size requested by the personality, 6304 * which defaults to physical size, but can be virtual size 6305 */ 6306 max_sectors = mddev->resync_max_sectors; 6307 mddev->resync_mismatches = 0; 6308 /* we don't use the checkpoint if there's a bitmap */ 6309 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6310 j = mddev->resync_min; 6311 else if (!mddev->bitmap) 6312 j = mddev->recovery_cp; 6313 6314 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6315 max_sectors = mddev->dev_sectors; 6316 else { 6317 /* recovery follows the physical size of devices */ 6318 max_sectors = mddev->dev_sectors; 6319 j = MaxSector; 6320 list_for_each_entry(rdev, &mddev->disks, same_set) 6321 if (rdev->raid_disk >= 0 && 6322 !test_bit(Faulty, &rdev->flags) && 6323 !test_bit(In_sync, &rdev->flags) && 6324 rdev->recovery_offset < j) 6325 j = rdev->recovery_offset; 6326 } 6327 6328 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6329 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 6330 " %d KB/sec/disk.\n", speed_min(mddev)); 6331 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 6332 "(but not more than %d KB/sec) for %s.\n", 6333 speed_max(mddev), desc); 6334 6335 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 6336 6337 io_sectors = 0; 6338 for (m = 0; m < SYNC_MARKS; m++) { 6339 mark[m] = jiffies; 6340 mark_cnt[m] = io_sectors; 6341 } 6342 last_mark = 0; 6343 mddev->resync_mark = mark[last_mark]; 6344 mddev->resync_mark_cnt = mark_cnt[last_mark]; 6345 6346 /* 6347 * Tune reconstruction: 6348 */ 6349 window = 32*(PAGE_SIZE/512); 6350 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6351 window/2,(unsigned long long) max_sectors/2); 6352 6353 atomic_set(&mddev->recovery_active, 0); 6354 last_check = 0; 6355 6356 if (j>2) { 6357 printk(KERN_INFO 6358 "md: resuming %s of %s from checkpoint.\n", 6359 desc, mdname(mddev)); 6360 mddev->curr_resync = j; 6361 } 6362 6363 while (j < max_sectors) { 6364 sector_t sectors; 6365 6366 skipped = 0; 6367 6368 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6369 ((mddev->curr_resync > mddev->curr_resync_completed && 6370 (mddev->curr_resync - mddev->curr_resync_completed) 6371 > (max_sectors >> 4)) || 6372 (j - mddev->curr_resync_completed)*2 6373 >= mddev->resync_max - mddev->curr_resync_completed 6374 )) { 6375 /* time to update curr_resync_completed */ 6376 blk_unplug(mddev->queue); 6377 wait_event(mddev->recovery_wait, 6378 atomic_read(&mddev->recovery_active) == 0); 6379 mddev->curr_resync_completed = 6380 mddev->curr_resync; 6381 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6382 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6383 } 6384 6385 while (j >= mddev->resync_max && !kthread_should_stop()) { 6386 /* As this condition is controlled by user-space, 6387 * we can block indefinitely, so use '_interruptible' 6388 * to avoid triggering warnings. 6389 */ 6390 flush_signals(current); /* just in case */ 6391 wait_event_interruptible(mddev->recovery_wait, 6392 mddev->resync_max > j 6393 || kthread_should_stop()); 6394 } 6395 6396 if (kthread_should_stop()) 6397 goto interrupted; 6398 6399 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6400 currspeed < speed_min(mddev)); 6401 if (sectors == 0) { 6402 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6403 goto out; 6404 } 6405 6406 if (!skipped) { /* actual IO requested */ 6407 io_sectors += sectors; 6408 atomic_add(sectors, &mddev->recovery_active); 6409 } 6410 6411 j += sectors; 6412 if (j>1) mddev->curr_resync = j; 6413 mddev->curr_mark_cnt = io_sectors; 6414 if (last_check == 0) 6415 /* this is the earliers that rebuilt will be 6416 * visible in /proc/mdstat 6417 */ 6418 md_new_event(mddev); 6419 6420 if (last_check + window > io_sectors || j == max_sectors) 6421 continue; 6422 6423 last_check = io_sectors; 6424 6425 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6426 break; 6427 6428 repeat: 6429 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 6430 /* step marks */ 6431 int next = (last_mark+1) % SYNC_MARKS; 6432 6433 mddev->resync_mark = mark[next]; 6434 mddev->resync_mark_cnt = mark_cnt[next]; 6435 mark[next] = jiffies; 6436 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 6437 last_mark = next; 6438 } 6439 6440 6441 if (kthread_should_stop()) 6442 goto interrupted; 6443 6444 6445 /* 6446 * this loop exits only if either when we are slower than 6447 * the 'hard' speed limit, or the system was IO-idle for 6448 * a jiffy. 6449 * the system might be non-idle CPU-wise, but we only care 6450 * about not overloading the IO subsystem. (things like an 6451 * e2fsck being done on the RAID array should execute fast) 6452 */ 6453 blk_unplug(mddev->queue); 6454 cond_resched(); 6455 6456 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6457 /((jiffies-mddev->resync_mark)/HZ +1) +1; 6458 6459 if (currspeed > speed_min(mddev)) { 6460 if ((currspeed > speed_max(mddev)) || 6461 !is_mddev_idle(mddev, 0)) { 6462 msleep(500); 6463 goto repeat; 6464 } 6465 } 6466 } 6467 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 6468 /* 6469 * this also signals 'finished resyncing' to md_stop 6470 */ 6471 out: 6472 blk_unplug(mddev->queue); 6473 6474 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6475 6476 /* tell personality that we are finished */ 6477 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 6478 6479 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 6480 mddev->curr_resync > 2) { 6481 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6482 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6483 if (mddev->curr_resync >= mddev->recovery_cp) { 6484 printk(KERN_INFO 6485 "md: checkpointing %s of %s.\n", 6486 desc, mdname(mddev)); 6487 mddev->recovery_cp = mddev->curr_resync; 6488 } 6489 } else 6490 mddev->recovery_cp = MaxSector; 6491 } else { 6492 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6493 mddev->curr_resync = MaxSector; 6494 list_for_each_entry(rdev, &mddev->disks, same_set) 6495 if (rdev->raid_disk >= 0 && 6496 !test_bit(Faulty, &rdev->flags) && 6497 !test_bit(In_sync, &rdev->flags) && 6498 rdev->recovery_offset < mddev->curr_resync) 6499 rdev->recovery_offset = mddev->curr_resync; 6500 } 6501 } 6502 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6503 6504 skip: 6505 mddev->curr_resync = 0; 6506 mddev->curr_resync_completed = 0; 6507 mddev->resync_min = 0; 6508 mddev->resync_max = MaxSector; 6509 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6510 wake_up(&resync_wait); 6511 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6512 md_wakeup_thread(mddev->thread); 6513 return; 6514 6515 interrupted: 6516 /* 6517 * got a signal, exit. 6518 */ 6519 printk(KERN_INFO 6520 "md: md_do_sync() got signal ... exiting\n"); 6521 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6522 goto out; 6523 6524 } 6525 EXPORT_SYMBOL_GPL(md_do_sync); 6526 6527 6528 static int remove_and_add_spares(mddev_t *mddev) 6529 { 6530 mdk_rdev_t *rdev; 6531 int spares = 0; 6532 6533 mddev->curr_resync_completed = 0; 6534 6535 list_for_each_entry(rdev, &mddev->disks, same_set) 6536 if (rdev->raid_disk >= 0 && 6537 !test_bit(Blocked, &rdev->flags) && 6538 (test_bit(Faulty, &rdev->flags) || 6539 ! test_bit(In_sync, &rdev->flags)) && 6540 atomic_read(&rdev->nr_pending)==0) { 6541 if (mddev->pers->hot_remove_disk( 6542 mddev, rdev->raid_disk)==0) { 6543 char nm[20]; 6544 sprintf(nm,"rd%d", rdev->raid_disk); 6545 sysfs_remove_link(&mddev->kobj, nm); 6546 rdev->raid_disk = -1; 6547 } 6548 } 6549 6550 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 6551 list_for_each_entry(rdev, &mddev->disks, same_set) { 6552 if (rdev->raid_disk >= 0 && 6553 !test_bit(In_sync, &rdev->flags) && 6554 !test_bit(Blocked, &rdev->flags)) 6555 spares++; 6556 if (rdev->raid_disk < 0 6557 && !test_bit(Faulty, &rdev->flags)) { 6558 rdev->recovery_offset = 0; 6559 if (mddev->pers-> 6560 hot_add_disk(mddev, rdev) == 0) { 6561 char nm[20]; 6562 sprintf(nm, "rd%d", rdev->raid_disk); 6563 if (sysfs_create_link(&mddev->kobj, 6564 &rdev->kobj, nm)) 6565 printk(KERN_WARNING 6566 "md: cannot register " 6567 "%s for %s\n", 6568 nm, mdname(mddev)); 6569 spares++; 6570 md_new_event(mddev); 6571 } else 6572 break; 6573 } 6574 } 6575 } 6576 return spares; 6577 } 6578 /* 6579 * This routine is regularly called by all per-raid-array threads to 6580 * deal with generic issues like resync and super-block update. 6581 * Raid personalities that don't have a thread (linear/raid0) do not 6582 * need this as they never do any recovery or update the superblock. 6583 * 6584 * It does not do any resync itself, but rather "forks" off other threads 6585 * to do that as needed. 6586 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6587 * "->recovery" and create a thread at ->sync_thread. 6588 * When the thread finishes it sets MD_RECOVERY_DONE 6589 * and wakeups up this thread which will reap the thread and finish up. 6590 * This thread also removes any faulty devices (with nr_pending == 0). 6591 * 6592 * The overall approach is: 6593 * 1/ if the superblock needs updating, update it. 6594 * 2/ If a recovery thread is running, don't do anything else. 6595 * 3/ If recovery has finished, clean up, possibly marking spares active. 6596 * 4/ If there are any faulty devices, remove them. 6597 * 5/ If array is degraded, try to add spares devices 6598 * 6/ If array has spares or is not in-sync, start a resync thread. 6599 */ 6600 void md_check_recovery(mddev_t *mddev) 6601 { 6602 mdk_rdev_t *rdev; 6603 6604 6605 if (mddev->bitmap) 6606 bitmap_daemon_work(mddev->bitmap); 6607 6608 if (mddev->ro) 6609 return; 6610 6611 if (signal_pending(current)) { 6612 if (mddev->pers->sync_request && !mddev->external) { 6613 printk(KERN_INFO "md: %s in immediate safe mode\n", 6614 mdname(mddev)); 6615 mddev->safemode = 2; 6616 } 6617 flush_signals(current); 6618 } 6619 6620 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6621 return; 6622 if ( ! ( 6623 (mddev->flags && !mddev->external) || 6624 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6625 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6626 (mddev->external == 0 && mddev->safemode == 1) || 6627 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6628 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6629 )) 6630 return; 6631 6632 if (mddev_trylock(mddev)) { 6633 int spares = 0; 6634 6635 if (mddev->ro) { 6636 /* Only thing we do on a ro array is remove 6637 * failed devices. 6638 */ 6639 remove_and_add_spares(mddev); 6640 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6641 goto unlock; 6642 } 6643 6644 if (!mddev->external) { 6645 int did_change = 0; 6646 spin_lock_irq(&mddev->write_lock); 6647 if (mddev->safemode && 6648 !atomic_read(&mddev->writes_pending) && 6649 !mddev->in_sync && 6650 mddev->recovery_cp == MaxSector) { 6651 mddev->in_sync = 1; 6652 did_change = 1; 6653 if (mddev->persistent) 6654 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6655 } 6656 if (mddev->safemode == 1) 6657 mddev->safemode = 0; 6658 spin_unlock_irq(&mddev->write_lock); 6659 if (did_change) 6660 sysfs_notify_dirent(mddev->sysfs_state); 6661 } 6662 6663 if (mddev->flags) 6664 md_update_sb(mddev, 0); 6665 6666 list_for_each_entry(rdev, &mddev->disks, same_set) 6667 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6668 sysfs_notify_dirent(rdev->sysfs_state); 6669 6670 6671 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6672 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6673 /* resync/recovery still happening */ 6674 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6675 goto unlock; 6676 } 6677 if (mddev->sync_thread) { 6678 /* resync has finished, collect result */ 6679 md_unregister_thread(mddev->sync_thread); 6680 mddev->sync_thread = NULL; 6681 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6682 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6683 /* success...*/ 6684 /* activate any spares */ 6685 if (mddev->pers->spare_active(mddev)) 6686 sysfs_notify(&mddev->kobj, NULL, 6687 "degraded"); 6688 } 6689 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6690 mddev->pers->finish_reshape) 6691 mddev->pers->finish_reshape(mddev); 6692 md_update_sb(mddev, 1); 6693 6694 /* if array is no-longer degraded, then any saved_raid_disk 6695 * information must be scrapped 6696 */ 6697 if (!mddev->degraded) 6698 list_for_each_entry(rdev, &mddev->disks, same_set) 6699 rdev->saved_raid_disk = -1; 6700 6701 mddev->recovery = 0; 6702 /* flag recovery needed just to double check */ 6703 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6704 sysfs_notify_dirent(mddev->sysfs_action); 6705 md_new_event(mddev); 6706 goto unlock; 6707 } 6708 /* Set RUNNING before clearing NEEDED to avoid 6709 * any transients in the value of "sync_action". 6710 */ 6711 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6712 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6713 /* Clear some bits that don't mean anything, but 6714 * might be left set 6715 */ 6716 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6717 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6718 6719 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6720 goto unlock; 6721 /* no recovery is running. 6722 * remove any failed drives, then 6723 * add spares if possible. 6724 * Spare are also removed and re-added, to allow 6725 * the personality to fail the re-add. 6726 */ 6727 6728 if (mddev->reshape_position != MaxSector) { 6729 if (mddev->pers->check_reshape == NULL || 6730 mddev->pers->check_reshape(mddev) != 0) 6731 /* Cannot proceed */ 6732 goto unlock; 6733 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6734 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6735 } else if ((spares = remove_and_add_spares(mddev))) { 6736 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6737 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6738 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6739 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6740 } else if (mddev->recovery_cp < MaxSector) { 6741 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6742 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6743 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6744 /* nothing to be done ... */ 6745 goto unlock; 6746 6747 if (mddev->pers->sync_request) { 6748 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6749 /* We are adding a device or devices to an array 6750 * which has the bitmap stored on all devices. 6751 * So make sure all bitmap pages get written 6752 */ 6753 bitmap_write_all(mddev->bitmap); 6754 } 6755 mddev->sync_thread = md_register_thread(md_do_sync, 6756 mddev, 6757 "resync"); 6758 if (!mddev->sync_thread) { 6759 printk(KERN_ERR "%s: could not start resync" 6760 " thread...\n", 6761 mdname(mddev)); 6762 /* leave the spares where they are, it shouldn't hurt */ 6763 mddev->recovery = 0; 6764 } else 6765 md_wakeup_thread(mddev->sync_thread); 6766 sysfs_notify_dirent(mddev->sysfs_action); 6767 md_new_event(mddev); 6768 } 6769 unlock: 6770 if (!mddev->sync_thread) { 6771 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6772 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6773 &mddev->recovery)) 6774 if (mddev->sysfs_action) 6775 sysfs_notify_dirent(mddev->sysfs_action); 6776 } 6777 mddev_unlock(mddev); 6778 } 6779 } 6780 6781 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 6782 { 6783 sysfs_notify_dirent(rdev->sysfs_state); 6784 wait_event_timeout(rdev->blocked_wait, 6785 !test_bit(Blocked, &rdev->flags), 6786 msecs_to_jiffies(5000)); 6787 rdev_dec_pending(rdev, mddev); 6788 } 6789 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 6790 6791 static int md_notify_reboot(struct notifier_block *this, 6792 unsigned long code, void *x) 6793 { 6794 struct list_head *tmp; 6795 mddev_t *mddev; 6796 6797 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 6798 6799 printk(KERN_INFO "md: stopping all md devices.\n"); 6800 6801 for_each_mddev(mddev, tmp) 6802 if (mddev_trylock(mddev)) { 6803 /* Force a switch to readonly even array 6804 * appears to still be in use. Hence 6805 * the '100'. 6806 */ 6807 do_md_stop(mddev, 1, 100); 6808 mddev_unlock(mddev); 6809 } 6810 /* 6811 * certain more exotic SCSI devices are known to be 6812 * volatile wrt too early system reboots. While the 6813 * right place to handle this issue is the given 6814 * driver, we do want to have a safe RAID driver ... 6815 */ 6816 mdelay(1000*1); 6817 } 6818 return NOTIFY_DONE; 6819 } 6820 6821 static struct notifier_block md_notifier = { 6822 .notifier_call = md_notify_reboot, 6823 .next = NULL, 6824 .priority = INT_MAX, /* before any real devices */ 6825 }; 6826 6827 static void md_geninit(void) 6828 { 6829 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6830 6831 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 6832 } 6833 6834 static int __init md_init(void) 6835 { 6836 if (register_blkdev(MD_MAJOR, "md")) 6837 return -1; 6838 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6839 unregister_blkdev(MD_MAJOR, "md"); 6840 return -1; 6841 } 6842 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 6843 md_probe, NULL, NULL); 6844 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6845 md_probe, NULL, NULL); 6846 6847 register_reboot_notifier(&md_notifier); 6848 raid_table_header = register_sysctl_table(raid_root_table); 6849 6850 md_geninit(); 6851 return 0; 6852 } 6853 6854 6855 #ifndef MODULE 6856 6857 /* 6858 * Searches all registered partitions for autorun RAID arrays 6859 * at boot time. 6860 */ 6861 6862 static LIST_HEAD(all_detected_devices); 6863 struct detected_devices_node { 6864 struct list_head list; 6865 dev_t dev; 6866 }; 6867 6868 void md_autodetect_dev(dev_t dev) 6869 { 6870 struct detected_devices_node *node_detected_dev; 6871 6872 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 6873 if (node_detected_dev) { 6874 node_detected_dev->dev = dev; 6875 list_add_tail(&node_detected_dev->list, &all_detected_devices); 6876 } else { 6877 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 6878 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 6879 } 6880 } 6881 6882 6883 static void autostart_arrays(int part) 6884 { 6885 mdk_rdev_t *rdev; 6886 struct detected_devices_node *node_detected_dev; 6887 dev_t dev; 6888 int i_scanned, i_passed; 6889 6890 i_scanned = 0; 6891 i_passed = 0; 6892 6893 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 6894 6895 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 6896 i_scanned++; 6897 node_detected_dev = list_entry(all_detected_devices.next, 6898 struct detected_devices_node, list); 6899 list_del(&node_detected_dev->list); 6900 dev = node_detected_dev->dev; 6901 kfree(node_detected_dev); 6902 rdev = md_import_device(dev,0, 90); 6903 if (IS_ERR(rdev)) 6904 continue; 6905 6906 if (test_bit(Faulty, &rdev->flags)) { 6907 MD_BUG(); 6908 continue; 6909 } 6910 set_bit(AutoDetected, &rdev->flags); 6911 list_add(&rdev->same_set, &pending_raid_disks); 6912 i_passed++; 6913 } 6914 6915 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 6916 i_scanned, i_passed); 6917 6918 autorun_devices(part); 6919 } 6920 6921 #endif /* !MODULE */ 6922 6923 static __exit void md_exit(void) 6924 { 6925 mddev_t *mddev; 6926 struct list_head *tmp; 6927 6928 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 6929 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6930 6931 unregister_blkdev(MD_MAJOR,"md"); 6932 unregister_blkdev(mdp_major, "mdp"); 6933 unregister_reboot_notifier(&md_notifier); 6934 unregister_sysctl_table(raid_table_header); 6935 remove_proc_entry("mdstat", NULL); 6936 for_each_mddev(mddev, tmp) { 6937 export_array(mddev); 6938 mddev->hold_active = 0; 6939 } 6940 } 6941 6942 subsys_initcall(md_init); 6943 module_exit(md_exit) 6944 6945 static int get_ro(char *buffer, struct kernel_param *kp) 6946 { 6947 return sprintf(buffer, "%d", start_readonly); 6948 } 6949 static int set_ro(const char *val, struct kernel_param *kp) 6950 { 6951 char *e; 6952 int num = simple_strtoul(val, &e, 10); 6953 if (*val && (*e == '\0' || *e == '\n')) { 6954 start_readonly = num; 6955 return 0; 6956 } 6957 return -EINVAL; 6958 } 6959 6960 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6961 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6962 6963 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 6964 6965 EXPORT_SYMBOL(register_md_personality); 6966 EXPORT_SYMBOL(unregister_md_personality); 6967 EXPORT_SYMBOL(md_error); 6968 EXPORT_SYMBOL(md_done_sync); 6969 EXPORT_SYMBOL(md_write_start); 6970 EXPORT_SYMBOL(md_write_end); 6971 EXPORT_SYMBOL(md_register_thread); 6972 EXPORT_SYMBOL(md_unregister_thread); 6973 EXPORT_SYMBOL(md_wakeup_thread); 6974 EXPORT_SYMBOL(md_check_recovery); 6975 MODULE_LICENSE("GPL"); 6976 MODULE_ALIAS("md"); 6977 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 6978