1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/sysctl.h> 38 #include <linux/seq_file.h> 39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/hdreg.h> 43 #include <linux/proc_fs.h> 44 #include <linux/random.h> 45 #include <linux/reboot.h> 46 #include <linux/file.h> 47 #include <linux/delay.h> 48 #include <linux/raid/md_p.h> 49 #include <linux/raid/md_u.h> 50 #include "md.h" 51 #include "bitmap.h" 52 53 #define DEBUG 0 54 #define dprintk(x...) ((void)(DEBUG && printk(x))) 55 56 57 #ifndef MODULE 58 static void autostart_arrays(int part); 59 #endif 60 61 static LIST_HEAD(pers_list); 62 static DEFINE_SPINLOCK(pers_lock); 63 64 static void md_print_devices(void); 65 66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 67 68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 69 70 /* 71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 72 * is 1000 KB/sec, so the extra system load does not show up that much. 73 * Increase it if you want to have more _guaranteed_ speed. Note that 74 * the RAID driver will use the maximum available bandwidth if the IO 75 * subsystem is idle. There is also an 'absolute maximum' reconstruction 76 * speed limit - in case reconstruction slows down your system despite 77 * idle IO detection. 78 * 79 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 80 * or /sys/block/mdX/md/sync_speed_{min,max} 81 */ 82 83 static int sysctl_speed_limit_min = 1000; 84 static int sysctl_speed_limit_max = 200000; 85 static inline int speed_min(mddev_t *mddev) 86 { 87 return mddev->sync_speed_min ? 88 mddev->sync_speed_min : sysctl_speed_limit_min; 89 } 90 91 static inline int speed_max(mddev_t *mddev) 92 { 93 return mddev->sync_speed_max ? 94 mddev->sync_speed_max : sysctl_speed_limit_max; 95 } 96 97 static struct ctl_table_header *raid_table_header; 98 99 static ctl_table raid_table[] = { 100 { 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 102 .procname = "speed_limit_min", 103 .data = &sysctl_speed_limit_min, 104 .maxlen = sizeof(int), 105 .mode = S_IRUGO|S_IWUSR, 106 .proc_handler = &proc_dointvec, 107 }, 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 110 .procname = "speed_limit_max", 111 .data = &sysctl_speed_limit_max, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { .ctl_name = 0 } 117 }; 118 119 static ctl_table raid_dir_table[] = { 120 { 121 .ctl_name = DEV_RAID, 122 .procname = "raid", 123 .maxlen = 0, 124 .mode = S_IRUGO|S_IXUGO, 125 .child = raid_table, 126 }, 127 { .ctl_name = 0 } 128 }; 129 130 static ctl_table raid_root_table[] = { 131 { 132 .ctl_name = CTL_DEV, 133 .procname = "dev", 134 .maxlen = 0, 135 .mode = 0555, 136 .child = raid_dir_table, 137 }, 138 { .ctl_name = 0 } 139 }; 140 141 static struct block_device_operations md_fops; 142 143 static int start_readonly; 144 145 /* 146 * We have a system wide 'event count' that is incremented 147 * on any 'interesting' event, and readers of /proc/mdstat 148 * can use 'poll' or 'select' to find out when the event 149 * count increases. 150 * 151 * Events are: 152 * start array, stop array, error, add device, remove device, 153 * start build, activate spare 154 */ 155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 156 static atomic_t md_event_count; 157 void md_new_event(mddev_t *mddev) 158 { 159 atomic_inc(&md_event_count); 160 wake_up(&md_event_waiters); 161 } 162 EXPORT_SYMBOL_GPL(md_new_event); 163 164 /* Alternate version that can be called from interrupts 165 * when calling sysfs_notify isn't needed. 166 */ 167 static void md_new_event_inintr(mddev_t *mddev) 168 { 169 atomic_inc(&md_event_count); 170 wake_up(&md_event_waiters); 171 } 172 173 /* 174 * Enables to iterate over all existing md arrays 175 * all_mddevs_lock protects this list. 176 */ 177 static LIST_HEAD(all_mddevs); 178 static DEFINE_SPINLOCK(all_mddevs_lock); 179 180 181 /* 182 * iterates through all used mddevs in the system. 183 * We take care to grab the all_mddevs_lock whenever navigating 184 * the list, and to always hold a refcount when unlocked. 185 * Any code which breaks out of this loop while own 186 * a reference to the current mddev and must mddev_put it. 187 */ 188 #define for_each_mddev(mddev,tmp) \ 189 \ 190 for (({ spin_lock(&all_mddevs_lock); \ 191 tmp = all_mddevs.next; \ 192 mddev = NULL;}); \ 193 ({ if (tmp != &all_mddevs) \ 194 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 195 spin_unlock(&all_mddevs_lock); \ 196 if (mddev) mddev_put(mddev); \ 197 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 198 tmp != &all_mddevs;}); \ 199 ({ spin_lock(&all_mddevs_lock); \ 200 tmp = tmp->next;}) \ 201 ) 202 203 204 /* Rather than calling directly into the personality make_request function, 205 * IO requests come here first so that we can check if the device is 206 * being suspended pending a reconfiguration. 207 * We hold a refcount over the call to ->make_request. By the time that 208 * call has finished, the bio has been linked into some internal structure 209 * and so is visible to ->quiesce(), so we don't need the refcount any more. 210 */ 211 static int md_make_request(struct request_queue *q, struct bio *bio) 212 { 213 mddev_t *mddev = q->queuedata; 214 int rv; 215 if (mddev == NULL || mddev->pers == NULL) { 216 bio_io_error(bio); 217 return 0; 218 } 219 rcu_read_lock(); 220 if (mddev->suspended) { 221 DEFINE_WAIT(__wait); 222 for (;;) { 223 prepare_to_wait(&mddev->sb_wait, &__wait, 224 TASK_UNINTERRUPTIBLE); 225 if (!mddev->suspended) 226 break; 227 rcu_read_unlock(); 228 schedule(); 229 rcu_read_lock(); 230 } 231 finish_wait(&mddev->sb_wait, &__wait); 232 } 233 atomic_inc(&mddev->active_io); 234 rcu_read_unlock(); 235 rv = mddev->pers->make_request(q, bio); 236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 237 wake_up(&mddev->sb_wait); 238 239 return rv; 240 } 241 242 static void mddev_suspend(mddev_t *mddev) 243 { 244 BUG_ON(mddev->suspended); 245 mddev->suspended = 1; 246 synchronize_rcu(); 247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 248 mddev->pers->quiesce(mddev, 1); 249 md_unregister_thread(mddev->thread); 250 mddev->thread = NULL; 251 /* we now know that no code is executing in the personality module, 252 * except possibly the tail end of a ->bi_end_io function, but that 253 * is certain to complete before the module has a chance to get 254 * unloaded 255 */ 256 } 257 258 static void mddev_resume(mddev_t *mddev) 259 { 260 mddev->suspended = 0; 261 wake_up(&mddev->sb_wait); 262 mddev->pers->quiesce(mddev, 0); 263 } 264 265 266 static inline mddev_t *mddev_get(mddev_t *mddev) 267 { 268 atomic_inc(&mddev->active); 269 return mddev; 270 } 271 272 static void mddev_delayed_delete(struct work_struct *ws); 273 274 static void mddev_put(mddev_t *mddev) 275 { 276 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 277 return; 278 if (!mddev->raid_disks && list_empty(&mddev->disks) && 279 !mddev->hold_active) { 280 list_del(&mddev->all_mddevs); 281 if (mddev->gendisk) { 282 /* we did a probe so need to clean up. 283 * Call schedule_work inside the spinlock 284 * so that flush_scheduled_work() after 285 * mddev_find will succeed in waiting for the 286 * work to be done. 287 */ 288 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 289 schedule_work(&mddev->del_work); 290 } else 291 kfree(mddev); 292 } 293 spin_unlock(&all_mddevs_lock); 294 } 295 296 static mddev_t * mddev_find(dev_t unit) 297 { 298 mddev_t *mddev, *new = NULL; 299 300 retry: 301 spin_lock(&all_mddevs_lock); 302 303 if (unit) { 304 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 305 if (mddev->unit == unit) { 306 mddev_get(mddev); 307 spin_unlock(&all_mddevs_lock); 308 kfree(new); 309 return mddev; 310 } 311 312 if (new) { 313 list_add(&new->all_mddevs, &all_mddevs); 314 spin_unlock(&all_mddevs_lock); 315 new->hold_active = UNTIL_IOCTL; 316 return new; 317 } 318 } else if (new) { 319 /* find an unused unit number */ 320 static int next_minor = 512; 321 int start = next_minor; 322 int is_free = 0; 323 int dev = 0; 324 while (!is_free) { 325 dev = MKDEV(MD_MAJOR, next_minor); 326 next_minor++; 327 if (next_minor > MINORMASK) 328 next_minor = 0; 329 if (next_minor == start) { 330 /* Oh dear, all in use. */ 331 spin_unlock(&all_mddevs_lock); 332 kfree(new); 333 return NULL; 334 } 335 336 is_free = 1; 337 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 338 if (mddev->unit == dev) { 339 is_free = 0; 340 break; 341 } 342 } 343 new->unit = dev; 344 new->md_minor = MINOR(dev); 345 new->hold_active = UNTIL_STOP; 346 list_add(&new->all_mddevs, &all_mddevs); 347 spin_unlock(&all_mddevs_lock); 348 return new; 349 } 350 spin_unlock(&all_mddevs_lock); 351 352 new = kzalloc(sizeof(*new), GFP_KERNEL); 353 if (!new) 354 return NULL; 355 356 new->unit = unit; 357 if (MAJOR(unit) == MD_MAJOR) 358 new->md_minor = MINOR(unit); 359 else 360 new->md_minor = MINOR(unit) >> MdpMinorShift; 361 362 mutex_init(&new->reconfig_mutex); 363 INIT_LIST_HEAD(&new->disks); 364 INIT_LIST_HEAD(&new->all_mddevs); 365 init_timer(&new->safemode_timer); 366 atomic_set(&new->active, 1); 367 atomic_set(&new->openers, 0); 368 atomic_set(&new->active_io, 0); 369 spin_lock_init(&new->write_lock); 370 init_waitqueue_head(&new->sb_wait); 371 init_waitqueue_head(&new->recovery_wait); 372 new->reshape_position = MaxSector; 373 new->resync_min = 0; 374 new->resync_max = MaxSector; 375 new->level = LEVEL_NONE; 376 377 goto retry; 378 } 379 380 static inline int mddev_lock(mddev_t * mddev) 381 { 382 return mutex_lock_interruptible(&mddev->reconfig_mutex); 383 } 384 385 static inline int mddev_is_locked(mddev_t *mddev) 386 { 387 return mutex_is_locked(&mddev->reconfig_mutex); 388 } 389 390 static inline int mddev_trylock(mddev_t * mddev) 391 { 392 return mutex_trylock(&mddev->reconfig_mutex); 393 } 394 395 static inline void mddev_unlock(mddev_t * mddev) 396 { 397 mutex_unlock(&mddev->reconfig_mutex); 398 399 md_wakeup_thread(mddev->thread); 400 } 401 402 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 403 { 404 mdk_rdev_t *rdev; 405 406 list_for_each_entry(rdev, &mddev->disks, same_set) 407 if (rdev->desc_nr == nr) 408 return rdev; 409 410 return NULL; 411 } 412 413 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 414 { 415 mdk_rdev_t *rdev; 416 417 list_for_each_entry(rdev, &mddev->disks, same_set) 418 if (rdev->bdev->bd_dev == dev) 419 return rdev; 420 421 return NULL; 422 } 423 424 static struct mdk_personality *find_pers(int level, char *clevel) 425 { 426 struct mdk_personality *pers; 427 list_for_each_entry(pers, &pers_list, list) { 428 if (level != LEVEL_NONE && pers->level == level) 429 return pers; 430 if (strcmp(pers->name, clevel)==0) 431 return pers; 432 } 433 return NULL; 434 } 435 436 /* return the offset of the super block in 512byte sectors */ 437 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 438 { 439 sector_t num_sectors = bdev->bd_inode->i_size / 512; 440 return MD_NEW_SIZE_SECTORS(num_sectors); 441 } 442 443 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) 444 { 445 sector_t num_sectors = rdev->sb_start; 446 447 if (chunk_size) 448 num_sectors &= ~((sector_t)chunk_size/512 - 1); 449 return num_sectors; 450 } 451 452 static int alloc_disk_sb(mdk_rdev_t * rdev) 453 { 454 if (rdev->sb_page) 455 MD_BUG(); 456 457 rdev->sb_page = alloc_page(GFP_KERNEL); 458 if (!rdev->sb_page) { 459 printk(KERN_ALERT "md: out of memory.\n"); 460 return -ENOMEM; 461 } 462 463 return 0; 464 } 465 466 static void free_disk_sb(mdk_rdev_t * rdev) 467 { 468 if (rdev->sb_page) { 469 put_page(rdev->sb_page); 470 rdev->sb_loaded = 0; 471 rdev->sb_page = NULL; 472 rdev->sb_start = 0; 473 rdev->sectors = 0; 474 } 475 } 476 477 478 static void super_written(struct bio *bio, int error) 479 { 480 mdk_rdev_t *rdev = bio->bi_private; 481 mddev_t *mddev = rdev->mddev; 482 483 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 484 printk("md: super_written gets error=%d, uptodate=%d\n", 485 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 486 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 487 md_error(mddev, rdev); 488 } 489 490 if (atomic_dec_and_test(&mddev->pending_writes)) 491 wake_up(&mddev->sb_wait); 492 bio_put(bio); 493 } 494 495 static void super_written_barrier(struct bio *bio, int error) 496 { 497 struct bio *bio2 = bio->bi_private; 498 mdk_rdev_t *rdev = bio2->bi_private; 499 mddev_t *mddev = rdev->mddev; 500 501 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 502 error == -EOPNOTSUPP) { 503 unsigned long flags; 504 /* barriers don't appear to be supported :-( */ 505 set_bit(BarriersNotsupp, &rdev->flags); 506 mddev->barriers_work = 0; 507 spin_lock_irqsave(&mddev->write_lock, flags); 508 bio2->bi_next = mddev->biolist; 509 mddev->biolist = bio2; 510 spin_unlock_irqrestore(&mddev->write_lock, flags); 511 wake_up(&mddev->sb_wait); 512 bio_put(bio); 513 } else { 514 bio_put(bio2); 515 bio->bi_private = rdev; 516 super_written(bio, error); 517 } 518 } 519 520 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 521 sector_t sector, int size, struct page *page) 522 { 523 /* write first size bytes of page to sector of rdev 524 * Increment mddev->pending_writes before returning 525 * and decrement it on completion, waking up sb_wait 526 * if zero is reached. 527 * If an error occurred, call md_error 528 * 529 * As we might need to resubmit the request if BIO_RW_BARRIER 530 * causes ENOTSUPP, we allocate a spare bio... 531 */ 532 struct bio *bio = bio_alloc(GFP_NOIO, 1); 533 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 534 535 bio->bi_bdev = rdev->bdev; 536 bio->bi_sector = sector; 537 bio_add_page(bio, page, size, 0); 538 bio->bi_private = rdev; 539 bio->bi_end_io = super_written; 540 bio->bi_rw = rw; 541 542 atomic_inc(&mddev->pending_writes); 543 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 544 struct bio *rbio; 545 rw |= (1<<BIO_RW_BARRIER); 546 rbio = bio_clone(bio, GFP_NOIO); 547 rbio->bi_private = bio; 548 rbio->bi_end_io = super_written_barrier; 549 submit_bio(rw, rbio); 550 } else 551 submit_bio(rw, bio); 552 } 553 554 void md_super_wait(mddev_t *mddev) 555 { 556 /* wait for all superblock writes that were scheduled to complete. 557 * if any had to be retried (due to BARRIER problems), retry them 558 */ 559 DEFINE_WAIT(wq); 560 for(;;) { 561 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 562 if (atomic_read(&mddev->pending_writes)==0) 563 break; 564 while (mddev->biolist) { 565 struct bio *bio; 566 spin_lock_irq(&mddev->write_lock); 567 bio = mddev->biolist; 568 mddev->biolist = bio->bi_next ; 569 bio->bi_next = NULL; 570 spin_unlock_irq(&mddev->write_lock); 571 submit_bio(bio->bi_rw, bio); 572 } 573 schedule(); 574 } 575 finish_wait(&mddev->sb_wait, &wq); 576 } 577 578 static void bi_complete(struct bio *bio, int error) 579 { 580 complete((struct completion*)bio->bi_private); 581 } 582 583 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 584 struct page *page, int rw) 585 { 586 struct bio *bio = bio_alloc(GFP_NOIO, 1); 587 struct completion event; 588 int ret; 589 590 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 591 592 bio->bi_bdev = bdev; 593 bio->bi_sector = sector; 594 bio_add_page(bio, page, size, 0); 595 init_completion(&event); 596 bio->bi_private = &event; 597 bio->bi_end_io = bi_complete; 598 submit_bio(rw, bio); 599 wait_for_completion(&event); 600 601 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 602 bio_put(bio); 603 return ret; 604 } 605 EXPORT_SYMBOL_GPL(sync_page_io); 606 607 static int read_disk_sb(mdk_rdev_t * rdev, int size) 608 { 609 char b[BDEVNAME_SIZE]; 610 if (!rdev->sb_page) { 611 MD_BUG(); 612 return -EINVAL; 613 } 614 if (rdev->sb_loaded) 615 return 0; 616 617 618 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 619 goto fail; 620 rdev->sb_loaded = 1; 621 return 0; 622 623 fail: 624 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 625 bdevname(rdev->bdev,b)); 626 return -EINVAL; 627 } 628 629 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 630 { 631 return sb1->set_uuid0 == sb2->set_uuid0 && 632 sb1->set_uuid1 == sb2->set_uuid1 && 633 sb1->set_uuid2 == sb2->set_uuid2 && 634 sb1->set_uuid3 == sb2->set_uuid3; 635 } 636 637 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 638 { 639 int ret; 640 mdp_super_t *tmp1, *tmp2; 641 642 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 643 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 644 645 if (!tmp1 || !tmp2) { 646 ret = 0; 647 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 648 goto abort; 649 } 650 651 *tmp1 = *sb1; 652 *tmp2 = *sb2; 653 654 /* 655 * nr_disks is not constant 656 */ 657 tmp1->nr_disks = 0; 658 tmp2->nr_disks = 0; 659 660 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 661 abort: 662 kfree(tmp1); 663 kfree(tmp2); 664 return ret; 665 } 666 667 668 static u32 md_csum_fold(u32 csum) 669 { 670 csum = (csum & 0xffff) + (csum >> 16); 671 return (csum & 0xffff) + (csum >> 16); 672 } 673 674 static unsigned int calc_sb_csum(mdp_super_t * sb) 675 { 676 u64 newcsum = 0; 677 u32 *sb32 = (u32*)sb; 678 int i; 679 unsigned int disk_csum, csum; 680 681 disk_csum = sb->sb_csum; 682 sb->sb_csum = 0; 683 684 for (i = 0; i < MD_SB_BYTES/4 ; i++) 685 newcsum += sb32[i]; 686 csum = (newcsum & 0xffffffff) + (newcsum>>32); 687 688 689 #ifdef CONFIG_ALPHA 690 /* This used to use csum_partial, which was wrong for several 691 * reasons including that different results are returned on 692 * different architectures. It isn't critical that we get exactly 693 * the same return value as before (we always csum_fold before 694 * testing, and that removes any differences). However as we 695 * know that csum_partial always returned a 16bit value on 696 * alphas, do a fold to maximise conformity to previous behaviour. 697 */ 698 sb->sb_csum = md_csum_fold(disk_csum); 699 #else 700 sb->sb_csum = disk_csum; 701 #endif 702 return csum; 703 } 704 705 706 /* 707 * Handle superblock details. 708 * We want to be able to handle multiple superblock formats 709 * so we have a common interface to them all, and an array of 710 * different handlers. 711 * We rely on user-space to write the initial superblock, and support 712 * reading and updating of superblocks. 713 * Interface methods are: 714 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 715 * loads and validates a superblock on dev. 716 * if refdev != NULL, compare superblocks on both devices 717 * Return: 718 * 0 - dev has a superblock that is compatible with refdev 719 * 1 - dev has a superblock that is compatible and newer than refdev 720 * so dev should be used as the refdev in future 721 * -EINVAL superblock incompatible or invalid 722 * -othererror e.g. -EIO 723 * 724 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 725 * Verify that dev is acceptable into mddev. 726 * The first time, mddev->raid_disks will be 0, and data from 727 * dev should be merged in. Subsequent calls check that dev 728 * is new enough. Return 0 or -EINVAL 729 * 730 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 731 * Update the superblock for rdev with data in mddev 732 * This does not write to disc. 733 * 734 */ 735 736 struct super_type { 737 char *name; 738 struct module *owner; 739 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 740 int minor_version); 741 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 742 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 743 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 744 sector_t num_sectors); 745 }; 746 747 /* 748 * load_super for 0.90.0 749 */ 750 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 751 { 752 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 753 mdp_super_t *sb; 754 int ret; 755 756 /* 757 * Calculate the position of the superblock (512byte sectors), 758 * it's at the end of the disk. 759 * 760 * It also happens to be a multiple of 4Kb. 761 */ 762 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 763 764 ret = read_disk_sb(rdev, MD_SB_BYTES); 765 if (ret) return ret; 766 767 ret = -EINVAL; 768 769 bdevname(rdev->bdev, b); 770 sb = (mdp_super_t*)page_address(rdev->sb_page); 771 772 if (sb->md_magic != MD_SB_MAGIC) { 773 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 774 b); 775 goto abort; 776 } 777 778 if (sb->major_version != 0 || 779 sb->minor_version < 90 || 780 sb->minor_version > 91) { 781 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 782 sb->major_version, sb->minor_version, 783 b); 784 goto abort; 785 } 786 787 if (sb->raid_disks <= 0) 788 goto abort; 789 790 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 791 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 792 b); 793 goto abort; 794 } 795 796 rdev->preferred_minor = sb->md_minor; 797 rdev->data_offset = 0; 798 rdev->sb_size = MD_SB_BYTES; 799 800 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { 801 if (sb->level != 1 && sb->level != 4 802 && sb->level != 5 && sb->level != 6 803 && sb->level != 10) { 804 /* FIXME use a better test */ 805 printk(KERN_WARNING 806 "md: bitmaps not supported for this level.\n"); 807 goto abort; 808 } 809 } 810 811 if (sb->level == LEVEL_MULTIPATH) 812 rdev->desc_nr = -1; 813 else 814 rdev->desc_nr = sb->this_disk.number; 815 816 if (!refdev) { 817 ret = 1; 818 } else { 819 __u64 ev1, ev2; 820 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 821 if (!uuid_equal(refsb, sb)) { 822 printk(KERN_WARNING "md: %s has different UUID to %s\n", 823 b, bdevname(refdev->bdev,b2)); 824 goto abort; 825 } 826 if (!sb_equal(refsb, sb)) { 827 printk(KERN_WARNING "md: %s has same UUID" 828 " but different superblock to %s\n", 829 b, bdevname(refdev->bdev, b2)); 830 goto abort; 831 } 832 ev1 = md_event(sb); 833 ev2 = md_event(refsb); 834 if (ev1 > ev2) 835 ret = 1; 836 else 837 ret = 0; 838 } 839 rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); 840 841 if (rdev->sectors < sb->size * 2 && sb->level > 1) 842 /* "this cannot possibly happen" ... */ 843 ret = -EINVAL; 844 845 abort: 846 return ret; 847 } 848 849 /* 850 * validate_super for 0.90.0 851 */ 852 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 853 { 854 mdp_disk_t *desc; 855 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 856 __u64 ev1 = md_event(sb); 857 858 rdev->raid_disk = -1; 859 clear_bit(Faulty, &rdev->flags); 860 clear_bit(In_sync, &rdev->flags); 861 clear_bit(WriteMostly, &rdev->flags); 862 clear_bit(BarriersNotsupp, &rdev->flags); 863 864 if (mddev->raid_disks == 0) { 865 mddev->major_version = 0; 866 mddev->minor_version = sb->minor_version; 867 mddev->patch_version = sb->patch_version; 868 mddev->external = 0; 869 mddev->chunk_size = sb->chunk_size; 870 mddev->ctime = sb->ctime; 871 mddev->utime = sb->utime; 872 mddev->level = sb->level; 873 mddev->clevel[0] = 0; 874 mddev->layout = sb->layout; 875 mddev->raid_disks = sb->raid_disks; 876 mddev->dev_sectors = sb->size * 2; 877 mddev->events = ev1; 878 mddev->bitmap_offset = 0; 879 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 880 881 if (mddev->minor_version >= 91) { 882 mddev->reshape_position = sb->reshape_position; 883 mddev->delta_disks = sb->delta_disks; 884 mddev->new_level = sb->new_level; 885 mddev->new_layout = sb->new_layout; 886 mddev->new_chunk = sb->new_chunk; 887 } else { 888 mddev->reshape_position = MaxSector; 889 mddev->delta_disks = 0; 890 mddev->new_level = mddev->level; 891 mddev->new_layout = mddev->layout; 892 mddev->new_chunk = mddev->chunk_size; 893 } 894 895 if (sb->state & (1<<MD_SB_CLEAN)) 896 mddev->recovery_cp = MaxSector; 897 else { 898 if (sb->events_hi == sb->cp_events_hi && 899 sb->events_lo == sb->cp_events_lo) { 900 mddev->recovery_cp = sb->recovery_cp; 901 } else 902 mddev->recovery_cp = 0; 903 } 904 905 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 906 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 907 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 908 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 909 910 mddev->max_disks = MD_SB_DISKS; 911 912 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 913 mddev->bitmap_file == NULL) 914 mddev->bitmap_offset = mddev->default_bitmap_offset; 915 916 } else if (mddev->pers == NULL) { 917 /* Insist on good event counter while assembling */ 918 ++ev1; 919 if (ev1 < mddev->events) 920 return -EINVAL; 921 } else if (mddev->bitmap) { 922 /* if adding to array with a bitmap, then we can accept an 923 * older device ... but not too old. 924 */ 925 if (ev1 < mddev->bitmap->events_cleared) 926 return 0; 927 } else { 928 if (ev1 < mddev->events) 929 /* just a hot-add of a new device, leave raid_disk at -1 */ 930 return 0; 931 } 932 933 if (mddev->level != LEVEL_MULTIPATH) { 934 desc = sb->disks + rdev->desc_nr; 935 936 if (desc->state & (1<<MD_DISK_FAULTY)) 937 set_bit(Faulty, &rdev->flags); 938 else if (desc->state & (1<<MD_DISK_SYNC) /* && 939 desc->raid_disk < mddev->raid_disks */) { 940 set_bit(In_sync, &rdev->flags); 941 rdev->raid_disk = desc->raid_disk; 942 } 943 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 944 set_bit(WriteMostly, &rdev->flags); 945 } else /* MULTIPATH are always insync */ 946 set_bit(In_sync, &rdev->flags); 947 return 0; 948 } 949 950 /* 951 * sync_super for 0.90.0 952 */ 953 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 954 { 955 mdp_super_t *sb; 956 mdk_rdev_t *rdev2; 957 int next_spare = mddev->raid_disks; 958 959 960 /* make rdev->sb match mddev data.. 961 * 962 * 1/ zero out disks 963 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 964 * 3/ any empty disks < next_spare become removed 965 * 966 * disks[0] gets initialised to REMOVED because 967 * we cannot be sure from other fields if it has 968 * been initialised or not. 969 */ 970 int i; 971 int active=0, working=0,failed=0,spare=0,nr_disks=0; 972 973 rdev->sb_size = MD_SB_BYTES; 974 975 sb = (mdp_super_t*)page_address(rdev->sb_page); 976 977 memset(sb, 0, sizeof(*sb)); 978 979 sb->md_magic = MD_SB_MAGIC; 980 sb->major_version = mddev->major_version; 981 sb->patch_version = mddev->patch_version; 982 sb->gvalid_words = 0; /* ignored */ 983 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 984 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 985 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 986 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 987 988 sb->ctime = mddev->ctime; 989 sb->level = mddev->level; 990 sb->size = mddev->dev_sectors / 2; 991 sb->raid_disks = mddev->raid_disks; 992 sb->md_minor = mddev->md_minor; 993 sb->not_persistent = 0; 994 sb->utime = mddev->utime; 995 sb->state = 0; 996 sb->events_hi = (mddev->events>>32); 997 sb->events_lo = (u32)mddev->events; 998 999 if (mddev->reshape_position == MaxSector) 1000 sb->minor_version = 90; 1001 else { 1002 sb->minor_version = 91; 1003 sb->reshape_position = mddev->reshape_position; 1004 sb->new_level = mddev->new_level; 1005 sb->delta_disks = mddev->delta_disks; 1006 sb->new_layout = mddev->new_layout; 1007 sb->new_chunk = mddev->new_chunk; 1008 } 1009 mddev->minor_version = sb->minor_version; 1010 if (mddev->in_sync) 1011 { 1012 sb->recovery_cp = mddev->recovery_cp; 1013 sb->cp_events_hi = (mddev->events>>32); 1014 sb->cp_events_lo = (u32)mddev->events; 1015 if (mddev->recovery_cp == MaxSector) 1016 sb->state = (1<< MD_SB_CLEAN); 1017 } else 1018 sb->recovery_cp = 0; 1019 1020 sb->layout = mddev->layout; 1021 sb->chunk_size = mddev->chunk_size; 1022 1023 if (mddev->bitmap && mddev->bitmap_file == NULL) 1024 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1025 1026 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1027 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1028 mdp_disk_t *d; 1029 int desc_nr; 1030 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1031 && !test_bit(Faulty, &rdev2->flags)) 1032 desc_nr = rdev2->raid_disk; 1033 else 1034 desc_nr = next_spare++; 1035 rdev2->desc_nr = desc_nr; 1036 d = &sb->disks[rdev2->desc_nr]; 1037 nr_disks++; 1038 d->number = rdev2->desc_nr; 1039 d->major = MAJOR(rdev2->bdev->bd_dev); 1040 d->minor = MINOR(rdev2->bdev->bd_dev); 1041 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1042 && !test_bit(Faulty, &rdev2->flags)) 1043 d->raid_disk = rdev2->raid_disk; 1044 else 1045 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1046 if (test_bit(Faulty, &rdev2->flags)) 1047 d->state = (1<<MD_DISK_FAULTY); 1048 else if (test_bit(In_sync, &rdev2->flags)) { 1049 d->state = (1<<MD_DISK_ACTIVE); 1050 d->state |= (1<<MD_DISK_SYNC); 1051 active++; 1052 working++; 1053 } else { 1054 d->state = 0; 1055 spare++; 1056 working++; 1057 } 1058 if (test_bit(WriteMostly, &rdev2->flags)) 1059 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1060 } 1061 /* now set the "removed" and "faulty" bits on any missing devices */ 1062 for (i=0 ; i < mddev->raid_disks ; i++) { 1063 mdp_disk_t *d = &sb->disks[i]; 1064 if (d->state == 0 && d->number == 0) { 1065 d->number = i; 1066 d->raid_disk = i; 1067 d->state = (1<<MD_DISK_REMOVED); 1068 d->state |= (1<<MD_DISK_FAULTY); 1069 failed++; 1070 } 1071 } 1072 sb->nr_disks = nr_disks; 1073 sb->active_disks = active; 1074 sb->working_disks = working; 1075 sb->failed_disks = failed; 1076 sb->spare_disks = spare; 1077 1078 sb->this_disk = sb->disks[rdev->desc_nr]; 1079 sb->sb_csum = calc_sb_csum(sb); 1080 } 1081 1082 /* 1083 * rdev_size_change for 0.90.0 1084 */ 1085 static unsigned long long 1086 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1087 { 1088 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1089 return 0; /* component must fit device */ 1090 if (rdev->mddev->bitmap_offset) 1091 return 0; /* can't move bitmap */ 1092 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1093 if (!num_sectors || num_sectors > rdev->sb_start) 1094 num_sectors = rdev->sb_start; 1095 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1096 rdev->sb_page); 1097 md_super_wait(rdev->mddev); 1098 return num_sectors / 2; /* kB for sysfs */ 1099 } 1100 1101 1102 /* 1103 * version 1 superblock 1104 */ 1105 1106 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1107 { 1108 __le32 disk_csum; 1109 u32 csum; 1110 unsigned long long newcsum; 1111 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1112 __le32 *isuper = (__le32*)sb; 1113 int i; 1114 1115 disk_csum = sb->sb_csum; 1116 sb->sb_csum = 0; 1117 newcsum = 0; 1118 for (i=0; size>=4; size -= 4 ) 1119 newcsum += le32_to_cpu(*isuper++); 1120 1121 if (size == 2) 1122 newcsum += le16_to_cpu(*(__le16*) isuper); 1123 1124 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1125 sb->sb_csum = disk_csum; 1126 return cpu_to_le32(csum); 1127 } 1128 1129 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1130 { 1131 struct mdp_superblock_1 *sb; 1132 int ret; 1133 sector_t sb_start; 1134 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1135 int bmask; 1136 1137 /* 1138 * Calculate the position of the superblock in 512byte sectors. 1139 * It is always aligned to a 4K boundary and 1140 * depeding on minor_version, it can be: 1141 * 0: At least 8K, but less than 12K, from end of device 1142 * 1: At start of device 1143 * 2: 4K from start of device. 1144 */ 1145 switch(minor_version) { 1146 case 0: 1147 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1148 sb_start -= 8*2; 1149 sb_start &= ~(sector_t)(4*2-1); 1150 break; 1151 case 1: 1152 sb_start = 0; 1153 break; 1154 case 2: 1155 sb_start = 8; 1156 break; 1157 default: 1158 return -EINVAL; 1159 } 1160 rdev->sb_start = sb_start; 1161 1162 /* superblock is rarely larger than 1K, but it can be larger, 1163 * and it is safe to read 4k, so we do that 1164 */ 1165 ret = read_disk_sb(rdev, 4096); 1166 if (ret) return ret; 1167 1168 1169 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1170 1171 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1172 sb->major_version != cpu_to_le32(1) || 1173 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1174 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1175 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1176 return -EINVAL; 1177 1178 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1179 printk("md: invalid superblock checksum on %s\n", 1180 bdevname(rdev->bdev,b)); 1181 return -EINVAL; 1182 } 1183 if (le64_to_cpu(sb->data_size) < 10) { 1184 printk("md: data_size too small on %s\n", 1185 bdevname(rdev->bdev,b)); 1186 return -EINVAL; 1187 } 1188 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { 1189 if (sb->level != cpu_to_le32(1) && 1190 sb->level != cpu_to_le32(4) && 1191 sb->level != cpu_to_le32(5) && 1192 sb->level != cpu_to_le32(6) && 1193 sb->level != cpu_to_le32(10)) { 1194 printk(KERN_WARNING 1195 "md: bitmaps not supported for this level.\n"); 1196 return -EINVAL; 1197 } 1198 } 1199 1200 rdev->preferred_minor = 0xffff; 1201 rdev->data_offset = le64_to_cpu(sb->data_offset); 1202 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1203 1204 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1205 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1206 if (rdev->sb_size & bmask) 1207 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1208 1209 if (minor_version 1210 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1211 return -EINVAL; 1212 1213 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1214 rdev->desc_nr = -1; 1215 else 1216 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1217 1218 if (!refdev) { 1219 ret = 1; 1220 } else { 1221 __u64 ev1, ev2; 1222 struct mdp_superblock_1 *refsb = 1223 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1224 1225 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1226 sb->level != refsb->level || 1227 sb->layout != refsb->layout || 1228 sb->chunksize != refsb->chunksize) { 1229 printk(KERN_WARNING "md: %s has strangely different" 1230 " superblock to %s\n", 1231 bdevname(rdev->bdev,b), 1232 bdevname(refdev->bdev,b2)); 1233 return -EINVAL; 1234 } 1235 ev1 = le64_to_cpu(sb->events); 1236 ev2 = le64_to_cpu(refsb->events); 1237 1238 if (ev1 > ev2) 1239 ret = 1; 1240 else 1241 ret = 0; 1242 } 1243 if (minor_version) 1244 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1245 le64_to_cpu(sb->data_offset); 1246 else 1247 rdev->sectors = rdev->sb_start; 1248 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1249 return -EINVAL; 1250 rdev->sectors = le64_to_cpu(sb->data_size); 1251 if (le32_to_cpu(sb->chunksize)) 1252 rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); 1253 1254 if (le64_to_cpu(sb->size) > rdev->sectors) 1255 return -EINVAL; 1256 return ret; 1257 } 1258 1259 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1260 { 1261 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1262 __u64 ev1 = le64_to_cpu(sb->events); 1263 1264 rdev->raid_disk = -1; 1265 clear_bit(Faulty, &rdev->flags); 1266 clear_bit(In_sync, &rdev->flags); 1267 clear_bit(WriteMostly, &rdev->flags); 1268 clear_bit(BarriersNotsupp, &rdev->flags); 1269 1270 if (mddev->raid_disks == 0) { 1271 mddev->major_version = 1; 1272 mddev->patch_version = 0; 1273 mddev->external = 0; 1274 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1275 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1276 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1277 mddev->level = le32_to_cpu(sb->level); 1278 mddev->clevel[0] = 0; 1279 mddev->layout = le32_to_cpu(sb->layout); 1280 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1281 mddev->dev_sectors = le64_to_cpu(sb->size); 1282 mddev->events = ev1; 1283 mddev->bitmap_offset = 0; 1284 mddev->default_bitmap_offset = 1024 >> 9; 1285 1286 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1287 memcpy(mddev->uuid, sb->set_uuid, 16); 1288 1289 mddev->max_disks = (4096-256)/2; 1290 1291 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1292 mddev->bitmap_file == NULL ) 1293 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1294 1295 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1296 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1297 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1298 mddev->new_level = le32_to_cpu(sb->new_level); 1299 mddev->new_layout = le32_to_cpu(sb->new_layout); 1300 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1301 } else { 1302 mddev->reshape_position = MaxSector; 1303 mddev->delta_disks = 0; 1304 mddev->new_level = mddev->level; 1305 mddev->new_layout = mddev->layout; 1306 mddev->new_chunk = mddev->chunk_size; 1307 } 1308 1309 } else if (mddev->pers == NULL) { 1310 /* Insist of good event counter while assembling */ 1311 ++ev1; 1312 if (ev1 < mddev->events) 1313 return -EINVAL; 1314 } else if (mddev->bitmap) { 1315 /* If adding to array with a bitmap, then we can accept an 1316 * older device, but not too old. 1317 */ 1318 if (ev1 < mddev->bitmap->events_cleared) 1319 return 0; 1320 } else { 1321 if (ev1 < mddev->events) 1322 /* just a hot-add of a new device, leave raid_disk at -1 */ 1323 return 0; 1324 } 1325 if (mddev->level != LEVEL_MULTIPATH) { 1326 int role; 1327 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1328 switch(role) { 1329 case 0xffff: /* spare */ 1330 break; 1331 case 0xfffe: /* faulty */ 1332 set_bit(Faulty, &rdev->flags); 1333 break; 1334 default: 1335 if ((le32_to_cpu(sb->feature_map) & 1336 MD_FEATURE_RECOVERY_OFFSET)) 1337 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1338 else 1339 set_bit(In_sync, &rdev->flags); 1340 rdev->raid_disk = role; 1341 break; 1342 } 1343 if (sb->devflags & WriteMostly1) 1344 set_bit(WriteMostly, &rdev->flags); 1345 } else /* MULTIPATH are always insync */ 1346 set_bit(In_sync, &rdev->flags); 1347 1348 return 0; 1349 } 1350 1351 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1352 { 1353 struct mdp_superblock_1 *sb; 1354 mdk_rdev_t *rdev2; 1355 int max_dev, i; 1356 /* make rdev->sb match mddev and rdev data. */ 1357 1358 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1359 1360 sb->feature_map = 0; 1361 sb->pad0 = 0; 1362 sb->recovery_offset = cpu_to_le64(0); 1363 memset(sb->pad1, 0, sizeof(sb->pad1)); 1364 memset(sb->pad2, 0, sizeof(sb->pad2)); 1365 memset(sb->pad3, 0, sizeof(sb->pad3)); 1366 1367 sb->utime = cpu_to_le64((__u64)mddev->utime); 1368 sb->events = cpu_to_le64(mddev->events); 1369 if (mddev->in_sync) 1370 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1371 else 1372 sb->resync_offset = cpu_to_le64(0); 1373 1374 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1375 1376 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1377 sb->size = cpu_to_le64(mddev->dev_sectors); 1378 sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); 1379 sb->level = cpu_to_le32(mddev->level); 1380 sb->layout = cpu_to_le32(mddev->layout); 1381 1382 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1383 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1384 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1385 } 1386 1387 if (rdev->raid_disk >= 0 && 1388 !test_bit(In_sync, &rdev->flags)) { 1389 if (mddev->curr_resync_completed > rdev->recovery_offset) 1390 rdev->recovery_offset = mddev->curr_resync_completed; 1391 if (rdev->recovery_offset > 0) { 1392 sb->feature_map |= 1393 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1394 sb->recovery_offset = 1395 cpu_to_le64(rdev->recovery_offset); 1396 } 1397 } 1398 1399 if (mddev->reshape_position != MaxSector) { 1400 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1401 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1402 sb->new_layout = cpu_to_le32(mddev->new_layout); 1403 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1404 sb->new_level = cpu_to_le32(mddev->new_level); 1405 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1406 } 1407 1408 max_dev = 0; 1409 list_for_each_entry(rdev2, &mddev->disks, same_set) 1410 if (rdev2->desc_nr+1 > max_dev) 1411 max_dev = rdev2->desc_nr+1; 1412 1413 if (max_dev > le32_to_cpu(sb->max_dev)) 1414 sb->max_dev = cpu_to_le32(max_dev); 1415 for (i=0; i<max_dev;i++) 1416 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1417 1418 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1419 i = rdev2->desc_nr; 1420 if (test_bit(Faulty, &rdev2->flags)) 1421 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1422 else if (test_bit(In_sync, &rdev2->flags)) 1423 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1424 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1425 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1426 else 1427 sb->dev_roles[i] = cpu_to_le16(0xffff); 1428 } 1429 1430 sb->sb_csum = calc_sb_1_csum(sb); 1431 } 1432 1433 static unsigned long long 1434 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1435 { 1436 struct mdp_superblock_1 *sb; 1437 sector_t max_sectors; 1438 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1439 return 0; /* component must fit device */ 1440 if (rdev->sb_start < rdev->data_offset) { 1441 /* minor versions 1 and 2; superblock before data */ 1442 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1443 max_sectors -= rdev->data_offset; 1444 if (!num_sectors || num_sectors > max_sectors) 1445 num_sectors = max_sectors; 1446 } else if (rdev->mddev->bitmap_offset) { 1447 /* minor version 0 with bitmap we can't move */ 1448 return 0; 1449 } else { 1450 /* minor version 0; superblock after data */ 1451 sector_t sb_start; 1452 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1453 sb_start &= ~(sector_t)(4*2 - 1); 1454 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1455 if (!num_sectors || num_sectors > max_sectors) 1456 num_sectors = max_sectors; 1457 rdev->sb_start = sb_start; 1458 } 1459 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1460 sb->data_size = cpu_to_le64(num_sectors); 1461 sb->super_offset = rdev->sb_start; 1462 sb->sb_csum = calc_sb_1_csum(sb); 1463 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1464 rdev->sb_page); 1465 md_super_wait(rdev->mddev); 1466 return num_sectors / 2; /* kB for sysfs */ 1467 } 1468 1469 static struct super_type super_types[] = { 1470 [0] = { 1471 .name = "0.90.0", 1472 .owner = THIS_MODULE, 1473 .load_super = super_90_load, 1474 .validate_super = super_90_validate, 1475 .sync_super = super_90_sync, 1476 .rdev_size_change = super_90_rdev_size_change, 1477 }, 1478 [1] = { 1479 .name = "md-1", 1480 .owner = THIS_MODULE, 1481 .load_super = super_1_load, 1482 .validate_super = super_1_validate, 1483 .sync_super = super_1_sync, 1484 .rdev_size_change = super_1_rdev_size_change, 1485 }, 1486 }; 1487 1488 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1489 { 1490 mdk_rdev_t *rdev, *rdev2; 1491 1492 rcu_read_lock(); 1493 rdev_for_each_rcu(rdev, mddev1) 1494 rdev_for_each_rcu(rdev2, mddev2) 1495 if (rdev->bdev->bd_contains == 1496 rdev2->bdev->bd_contains) { 1497 rcu_read_unlock(); 1498 return 1; 1499 } 1500 rcu_read_unlock(); 1501 return 0; 1502 } 1503 1504 static LIST_HEAD(pending_raid_disks); 1505 1506 static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) 1507 { 1508 struct mdk_personality *pers = mddev->pers; 1509 struct gendisk *disk = mddev->gendisk; 1510 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1511 struct blk_integrity *bi_mddev = blk_get_integrity(disk); 1512 1513 /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ 1514 if (pers && pers->level >= 4 && pers->level <= 6) 1515 return; 1516 1517 /* If rdev is integrity capable, register profile for mddev */ 1518 if (!bi_mddev && bi_rdev) { 1519 if (blk_integrity_register(disk, bi_rdev)) 1520 printk(KERN_ERR "%s: %s Could not register integrity!\n", 1521 __func__, disk->disk_name); 1522 else 1523 printk(KERN_NOTICE "Enabling data integrity on %s\n", 1524 disk->disk_name); 1525 return; 1526 } 1527 1528 /* Check that mddev and rdev have matching profiles */ 1529 if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { 1530 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, 1531 disk->disk_name, rdev->bdev->bd_disk->disk_name); 1532 printk(KERN_NOTICE "Disabling data integrity on %s\n", 1533 disk->disk_name); 1534 blk_integrity_unregister(disk); 1535 } 1536 } 1537 1538 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1539 { 1540 char b[BDEVNAME_SIZE]; 1541 struct kobject *ko; 1542 char *s; 1543 int err; 1544 1545 if (rdev->mddev) { 1546 MD_BUG(); 1547 return -EINVAL; 1548 } 1549 1550 /* prevent duplicates */ 1551 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1552 return -EEXIST; 1553 1554 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 1555 if (rdev->sectors && (mddev->dev_sectors == 0 || 1556 rdev->sectors < mddev->dev_sectors)) { 1557 if (mddev->pers) { 1558 /* Cannot change size, so fail 1559 * If mddev->level <= 0, then we don't care 1560 * about aligning sizes (e.g. linear) 1561 */ 1562 if (mddev->level > 0) 1563 return -ENOSPC; 1564 } else 1565 mddev->dev_sectors = rdev->sectors; 1566 } 1567 1568 /* Verify rdev->desc_nr is unique. 1569 * If it is -1, assign a free number, else 1570 * check number is not in use 1571 */ 1572 if (rdev->desc_nr < 0) { 1573 int choice = 0; 1574 if (mddev->pers) choice = mddev->raid_disks; 1575 while (find_rdev_nr(mddev, choice)) 1576 choice++; 1577 rdev->desc_nr = choice; 1578 } else { 1579 if (find_rdev_nr(mddev, rdev->desc_nr)) 1580 return -EBUSY; 1581 } 1582 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 1583 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 1584 mdname(mddev), mddev->max_disks); 1585 return -EBUSY; 1586 } 1587 bdevname(rdev->bdev,b); 1588 while ( (s=strchr(b, '/')) != NULL) 1589 *s = '!'; 1590 1591 rdev->mddev = mddev; 1592 printk(KERN_INFO "md: bind<%s>\n", b); 1593 1594 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1595 goto fail; 1596 1597 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1598 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1599 kobject_del(&rdev->kobj); 1600 goto fail; 1601 } 1602 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1603 1604 list_add_rcu(&rdev->same_set, &mddev->disks); 1605 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1606 1607 /* May as well allow recovery to be retried once */ 1608 mddev->recovery_disabled = 0; 1609 1610 md_integrity_check(rdev, mddev); 1611 return 0; 1612 1613 fail: 1614 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1615 b, mdname(mddev)); 1616 return err; 1617 } 1618 1619 static void md_delayed_delete(struct work_struct *ws) 1620 { 1621 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1622 kobject_del(&rdev->kobj); 1623 kobject_put(&rdev->kobj); 1624 } 1625 1626 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1627 { 1628 char b[BDEVNAME_SIZE]; 1629 if (!rdev->mddev) { 1630 MD_BUG(); 1631 return; 1632 } 1633 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1634 list_del_rcu(&rdev->same_set); 1635 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1636 rdev->mddev = NULL; 1637 sysfs_remove_link(&rdev->kobj, "block"); 1638 sysfs_put(rdev->sysfs_state); 1639 rdev->sysfs_state = NULL; 1640 /* We need to delay this, otherwise we can deadlock when 1641 * writing to 'remove' to "dev/state". We also need 1642 * to delay it due to rcu usage. 1643 */ 1644 synchronize_rcu(); 1645 INIT_WORK(&rdev->del_work, md_delayed_delete); 1646 kobject_get(&rdev->kobj); 1647 schedule_work(&rdev->del_work); 1648 } 1649 1650 /* 1651 * prevent the device from being mounted, repartitioned or 1652 * otherwise reused by a RAID array (or any other kernel 1653 * subsystem), by bd_claiming the device. 1654 */ 1655 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1656 { 1657 int err = 0; 1658 struct block_device *bdev; 1659 char b[BDEVNAME_SIZE]; 1660 1661 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1662 if (IS_ERR(bdev)) { 1663 printk(KERN_ERR "md: could not open %s.\n", 1664 __bdevname(dev, b)); 1665 return PTR_ERR(bdev); 1666 } 1667 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1668 if (err) { 1669 printk(KERN_ERR "md: could not bd_claim %s.\n", 1670 bdevname(bdev, b)); 1671 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1672 return err; 1673 } 1674 if (!shared) 1675 set_bit(AllReserved, &rdev->flags); 1676 rdev->bdev = bdev; 1677 return err; 1678 } 1679 1680 static void unlock_rdev(mdk_rdev_t *rdev) 1681 { 1682 struct block_device *bdev = rdev->bdev; 1683 rdev->bdev = NULL; 1684 if (!bdev) 1685 MD_BUG(); 1686 bd_release(bdev); 1687 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1688 } 1689 1690 void md_autodetect_dev(dev_t dev); 1691 1692 static void export_rdev(mdk_rdev_t * rdev) 1693 { 1694 char b[BDEVNAME_SIZE]; 1695 printk(KERN_INFO "md: export_rdev(%s)\n", 1696 bdevname(rdev->bdev,b)); 1697 if (rdev->mddev) 1698 MD_BUG(); 1699 free_disk_sb(rdev); 1700 #ifndef MODULE 1701 if (test_bit(AutoDetected, &rdev->flags)) 1702 md_autodetect_dev(rdev->bdev->bd_dev); 1703 #endif 1704 unlock_rdev(rdev); 1705 kobject_put(&rdev->kobj); 1706 } 1707 1708 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1709 { 1710 unbind_rdev_from_array(rdev); 1711 export_rdev(rdev); 1712 } 1713 1714 static void export_array(mddev_t *mddev) 1715 { 1716 mdk_rdev_t *rdev, *tmp; 1717 1718 rdev_for_each(rdev, tmp, mddev) { 1719 if (!rdev->mddev) { 1720 MD_BUG(); 1721 continue; 1722 } 1723 kick_rdev_from_array(rdev); 1724 } 1725 if (!list_empty(&mddev->disks)) 1726 MD_BUG(); 1727 mddev->raid_disks = 0; 1728 mddev->major_version = 0; 1729 } 1730 1731 static void print_desc(mdp_disk_t *desc) 1732 { 1733 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1734 desc->major,desc->minor,desc->raid_disk,desc->state); 1735 } 1736 1737 static void print_sb_90(mdp_super_t *sb) 1738 { 1739 int i; 1740 1741 printk(KERN_INFO 1742 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1743 sb->major_version, sb->minor_version, sb->patch_version, 1744 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1745 sb->ctime); 1746 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1747 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1748 sb->md_minor, sb->layout, sb->chunk_size); 1749 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1750 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1751 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1752 sb->failed_disks, sb->spare_disks, 1753 sb->sb_csum, (unsigned long)sb->events_lo); 1754 1755 printk(KERN_INFO); 1756 for (i = 0; i < MD_SB_DISKS; i++) { 1757 mdp_disk_t *desc; 1758 1759 desc = sb->disks + i; 1760 if (desc->number || desc->major || desc->minor || 1761 desc->raid_disk || (desc->state && (desc->state != 4))) { 1762 printk(" D %2d: ", i); 1763 print_desc(desc); 1764 } 1765 } 1766 printk(KERN_INFO "md: THIS: "); 1767 print_desc(&sb->this_disk); 1768 } 1769 1770 static void print_sb_1(struct mdp_superblock_1 *sb) 1771 { 1772 __u8 *uuid; 1773 1774 uuid = sb->set_uuid; 1775 printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1776 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" 1777 KERN_INFO "md: Name: \"%s\" CT:%llu\n", 1778 le32_to_cpu(sb->major_version), 1779 le32_to_cpu(sb->feature_map), 1780 uuid[0], uuid[1], uuid[2], uuid[3], 1781 uuid[4], uuid[5], uuid[6], uuid[7], 1782 uuid[8], uuid[9], uuid[10], uuid[11], 1783 uuid[12], uuid[13], uuid[14], uuid[15], 1784 sb->set_name, 1785 (unsigned long long)le64_to_cpu(sb->ctime) 1786 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1787 1788 uuid = sb->device_uuid; 1789 printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1790 " RO:%llu\n" 1791 KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1792 ":%02x%02x%02x%02x%02x%02x\n" 1793 KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1794 KERN_INFO "md: (MaxDev:%u) \n", 1795 le32_to_cpu(sb->level), 1796 (unsigned long long)le64_to_cpu(sb->size), 1797 le32_to_cpu(sb->raid_disks), 1798 le32_to_cpu(sb->layout), 1799 le32_to_cpu(sb->chunksize), 1800 (unsigned long long)le64_to_cpu(sb->data_offset), 1801 (unsigned long long)le64_to_cpu(sb->data_size), 1802 (unsigned long long)le64_to_cpu(sb->super_offset), 1803 (unsigned long long)le64_to_cpu(sb->recovery_offset), 1804 le32_to_cpu(sb->dev_number), 1805 uuid[0], uuid[1], uuid[2], uuid[3], 1806 uuid[4], uuid[5], uuid[6], uuid[7], 1807 uuid[8], uuid[9], uuid[10], uuid[11], 1808 uuid[12], uuid[13], uuid[14], uuid[15], 1809 sb->devflags, 1810 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 1811 (unsigned long long)le64_to_cpu(sb->events), 1812 (unsigned long long)le64_to_cpu(sb->resync_offset), 1813 le32_to_cpu(sb->sb_csum), 1814 le32_to_cpu(sb->max_dev) 1815 ); 1816 } 1817 1818 static void print_rdev(mdk_rdev_t *rdev, int major_version) 1819 { 1820 char b[BDEVNAME_SIZE]; 1821 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 1822 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 1823 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1824 rdev->desc_nr); 1825 if (rdev->sb_loaded) { 1826 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 1827 switch (major_version) { 1828 case 0: 1829 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 1830 break; 1831 case 1: 1832 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 1833 break; 1834 } 1835 } else 1836 printk(KERN_INFO "md: no rdev superblock!\n"); 1837 } 1838 1839 static void md_print_devices(void) 1840 { 1841 struct list_head *tmp; 1842 mdk_rdev_t *rdev; 1843 mddev_t *mddev; 1844 char b[BDEVNAME_SIZE]; 1845 1846 printk("\n"); 1847 printk("md: **********************************\n"); 1848 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1849 printk("md: **********************************\n"); 1850 for_each_mddev(mddev, tmp) { 1851 1852 if (mddev->bitmap) 1853 bitmap_print_sb(mddev->bitmap); 1854 else 1855 printk("%s: ", mdname(mddev)); 1856 list_for_each_entry(rdev, &mddev->disks, same_set) 1857 printk("<%s>", bdevname(rdev->bdev,b)); 1858 printk("\n"); 1859 1860 list_for_each_entry(rdev, &mddev->disks, same_set) 1861 print_rdev(rdev, mddev->major_version); 1862 } 1863 printk("md: **********************************\n"); 1864 printk("\n"); 1865 } 1866 1867 1868 static void sync_sbs(mddev_t * mddev, int nospares) 1869 { 1870 /* Update each superblock (in-memory image), but 1871 * if we are allowed to, skip spares which already 1872 * have the right event counter, or have one earlier 1873 * (which would mean they aren't being marked as dirty 1874 * with the rest of the array) 1875 */ 1876 mdk_rdev_t *rdev; 1877 1878 list_for_each_entry(rdev, &mddev->disks, same_set) { 1879 if (rdev->sb_events == mddev->events || 1880 (nospares && 1881 rdev->raid_disk < 0 && 1882 (rdev->sb_events&1)==0 && 1883 rdev->sb_events+1 == mddev->events)) { 1884 /* Don't update this superblock */ 1885 rdev->sb_loaded = 2; 1886 } else { 1887 super_types[mddev->major_version]. 1888 sync_super(mddev, rdev); 1889 rdev->sb_loaded = 1; 1890 } 1891 } 1892 } 1893 1894 static void md_update_sb(mddev_t * mddev, int force_change) 1895 { 1896 mdk_rdev_t *rdev; 1897 int sync_req; 1898 int nospares = 0; 1899 1900 if (mddev->external) 1901 return; 1902 repeat: 1903 spin_lock_irq(&mddev->write_lock); 1904 1905 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1906 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1907 force_change = 1; 1908 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1909 /* just a clean<-> dirty transition, possibly leave spares alone, 1910 * though if events isn't the right even/odd, we will have to do 1911 * spares after all 1912 */ 1913 nospares = 1; 1914 if (force_change) 1915 nospares = 0; 1916 if (mddev->degraded) 1917 /* If the array is degraded, then skipping spares is both 1918 * dangerous and fairly pointless. 1919 * Dangerous because a device that was removed from the array 1920 * might have a event_count that still looks up-to-date, 1921 * so it can be re-added without a resync. 1922 * Pointless because if there are any spares to skip, 1923 * then a recovery will happen and soon that array won't 1924 * be degraded any more and the spare can go back to sleep then. 1925 */ 1926 nospares = 0; 1927 1928 sync_req = mddev->in_sync; 1929 mddev->utime = get_seconds(); 1930 1931 /* If this is just a dirty<->clean transition, and the array is clean 1932 * and 'events' is odd, we can roll back to the previous clean state */ 1933 if (nospares 1934 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1935 && (mddev->events & 1) 1936 && mddev->events != 1) 1937 mddev->events--; 1938 else { 1939 /* otherwise we have to go forward and ... */ 1940 mddev->events ++; 1941 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1942 /* .. if the array isn't clean, insist on an odd 'events' */ 1943 if ((mddev->events&1)==0) { 1944 mddev->events++; 1945 nospares = 0; 1946 } 1947 } else { 1948 /* otherwise insist on an even 'events' (for clean states) */ 1949 if ((mddev->events&1)) { 1950 mddev->events++; 1951 nospares = 0; 1952 } 1953 } 1954 } 1955 1956 if (!mddev->events) { 1957 /* 1958 * oops, this 64-bit counter should never wrap. 1959 * Either we are in around ~1 trillion A.C., assuming 1960 * 1 reboot per second, or we have a bug: 1961 */ 1962 MD_BUG(); 1963 mddev->events --; 1964 } 1965 1966 /* 1967 * do not write anything to disk if using 1968 * nonpersistent superblocks 1969 */ 1970 if (!mddev->persistent) { 1971 if (!mddev->external) 1972 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1973 1974 spin_unlock_irq(&mddev->write_lock); 1975 wake_up(&mddev->sb_wait); 1976 return; 1977 } 1978 sync_sbs(mddev, nospares); 1979 spin_unlock_irq(&mddev->write_lock); 1980 1981 dprintk(KERN_INFO 1982 "md: updating %s RAID superblock on device (in sync %d)\n", 1983 mdname(mddev),mddev->in_sync); 1984 1985 bitmap_update_sb(mddev->bitmap); 1986 list_for_each_entry(rdev, &mddev->disks, same_set) { 1987 char b[BDEVNAME_SIZE]; 1988 dprintk(KERN_INFO "md: "); 1989 if (rdev->sb_loaded != 1) 1990 continue; /* no noise on spare devices */ 1991 if (test_bit(Faulty, &rdev->flags)) 1992 dprintk("(skipping faulty "); 1993 1994 dprintk("%s ", bdevname(rdev->bdev,b)); 1995 if (!test_bit(Faulty, &rdev->flags)) { 1996 md_super_write(mddev,rdev, 1997 rdev->sb_start, rdev->sb_size, 1998 rdev->sb_page); 1999 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 2000 bdevname(rdev->bdev,b), 2001 (unsigned long long)rdev->sb_start); 2002 rdev->sb_events = mddev->events; 2003 2004 } else 2005 dprintk(")\n"); 2006 if (mddev->level == LEVEL_MULTIPATH) 2007 /* only need to write one superblock... */ 2008 break; 2009 } 2010 md_super_wait(mddev); 2011 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2012 2013 spin_lock_irq(&mddev->write_lock); 2014 if (mddev->in_sync != sync_req || 2015 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2016 /* have to write it out again */ 2017 spin_unlock_irq(&mddev->write_lock); 2018 goto repeat; 2019 } 2020 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2021 spin_unlock_irq(&mddev->write_lock); 2022 wake_up(&mddev->sb_wait); 2023 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2024 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2025 2026 } 2027 2028 /* words written to sysfs files may, or may not, be \n terminated. 2029 * We want to accept with case. For this we use cmd_match. 2030 */ 2031 static int cmd_match(const char *cmd, const char *str) 2032 { 2033 /* See if cmd, written into a sysfs file, matches 2034 * str. They must either be the same, or cmd can 2035 * have a trailing newline 2036 */ 2037 while (*cmd && *str && *cmd == *str) { 2038 cmd++; 2039 str++; 2040 } 2041 if (*cmd == '\n') 2042 cmd++; 2043 if (*str || *cmd) 2044 return 0; 2045 return 1; 2046 } 2047 2048 struct rdev_sysfs_entry { 2049 struct attribute attr; 2050 ssize_t (*show)(mdk_rdev_t *, char *); 2051 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 2052 }; 2053 2054 static ssize_t 2055 state_show(mdk_rdev_t *rdev, char *page) 2056 { 2057 char *sep = ""; 2058 size_t len = 0; 2059 2060 if (test_bit(Faulty, &rdev->flags)) { 2061 len+= sprintf(page+len, "%sfaulty",sep); 2062 sep = ","; 2063 } 2064 if (test_bit(In_sync, &rdev->flags)) { 2065 len += sprintf(page+len, "%sin_sync",sep); 2066 sep = ","; 2067 } 2068 if (test_bit(WriteMostly, &rdev->flags)) { 2069 len += sprintf(page+len, "%swrite_mostly",sep); 2070 sep = ","; 2071 } 2072 if (test_bit(Blocked, &rdev->flags)) { 2073 len += sprintf(page+len, "%sblocked", sep); 2074 sep = ","; 2075 } 2076 if (!test_bit(Faulty, &rdev->flags) && 2077 !test_bit(In_sync, &rdev->flags)) { 2078 len += sprintf(page+len, "%sspare", sep); 2079 sep = ","; 2080 } 2081 return len+sprintf(page+len, "\n"); 2082 } 2083 2084 static ssize_t 2085 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2086 { 2087 /* can write 2088 * faulty - simulates and error 2089 * remove - disconnects the device 2090 * writemostly - sets write_mostly 2091 * -writemostly - clears write_mostly 2092 * blocked - sets the Blocked flag 2093 * -blocked - clears the Blocked flag 2094 * insync - sets Insync providing device isn't active 2095 */ 2096 int err = -EINVAL; 2097 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2098 md_error(rdev->mddev, rdev); 2099 err = 0; 2100 } else if (cmd_match(buf, "remove")) { 2101 if (rdev->raid_disk >= 0) 2102 err = -EBUSY; 2103 else { 2104 mddev_t *mddev = rdev->mddev; 2105 kick_rdev_from_array(rdev); 2106 if (mddev->pers) 2107 md_update_sb(mddev, 1); 2108 md_new_event(mddev); 2109 err = 0; 2110 } 2111 } else if (cmd_match(buf, "writemostly")) { 2112 set_bit(WriteMostly, &rdev->flags); 2113 err = 0; 2114 } else if (cmd_match(buf, "-writemostly")) { 2115 clear_bit(WriteMostly, &rdev->flags); 2116 err = 0; 2117 } else if (cmd_match(buf, "blocked")) { 2118 set_bit(Blocked, &rdev->flags); 2119 err = 0; 2120 } else if (cmd_match(buf, "-blocked")) { 2121 clear_bit(Blocked, &rdev->flags); 2122 wake_up(&rdev->blocked_wait); 2123 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2124 md_wakeup_thread(rdev->mddev->thread); 2125 2126 err = 0; 2127 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2128 set_bit(In_sync, &rdev->flags); 2129 err = 0; 2130 } 2131 if (!err && rdev->sysfs_state) 2132 sysfs_notify_dirent(rdev->sysfs_state); 2133 return err ? err : len; 2134 } 2135 static struct rdev_sysfs_entry rdev_state = 2136 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2137 2138 static ssize_t 2139 errors_show(mdk_rdev_t *rdev, char *page) 2140 { 2141 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2142 } 2143 2144 static ssize_t 2145 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2146 { 2147 char *e; 2148 unsigned long n = simple_strtoul(buf, &e, 10); 2149 if (*buf && (*e == 0 || *e == '\n')) { 2150 atomic_set(&rdev->corrected_errors, n); 2151 return len; 2152 } 2153 return -EINVAL; 2154 } 2155 static struct rdev_sysfs_entry rdev_errors = 2156 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2157 2158 static ssize_t 2159 slot_show(mdk_rdev_t *rdev, char *page) 2160 { 2161 if (rdev->raid_disk < 0) 2162 return sprintf(page, "none\n"); 2163 else 2164 return sprintf(page, "%d\n", rdev->raid_disk); 2165 } 2166 2167 static ssize_t 2168 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2169 { 2170 char *e; 2171 int err; 2172 char nm[20]; 2173 int slot = simple_strtoul(buf, &e, 10); 2174 if (strncmp(buf, "none", 4)==0) 2175 slot = -1; 2176 else if (e==buf || (*e && *e!= '\n')) 2177 return -EINVAL; 2178 if (rdev->mddev->pers && slot == -1) { 2179 /* Setting 'slot' on an active array requires also 2180 * updating the 'rd%d' link, and communicating 2181 * with the personality with ->hot_*_disk. 2182 * For now we only support removing 2183 * failed/spare devices. This normally happens automatically, 2184 * but not when the metadata is externally managed. 2185 */ 2186 if (rdev->raid_disk == -1) 2187 return -EEXIST; 2188 /* personality does all needed checks */ 2189 if (rdev->mddev->pers->hot_add_disk == NULL) 2190 return -EINVAL; 2191 err = rdev->mddev->pers-> 2192 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2193 if (err) 2194 return err; 2195 sprintf(nm, "rd%d", rdev->raid_disk); 2196 sysfs_remove_link(&rdev->mddev->kobj, nm); 2197 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2198 md_wakeup_thread(rdev->mddev->thread); 2199 } else if (rdev->mddev->pers) { 2200 mdk_rdev_t *rdev2; 2201 /* Activating a spare .. or possibly reactivating 2202 * if we ever get bitmaps working here. 2203 */ 2204 2205 if (rdev->raid_disk != -1) 2206 return -EBUSY; 2207 2208 if (rdev->mddev->pers->hot_add_disk == NULL) 2209 return -EINVAL; 2210 2211 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) 2212 if (rdev2->raid_disk == slot) 2213 return -EEXIST; 2214 2215 rdev->raid_disk = slot; 2216 if (test_bit(In_sync, &rdev->flags)) 2217 rdev->saved_raid_disk = slot; 2218 else 2219 rdev->saved_raid_disk = -1; 2220 err = rdev->mddev->pers-> 2221 hot_add_disk(rdev->mddev, rdev); 2222 if (err) { 2223 rdev->raid_disk = -1; 2224 return err; 2225 } else 2226 sysfs_notify_dirent(rdev->sysfs_state); 2227 sprintf(nm, "rd%d", rdev->raid_disk); 2228 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2229 printk(KERN_WARNING 2230 "md: cannot register " 2231 "%s for %s\n", 2232 nm, mdname(rdev->mddev)); 2233 2234 /* don't wakeup anyone, leave that to userspace. */ 2235 } else { 2236 if (slot >= rdev->mddev->raid_disks) 2237 return -ENOSPC; 2238 rdev->raid_disk = slot; 2239 /* assume it is working */ 2240 clear_bit(Faulty, &rdev->flags); 2241 clear_bit(WriteMostly, &rdev->flags); 2242 set_bit(In_sync, &rdev->flags); 2243 sysfs_notify_dirent(rdev->sysfs_state); 2244 } 2245 return len; 2246 } 2247 2248 2249 static struct rdev_sysfs_entry rdev_slot = 2250 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2251 2252 static ssize_t 2253 offset_show(mdk_rdev_t *rdev, char *page) 2254 { 2255 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2256 } 2257 2258 static ssize_t 2259 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2260 { 2261 char *e; 2262 unsigned long long offset = simple_strtoull(buf, &e, 10); 2263 if (e==buf || (*e && *e != '\n')) 2264 return -EINVAL; 2265 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2266 return -EBUSY; 2267 if (rdev->sectors && rdev->mddev->external) 2268 /* Must set offset before size, so overlap checks 2269 * can be sane */ 2270 return -EBUSY; 2271 rdev->data_offset = offset; 2272 return len; 2273 } 2274 2275 static struct rdev_sysfs_entry rdev_offset = 2276 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2277 2278 static ssize_t 2279 rdev_size_show(mdk_rdev_t *rdev, char *page) 2280 { 2281 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2282 } 2283 2284 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2285 { 2286 /* check if two start/length pairs overlap */ 2287 if (s1+l1 <= s2) 2288 return 0; 2289 if (s2+l2 <= s1) 2290 return 0; 2291 return 1; 2292 } 2293 2294 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2295 { 2296 unsigned long long blocks; 2297 sector_t new; 2298 2299 if (strict_strtoull(buf, 10, &blocks) < 0) 2300 return -EINVAL; 2301 2302 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2303 return -EINVAL; /* sector conversion overflow */ 2304 2305 new = blocks * 2; 2306 if (new != blocks * 2) 2307 return -EINVAL; /* unsigned long long to sector_t overflow */ 2308 2309 *sectors = new; 2310 return 0; 2311 } 2312 2313 static ssize_t 2314 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2315 { 2316 mddev_t *my_mddev = rdev->mddev; 2317 sector_t oldsectors = rdev->sectors; 2318 sector_t sectors; 2319 2320 if (strict_blocks_to_sectors(buf, §ors) < 0) 2321 return -EINVAL; 2322 if (my_mddev->pers && rdev->raid_disk >= 0) { 2323 if (my_mddev->persistent) { 2324 sectors = super_types[my_mddev->major_version]. 2325 rdev_size_change(rdev, sectors); 2326 if (!sectors) 2327 return -EBUSY; 2328 } else if (!sectors) 2329 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2330 rdev->data_offset; 2331 } 2332 if (sectors < my_mddev->dev_sectors) 2333 return -EINVAL; /* component must fit device */ 2334 2335 rdev->sectors = sectors; 2336 if (sectors > oldsectors && my_mddev->external) { 2337 /* need to check that all other rdevs with the same ->bdev 2338 * do not overlap. We need to unlock the mddev to avoid 2339 * a deadlock. We have already changed rdev->sectors, and if 2340 * we have to change it back, we will have the lock again. 2341 */ 2342 mddev_t *mddev; 2343 int overlap = 0; 2344 struct list_head *tmp; 2345 2346 mddev_unlock(my_mddev); 2347 for_each_mddev(mddev, tmp) { 2348 mdk_rdev_t *rdev2; 2349 2350 mddev_lock(mddev); 2351 list_for_each_entry(rdev2, &mddev->disks, same_set) 2352 if (test_bit(AllReserved, &rdev2->flags) || 2353 (rdev->bdev == rdev2->bdev && 2354 rdev != rdev2 && 2355 overlaps(rdev->data_offset, rdev->sectors, 2356 rdev2->data_offset, 2357 rdev2->sectors))) { 2358 overlap = 1; 2359 break; 2360 } 2361 mddev_unlock(mddev); 2362 if (overlap) { 2363 mddev_put(mddev); 2364 break; 2365 } 2366 } 2367 mddev_lock(my_mddev); 2368 if (overlap) { 2369 /* Someone else could have slipped in a size 2370 * change here, but doing so is just silly. 2371 * We put oldsectors back because we *know* it is 2372 * safe, and trust userspace not to race with 2373 * itself 2374 */ 2375 rdev->sectors = oldsectors; 2376 return -EBUSY; 2377 } 2378 } 2379 return len; 2380 } 2381 2382 static struct rdev_sysfs_entry rdev_size = 2383 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2384 2385 static struct attribute *rdev_default_attrs[] = { 2386 &rdev_state.attr, 2387 &rdev_errors.attr, 2388 &rdev_slot.attr, 2389 &rdev_offset.attr, 2390 &rdev_size.attr, 2391 NULL, 2392 }; 2393 static ssize_t 2394 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2395 { 2396 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2397 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2398 mddev_t *mddev = rdev->mddev; 2399 ssize_t rv; 2400 2401 if (!entry->show) 2402 return -EIO; 2403 2404 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2405 if (!rv) { 2406 if (rdev->mddev == NULL) 2407 rv = -EBUSY; 2408 else 2409 rv = entry->show(rdev, page); 2410 mddev_unlock(mddev); 2411 } 2412 return rv; 2413 } 2414 2415 static ssize_t 2416 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2417 const char *page, size_t length) 2418 { 2419 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2420 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2421 ssize_t rv; 2422 mddev_t *mddev = rdev->mddev; 2423 2424 if (!entry->store) 2425 return -EIO; 2426 if (!capable(CAP_SYS_ADMIN)) 2427 return -EACCES; 2428 rv = mddev ? mddev_lock(mddev): -EBUSY; 2429 if (!rv) { 2430 if (rdev->mddev == NULL) 2431 rv = -EBUSY; 2432 else 2433 rv = entry->store(rdev, page, length); 2434 mddev_unlock(mddev); 2435 } 2436 return rv; 2437 } 2438 2439 static void rdev_free(struct kobject *ko) 2440 { 2441 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2442 kfree(rdev); 2443 } 2444 static struct sysfs_ops rdev_sysfs_ops = { 2445 .show = rdev_attr_show, 2446 .store = rdev_attr_store, 2447 }; 2448 static struct kobj_type rdev_ktype = { 2449 .release = rdev_free, 2450 .sysfs_ops = &rdev_sysfs_ops, 2451 .default_attrs = rdev_default_attrs, 2452 }; 2453 2454 /* 2455 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2456 * 2457 * mark the device faulty if: 2458 * 2459 * - the device is nonexistent (zero size) 2460 * - the device has no valid superblock 2461 * 2462 * a faulty rdev _never_ has rdev->sb set. 2463 */ 2464 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2465 { 2466 char b[BDEVNAME_SIZE]; 2467 int err; 2468 mdk_rdev_t *rdev; 2469 sector_t size; 2470 2471 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2472 if (!rdev) { 2473 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2474 return ERR_PTR(-ENOMEM); 2475 } 2476 2477 if ((err = alloc_disk_sb(rdev))) 2478 goto abort_free; 2479 2480 err = lock_rdev(rdev, newdev, super_format == -2); 2481 if (err) 2482 goto abort_free; 2483 2484 kobject_init(&rdev->kobj, &rdev_ktype); 2485 2486 rdev->desc_nr = -1; 2487 rdev->saved_raid_disk = -1; 2488 rdev->raid_disk = -1; 2489 rdev->flags = 0; 2490 rdev->data_offset = 0; 2491 rdev->sb_events = 0; 2492 atomic_set(&rdev->nr_pending, 0); 2493 atomic_set(&rdev->read_errors, 0); 2494 atomic_set(&rdev->corrected_errors, 0); 2495 2496 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2497 if (!size) { 2498 printk(KERN_WARNING 2499 "md: %s has zero or unknown size, marking faulty!\n", 2500 bdevname(rdev->bdev,b)); 2501 err = -EINVAL; 2502 goto abort_free; 2503 } 2504 2505 if (super_format >= 0) { 2506 err = super_types[super_format]. 2507 load_super(rdev, NULL, super_minor); 2508 if (err == -EINVAL) { 2509 printk(KERN_WARNING 2510 "md: %s does not have a valid v%d.%d " 2511 "superblock, not importing!\n", 2512 bdevname(rdev->bdev,b), 2513 super_format, super_minor); 2514 goto abort_free; 2515 } 2516 if (err < 0) { 2517 printk(KERN_WARNING 2518 "md: could not read %s's sb, not importing!\n", 2519 bdevname(rdev->bdev,b)); 2520 goto abort_free; 2521 } 2522 } 2523 2524 INIT_LIST_HEAD(&rdev->same_set); 2525 init_waitqueue_head(&rdev->blocked_wait); 2526 2527 return rdev; 2528 2529 abort_free: 2530 if (rdev->sb_page) { 2531 if (rdev->bdev) 2532 unlock_rdev(rdev); 2533 free_disk_sb(rdev); 2534 } 2535 kfree(rdev); 2536 return ERR_PTR(err); 2537 } 2538 2539 /* 2540 * Check a full RAID array for plausibility 2541 */ 2542 2543 2544 static void analyze_sbs(mddev_t * mddev) 2545 { 2546 int i; 2547 mdk_rdev_t *rdev, *freshest, *tmp; 2548 char b[BDEVNAME_SIZE]; 2549 2550 freshest = NULL; 2551 rdev_for_each(rdev, tmp, mddev) 2552 switch (super_types[mddev->major_version]. 2553 load_super(rdev, freshest, mddev->minor_version)) { 2554 case 1: 2555 freshest = rdev; 2556 break; 2557 case 0: 2558 break; 2559 default: 2560 printk( KERN_ERR \ 2561 "md: fatal superblock inconsistency in %s" 2562 " -- removing from array\n", 2563 bdevname(rdev->bdev,b)); 2564 kick_rdev_from_array(rdev); 2565 } 2566 2567 2568 super_types[mddev->major_version]. 2569 validate_super(mddev, freshest); 2570 2571 i = 0; 2572 rdev_for_each(rdev, tmp, mddev) { 2573 if (rdev->desc_nr >= mddev->max_disks || 2574 i > mddev->max_disks) { 2575 printk(KERN_WARNING 2576 "md: %s: %s: only %d devices permitted\n", 2577 mdname(mddev), bdevname(rdev->bdev, b), 2578 mddev->max_disks); 2579 kick_rdev_from_array(rdev); 2580 continue; 2581 } 2582 if (rdev != freshest) 2583 if (super_types[mddev->major_version]. 2584 validate_super(mddev, rdev)) { 2585 printk(KERN_WARNING "md: kicking non-fresh %s" 2586 " from array!\n", 2587 bdevname(rdev->bdev,b)); 2588 kick_rdev_from_array(rdev); 2589 continue; 2590 } 2591 if (mddev->level == LEVEL_MULTIPATH) { 2592 rdev->desc_nr = i++; 2593 rdev->raid_disk = rdev->desc_nr; 2594 set_bit(In_sync, &rdev->flags); 2595 } else if (rdev->raid_disk >= mddev->raid_disks) { 2596 rdev->raid_disk = -1; 2597 clear_bit(In_sync, &rdev->flags); 2598 } 2599 } 2600 2601 2602 2603 if (mddev->recovery_cp != MaxSector && 2604 mddev->level >= 1) 2605 printk(KERN_ERR "md: %s: raid array is not clean" 2606 " -- starting background reconstruction\n", 2607 mdname(mddev)); 2608 2609 } 2610 2611 static void md_safemode_timeout(unsigned long data); 2612 2613 static ssize_t 2614 safe_delay_show(mddev_t *mddev, char *page) 2615 { 2616 int msec = (mddev->safemode_delay*1000)/HZ; 2617 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2618 } 2619 static ssize_t 2620 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2621 { 2622 int scale=1; 2623 int dot=0; 2624 int i; 2625 unsigned long msec; 2626 char buf[30]; 2627 2628 /* remove a period, and count digits after it */ 2629 if (len >= sizeof(buf)) 2630 return -EINVAL; 2631 strlcpy(buf, cbuf, sizeof(buf)); 2632 for (i=0; i<len; i++) { 2633 if (dot) { 2634 if (isdigit(buf[i])) { 2635 buf[i-1] = buf[i]; 2636 scale *= 10; 2637 } 2638 buf[i] = 0; 2639 } else if (buf[i] == '.') { 2640 dot=1; 2641 buf[i] = 0; 2642 } 2643 } 2644 if (strict_strtoul(buf, 10, &msec) < 0) 2645 return -EINVAL; 2646 msec = (msec * 1000) / scale; 2647 if (msec == 0) 2648 mddev->safemode_delay = 0; 2649 else { 2650 unsigned long old_delay = mddev->safemode_delay; 2651 mddev->safemode_delay = (msec*HZ)/1000; 2652 if (mddev->safemode_delay == 0) 2653 mddev->safemode_delay = 1; 2654 if (mddev->safemode_delay < old_delay) 2655 md_safemode_timeout((unsigned long)mddev); 2656 } 2657 return len; 2658 } 2659 static struct md_sysfs_entry md_safe_delay = 2660 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2661 2662 static ssize_t 2663 level_show(mddev_t *mddev, char *page) 2664 { 2665 struct mdk_personality *p = mddev->pers; 2666 if (p) 2667 return sprintf(page, "%s\n", p->name); 2668 else if (mddev->clevel[0]) 2669 return sprintf(page, "%s\n", mddev->clevel); 2670 else if (mddev->level != LEVEL_NONE) 2671 return sprintf(page, "%d\n", mddev->level); 2672 else 2673 return 0; 2674 } 2675 2676 static ssize_t 2677 level_store(mddev_t *mddev, const char *buf, size_t len) 2678 { 2679 char level[16]; 2680 ssize_t rv = len; 2681 struct mdk_personality *pers; 2682 void *priv; 2683 2684 if (mddev->pers == NULL) { 2685 if (len == 0) 2686 return 0; 2687 if (len >= sizeof(mddev->clevel)) 2688 return -ENOSPC; 2689 strncpy(mddev->clevel, buf, len); 2690 if (mddev->clevel[len-1] == '\n') 2691 len--; 2692 mddev->clevel[len] = 0; 2693 mddev->level = LEVEL_NONE; 2694 return rv; 2695 } 2696 2697 /* request to change the personality. Need to ensure: 2698 * - array is not engaged in resync/recovery/reshape 2699 * - old personality can be suspended 2700 * - new personality will access other array. 2701 */ 2702 2703 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 2704 return -EBUSY; 2705 2706 if (!mddev->pers->quiesce) { 2707 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 2708 mdname(mddev), mddev->pers->name); 2709 return -EINVAL; 2710 } 2711 2712 /* Now find the new personality */ 2713 if (len == 0 || len >= sizeof(level)) 2714 return -EINVAL; 2715 strncpy(level, buf, len); 2716 if (level[len-1] == '\n') 2717 len--; 2718 level[len] = 0; 2719 2720 request_module("md-%s", level); 2721 spin_lock(&pers_lock); 2722 pers = find_pers(LEVEL_NONE, level); 2723 if (!pers || !try_module_get(pers->owner)) { 2724 spin_unlock(&pers_lock); 2725 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2726 return -EINVAL; 2727 } 2728 spin_unlock(&pers_lock); 2729 2730 if (pers == mddev->pers) { 2731 /* Nothing to do! */ 2732 module_put(pers->owner); 2733 return rv; 2734 } 2735 if (!pers->takeover) { 2736 module_put(pers->owner); 2737 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2738 mdname(mddev), level); 2739 return -EINVAL; 2740 } 2741 2742 /* ->takeover must set new_* and/or delta_disks 2743 * if it succeeds, and may set them when it fails. 2744 */ 2745 priv = pers->takeover(mddev); 2746 if (IS_ERR(priv)) { 2747 mddev->new_level = mddev->level; 2748 mddev->new_layout = mddev->layout; 2749 mddev->new_chunk = mddev->chunk_size; 2750 mddev->raid_disks -= mddev->delta_disks; 2751 mddev->delta_disks = 0; 2752 module_put(pers->owner); 2753 printk(KERN_WARNING "md: %s: %s would not accept array\n", 2754 mdname(mddev), level); 2755 return PTR_ERR(priv); 2756 } 2757 2758 /* Looks like we have a winner */ 2759 mddev_suspend(mddev); 2760 mddev->pers->stop(mddev); 2761 module_put(mddev->pers->owner); 2762 mddev->pers = pers; 2763 mddev->private = priv; 2764 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2765 mddev->level = mddev->new_level; 2766 mddev->layout = mddev->new_layout; 2767 mddev->chunk_size = mddev->new_chunk; 2768 mddev->delta_disks = 0; 2769 pers->run(mddev); 2770 mddev_resume(mddev); 2771 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2772 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2773 md_wakeup_thread(mddev->thread); 2774 return rv; 2775 } 2776 2777 static struct md_sysfs_entry md_level = 2778 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2779 2780 2781 static ssize_t 2782 layout_show(mddev_t *mddev, char *page) 2783 { 2784 /* just a number, not meaningful for all levels */ 2785 if (mddev->reshape_position != MaxSector && 2786 mddev->layout != mddev->new_layout) 2787 return sprintf(page, "%d (%d)\n", 2788 mddev->new_layout, mddev->layout); 2789 return sprintf(page, "%d\n", mddev->layout); 2790 } 2791 2792 static ssize_t 2793 layout_store(mddev_t *mddev, const char *buf, size_t len) 2794 { 2795 char *e; 2796 unsigned long n = simple_strtoul(buf, &e, 10); 2797 2798 if (!*buf || (*e && *e != '\n')) 2799 return -EINVAL; 2800 2801 if (mddev->pers) { 2802 int err; 2803 if (mddev->pers->reconfig == NULL) 2804 return -EBUSY; 2805 err = mddev->pers->reconfig(mddev, n, -1); 2806 if (err) 2807 return err; 2808 } else { 2809 mddev->new_layout = n; 2810 if (mddev->reshape_position == MaxSector) 2811 mddev->layout = n; 2812 } 2813 return len; 2814 } 2815 static struct md_sysfs_entry md_layout = 2816 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2817 2818 2819 static ssize_t 2820 raid_disks_show(mddev_t *mddev, char *page) 2821 { 2822 if (mddev->raid_disks == 0) 2823 return 0; 2824 if (mddev->reshape_position != MaxSector && 2825 mddev->delta_disks != 0) 2826 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2827 mddev->raid_disks - mddev->delta_disks); 2828 return sprintf(page, "%d\n", mddev->raid_disks); 2829 } 2830 2831 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2832 2833 static ssize_t 2834 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2835 { 2836 char *e; 2837 int rv = 0; 2838 unsigned long n = simple_strtoul(buf, &e, 10); 2839 2840 if (!*buf || (*e && *e != '\n')) 2841 return -EINVAL; 2842 2843 if (mddev->pers) 2844 rv = update_raid_disks(mddev, n); 2845 else if (mddev->reshape_position != MaxSector) { 2846 int olddisks = mddev->raid_disks - mddev->delta_disks; 2847 mddev->delta_disks = n - olddisks; 2848 mddev->raid_disks = n; 2849 } else 2850 mddev->raid_disks = n; 2851 return rv ? rv : len; 2852 } 2853 static struct md_sysfs_entry md_raid_disks = 2854 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2855 2856 static ssize_t 2857 chunk_size_show(mddev_t *mddev, char *page) 2858 { 2859 if (mddev->reshape_position != MaxSector && 2860 mddev->chunk_size != mddev->new_chunk) 2861 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2862 mddev->chunk_size); 2863 return sprintf(page, "%d\n", mddev->chunk_size); 2864 } 2865 2866 static ssize_t 2867 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2868 { 2869 char *e; 2870 unsigned long n = simple_strtoul(buf, &e, 10); 2871 2872 if (!*buf || (*e && *e != '\n')) 2873 return -EINVAL; 2874 2875 if (mddev->pers) { 2876 int err; 2877 if (mddev->pers->reconfig == NULL) 2878 return -EBUSY; 2879 err = mddev->pers->reconfig(mddev, -1, n); 2880 if (err) 2881 return err; 2882 } else { 2883 mddev->new_chunk = n; 2884 if (mddev->reshape_position == MaxSector) 2885 mddev->chunk_size = n; 2886 } 2887 return len; 2888 } 2889 static struct md_sysfs_entry md_chunk_size = 2890 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2891 2892 static ssize_t 2893 resync_start_show(mddev_t *mddev, char *page) 2894 { 2895 if (mddev->recovery_cp == MaxSector) 2896 return sprintf(page, "none\n"); 2897 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2898 } 2899 2900 static ssize_t 2901 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2902 { 2903 char *e; 2904 unsigned long long n = simple_strtoull(buf, &e, 10); 2905 2906 if (mddev->pers) 2907 return -EBUSY; 2908 if (!*buf || (*e && *e != '\n')) 2909 return -EINVAL; 2910 2911 mddev->recovery_cp = n; 2912 return len; 2913 } 2914 static struct md_sysfs_entry md_resync_start = 2915 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2916 2917 /* 2918 * The array state can be: 2919 * 2920 * clear 2921 * No devices, no size, no level 2922 * Equivalent to STOP_ARRAY ioctl 2923 * inactive 2924 * May have some settings, but array is not active 2925 * all IO results in error 2926 * When written, doesn't tear down array, but just stops it 2927 * suspended (not supported yet) 2928 * All IO requests will block. The array can be reconfigured. 2929 * Writing this, if accepted, will block until array is quiescent 2930 * readonly 2931 * no resync can happen. no superblocks get written. 2932 * write requests fail 2933 * read-auto 2934 * like readonly, but behaves like 'clean' on a write request. 2935 * 2936 * clean - no pending writes, but otherwise active. 2937 * When written to inactive array, starts without resync 2938 * If a write request arrives then 2939 * if metadata is known, mark 'dirty' and switch to 'active'. 2940 * if not known, block and switch to write-pending 2941 * If written to an active array that has pending writes, then fails. 2942 * active 2943 * fully active: IO and resync can be happening. 2944 * When written to inactive array, starts with resync 2945 * 2946 * write-pending 2947 * clean, but writes are blocked waiting for 'active' to be written. 2948 * 2949 * active-idle 2950 * like active, but no writes have been seen for a while (100msec). 2951 * 2952 */ 2953 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2954 write_pending, active_idle, bad_word}; 2955 static char *array_states[] = { 2956 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2957 "write-pending", "active-idle", NULL }; 2958 2959 static int match_word(const char *word, char **list) 2960 { 2961 int n; 2962 for (n=0; list[n]; n++) 2963 if (cmd_match(word, list[n])) 2964 break; 2965 return n; 2966 } 2967 2968 static ssize_t 2969 array_state_show(mddev_t *mddev, char *page) 2970 { 2971 enum array_state st = inactive; 2972 2973 if (mddev->pers) 2974 switch(mddev->ro) { 2975 case 1: 2976 st = readonly; 2977 break; 2978 case 2: 2979 st = read_auto; 2980 break; 2981 case 0: 2982 if (mddev->in_sync) 2983 st = clean; 2984 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2985 st = write_pending; 2986 else if (mddev->safemode) 2987 st = active_idle; 2988 else 2989 st = active; 2990 } 2991 else { 2992 if (list_empty(&mddev->disks) && 2993 mddev->raid_disks == 0 && 2994 mddev->dev_sectors == 0) 2995 st = clear; 2996 else 2997 st = inactive; 2998 } 2999 return sprintf(page, "%s\n", array_states[st]); 3000 } 3001 3002 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3003 static int do_md_run(mddev_t * mddev); 3004 static int restart_array(mddev_t *mddev); 3005 3006 static ssize_t 3007 array_state_store(mddev_t *mddev, const char *buf, size_t len) 3008 { 3009 int err = -EINVAL; 3010 enum array_state st = match_word(buf, array_states); 3011 switch(st) { 3012 case bad_word: 3013 break; 3014 case clear: 3015 /* stopping an active array */ 3016 if (atomic_read(&mddev->openers) > 0) 3017 return -EBUSY; 3018 err = do_md_stop(mddev, 0, 0); 3019 break; 3020 case inactive: 3021 /* stopping an active array */ 3022 if (mddev->pers) { 3023 if (atomic_read(&mddev->openers) > 0) 3024 return -EBUSY; 3025 err = do_md_stop(mddev, 2, 0); 3026 } else 3027 err = 0; /* already inactive */ 3028 break; 3029 case suspended: 3030 break; /* not supported yet */ 3031 case readonly: 3032 if (mddev->pers) 3033 err = do_md_stop(mddev, 1, 0); 3034 else { 3035 mddev->ro = 1; 3036 set_disk_ro(mddev->gendisk, 1); 3037 err = do_md_run(mddev); 3038 } 3039 break; 3040 case read_auto: 3041 if (mddev->pers) { 3042 if (mddev->ro == 0) 3043 err = do_md_stop(mddev, 1, 0); 3044 else if (mddev->ro == 1) 3045 err = restart_array(mddev); 3046 if (err == 0) { 3047 mddev->ro = 2; 3048 set_disk_ro(mddev->gendisk, 0); 3049 } 3050 } else { 3051 mddev->ro = 2; 3052 err = do_md_run(mddev); 3053 } 3054 break; 3055 case clean: 3056 if (mddev->pers) { 3057 restart_array(mddev); 3058 spin_lock_irq(&mddev->write_lock); 3059 if (atomic_read(&mddev->writes_pending) == 0) { 3060 if (mddev->in_sync == 0) { 3061 mddev->in_sync = 1; 3062 if (mddev->safemode == 1) 3063 mddev->safemode = 0; 3064 if (mddev->persistent) 3065 set_bit(MD_CHANGE_CLEAN, 3066 &mddev->flags); 3067 } 3068 err = 0; 3069 } else 3070 err = -EBUSY; 3071 spin_unlock_irq(&mddev->write_lock); 3072 } else 3073 err = -EINVAL; 3074 break; 3075 case active: 3076 if (mddev->pers) { 3077 restart_array(mddev); 3078 if (mddev->external) 3079 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 3080 wake_up(&mddev->sb_wait); 3081 err = 0; 3082 } else { 3083 mddev->ro = 0; 3084 set_disk_ro(mddev->gendisk, 0); 3085 err = do_md_run(mddev); 3086 } 3087 break; 3088 case write_pending: 3089 case active_idle: 3090 /* these cannot be set */ 3091 break; 3092 } 3093 if (err) 3094 return err; 3095 else { 3096 sysfs_notify_dirent(mddev->sysfs_state); 3097 return len; 3098 } 3099 } 3100 static struct md_sysfs_entry md_array_state = 3101 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3102 3103 static ssize_t 3104 null_show(mddev_t *mddev, char *page) 3105 { 3106 return -EINVAL; 3107 } 3108 3109 static ssize_t 3110 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 3111 { 3112 /* buf must be %d:%d\n? giving major and minor numbers */ 3113 /* The new device is added to the array. 3114 * If the array has a persistent superblock, we read the 3115 * superblock to initialise info and check validity. 3116 * Otherwise, only checking done is that in bind_rdev_to_array, 3117 * which mainly checks size. 3118 */ 3119 char *e; 3120 int major = simple_strtoul(buf, &e, 10); 3121 int minor; 3122 dev_t dev; 3123 mdk_rdev_t *rdev; 3124 int err; 3125 3126 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3127 return -EINVAL; 3128 minor = simple_strtoul(e+1, &e, 10); 3129 if (*e && *e != '\n') 3130 return -EINVAL; 3131 dev = MKDEV(major, minor); 3132 if (major != MAJOR(dev) || 3133 minor != MINOR(dev)) 3134 return -EOVERFLOW; 3135 3136 3137 if (mddev->persistent) { 3138 rdev = md_import_device(dev, mddev->major_version, 3139 mddev->minor_version); 3140 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3141 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3142 mdk_rdev_t, same_set); 3143 err = super_types[mddev->major_version] 3144 .load_super(rdev, rdev0, mddev->minor_version); 3145 if (err < 0) 3146 goto out; 3147 } 3148 } else if (mddev->external) 3149 rdev = md_import_device(dev, -2, -1); 3150 else 3151 rdev = md_import_device(dev, -1, -1); 3152 3153 if (IS_ERR(rdev)) 3154 return PTR_ERR(rdev); 3155 err = bind_rdev_to_array(rdev, mddev); 3156 out: 3157 if (err) 3158 export_rdev(rdev); 3159 return err ? err : len; 3160 } 3161 3162 static struct md_sysfs_entry md_new_device = 3163 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3164 3165 static ssize_t 3166 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 3167 { 3168 char *end; 3169 unsigned long chunk, end_chunk; 3170 3171 if (!mddev->bitmap) 3172 goto out; 3173 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3174 while (*buf) { 3175 chunk = end_chunk = simple_strtoul(buf, &end, 0); 3176 if (buf == end) break; 3177 if (*end == '-') { /* range */ 3178 buf = end + 1; 3179 end_chunk = simple_strtoul(buf, &end, 0); 3180 if (buf == end) break; 3181 } 3182 if (*end && !isspace(*end)) break; 3183 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3184 buf = end; 3185 while (isspace(*buf)) buf++; 3186 } 3187 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3188 out: 3189 return len; 3190 } 3191 3192 static struct md_sysfs_entry md_bitmap = 3193 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3194 3195 static ssize_t 3196 size_show(mddev_t *mddev, char *page) 3197 { 3198 return sprintf(page, "%llu\n", 3199 (unsigned long long)mddev->dev_sectors / 2); 3200 } 3201 3202 static int update_size(mddev_t *mddev, sector_t num_sectors); 3203 3204 static ssize_t 3205 size_store(mddev_t *mddev, const char *buf, size_t len) 3206 { 3207 /* If array is inactive, we can reduce the component size, but 3208 * not increase it (except from 0). 3209 * If array is active, we can try an on-line resize 3210 */ 3211 sector_t sectors; 3212 int err = strict_blocks_to_sectors(buf, §ors); 3213 3214 if (err < 0) 3215 return err; 3216 if (mddev->pers) { 3217 err = update_size(mddev, sectors); 3218 md_update_sb(mddev, 1); 3219 } else { 3220 if (mddev->dev_sectors == 0 || 3221 mddev->dev_sectors > sectors) 3222 mddev->dev_sectors = sectors; 3223 else 3224 err = -ENOSPC; 3225 } 3226 return err ? err : len; 3227 } 3228 3229 static struct md_sysfs_entry md_size = 3230 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3231 3232 3233 /* Metdata version. 3234 * This is one of 3235 * 'none' for arrays with no metadata (good luck...) 3236 * 'external' for arrays with externally managed metadata, 3237 * or N.M for internally known formats 3238 */ 3239 static ssize_t 3240 metadata_show(mddev_t *mddev, char *page) 3241 { 3242 if (mddev->persistent) 3243 return sprintf(page, "%d.%d\n", 3244 mddev->major_version, mddev->minor_version); 3245 else if (mddev->external) 3246 return sprintf(page, "external:%s\n", mddev->metadata_type); 3247 else 3248 return sprintf(page, "none\n"); 3249 } 3250 3251 static ssize_t 3252 metadata_store(mddev_t *mddev, const char *buf, size_t len) 3253 { 3254 int major, minor; 3255 char *e; 3256 /* Changing the details of 'external' metadata is 3257 * always permitted. Otherwise there must be 3258 * no devices attached to the array. 3259 */ 3260 if (mddev->external && strncmp(buf, "external:", 9) == 0) 3261 ; 3262 else if (!list_empty(&mddev->disks)) 3263 return -EBUSY; 3264 3265 if (cmd_match(buf, "none")) { 3266 mddev->persistent = 0; 3267 mddev->external = 0; 3268 mddev->major_version = 0; 3269 mddev->minor_version = 90; 3270 return len; 3271 } 3272 if (strncmp(buf, "external:", 9) == 0) { 3273 size_t namelen = len-9; 3274 if (namelen >= sizeof(mddev->metadata_type)) 3275 namelen = sizeof(mddev->metadata_type)-1; 3276 strncpy(mddev->metadata_type, buf+9, namelen); 3277 mddev->metadata_type[namelen] = 0; 3278 if (namelen && mddev->metadata_type[namelen-1] == '\n') 3279 mddev->metadata_type[--namelen] = 0; 3280 mddev->persistent = 0; 3281 mddev->external = 1; 3282 mddev->major_version = 0; 3283 mddev->minor_version = 90; 3284 return len; 3285 } 3286 major = simple_strtoul(buf, &e, 10); 3287 if (e==buf || *e != '.') 3288 return -EINVAL; 3289 buf = e+1; 3290 minor = simple_strtoul(buf, &e, 10); 3291 if (e==buf || (*e && *e != '\n') ) 3292 return -EINVAL; 3293 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 3294 return -ENOENT; 3295 mddev->major_version = major; 3296 mddev->minor_version = minor; 3297 mddev->persistent = 1; 3298 mddev->external = 0; 3299 return len; 3300 } 3301 3302 static struct md_sysfs_entry md_metadata = 3303 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 3304 3305 static ssize_t 3306 action_show(mddev_t *mddev, char *page) 3307 { 3308 char *type = "idle"; 3309 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3310 type = "frozen"; 3311 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3312 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 3313 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3314 type = "reshape"; 3315 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3316 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3317 type = "resync"; 3318 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 3319 type = "check"; 3320 else 3321 type = "repair"; 3322 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3323 type = "recover"; 3324 } 3325 return sprintf(page, "%s\n", type); 3326 } 3327 3328 static ssize_t 3329 action_store(mddev_t *mddev, const char *page, size_t len) 3330 { 3331 if (!mddev->pers || !mddev->pers->sync_request) 3332 return -EINVAL; 3333 3334 if (cmd_match(page, "frozen")) 3335 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3336 else 3337 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3338 3339 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3340 if (mddev->sync_thread) { 3341 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3342 md_unregister_thread(mddev->sync_thread); 3343 mddev->sync_thread = NULL; 3344 mddev->recovery = 0; 3345 } 3346 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3347 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3348 return -EBUSY; 3349 else if (cmd_match(page, "resync")) 3350 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3351 else if (cmd_match(page, "recover")) { 3352 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3353 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3354 } else if (cmd_match(page, "reshape")) { 3355 int err; 3356 if (mddev->pers->start_reshape == NULL) 3357 return -EINVAL; 3358 err = mddev->pers->start_reshape(mddev); 3359 if (err) 3360 return err; 3361 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3362 } else { 3363 if (cmd_match(page, "check")) 3364 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3365 else if (!cmd_match(page, "repair")) 3366 return -EINVAL; 3367 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3368 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3369 } 3370 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3371 md_wakeup_thread(mddev->thread); 3372 sysfs_notify_dirent(mddev->sysfs_action); 3373 return len; 3374 } 3375 3376 static ssize_t 3377 mismatch_cnt_show(mddev_t *mddev, char *page) 3378 { 3379 return sprintf(page, "%llu\n", 3380 (unsigned long long) mddev->resync_mismatches); 3381 } 3382 3383 static struct md_sysfs_entry md_scan_mode = 3384 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3385 3386 3387 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3388 3389 static ssize_t 3390 sync_min_show(mddev_t *mddev, char *page) 3391 { 3392 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3393 mddev->sync_speed_min ? "local": "system"); 3394 } 3395 3396 static ssize_t 3397 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3398 { 3399 int min; 3400 char *e; 3401 if (strncmp(buf, "system", 6)==0) { 3402 mddev->sync_speed_min = 0; 3403 return len; 3404 } 3405 min = simple_strtoul(buf, &e, 10); 3406 if (buf == e || (*e && *e != '\n') || min <= 0) 3407 return -EINVAL; 3408 mddev->sync_speed_min = min; 3409 return len; 3410 } 3411 3412 static struct md_sysfs_entry md_sync_min = 3413 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3414 3415 static ssize_t 3416 sync_max_show(mddev_t *mddev, char *page) 3417 { 3418 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3419 mddev->sync_speed_max ? "local": "system"); 3420 } 3421 3422 static ssize_t 3423 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3424 { 3425 int max; 3426 char *e; 3427 if (strncmp(buf, "system", 6)==0) { 3428 mddev->sync_speed_max = 0; 3429 return len; 3430 } 3431 max = simple_strtoul(buf, &e, 10); 3432 if (buf == e || (*e && *e != '\n') || max <= 0) 3433 return -EINVAL; 3434 mddev->sync_speed_max = max; 3435 return len; 3436 } 3437 3438 static struct md_sysfs_entry md_sync_max = 3439 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3440 3441 static ssize_t 3442 degraded_show(mddev_t *mddev, char *page) 3443 { 3444 return sprintf(page, "%d\n", mddev->degraded); 3445 } 3446 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3447 3448 static ssize_t 3449 sync_force_parallel_show(mddev_t *mddev, char *page) 3450 { 3451 return sprintf(page, "%d\n", mddev->parallel_resync); 3452 } 3453 3454 static ssize_t 3455 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3456 { 3457 long n; 3458 3459 if (strict_strtol(buf, 10, &n)) 3460 return -EINVAL; 3461 3462 if (n != 0 && n != 1) 3463 return -EINVAL; 3464 3465 mddev->parallel_resync = n; 3466 3467 if (mddev->sync_thread) 3468 wake_up(&resync_wait); 3469 3470 return len; 3471 } 3472 3473 /* force parallel resync, even with shared block devices */ 3474 static struct md_sysfs_entry md_sync_force_parallel = 3475 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3476 sync_force_parallel_show, sync_force_parallel_store); 3477 3478 static ssize_t 3479 sync_speed_show(mddev_t *mddev, char *page) 3480 { 3481 unsigned long resync, dt, db; 3482 if (mddev->curr_resync == 0) 3483 return sprintf(page, "none\n"); 3484 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3485 dt = (jiffies - mddev->resync_mark) / HZ; 3486 if (!dt) dt++; 3487 db = resync - mddev->resync_mark_cnt; 3488 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3489 } 3490 3491 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3492 3493 static ssize_t 3494 sync_completed_show(mddev_t *mddev, char *page) 3495 { 3496 unsigned long max_sectors, resync; 3497 3498 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3499 return sprintf(page, "none\n"); 3500 3501 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3502 max_sectors = mddev->resync_max_sectors; 3503 else 3504 max_sectors = mddev->dev_sectors; 3505 3506 resync = mddev->curr_resync_completed; 3507 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3508 } 3509 3510 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3511 3512 static ssize_t 3513 min_sync_show(mddev_t *mddev, char *page) 3514 { 3515 return sprintf(page, "%llu\n", 3516 (unsigned long long)mddev->resync_min); 3517 } 3518 static ssize_t 3519 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3520 { 3521 unsigned long long min; 3522 if (strict_strtoull(buf, 10, &min)) 3523 return -EINVAL; 3524 if (min > mddev->resync_max) 3525 return -EINVAL; 3526 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3527 return -EBUSY; 3528 3529 /* Must be a multiple of chunk_size */ 3530 if (mddev->chunk_size) { 3531 if (min & (sector_t)((mddev->chunk_size>>9)-1)) 3532 return -EINVAL; 3533 } 3534 mddev->resync_min = min; 3535 3536 return len; 3537 } 3538 3539 static struct md_sysfs_entry md_min_sync = 3540 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3541 3542 static ssize_t 3543 max_sync_show(mddev_t *mddev, char *page) 3544 { 3545 if (mddev->resync_max == MaxSector) 3546 return sprintf(page, "max\n"); 3547 else 3548 return sprintf(page, "%llu\n", 3549 (unsigned long long)mddev->resync_max); 3550 } 3551 static ssize_t 3552 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3553 { 3554 if (strncmp(buf, "max", 3) == 0) 3555 mddev->resync_max = MaxSector; 3556 else { 3557 unsigned long long max; 3558 if (strict_strtoull(buf, 10, &max)) 3559 return -EINVAL; 3560 if (max < mddev->resync_min) 3561 return -EINVAL; 3562 if (max < mddev->resync_max && 3563 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3564 return -EBUSY; 3565 3566 /* Must be a multiple of chunk_size */ 3567 if (mddev->chunk_size) { 3568 if (max & (sector_t)((mddev->chunk_size>>9)-1)) 3569 return -EINVAL; 3570 } 3571 mddev->resync_max = max; 3572 } 3573 wake_up(&mddev->recovery_wait); 3574 return len; 3575 } 3576 3577 static struct md_sysfs_entry md_max_sync = 3578 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3579 3580 static ssize_t 3581 suspend_lo_show(mddev_t *mddev, char *page) 3582 { 3583 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3584 } 3585 3586 static ssize_t 3587 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3588 { 3589 char *e; 3590 unsigned long long new = simple_strtoull(buf, &e, 10); 3591 3592 if (mddev->pers->quiesce == NULL) 3593 return -EINVAL; 3594 if (buf == e || (*e && *e != '\n')) 3595 return -EINVAL; 3596 if (new >= mddev->suspend_hi || 3597 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3598 mddev->suspend_lo = new; 3599 mddev->pers->quiesce(mddev, 2); 3600 return len; 3601 } else 3602 return -EINVAL; 3603 } 3604 static struct md_sysfs_entry md_suspend_lo = 3605 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3606 3607 3608 static ssize_t 3609 suspend_hi_show(mddev_t *mddev, char *page) 3610 { 3611 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3612 } 3613 3614 static ssize_t 3615 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3616 { 3617 char *e; 3618 unsigned long long new = simple_strtoull(buf, &e, 10); 3619 3620 if (mddev->pers->quiesce == NULL) 3621 return -EINVAL; 3622 if (buf == e || (*e && *e != '\n')) 3623 return -EINVAL; 3624 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3625 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3626 mddev->suspend_hi = new; 3627 mddev->pers->quiesce(mddev, 1); 3628 mddev->pers->quiesce(mddev, 0); 3629 return len; 3630 } else 3631 return -EINVAL; 3632 } 3633 static struct md_sysfs_entry md_suspend_hi = 3634 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3635 3636 static ssize_t 3637 reshape_position_show(mddev_t *mddev, char *page) 3638 { 3639 if (mddev->reshape_position != MaxSector) 3640 return sprintf(page, "%llu\n", 3641 (unsigned long long)mddev->reshape_position); 3642 strcpy(page, "none\n"); 3643 return 5; 3644 } 3645 3646 static ssize_t 3647 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3648 { 3649 char *e; 3650 unsigned long long new = simple_strtoull(buf, &e, 10); 3651 if (mddev->pers) 3652 return -EBUSY; 3653 if (buf == e || (*e && *e != '\n')) 3654 return -EINVAL; 3655 mddev->reshape_position = new; 3656 mddev->delta_disks = 0; 3657 mddev->new_level = mddev->level; 3658 mddev->new_layout = mddev->layout; 3659 mddev->new_chunk = mddev->chunk_size; 3660 return len; 3661 } 3662 3663 static struct md_sysfs_entry md_reshape_position = 3664 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3665 reshape_position_store); 3666 3667 static ssize_t 3668 array_size_show(mddev_t *mddev, char *page) 3669 { 3670 if (mddev->external_size) 3671 return sprintf(page, "%llu\n", 3672 (unsigned long long)mddev->array_sectors/2); 3673 else 3674 return sprintf(page, "default\n"); 3675 } 3676 3677 static ssize_t 3678 array_size_store(mddev_t *mddev, const char *buf, size_t len) 3679 { 3680 sector_t sectors; 3681 3682 if (strncmp(buf, "default", 7) == 0) { 3683 if (mddev->pers) 3684 sectors = mddev->pers->size(mddev, 0, 0); 3685 else 3686 sectors = mddev->array_sectors; 3687 3688 mddev->external_size = 0; 3689 } else { 3690 if (strict_blocks_to_sectors(buf, §ors) < 0) 3691 return -EINVAL; 3692 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 3693 return -E2BIG; 3694 3695 mddev->external_size = 1; 3696 } 3697 3698 mddev->array_sectors = sectors; 3699 set_capacity(mddev->gendisk, mddev->array_sectors); 3700 if (mddev->pers) { 3701 struct block_device *bdev = bdget_disk(mddev->gendisk, 0); 3702 3703 if (bdev) { 3704 mutex_lock(&bdev->bd_inode->i_mutex); 3705 i_size_write(bdev->bd_inode, 3706 (loff_t)mddev->array_sectors << 9); 3707 mutex_unlock(&bdev->bd_inode->i_mutex); 3708 bdput(bdev); 3709 } 3710 } 3711 3712 return len; 3713 } 3714 3715 static struct md_sysfs_entry md_array_size = 3716 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 3717 array_size_store); 3718 3719 static struct attribute *md_default_attrs[] = { 3720 &md_level.attr, 3721 &md_layout.attr, 3722 &md_raid_disks.attr, 3723 &md_chunk_size.attr, 3724 &md_size.attr, 3725 &md_resync_start.attr, 3726 &md_metadata.attr, 3727 &md_new_device.attr, 3728 &md_safe_delay.attr, 3729 &md_array_state.attr, 3730 &md_reshape_position.attr, 3731 &md_array_size.attr, 3732 NULL, 3733 }; 3734 3735 static struct attribute *md_redundancy_attrs[] = { 3736 &md_scan_mode.attr, 3737 &md_mismatches.attr, 3738 &md_sync_min.attr, 3739 &md_sync_max.attr, 3740 &md_sync_speed.attr, 3741 &md_sync_force_parallel.attr, 3742 &md_sync_completed.attr, 3743 &md_min_sync.attr, 3744 &md_max_sync.attr, 3745 &md_suspend_lo.attr, 3746 &md_suspend_hi.attr, 3747 &md_bitmap.attr, 3748 &md_degraded.attr, 3749 NULL, 3750 }; 3751 static struct attribute_group md_redundancy_group = { 3752 .name = NULL, 3753 .attrs = md_redundancy_attrs, 3754 }; 3755 3756 3757 static ssize_t 3758 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3759 { 3760 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3761 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3762 ssize_t rv; 3763 3764 if (!entry->show) 3765 return -EIO; 3766 rv = mddev_lock(mddev); 3767 if (!rv) { 3768 rv = entry->show(mddev, page); 3769 mddev_unlock(mddev); 3770 } 3771 return rv; 3772 } 3773 3774 static ssize_t 3775 md_attr_store(struct kobject *kobj, struct attribute *attr, 3776 const char *page, size_t length) 3777 { 3778 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3779 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3780 ssize_t rv; 3781 3782 if (!entry->store) 3783 return -EIO; 3784 if (!capable(CAP_SYS_ADMIN)) 3785 return -EACCES; 3786 rv = mddev_lock(mddev); 3787 if (mddev->hold_active == UNTIL_IOCTL) 3788 mddev->hold_active = 0; 3789 if (!rv) { 3790 rv = entry->store(mddev, page, length); 3791 mddev_unlock(mddev); 3792 } 3793 return rv; 3794 } 3795 3796 static void md_free(struct kobject *ko) 3797 { 3798 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3799 3800 if (mddev->sysfs_state) 3801 sysfs_put(mddev->sysfs_state); 3802 3803 if (mddev->gendisk) { 3804 del_gendisk(mddev->gendisk); 3805 put_disk(mddev->gendisk); 3806 } 3807 if (mddev->queue) 3808 blk_cleanup_queue(mddev->queue); 3809 3810 kfree(mddev); 3811 } 3812 3813 static struct sysfs_ops md_sysfs_ops = { 3814 .show = md_attr_show, 3815 .store = md_attr_store, 3816 }; 3817 static struct kobj_type md_ktype = { 3818 .release = md_free, 3819 .sysfs_ops = &md_sysfs_ops, 3820 .default_attrs = md_default_attrs, 3821 }; 3822 3823 int mdp_major = 0; 3824 3825 static void mddev_delayed_delete(struct work_struct *ws) 3826 { 3827 mddev_t *mddev = container_of(ws, mddev_t, del_work); 3828 3829 if (mddev->private == &md_redundancy_group) { 3830 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3831 if (mddev->sysfs_action) 3832 sysfs_put(mddev->sysfs_action); 3833 mddev->sysfs_action = NULL; 3834 mddev->private = NULL; 3835 } 3836 kobject_del(&mddev->kobj); 3837 kobject_put(&mddev->kobj); 3838 } 3839 3840 static int md_alloc(dev_t dev, char *name) 3841 { 3842 static DEFINE_MUTEX(disks_mutex); 3843 mddev_t *mddev = mddev_find(dev); 3844 struct gendisk *disk; 3845 int partitioned; 3846 int shift; 3847 int unit; 3848 int error; 3849 3850 if (!mddev) 3851 return -ENODEV; 3852 3853 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 3854 shift = partitioned ? MdpMinorShift : 0; 3855 unit = MINOR(mddev->unit) >> shift; 3856 3857 /* wait for any previous instance if this device 3858 * to be completed removed (mddev_delayed_delete). 3859 */ 3860 flush_scheduled_work(); 3861 3862 mutex_lock(&disks_mutex); 3863 if (mddev->gendisk) { 3864 mutex_unlock(&disks_mutex); 3865 mddev_put(mddev); 3866 return -EEXIST; 3867 } 3868 3869 if (name) { 3870 /* Need to ensure that 'name' is not a duplicate. 3871 */ 3872 mddev_t *mddev2; 3873 spin_lock(&all_mddevs_lock); 3874 3875 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 3876 if (mddev2->gendisk && 3877 strcmp(mddev2->gendisk->disk_name, name) == 0) { 3878 spin_unlock(&all_mddevs_lock); 3879 return -EEXIST; 3880 } 3881 spin_unlock(&all_mddevs_lock); 3882 } 3883 3884 mddev->queue = blk_alloc_queue(GFP_KERNEL); 3885 if (!mddev->queue) { 3886 mutex_unlock(&disks_mutex); 3887 mddev_put(mddev); 3888 return -ENOMEM; 3889 } 3890 mddev->queue->queuedata = mddev; 3891 3892 /* Can be unlocked because the queue is new: no concurrency */ 3893 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3894 3895 blk_queue_make_request(mddev->queue, md_make_request); 3896 3897 disk = alloc_disk(1 << shift); 3898 if (!disk) { 3899 mutex_unlock(&disks_mutex); 3900 blk_cleanup_queue(mddev->queue); 3901 mddev->queue = NULL; 3902 mddev_put(mddev); 3903 return -ENOMEM; 3904 } 3905 disk->major = MAJOR(mddev->unit); 3906 disk->first_minor = unit << shift; 3907 if (name) 3908 strcpy(disk->disk_name, name); 3909 else if (partitioned) 3910 sprintf(disk->disk_name, "md_d%d", unit); 3911 else 3912 sprintf(disk->disk_name, "md%d", unit); 3913 disk->fops = &md_fops; 3914 disk->private_data = mddev; 3915 disk->queue = mddev->queue; 3916 /* Allow extended partitions. This makes the 3917 * 'mdp' device redundant, but we can't really 3918 * remove it now. 3919 */ 3920 disk->flags |= GENHD_FL_EXT_DEVT; 3921 add_disk(disk); 3922 mddev->gendisk = disk; 3923 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3924 &disk_to_dev(disk)->kobj, "%s", "md"); 3925 mutex_unlock(&disks_mutex); 3926 if (error) 3927 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3928 disk->disk_name); 3929 else { 3930 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3931 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3932 } 3933 mddev_put(mddev); 3934 return 0; 3935 } 3936 3937 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3938 { 3939 md_alloc(dev, NULL); 3940 return NULL; 3941 } 3942 3943 static int add_named_array(const char *val, struct kernel_param *kp) 3944 { 3945 /* val must be "md_*" where * is not all digits. 3946 * We allocate an array with a large free minor number, and 3947 * set the name to val. val must not already be an active name. 3948 */ 3949 int len = strlen(val); 3950 char buf[DISK_NAME_LEN]; 3951 3952 while (len && val[len-1] == '\n') 3953 len--; 3954 if (len >= DISK_NAME_LEN) 3955 return -E2BIG; 3956 strlcpy(buf, val, len+1); 3957 if (strncmp(buf, "md_", 3) != 0) 3958 return -EINVAL; 3959 return md_alloc(0, buf); 3960 } 3961 3962 static void md_safemode_timeout(unsigned long data) 3963 { 3964 mddev_t *mddev = (mddev_t *) data; 3965 3966 if (!atomic_read(&mddev->writes_pending)) { 3967 mddev->safemode = 1; 3968 if (mddev->external) 3969 sysfs_notify_dirent(mddev->sysfs_state); 3970 } 3971 md_wakeup_thread(mddev->thread); 3972 } 3973 3974 static int start_dirty_degraded; 3975 3976 static int do_md_run(mddev_t * mddev) 3977 { 3978 int err; 3979 int chunk_size; 3980 mdk_rdev_t *rdev; 3981 struct gendisk *disk; 3982 struct mdk_personality *pers; 3983 char b[BDEVNAME_SIZE]; 3984 3985 if (list_empty(&mddev->disks)) 3986 /* cannot run an array with no devices.. */ 3987 return -EINVAL; 3988 3989 if (mddev->pers) 3990 return -EBUSY; 3991 3992 /* 3993 * Analyze all RAID superblock(s) 3994 */ 3995 if (!mddev->raid_disks) { 3996 if (!mddev->persistent) 3997 return -EINVAL; 3998 analyze_sbs(mddev); 3999 } 4000 4001 chunk_size = mddev->chunk_size; 4002 4003 if (chunk_size) { 4004 if (chunk_size > MAX_CHUNK_SIZE) { 4005 printk(KERN_ERR "too big chunk_size: %d > %d\n", 4006 chunk_size, MAX_CHUNK_SIZE); 4007 return -EINVAL; 4008 } 4009 /* 4010 * chunk-size has to be a power of 2 4011 */ 4012 if ( (1 << ffz(~chunk_size)) != chunk_size) { 4013 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 4014 return -EINVAL; 4015 } 4016 4017 /* devices must have minimum size of one chunk */ 4018 list_for_each_entry(rdev, &mddev->disks, same_set) { 4019 if (test_bit(Faulty, &rdev->flags)) 4020 continue; 4021 if (rdev->sectors < chunk_size / 512) { 4022 printk(KERN_WARNING 4023 "md: Dev %s smaller than chunk_size:" 4024 " %llu < %d\n", 4025 bdevname(rdev->bdev,b), 4026 (unsigned long long)rdev->sectors, 4027 chunk_size / 512); 4028 return -EINVAL; 4029 } 4030 } 4031 } 4032 4033 if (mddev->level != LEVEL_NONE) 4034 request_module("md-level-%d", mddev->level); 4035 else if (mddev->clevel[0]) 4036 request_module("md-%s", mddev->clevel); 4037 4038 /* 4039 * Drop all container device buffers, from now on 4040 * the only valid external interface is through the md 4041 * device. 4042 */ 4043 list_for_each_entry(rdev, &mddev->disks, same_set) { 4044 if (test_bit(Faulty, &rdev->flags)) 4045 continue; 4046 sync_blockdev(rdev->bdev); 4047 invalidate_bdev(rdev->bdev); 4048 4049 /* perform some consistency tests on the device. 4050 * We don't want the data to overlap the metadata, 4051 * Internal Bitmap issues have been handled elsewhere. 4052 */ 4053 if (rdev->data_offset < rdev->sb_start) { 4054 if (mddev->dev_sectors && 4055 rdev->data_offset + mddev->dev_sectors 4056 > rdev->sb_start) { 4057 printk("md: %s: data overlaps metadata\n", 4058 mdname(mddev)); 4059 return -EINVAL; 4060 } 4061 } else { 4062 if (rdev->sb_start + rdev->sb_size/512 4063 > rdev->data_offset) { 4064 printk("md: %s: metadata overlaps data\n", 4065 mdname(mddev)); 4066 return -EINVAL; 4067 } 4068 } 4069 sysfs_notify_dirent(rdev->sysfs_state); 4070 } 4071 4072 md_probe(mddev->unit, NULL, NULL); 4073 disk = mddev->gendisk; 4074 if (!disk) 4075 return -ENOMEM; 4076 4077 spin_lock(&pers_lock); 4078 pers = find_pers(mddev->level, mddev->clevel); 4079 if (!pers || !try_module_get(pers->owner)) { 4080 spin_unlock(&pers_lock); 4081 if (mddev->level != LEVEL_NONE) 4082 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 4083 mddev->level); 4084 else 4085 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 4086 mddev->clevel); 4087 return -EINVAL; 4088 } 4089 mddev->pers = pers; 4090 spin_unlock(&pers_lock); 4091 if (mddev->level != pers->level) { 4092 mddev->level = pers->level; 4093 mddev->new_level = pers->level; 4094 } 4095 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4096 4097 if (pers->level >= 4 && pers->level <= 6) 4098 /* Cannot support integrity (yet) */ 4099 blk_integrity_unregister(mddev->gendisk); 4100 4101 if (mddev->reshape_position != MaxSector && 4102 pers->start_reshape == NULL) { 4103 /* This personality cannot handle reshaping... */ 4104 mddev->pers = NULL; 4105 module_put(pers->owner); 4106 return -EINVAL; 4107 } 4108 4109 if (pers->sync_request) { 4110 /* Warn if this is a potentially silly 4111 * configuration. 4112 */ 4113 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4114 mdk_rdev_t *rdev2; 4115 int warned = 0; 4116 4117 list_for_each_entry(rdev, &mddev->disks, same_set) 4118 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4119 if (rdev < rdev2 && 4120 rdev->bdev->bd_contains == 4121 rdev2->bdev->bd_contains) { 4122 printk(KERN_WARNING 4123 "%s: WARNING: %s appears to be" 4124 " on the same physical disk as" 4125 " %s.\n", 4126 mdname(mddev), 4127 bdevname(rdev->bdev,b), 4128 bdevname(rdev2->bdev,b2)); 4129 warned = 1; 4130 } 4131 } 4132 4133 if (warned) 4134 printk(KERN_WARNING 4135 "True protection against single-disk" 4136 " failure might be compromised.\n"); 4137 } 4138 4139 mddev->recovery = 0; 4140 /* may be over-ridden by personality */ 4141 mddev->resync_max_sectors = mddev->dev_sectors; 4142 4143 mddev->barriers_work = 1; 4144 mddev->ok_start_degraded = start_dirty_degraded; 4145 4146 if (start_readonly) 4147 mddev->ro = 2; /* read-only, but switch on first write */ 4148 4149 err = mddev->pers->run(mddev); 4150 if (err) 4151 printk(KERN_ERR "md: pers->run() failed ...\n"); 4152 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 4153 WARN_ONCE(!mddev->external_size, "%s: default size too small," 4154 " but 'external_size' not in effect?\n", __func__); 4155 printk(KERN_ERR 4156 "md: invalid array_size %llu > default size %llu\n", 4157 (unsigned long long)mddev->array_sectors / 2, 4158 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 4159 err = -EINVAL; 4160 mddev->pers->stop(mddev); 4161 } 4162 if (err == 0 && mddev->pers->sync_request) { 4163 err = bitmap_create(mddev); 4164 if (err) { 4165 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4166 mdname(mddev), err); 4167 mddev->pers->stop(mddev); 4168 } 4169 } 4170 if (err) { 4171 module_put(mddev->pers->owner); 4172 mddev->pers = NULL; 4173 bitmap_destroy(mddev); 4174 return err; 4175 } 4176 if (mddev->pers->sync_request) { 4177 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4178 printk(KERN_WARNING 4179 "md: cannot register extra attributes for %s\n", 4180 mdname(mddev)); 4181 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4182 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4183 mddev->ro = 0; 4184 4185 atomic_set(&mddev->writes_pending,0); 4186 mddev->safemode = 0; 4187 mddev->safemode_timer.function = md_safemode_timeout; 4188 mddev->safemode_timer.data = (unsigned long) mddev; 4189 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4190 mddev->in_sync = 1; 4191 4192 list_for_each_entry(rdev, &mddev->disks, same_set) 4193 if (rdev->raid_disk >= 0) { 4194 char nm[20]; 4195 sprintf(nm, "rd%d", rdev->raid_disk); 4196 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4197 printk("md: cannot register %s for %s\n", 4198 nm, mdname(mddev)); 4199 } 4200 4201 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4202 4203 if (mddev->flags) 4204 md_update_sb(mddev, 0); 4205 4206 set_capacity(disk, mddev->array_sectors); 4207 4208 /* If there is a partially-recovered drive we need to 4209 * start recovery here. If we leave it to md_check_recovery, 4210 * it will remove the drives and not do the right thing 4211 */ 4212 if (mddev->degraded && !mddev->sync_thread) { 4213 int spares = 0; 4214 list_for_each_entry(rdev, &mddev->disks, same_set) 4215 if (rdev->raid_disk >= 0 && 4216 !test_bit(In_sync, &rdev->flags) && 4217 !test_bit(Faulty, &rdev->flags)) 4218 /* complete an interrupted recovery */ 4219 spares++; 4220 if (spares && mddev->pers->sync_request) { 4221 mddev->recovery = 0; 4222 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4223 mddev->sync_thread = md_register_thread(md_do_sync, 4224 mddev, 4225 "%s_resync"); 4226 if (!mddev->sync_thread) { 4227 printk(KERN_ERR "%s: could not start resync" 4228 " thread...\n", 4229 mdname(mddev)); 4230 /* leave the spares where they are, it shouldn't hurt */ 4231 mddev->recovery = 0; 4232 } 4233 } 4234 } 4235 md_wakeup_thread(mddev->thread); 4236 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4237 4238 mddev->changed = 1; 4239 md_new_event(mddev); 4240 sysfs_notify_dirent(mddev->sysfs_state); 4241 if (mddev->sysfs_action) 4242 sysfs_notify_dirent(mddev->sysfs_action); 4243 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4244 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4245 return 0; 4246 } 4247 4248 static int restart_array(mddev_t *mddev) 4249 { 4250 struct gendisk *disk = mddev->gendisk; 4251 4252 /* Complain if it has no devices */ 4253 if (list_empty(&mddev->disks)) 4254 return -ENXIO; 4255 if (!mddev->pers) 4256 return -EINVAL; 4257 if (!mddev->ro) 4258 return -EBUSY; 4259 mddev->safemode = 0; 4260 mddev->ro = 0; 4261 set_disk_ro(disk, 0); 4262 printk(KERN_INFO "md: %s switched to read-write mode.\n", 4263 mdname(mddev)); 4264 /* Kick recovery or resync if necessary */ 4265 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4266 md_wakeup_thread(mddev->thread); 4267 md_wakeup_thread(mddev->sync_thread); 4268 sysfs_notify_dirent(mddev->sysfs_state); 4269 return 0; 4270 } 4271 4272 /* similar to deny_write_access, but accounts for our holding a reference 4273 * to the file ourselves */ 4274 static int deny_bitmap_write_access(struct file * file) 4275 { 4276 struct inode *inode = file->f_mapping->host; 4277 4278 spin_lock(&inode->i_lock); 4279 if (atomic_read(&inode->i_writecount) > 1) { 4280 spin_unlock(&inode->i_lock); 4281 return -ETXTBSY; 4282 } 4283 atomic_set(&inode->i_writecount, -1); 4284 spin_unlock(&inode->i_lock); 4285 4286 return 0; 4287 } 4288 4289 static void restore_bitmap_write_access(struct file *file) 4290 { 4291 struct inode *inode = file->f_mapping->host; 4292 4293 spin_lock(&inode->i_lock); 4294 atomic_set(&inode->i_writecount, 1); 4295 spin_unlock(&inode->i_lock); 4296 } 4297 4298 /* mode: 4299 * 0 - completely stop and dis-assemble array 4300 * 1 - switch to readonly 4301 * 2 - stop but do not disassemble array 4302 */ 4303 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4304 { 4305 int err = 0; 4306 struct gendisk *disk = mddev->gendisk; 4307 mdk_rdev_t *rdev; 4308 4309 if (atomic_read(&mddev->openers) > is_open) { 4310 printk("md: %s still in use.\n",mdname(mddev)); 4311 return -EBUSY; 4312 } 4313 4314 if (mddev->pers) { 4315 4316 if (mddev->sync_thread) { 4317 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4318 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4319 md_unregister_thread(mddev->sync_thread); 4320 mddev->sync_thread = NULL; 4321 } 4322 4323 del_timer_sync(&mddev->safemode_timer); 4324 4325 switch(mode) { 4326 case 1: /* readonly */ 4327 err = -ENXIO; 4328 if (mddev->ro==1) 4329 goto out; 4330 mddev->ro = 1; 4331 break; 4332 case 0: /* disassemble */ 4333 case 2: /* stop */ 4334 bitmap_flush(mddev); 4335 md_super_wait(mddev); 4336 if (mddev->ro) 4337 set_disk_ro(disk, 0); 4338 4339 mddev->pers->stop(mddev); 4340 mddev->queue->merge_bvec_fn = NULL; 4341 mddev->queue->unplug_fn = NULL; 4342 mddev->queue->backing_dev_info.congested_fn = NULL; 4343 module_put(mddev->pers->owner); 4344 if (mddev->pers->sync_request) 4345 mddev->private = &md_redundancy_group; 4346 mddev->pers = NULL; 4347 /* tell userspace to handle 'inactive' */ 4348 sysfs_notify_dirent(mddev->sysfs_state); 4349 4350 list_for_each_entry(rdev, &mddev->disks, same_set) 4351 if (rdev->raid_disk >= 0) { 4352 char nm[20]; 4353 sprintf(nm, "rd%d", rdev->raid_disk); 4354 sysfs_remove_link(&mddev->kobj, nm); 4355 } 4356 4357 set_capacity(disk, 0); 4358 mddev->changed = 1; 4359 4360 if (mddev->ro) 4361 mddev->ro = 0; 4362 } 4363 if (!mddev->in_sync || mddev->flags) { 4364 /* mark array as shutdown cleanly */ 4365 mddev->in_sync = 1; 4366 md_update_sb(mddev, 1); 4367 } 4368 if (mode == 1) 4369 set_disk_ro(disk, 1); 4370 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4371 } 4372 4373 /* 4374 * Free resources if final stop 4375 */ 4376 if (mode == 0) { 4377 4378 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4379 4380 bitmap_destroy(mddev); 4381 if (mddev->bitmap_file) { 4382 restore_bitmap_write_access(mddev->bitmap_file); 4383 fput(mddev->bitmap_file); 4384 mddev->bitmap_file = NULL; 4385 } 4386 mddev->bitmap_offset = 0; 4387 4388 /* make sure all md_delayed_delete calls have finished */ 4389 flush_scheduled_work(); 4390 4391 export_array(mddev); 4392 4393 mddev->array_sectors = 0; 4394 mddev->external_size = 0; 4395 mddev->dev_sectors = 0; 4396 mddev->raid_disks = 0; 4397 mddev->recovery_cp = 0; 4398 mddev->resync_min = 0; 4399 mddev->resync_max = MaxSector; 4400 mddev->reshape_position = MaxSector; 4401 mddev->external = 0; 4402 mddev->persistent = 0; 4403 mddev->level = LEVEL_NONE; 4404 mddev->clevel[0] = 0; 4405 mddev->flags = 0; 4406 mddev->ro = 0; 4407 mddev->metadata_type[0] = 0; 4408 mddev->chunk_size = 0; 4409 mddev->ctime = mddev->utime = 0; 4410 mddev->layout = 0; 4411 mddev->max_disks = 0; 4412 mddev->events = 0; 4413 mddev->delta_disks = 0; 4414 mddev->new_level = LEVEL_NONE; 4415 mddev->new_layout = 0; 4416 mddev->new_chunk = 0; 4417 mddev->curr_resync = 0; 4418 mddev->resync_mismatches = 0; 4419 mddev->suspend_lo = mddev->suspend_hi = 0; 4420 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4421 mddev->recovery = 0; 4422 mddev->in_sync = 0; 4423 mddev->changed = 0; 4424 mddev->degraded = 0; 4425 mddev->barriers_work = 0; 4426 mddev->safemode = 0; 4427 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4428 if (mddev->hold_active == UNTIL_STOP) 4429 mddev->hold_active = 0; 4430 4431 } else if (mddev->pers) 4432 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4433 mdname(mddev)); 4434 err = 0; 4435 blk_integrity_unregister(disk); 4436 md_new_event(mddev); 4437 sysfs_notify_dirent(mddev->sysfs_state); 4438 out: 4439 return err; 4440 } 4441 4442 #ifndef MODULE 4443 static void autorun_array(mddev_t *mddev) 4444 { 4445 mdk_rdev_t *rdev; 4446 int err; 4447 4448 if (list_empty(&mddev->disks)) 4449 return; 4450 4451 printk(KERN_INFO "md: running: "); 4452 4453 list_for_each_entry(rdev, &mddev->disks, same_set) { 4454 char b[BDEVNAME_SIZE]; 4455 printk("<%s>", bdevname(rdev->bdev,b)); 4456 } 4457 printk("\n"); 4458 4459 err = do_md_run(mddev); 4460 if (err) { 4461 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 4462 do_md_stop(mddev, 0, 0); 4463 } 4464 } 4465 4466 /* 4467 * lets try to run arrays based on all disks that have arrived 4468 * until now. (those are in pending_raid_disks) 4469 * 4470 * the method: pick the first pending disk, collect all disks with 4471 * the same UUID, remove all from the pending list and put them into 4472 * the 'same_array' list. Then order this list based on superblock 4473 * update time (freshest comes first), kick out 'old' disks and 4474 * compare superblocks. If everything's fine then run it. 4475 * 4476 * If "unit" is allocated, then bump its reference count 4477 */ 4478 static void autorun_devices(int part) 4479 { 4480 mdk_rdev_t *rdev0, *rdev, *tmp; 4481 mddev_t *mddev; 4482 char b[BDEVNAME_SIZE]; 4483 4484 printk(KERN_INFO "md: autorun ...\n"); 4485 while (!list_empty(&pending_raid_disks)) { 4486 int unit; 4487 dev_t dev; 4488 LIST_HEAD(candidates); 4489 rdev0 = list_entry(pending_raid_disks.next, 4490 mdk_rdev_t, same_set); 4491 4492 printk(KERN_INFO "md: considering %s ...\n", 4493 bdevname(rdev0->bdev,b)); 4494 INIT_LIST_HEAD(&candidates); 4495 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 4496 if (super_90_load(rdev, rdev0, 0) >= 0) { 4497 printk(KERN_INFO "md: adding %s ...\n", 4498 bdevname(rdev->bdev,b)); 4499 list_move(&rdev->same_set, &candidates); 4500 } 4501 /* 4502 * now we have a set of devices, with all of them having 4503 * mostly sane superblocks. It's time to allocate the 4504 * mddev. 4505 */ 4506 if (part) { 4507 dev = MKDEV(mdp_major, 4508 rdev0->preferred_minor << MdpMinorShift); 4509 unit = MINOR(dev) >> MdpMinorShift; 4510 } else { 4511 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4512 unit = MINOR(dev); 4513 } 4514 if (rdev0->preferred_minor != unit) { 4515 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4516 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4517 break; 4518 } 4519 4520 md_probe(dev, NULL, NULL); 4521 mddev = mddev_find(dev); 4522 if (!mddev || !mddev->gendisk) { 4523 if (mddev) 4524 mddev_put(mddev); 4525 printk(KERN_ERR 4526 "md: cannot allocate memory for md drive.\n"); 4527 break; 4528 } 4529 if (mddev_lock(mddev)) 4530 printk(KERN_WARNING "md: %s locked, cannot run\n", 4531 mdname(mddev)); 4532 else if (mddev->raid_disks || mddev->major_version 4533 || !list_empty(&mddev->disks)) { 4534 printk(KERN_WARNING 4535 "md: %s already running, cannot run %s\n", 4536 mdname(mddev), bdevname(rdev0->bdev,b)); 4537 mddev_unlock(mddev); 4538 } else { 4539 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4540 mddev->persistent = 1; 4541 rdev_for_each_list(rdev, tmp, &candidates) { 4542 list_del_init(&rdev->same_set); 4543 if (bind_rdev_to_array(rdev, mddev)) 4544 export_rdev(rdev); 4545 } 4546 autorun_array(mddev); 4547 mddev_unlock(mddev); 4548 } 4549 /* on success, candidates will be empty, on error 4550 * it won't... 4551 */ 4552 rdev_for_each_list(rdev, tmp, &candidates) { 4553 list_del_init(&rdev->same_set); 4554 export_rdev(rdev); 4555 } 4556 mddev_put(mddev); 4557 } 4558 printk(KERN_INFO "md: ... autorun DONE.\n"); 4559 } 4560 #endif /* !MODULE */ 4561 4562 static int get_version(void __user * arg) 4563 { 4564 mdu_version_t ver; 4565 4566 ver.major = MD_MAJOR_VERSION; 4567 ver.minor = MD_MINOR_VERSION; 4568 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4569 4570 if (copy_to_user(arg, &ver, sizeof(ver))) 4571 return -EFAULT; 4572 4573 return 0; 4574 } 4575 4576 static int get_array_info(mddev_t * mddev, void __user * arg) 4577 { 4578 mdu_array_info_t info; 4579 int nr,working,active,failed,spare; 4580 mdk_rdev_t *rdev; 4581 4582 nr=working=active=failed=spare=0; 4583 list_for_each_entry(rdev, &mddev->disks, same_set) { 4584 nr++; 4585 if (test_bit(Faulty, &rdev->flags)) 4586 failed++; 4587 else { 4588 working++; 4589 if (test_bit(In_sync, &rdev->flags)) 4590 active++; 4591 else 4592 spare++; 4593 } 4594 } 4595 4596 info.major_version = mddev->major_version; 4597 info.minor_version = mddev->minor_version; 4598 info.patch_version = MD_PATCHLEVEL_VERSION; 4599 info.ctime = mddev->ctime; 4600 info.level = mddev->level; 4601 info.size = mddev->dev_sectors / 2; 4602 if (info.size != mddev->dev_sectors / 2) /* overflow */ 4603 info.size = -1; 4604 info.nr_disks = nr; 4605 info.raid_disks = mddev->raid_disks; 4606 info.md_minor = mddev->md_minor; 4607 info.not_persistent= !mddev->persistent; 4608 4609 info.utime = mddev->utime; 4610 info.state = 0; 4611 if (mddev->in_sync) 4612 info.state = (1<<MD_SB_CLEAN); 4613 if (mddev->bitmap && mddev->bitmap_offset) 4614 info.state = (1<<MD_SB_BITMAP_PRESENT); 4615 info.active_disks = active; 4616 info.working_disks = working; 4617 info.failed_disks = failed; 4618 info.spare_disks = spare; 4619 4620 info.layout = mddev->layout; 4621 info.chunk_size = mddev->chunk_size; 4622 4623 if (copy_to_user(arg, &info, sizeof(info))) 4624 return -EFAULT; 4625 4626 return 0; 4627 } 4628 4629 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4630 { 4631 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4632 char *ptr, *buf = NULL; 4633 int err = -ENOMEM; 4634 4635 if (md_allow_write(mddev)) 4636 file = kmalloc(sizeof(*file), GFP_NOIO); 4637 else 4638 file = kmalloc(sizeof(*file), GFP_KERNEL); 4639 4640 if (!file) 4641 goto out; 4642 4643 /* bitmap disabled, zero the first byte and copy out */ 4644 if (!mddev->bitmap || !mddev->bitmap->file) { 4645 file->pathname[0] = '\0'; 4646 goto copy_out; 4647 } 4648 4649 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4650 if (!buf) 4651 goto out; 4652 4653 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4654 if (IS_ERR(ptr)) 4655 goto out; 4656 4657 strcpy(file->pathname, ptr); 4658 4659 copy_out: 4660 err = 0; 4661 if (copy_to_user(arg, file, sizeof(*file))) 4662 err = -EFAULT; 4663 out: 4664 kfree(buf); 4665 kfree(file); 4666 return err; 4667 } 4668 4669 static int get_disk_info(mddev_t * mddev, void __user * arg) 4670 { 4671 mdu_disk_info_t info; 4672 mdk_rdev_t *rdev; 4673 4674 if (copy_from_user(&info, arg, sizeof(info))) 4675 return -EFAULT; 4676 4677 rdev = find_rdev_nr(mddev, info.number); 4678 if (rdev) { 4679 info.major = MAJOR(rdev->bdev->bd_dev); 4680 info.minor = MINOR(rdev->bdev->bd_dev); 4681 info.raid_disk = rdev->raid_disk; 4682 info.state = 0; 4683 if (test_bit(Faulty, &rdev->flags)) 4684 info.state |= (1<<MD_DISK_FAULTY); 4685 else if (test_bit(In_sync, &rdev->flags)) { 4686 info.state |= (1<<MD_DISK_ACTIVE); 4687 info.state |= (1<<MD_DISK_SYNC); 4688 } 4689 if (test_bit(WriteMostly, &rdev->flags)) 4690 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4691 } else { 4692 info.major = info.minor = 0; 4693 info.raid_disk = -1; 4694 info.state = (1<<MD_DISK_REMOVED); 4695 } 4696 4697 if (copy_to_user(arg, &info, sizeof(info))) 4698 return -EFAULT; 4699 4700 return 0; 4701 } 4702 4703 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4704 { 4705 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4706 mdk_rdev_t *rdev; 4707 dev_t dev = MKDEV(info->major,info->minor); 4708 4709 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4710 return -EOVERFLOW; 4711 4712 if (!mddev->raid_disks) { 4713 int err; 4714 /* expecting a device which has a superblock */ 4715 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4716 if (IS_ERR(rdev)) { 4717 printk(KERN_WARNING 4718 "md: md_import_device returned %ld\n", 4719 PTR_ERR(rdev)); 4720 return PTR_ERR(rdev); 4721 } 4722 if (!list_empty(&mddev->disks)) { 4723 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4724 mdk_rdev_t, same_set); 4725 int err = super_types[mddev->major_version] 4726 .load_super(rdev, rdev0, mddev->minor_version); 4727 if (err < 0) { 4728 printk(KERN_WARNING 4729 "md: %s has different UUID to %s\n", 4730 bdevname(rdev->bdev,b), 4731 bdevname(rdev0->bdev,b2)); 4732 export_rdev(rdev); 4733 return -EINVAL; 4734 } 4735 } 4736 err = bind_rdev_to_array(rdev, mddev); 4737 if (err) 4738 export_rdev(rdev); 4739 return err; 4740 } 4741 4742 /* 4743 * add_new_disk can be used once the array is assembled 4744 * to add "hot spares". They must already have a superblock 4745 * written 4746 */ 4747 if (mddev->pers) { 4748 int err; 4749 if (!mddev->pers->hot_add_disk) { 4750 printk(KERN_WARNING 4751 "%s: personality does not support diskops!\n", 4752 mdname(mddev)); 4753 return -EINVAL; 4754 } 4755 if (mddev->persistent) 4756 rdev = md_import_device(dev, mddev->major_version, 4757 mddev->minor_version); 4758 else 4759 rdev = md_import_device(dev, -1, -1); 4760 if (IS_ERR(rdev)) { 4761 printk(KERN_WARNING 4762 "md: md_import_device returned %ld\n", 4763 PTR_ERR(rdev)); 4764 return PTR_ERR(rdev); 4765 } 4766 /* set save_raid_disk if appropriate */ 4767 if (!mddev->persistent) { 4768 if (info->state & (1<<MD_DISK_SYNC) && 4769 info->raid_disk < mddev->raid_disks) 4770 rdev->raid_disk = info->raid_disk; 4771 else 4772 rdev->raid_disk = -1; 4773 } else 4774 super_types[mddev->major_version]. 4775 validate_super(mddev, rdev); 4776 rdev->saved_raid_disk = rdev->raid_disk; 4777 4778 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4779 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4780 set_bit(WriteMostly, &rdev->flags); 4781 else 4782 clear_bit(WriteMostly, &rdev->flags); 4783 4784 rdev->raid_disk = -1; 4785 err = bind_rdev_to_array(rdev, mddev); 4786 if (!err && !mddev->pers->hot_remove_disk) { 4787 /* If there is hot_add_disk but no hot_remove_disk 4788 * then added disks for geometry changes, 4789 * and should be added immediately. 4790 */ 4791 super_types[mddev->major_version]. 4792 validate_super(mddev, rdev); 4793 err = mddev->pers->hot_add_disk(mddev, rdev); 4794 if (err) 4795 unbind_rdev_from_array(rdev); 4796 } 4797 if (err) 4798 export_rdev(rdev); 4799 else 4800 sysfs_notify_dirent(rdev->sysfs_state); 4801 4802 md_update_sb(mddev, 1); 4803 if (mddev->degraded) 4804 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4805 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4806 md_wakeup_thread(mddev->thread); 4807 return err; 4808 } 4809 4810 /* otherwise, add_new_disk is only allowed 4811 * for major_version==0 superblocks 4812 */ 4813 if (mddev->major_version != 0) { 4814 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 4815 mdname(mddev)); 4816 return -EINVAL; 4817 } 4818 4819 if (!(info->state & (1<<MD_DISK_FAULTY))) { 4820 int err; 4821 rdev = md_import_device(dev, -1, 0); 4822 if (IS_ERR(rdev)) { 4823 printk(KERN_WARNING 4824 "md: error, md_import_device() returned %ld\n", 4825 PTR_ERR(rdev)); 4826 return PTR_ERR(rdev); 4827 } 4828 rdev->desc_nr = info->number; 4829 if (info->raid_disk < mddev->raid_disks) 4830 rdev->raid_disk = info->raid_disk; 4831 else 4832 rdev->raid_disk = -1; 4833 4834 if (rdev->raid_disk < mddev->raid_disks) 4835 if (info->state & (1<<MD_DISK_SYNC)) 4836 set_bit(In_sync, &rdev->flags); 4837 4838 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4839 set_bit(WriteMostly, &rdev->flags); 4840 4841 if (!mddev->persistent) { 4842 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4843 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4844 } else 4845 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4846 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); 4847 4848 err = bind_rdev_to_array(rdev, mddev); 4849 if (err) { 4850 export_rdev(rdev); 4851 return err; 4852 } 4853 } 4854 4855 return 0; 4856 } 4857 4858 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 4859 { 4860 char b[BDEVNAME_SIZE]; 4861 mdk_rdev_t *rdev; 4862 4863 rdev = find_rdev(mddev, dev); 4864 if (!rdev) 4865 return -ENXIO; 4866 4867 if (rdev->raid_disk >= 0) 4868 goto busy; 4869 4870 kick_rdev_from_array(rdev); 4871 md_update_sb(mddev, 1); 4872 md_new_event(mddev); 4873 4874 return 0; 4875 busy: 4876 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 4877 bdevname(rdev->bdev,b), mdname(mddev)); 4878 return -EBUSY; 4879 } 4880 4881 static int hot_add_disk(mddev_t * mddev, dev_t dev) 4882 { 4883 char b[BDEVNAME_SIZE]; 4884 int err; 4885 mdk_rdev_t *rdev; 4886 4887 if (!mddev->pers) 4888 return -ENODEV; 4889 4890 if (mddev->major_version != 0) { 4891 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 4892 " version-0 superblocks.\n", 4893 mdname(mddev)); 4894 return -EINVAL; 4895 } 4896 if (!mddev->pers->hot_add_disk) { 4897 printk(KERN_WARNING 4898 "%s: personality does not support diskops!\n", 4899 mdname(mddev)); 4900 return -EINVAL; 4901 } 4902 4903 rdev = md_import_device(dev, -1, 0); 4904 if (IS_ERR(rdev)) { 4905 printk(KERN_WARNING 4906 "md: error, md_import_device() returned %ld\n", 4907 PTR_ERR(rdev)); 4908 return -EINVAL; 4909 } 4910 4911 if (mddev->persistent) 4912 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4913 else 4914 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4915 4916 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); 4917 4918 if (test_bit(Faulty, &rdev->flags)) { 4919 printk(KERN_WARNING 4920 "md: can not hot-add faulty %s disk to %s!\n", 4921 bdevname(rdev->bdev,b), mdname(mddev)); 4922 err = -EINVAL; 4923 goto abort_export; 4924 } 4925 clear_bit(In_sync, &rdev->flags); 4926 rdev->desc_nr = -1; 4927 rdev->saved_raid_disk = -1; 4928 err = bind_rdev_to_array(rdev, mddev); 4929 if (err) 4930 goto abort_export; 4931 4932 /* 4933 * The rest should better be atomic, we can have disk failures 4934 * noticed in interrupt contexts ... 4935 */ 4936 4937 rdev->raid_disk = -1; 4938 4939 md_update_sb(mddev, 1); 4940 4941 /* 4942 * Kick recovery, maybe this spare has to be added to the 4943 * array immediately. 4944 */ 4945 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4946 md_wakeup_thread(mddev->thread); 4947 md_new_event(mddev); 4948 return 0; 4949 4950 abort_export: 4951 export_rdev(rdev); 4952 return err; 4953 } 4954 4955 static int set_bitmap_file(mddev_t *mddev, int fd) 4956 { 4957 int err; 4958 4959 if (mddev->pers) { 4960 if (!mddev->pers->quiesce) 4961 return -EBUSY; 4962 if (mddev->recovery || mddev->sync_thread) 4963 return -EBUSY; 4964 /* we should be able to change the bitmap.. */ 4965 } 4966 4967 4968 if (fd >= 0) { 4969 if (mddev->bitmap) 4970 return -EEXIST; /* cannot add when bitmap is present */ 4971 mddev->bitmap_file = fget(fd); 4972 4973 if (mddev->bitmap_file == NULL) { 4974 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4975 mdname(mddev)); 4976 return -EBADF; 4977 } 4978 4979 err = deny_bitmap_write_access(mddev->bitmap_file); 4980 if (err) { 4981 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4982 mdname(mddev)); 4983 fput(mddev->bitmap_file); 4984 mddev->bitmap_file = NULL; 4985 return err; 4986 } 4987 mddev->bitmap_offset = 0; /* file overrides offset */ 4988 } else if (mddev->bitmap == NULL) 4989 return -ENOENT; /* cannot remove what isn't there */ 4990 err = 0; 4991 if (mddev->pers) { 4992 mddev->pers->quiesce(mddev, 1); 4993 if (fd >= 0) 4994 err = bitmap_create(mddev); 4995 if (fd < 0 || err) { 4996 bitmap_destroy(mddev); 4997 fd = -1; /* make sure to put the file */ 4998 } 4999 mddev->pers->quiesce(mddev, 0); 5000 } 5001 if (fd < 0) { 5002 if (mddev->bitmap_file) { 5003 restore_bitmap_write_access(mddev->bitmap_file); 5004 fput(mddev->bitmap_file); 5005 } 5006 mddev->bitmap_file = NULL; 5007 } 5008 5009 return err; 5010 } 5011 5012 /* 5013 * set_array_info is used two different ways 5014 * The original usage is when creating a new array. 5015 * In this usage, raid_disks is > 0 and it together with 5016 * level, size, not_persistent,layout,chunksize determine the 5017 * shape of the array. 5018 * This will always create an array with a type-0.90.0 superblock. 5019 * The newer usage is when assembling an array. 5020 * In this case raid_disks will be 0, and the major_version field is 5021 * use to determine which style super-blocks are to be found on the devices. 5022 * The minor and patch _version numbers are also kept incase the 5023 * super_block handler wishes to interpret them. 5024 */ 5025 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 5026 { 5027 5028 if (info->raid_disks == 0) { 5029 /* just setting version number for superblock loading */ 5030 if (info->major_version < 0 || 5031 info->major_version >= ARRAY_SIZE(super_types) || 5032 super_types[info->major_version].name == NULL) { 5033 /* maybe try to auto-load a module? */ 5034 printk(KERN_INFO 5035 "md: superblock version %d not known\n", 5036 info->major_version); 5037 return -EINVAL; 5038 } 5039 mddev->major_version = info->major_version; 5040 mddev->minor_version = info->minor_version; 5041 mddev->patch_version = info->patch_version; 5042 mddev->persistent = !info->not_persistent; 5043 return 0; 5044 } 5045 mddev->major_version = MD_MAJOR_VERSION; 5046 mddev->minor_version = MD_MINOR_VERSION; 5047 mddev->patch_version = MD_PATCHLEVEL_VERSION; 5048 mddev->ctime = get_seconds(); 5049 5050 mddev->level = info->level; 5051 mddev->clevel[0] = 0; 5052 mddev->dev_sectors = 2 * (sector_t)info->size; 5053 mddev->raid_disks = info->raid_disks; 5054 /* don't set md_minor, it is determined by which /dev/md* was 5055 * openned 5056 */ 5057 if (info->state & (1<<MD_SB_CLEAN)) 5058 mddev->recovery_cp = MaxSector; 5059 else 5060 mddev->recovery_cp = 0; 5061 mddev->persistent = ! info->not_persistent; 5062 mddev->external = 0; 5063 5064 mddev->layout = info->layout; 5065 mddev->chunk_size = info->chunk_size; 5066 5067 mddev->max_disks = MD_SB_DISKS; 5068 5069 if (mddev->persistent) 5070 mddev->flags = 0; 5071 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5072 5073 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5074 mddev->bitmap_offset = 0; 5075 5076 mddev->reshape_position = MaxSector; 5077 5078 /* 5079 * Generate a 128 bit UUID 5080 */ 5081 get_random_bytes(mddev->uuid, 16); 5082 5083 mddev->new_level = mddev->level; 5084 mddev->new_chunk = mddev->chunk_size; 5085 mddev->new_layout = mddev->layout; 5086 mddev->delta_disks = 0; 5087 5088 return 0; 5089 } 5090 5091 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) 5092 { 5093 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5094 5095 if (mddev->external_size) 5096 return; 5097 5098 mddev->array_sectors = array_sectors; 5099 } 5100 EXPORT_SYMBOL(md_set_array_sectors); 5101 5102 static int update_size(mddev_t *mddev, sector_t num_sectors) 5103 { 5104 mdk_rdev_t *rdev; 5105 int rv; 5106 int fit = (num_sectors == 0); 5107 5108 if (mddev->pers->resize == NULL) 5109 return -EINVAL; 5110 /* The "num_sectors" is the number of sectors of each device that 5111 * is used. This can only make sense for arrays with redundancy. 5112 * linear and raid0 always use whatever space is available. We can only 5113 * consider changing this number if no resync or reconstruction is 5114 * happening, and if the new size is acceptable. It must fit before the 5115 * sb_start or, if that is <data_offset, it must fit before the size 5116 * of each device. If num_sectors is zero, we find the largest size 5117 * that fits. 5118 5119 */ 5120 if (mddev->sync_thread) 5121 return -EBUSY; 5122 if (mddev->bitmap) 5123 /* Sorry, cannot grow a bitmap yet, just remove it, 5124 * grow, and re-add. 5125 */ 5126 return -EBUSY; 5127 list_for_each_entry(rdev, &mddev->disks, same_set) { 5128 sector_t avail = rdev->sectors; 5129 5130 if (fit && (num_sectors == 0 || num_sectors > avail)) 5131 num_sectors = avail; 5132 if (avail < num_sectors) 5133 return -ENOSPC; 5134 } 5135 rv = mddev->pers->resize(mddev, num_sectors); 5136 if (!rv) { 5137 struct block_device *bdev; 5138 5139 bdev = bdget_disk(mddev->gendisk, 0); 5140 if (bdev) { 5141 mutex_lock(&bdev->bd_inode->i_mutex); 5142 i_size_write(bdev->bd_inode, 5143 (loff_t)mddev->array_sectors << 9); 5144 mutex_unlock(&bdev->bd_inode->i_mutex); 5145 bdput(bdev); 5146 } 5147 } 5148 return rv; 5149 } 5150 5151 static int update_raid_disks(mddev_t *mddev, int raid_disks) 5152 { 5153 int rv; 5154 /* change the number of raid disks */ 5155 if (mddev->pers->check_reshape == NULL) 5156 return -EINVAL; 5157 if (raid_disks <= 0 || 5158 raid_disks >= mddev->max_disks) 5159 return -EINVAL; 5160 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5161 return -EBUSY; 5162 mddev->delta_disks = raid_disks - mddev->raid_disks; 5163 5164 rv = mddev->pers->check_reshape(mddev); 5165 return rv; 5166 } 5167 5168 5169 /* 5170 * update_array_info is used to change the configuration of an 5171 * on-line array. 5172 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 5173 * fields in the info are checked against the array. 5174 * Any differences that cannot be handled will cause an error. 5175 * Normally, only one change can be managed at a time. 5176 */ 5177 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 5178 { 5179 int rv = 0; 5180 int cnt = 0; 5181 int state = 0; 5182 5183 /* calculate expected state,ignoring low bits */ 5184 if (mddev->bitmap && mddev->bitmap_offset) 5185 state |= (1 << MD_SB_BITMAP_PRESENT); 5186 5187 if (mddev->major_version != info->major_version || 5188 mddev->minor_version != info->minor_version || 5189 /* mddev->patch_version != info->patch_version || */ 5190 mddev->ctime != info->ctime || 5191 mddev->level != info->level || 5192 /* mddev->layout != info->layout || */ 5193 !mddev->persistent != info->not_persistent|| 5194 mddev->chunk_size != info->chunk_size || 5195 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5196 ((state^info->state) & 0xfffffe00) 5197 ) 5198 return -EINVAL; 5199 /* Check there is only one change */ 5200 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5201 cnt++; 5202 if (mddev->raid_disks != info->raid_disks) 5203 cnt++; 5204 if (mddev->layout != info->layout) 5205 cnt++; 5206 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 5207 cnt++; 5208 if (cnt == 0) 5209 return 0; 5210 if (cnt > 1) 5211 return -EINVAL; 5212 5213 if (mddev->layout != info->layout) { 5214 /* Change layout 5215 * we don't need to do anything at the md level, the 5216 * personality will take care of it all. 5217 */ 5218 if (mddev->pers->reconfig == NULL) 5219 return -EINVAL; 5220 else 5221 return mddev->pers->reconfig(mddev, info->layout, -1); 5222 } 5223 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5224 rv = update_size(mddev, (sector_t)info->size * 2); 5225 5226 if (mddev->raid_disks != info->raid_disks) 5227 rv = update_raid_disks(mddev, info->raid_disks); 5228 5229 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 5230 if (mddev->pers->quiesce == NULL) 5231 return -EINVAL; 5232 if (mddev->recovery || mddev->sync_thread) 5233 return -EBUSY; 5234 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 5235 /* add the bitmap */ 5236 if (mddev->bitmap) 5237 return -EEXIST; 5238 if (mddev->default_bitmap_offset == 0) 5239 return -EINVAL; 5240 mddev->bitmap_offset = mddev->default_bitmap_offset; 5241 mddev->pers->quiesce(mddev, 1); 5242 rv = bitmap_create(mddev); 5243 if (rv) 5244 bitmap_destroy(mddev); 5245 mddev->pers->quiesce(mddev, 0); 5246 } else { 5247 /* remove the bitmap */ 5248 if (!mddev->bitmap) 5249 return -ENOENT; 5250 if (mddev->bitmap->file) 5251 return -EINVAL; 5252 mddev->pers->quiesce(mddev, 1); 5253 bitmap_destroy(mddev); 5254 mddev->pers->quiesce(mddev, 0); 5255 mddev->bitmap_offset = 0; 5256 } 5257 } 5258 md_update_sb(mddev, 1); 5259 return rv; 5260 } 5261 5262 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 5263 { 5264 mdk_rdev_t *rdev; 5265 5266 if (mddev->pers == NULL) 5267 return -ENODEV; 5268 5269 rdev = find_rdev(mddev, dev); 5270 if (!rdev) 5271 return -ENODEV; 5272 5273 md_error(mddev, rdev); 5274 return 0; 5275 } 5276 5277 /* 5278 * We have a problem here : there is no easy way to give a CHS 5279 * virtual geometry. We currently pretend that we have a 2 heads 5280 * 4 sectors (with a BIG number of cylinders...). This drives 5281 * dosfs just mad... ;-) 5282 */ 5283 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 5284 { 5285 mddev_t *mddev = bdev->bd_disk->private_data; 5286 5287 geo->heads = 2; 5288 geo->sectors = 4; 5289 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5290 return 0; 5291 } 5292 5293 static int md_ioctl(struct block_device *bdev, fmode_t mode, 5294 unsigned int cmd, unsigned long arg) 5295 { 5296 int err = 0; 5297 void __user *argp = (void __user *)arg; 5298 mddev_t *mddev = NULL; 5299 5300 if (!capable(CAP_SYS_ADMIN)) 5301 return -EACCES; 5302 5303 /* 5304 * Commands dealing with the RAID driver but not any 5305 * particular array: 5306 */ 5307 switch (cmd) 5308 { 5309 case RAID_VERSION: 5310 err = get_version(argp); 5311 goto done; 5312 5313 case PRINT_RAID_DEBUG: 5314 err = 0; 5315 md_print_devices(); 5316 goto done; 5317 5318 #ifndef MODULE 5319 case RAID_AUTORUN: 5320 err = 0; 5321 autostart_arrays(arg); 5322 goto done; 5323 #endif 5324 default:; 5325 } 5326 5327 /* 5328 * Commands creating/starting a new array: 5329 */ 5330 5331 mddev = bdev->bd_disk->private_data; 5332 5333 if (!mddev) { 5334 BUG(); 5335 goto abort; 5336 } 5337 5338 err = mddev_lock(mddev); 5339 if (err) { 5340 printk(KERN_INFO 5341 "md: ioctl lock interrupted, reason %d, cmd %d\n", 5342 err, cmd); 5343 goto abort; 5344 } 5345 5346 switch (cmd) 5347 { 5348 case SET_ARRAY_INFO: 5349 { 5350 mdu_array_info_t info; 5351 if (!arg) 5352 memset(&info, 0, sizeof(info)); 5353 else if (copy_from_user(&info, argp, sizeof(info))) { 5354 err = -EFAULT; 5355 goto abort_unlock; 5356 } 5357 if (mddev->pers) { 5358 err = update_array_info(mddev, &info); 5359 if (err) { 5360 printk(KERN_WARNING "md: couldn't update" 5361 " array info. %d\n", err); 5362 goto abort_unlock; 5363 } 5364 goto done_unlock; 5365 } 5366 if (!list_empty(&mddev->disks)) { 5367 printk(KERN_WARNING 5368 "md: array %s already has disks!\n", 5369 mdname(mddev)); 5370 err = -EBUSY; 5371 goto abort_unlock; 5372 } 5373 if (mddev->raid_disks) { 5374 printk(KERN_WARNING 5375 "md: array %s already initialised!\n", 5376 mdname(mddev)); 5377 err = -EBUSY; 5378 goto abort_unlock; 5379 } 5380 err = set_array_info(mddev, &info); 5381 if (err) { 5382 printk(KERN_WARNING "md: couldn't set" 5383 " array info. %d\n", err); 5384 goto abort_unlock; 5385 } 5386 } 5387 goto done_unlock; 5388 5389 default:; 5390 } 5391 5392 /* 5393 * Commands querying/configuring an existing array: 5394 */ 5395 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 5396 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 5397 if ((!mddev->raid_disks && !mddev->external) 5398 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 5399 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 5400 && cmd != GET_BITMAP_FILE) { 5401 err = -ENODEV; 5402 goto abort_unlock; 5403 } 5404 5405 /* 5406 * Commands even a read-only array can execute: 5407 */ 5408 switch (cmd) 5409 { 5410 case GET_ARRAY_INFO: 5411 err = get_array_info(mddev, argp); 5412 goto done_unlock; 5413 5414 case GET_BITMAP_FILE: 5415 err = get_bitmap_file(mddev, argp); 5416 goto done_unlock; 5417 5418 case GET_DISK_INFO: 5419 err = get_disk_info(mddev, argp); 5420 goto done_unlock; 5421 5422 case RESTART_ARRAY_RW: 5423 err = restart_array(mddev); 5424 goto done_unlock; 5425 5426 case STOP_ARRAY: 5427 err = do_md_stop(mddev, 0, 1); 5428 goto done_unlock; 5429 5430 case STOP_ARRAY_RO: 5431 err = do_md_stop(mddev, 1, 1); 5432 goto done_unlock; 5433 5434 } 5435 5436 /* 5437 * The remaining ioctls are changing the state of the 5438 * superblock, so we do not allow them on read-only arrays. 5439 * However non-MD ioctls (e.g. get-size) will still come through 5440 * here and hit the 'default' below, so only disallow 5441 * 'md' ioctls, and switch to rw mode if started auto-readonly. 5442 */ 5443 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5444 if (mddev->ro == 2) { 5445 mddev->ro = 0; 5446 sysfs_notify_dirent(mddev->sysfs_state); 5447 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5448 md_wakeup_thread(mddev->thread); 5449 } else { 5450 err = -EROFS; 5451 goto abort_unlock; 5452 } 5453 } 5454 5455 switch (cmd) 5456 { 5457 case ADD_NEW_DISK: 5458 { 5459 mdu_disk_info_t info; 5460 if (copy_from_user(&info, argp, sizeof(info))) 5461 err = -EFAULT; 5462 else 5463 err = add_new_disk(mddev, &info); 5464 goto done_unlock; 5465 } 5466 5467 case HOT_REMOVE_DISK: 5468 err = hot_remove_disk(mddev, new_decode_dev(arg)); 5469 goto done_unlock; 5470 5471 case HOT_ADD_DISK: 5472 err = hot_add_disk(mddev, new_decode_dev(arg)); 5473 goto done_unlock; 5474 5475 case SET_DISK_FAULTY: 5476 err = set_disk_faulty(mddev, new_decode_dev(arg)); 5477 goto done_unlock; 5478 5479 case RUN_ARRAY: 5480 err = do_md_run(mddev); 5481 goto done_unlock; 5482 5483 case SET_BITMAP_FILE: 5484 err = set_bitmap_file(mddev, (int)arg); 5485 goto done_unlock; 5486 5487 default: 5488 err = -EINVAL; 5489 goto abort_unlock; 5490 } 5491 5492 done_unlock: 5493 abort_unlock: 5494 if (mddev->hold_active == UNTIL_IOCTL && 5495 err != -EINVAL) 5496 mddev->hold_active = 0; 5497 mddev_unlock(mddev); 5498 5499 return err; 5500 done: 5501 if (err) 5502 MD_BUG(); 5503 abort: 5504 return err; 5505 } 5506 5507 static int md_open(struct block_device *bdev, fmode_t mode) 5508 { 5509 /* 5510 * Succeed if we can lock the mddev, which confirms that 5511 * it isn't being stopped right now. 5512 */ 5513 mddev_t *mddev = mddev_find(bdev->bd_dev); 5514 int err; 5515 5516 if (mddev->gendisk != bdev->bd_disk) { 5517 /* we are racing with mddev_put which is discarding this 5518 * bd_disk. 5519 */ 5520 mddev_put(mddev); 5521 /* Wait until bdev->bd_disk is definitely gone */ 5522 flush_scheduled_work(); 5523 /* Then retry the open from the top */ 5524 return -ERESTARTSYS; 5525 } 5526 BUG_ON(mddev != bdev->bd_disk->private_data); 5527 5528 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5529 goto out; 5530 5531 err = 0; 5532 atomic_inc(&mddev->openers); 5533 mddev_unlock(mddev); 5534 5535 check_disk_change(bdev); 5536 out: 5537 return err; 5538 } 5539 5540 static int md_release(struct gendisk *disk, fmode_t mode) 5541 { 5542 mddev_t *mddev = disk->private_data; 5543 5544 BUG_ON(!mddev); 5545 atomic_dec(&mddev->openers); 5546 mddev_put(mddev); 5547 5548 return 0; 5549 } 5550 5551 static int md_media_changed(struct gendisk *disk) 5552 { 5553 mddev_t *mddev = disk->private_data; 5554 5555 return mddev->changed; 5556 } 5557 5558 static int md_revalidate(struct gendisk *disk) 5559 { 5560 mddev_t *mddev = disk->private_data; 5561 5562 mddev->changed = 0; 5563 return 0; 5564 } 5565 static struct block_device_operations md_fops = 5566 { 5567 .owner = THIS_MODULE, 5568 .open = md_open, 5569 .release = md_release, 5570 .ioctl = md_ioctl, 5571 .getgeo = md_getgeo, 5572 .media_changed = md_media_changed, 5573 .revalidate_disk= md_revalidate, 5574 }; 5575 5576 static int md_thread(void * arg) 5577 { 5578 mdk_thread_t *thread = arg; 5579 5580 /* 5581 * md_thread is a 'system-thread', it's priority should be very 5582 * high. We avoid resource deadlocks individually in each 5583 * raid personality. (RAID5 does preallocation) We also use RR and 5584 * the very same RT priority as kswapd, thus we will never get 5585 * into a priority inversion deadlock. 5586 * 5587 * we definitely have to have equal or higher priority than 5588 * bdflush, otherwise bdflush will deadlock if there are too 5589 * many dirty RAID5 blocks. 5590 */ 5591 5592 allow_signal(SIGKILL); 5593 while (!kthread_should_stop()) { 5594 5595 /* We need to wait INTERRUPTIBLE so that 5596 * we don't add to the load-average. 5597 * That means we need to be sure no signals are 5598 * pending 5599 */ 5600 if (signal_pending(current)) 5601 flush_signals(current); 5602 5603 wait_event_interruptible_timeout 5604 (thread->wqueue, 5605 test_bit(THREAD_WAKEUP, &thread->flags) 5606 || kthread_should_stop(), 5607 thread->timeout); 5608 5609 clear_bit(THREAD_WAKEUP, &thread->flags); 5610 5611 thread->run(thread->mddev); 5612 } 5613 5614 return 0; 5615 } 5616 5617 void md_wakeup_thread(mdk_thread_t *thread) 5618 { 5619 if (thread) { 5620 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5621 set_bit(THREAD_WAKEUP, &thread->flags); 5622 wake_up(&thread->wqueue); 5623 } 5624 } 5625 5626 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5627 const char *name) 5628 { 5629 mdk_thread_t *thread; 5630 5631 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5632 if (!thread) 5633 return NULL; 5634 5635 init_waitqueue_head(&thread->wqueue); 5636 5637 thread->run = run; 5638 thread->mddev = mddev; 5639 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5640 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 5641 if (IS_ERR(thread->tsk)) { 5642 kfree(thread); 5643 return NULL; 5644 } 5645 return thread; 5646 } 5647 5648 void md_unregister_thread(mdk_thread_t *thread) 5649 { 5650 if (!thread) 5651 return; 5652 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5653 5654 kthread_stop(thread->tsk); 5655 kfree(thread); 5656 } 5657 5658 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5659 { 5660 if (!mddev) { 5661 MD_BUG(); 5662 return; 5663 } 5664 5665 if (!rdev || test_bit(Faulty, &rdev->flags)) 5666 return; 5667 5668 if (mddev->external) 5669 set_bit(Blocked, &rdev->flags); 5670 /* 5671 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5672 mdname(mddev), 5673 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5674 __builtin_return_address(0),__builtin_return_address(1), 5675 __builtin_return_address(2),__builtin_return_address(3)); 5676 */ 5677 if (!mddev->pers) 5678 return; 5679 if (!mddev->pers->error_handler) 5680 return; 5681 mddev->pers->error_handler(mddev,rdev); 5682 if (mddev->degraded) 5683 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5684 set_bit(StateChanged, &rdev->flags); 5685 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5686 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5687 md_wakeup_thread(mddev->thread); 5688 md_new_event_inintr(mddev); 5689 } 5690 5691 /* seq_file implementation /proc/mdstat */ 5692 5693 static void status_unused(struct seq_file *seq) 5694 { 5695 int i = 0; 5696 mdk_rdev_t *rdev; 5697 5698 seq_printf(seq, "unused devices: "); 5699 5700 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 5701 char b[BDEVNAME_SIZE]; 5702 i++; 5703 seq_printf(seq, "%s ", 5704 bdevname(rdev->bdev,b)); 5705 } 5706 if (!i) 5707 seq_printf(seq, "<none>"); 5708 5709 seq_printf(seq, "\n"); 5710 } 5711 5712 5713 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5714 { 5715 sector_t max_sectors, resync, res; 5716 unsigned long dt, db; 5717 sector_t rt; 5718 int scale; 5719 unsigned int per_milli; 5720 5721 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 5722 5723 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5724 max_sectors = mddev->resync_max_sectors; 5725 else 5726 max_sectors = mddev->dev_sectors; 5727 5728 /* 5729 * Should not happen. 5730 */ 5731 if (!max_sectors) { 5732 MD_BUG(); 5733 return; 5734 } 5735 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5736 * in a sector_t, and (max_sectors>>scale) will fit in a 5737 * u32, as those are the requirements for sector_div. 5738 * Thus 'scale' must be at least 10 5739 */ 5740 scale = 10; 5741 if (sizeof(sector_t) > sizeof(unsigned long)) { 5742 while ( max_sectors/2 > (1ULL<<(scale+32))) 5743 scale++; 5744 } 5745 res = (resync>>scale)*1000; 5746 sector_div(res, (u32)((max_sectors>>scale)+1)); 5747 5748 per_milli = res; 5749 { 5750 int i, x = per_milli/50, y = 20-x; 5751 seq_printf(seq, "["); 5752 for (i = 0; i < x; i++) 5753 seq_printf(seq, "="); 5754 seq_printf(seq, ">"); 5755 for (i = 0; i < y; i++) 5756 seq_printf(seq, "."); 5757 seq_printf(seq, "] "); 5758 } 5759 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5760 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5761 "reshape" : 5762 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5763 "check" : 5764 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5765 "resync" : "recovery"))), 5766 per_milli/10, per_milli % 10, 5767 (unsigned long long) resync/2, 5768 (unsigned long long) max_sectors/2); 5769 5770 /* 5771 * dt: time from mark until now 5772 * db: blocks written from mark until now 5773 * rt: remaining time 5774 * 5775 * rt is a sector_t, so could be 32bit or 64bit. 5776 * So we divide before multiply in case it is 32bit and close 5777 * to the limit. 5778 * We scale the divisor (db) by 32 to avoid loosing precision 5779 * near the end of resync when the number of remaining sectors 5780 * is close to 'db'. 5781 * We then divide rt by 32 after multiplying by db to compensate. 5782 * The '+1' avoids division by zero if db is very small. 5783 */ 5784 dt = ((jiffies - mddev->resync_mark) / HZ); 5785 if (!dt) dt++; 5786 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 5787 - mddev->resync_mark_cnt; 5788 5789 rt = max_sectors - resync; /* number of remaining sectors */ 5790 sector_div(rt, db/32+1); 5791 rt *= dt; 5792 rt >>= 5; 5793 5794 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 5795 ((unsigned long)rt % 60)/6); 5796 5797 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 5798 } 5799 5800 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 5801 { 5802 struct list_head *tmp; 5803 loff_t l = *pos; 5804 mddev_t *mddev; 5805 5806 if (l >= 0x10000) 5807 return NULL; 5808 if (!l--) 5809 /* header */ 5810 return (void*)1; 5811 5812 spin_lock(&all_mddevs_lock); 5813 list_for_each(tmp,&all_mddevs) 5814 if (!l--) { 5815 mddev = list_entry(tmp, mddev_t, all_mddevs); 5816 mddev_get(mddev); 5817 spin_unlock(&all_mddevs_lock); 5818 return mddev; 5819 } 5820 spin_unlock(&all_mddevs_lock); 5821 if (!l--) 5822 return (void*)2;/* tail */ 5823 return NULL; 5824 } 5825 5826 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 5827 { 5828 struct list_head *tmp; 5829 mddev_t *next_mddev, *mddev = v; 5830 5831 ++*pos; 5832 if (v == (void*)2) 5833 return NULL; 5834 5835 spin_lock(&all_mddevs_lock); 5836 if (v == (void*)1) 5837 tmp = all_mddevs.next; 5838 else 5839 tmp = mddev->all_mddevs.next; 5840 if (tmp != &all_mddevs) 5841 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 5842 else { 5843 next_mddev = (void*)2; 5844 *pos = 0x10000; 5845 } 5846 spin_unlock(&all_mddevs_lock); 5847 5848 if (v != (void*)1) 5849 mddev_put(mddev); 5850 return next_mddev; 5851 5852 } 5853 5854 static void md_seq_stop(struct seq_file *seq, void *v) 5855 { 5856 mddev_t *mddev = v; 5857 5858 if (mddev && v != (void*)1 && v != (void*)2) 5859 mddev_put(mddev); 5860 } 5861 5862 struct mdstat_info { 5863 int event; 5864 }; 5865 5866 static int md_seq_show(struct seq_file *seq, void *v) 5867 { 5868 mddev_t *mddev = v; 5869 sector_t sectors; 5870 mdk_rdev_t *rdev; 5871 struct mdstat_info *mi = seq->private; 5872 struct bitmap *bitmap; 5873 5874 if (v == (void*)1) { 5875 struct mdk_personality *pers; 5876 seq_printf(seq, "Personalities : "); 5877 spin_lock(&pers_lock); 5878 list_for_each_entry(pers, &pers_list, list) 5879 seq_printf(seq, "[%s] ", pers->name); 5880 5881 spin_unlock(&pers_lock); 5882 seq_printf(seq, "\n"); 5883 mi->event = atomic_read(&md_event_count); 5884 return 0; 5885 } 5886 if (v == (void*)2) { 5887 status_unused(seq); 5888 return 0; 5889 } 5890 5891 if (mddev_lock(mddev) < 0) 5892 return -EINTR; 5893 5894 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 5895 seq_printf(seq, "%s : %sactive", mdname(mddev), 5896 mddev->pers ? "" : "in"); 5897 if (mddev->pers) { 5898 if (mddev->ro==1) 5899 seq_printf(seq, " (read-only)"); 5900 if (mddev->ro==2) 5901 seq_printf(seq, " (auto-read-only)"); 5902 seq_printf(seq, " %s", mddev->pers->name); 5903 } 5904 5905 sectors = 0; 5906 list_for_each_entry(rdev, &mddev->disks, same_set) { 5907 char b[BDEVNAME_SIZE]; 5908 seq_printf(seq, " %s[%d]", 5909 bdevname(rdev->bdev,b), rdev->desc_nr); 5910 if (test_bit(WriteMostly, &rdev->flags)) 5911 seq_printf(seq, "(W)"); 5912 if (test_bit(Faulty, &rdev->flags)) { 5913 seq_printf(seq, "(F)"); 5914 continue; 5915 } else if (rdev->raid_disk < 0) 5916 seq_printf(seq, "(S)"); /* spare */ 5917 sectors += rdev->sectors; 5918 } 5919 5920 if (!list_empty(&mddev->disks)) { 5921 if (mddev->pers) 5922 seq_printf(seq, "\n %llu blocks", 5923 (unsigned long long) 5924 mddev->array_sectors / 2); 5925 else 5926 seq_printf(seq, "\n %llu blocks", 5927 (unsigned long long)sectors / 2); 5928 } 5929 if (mddev->persistent) { 5930 if (mddev->major_version != 0 || 5931 mddev->minor_version != 90) { 5932 seq_printf(seq," super %d.%d", 5933 mddev->major_version, 5934 mddev->minor_version); 5935 } 5936 } else if (mddev->external) 5937 seq_printf(seq, " super external:%s", 5938 mddev->metadata_type); 5939 else 5940 seq_printf(seq, " super non-persistent"); 5941 5942 if (mddev->pers) { 5943 mddev->pers->status(seq, mddev); 5944 seq_printf(seq, "\n "); 5945 if (mddev->pers->sync_request) { 5946 if (mddev->curr_resync > 2) { 5947 status_resync(seq, mddev); 5948 seq_printf(seq, "\n "); 5949 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5950 seq_printf(seq, "\tresync=DELAYED\n "); 5951 else if (mddev->recovery_cp < MaxSector) 5952 seq_printf(seq, "\tresync=PENDING\n "); 5953 } 5954 } else 5955 seq_printf(seq, "\n "); 5956 5957 if ((bitmap = mddev->bitmap)) { 5958 unsigned long chunk_kb; 5959 unsigned long flags; 5960 spin_lock_irqsave(&bitmap->lock, flags); 5961 chunk_kb = bitmap->chunksize >> 10; 5962 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5963 "%lu%s chunk", 5964 bitmap->pages - bitmap->missing_pages, 5965 bitmap->pages, 5966 (bitmap->pages - bitmap->missing_pages) 5967 << (PAGE_SHIFT - 10), 5968 chunk_kb ? chunk_kb : bitmap->chunksize, 5969 chunk_kb ? "KB" : "B"); 5970 if (bitmap->file) { 5971 seq_printf(seq, ", file: "); 5972 seq_path(seq, &bitmap->file->f_path, " \t\n"); 5973 } 5974 5975 seq_printf(seq, "\n"); 5976 spin_unlock_irqrestore(&bitmap->lock, flags); 5977 } 5978 5979 seq_printf(seq, "\n"); 5980 } 5981 mddev_unlock(mddev); 5982 5983 return 0; 5984 } 5985 5986 static const struct seq_operations md_seq_ops = { 5987 .start = md_seq_start, 5988 .next = md_seq_next, 5989 .stop = md_seq_stop, 5990 .show = md_seq_show, 5991 }; 5992 5993 static int md_seq_open(struct inode *inode, struct file *file) 5994 { 5995 int error; 5996 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5997 if (mi == NULL) 5998 return -ENOMEM; 5999 6000 error = seq_open(file, &md_seq_ops); 6001 if (error) 6002 kfree(mi); 6003 else { 6004 struct seq_file *p = file->private_data; 6005 p->private = mi; 6006 mi->event = atomic_read(&md_event_count); 6007 } 6008 return error; 6009 } 6010 6011 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 6012 { 6013 struct seq_file *m = filp->private_data; 6014 struct mdstat_info *mi = m->private; 6015 int mask; 6016 6017 poll_wait(filp, &md_event_waiters, wait); 6018 6019 /* always allow read */ 6020 mask = POLLIN | POLLRDNORM; 6021 6022 if (mi->event != atomic_read(&md_event_count)) 6023 mask |= POLLERR | POLLPRI; 6024 return mask; 6025 } 6026 6027 static const struct file_operations md_seq_fops = { 6028 .owner = THIS_MODULE, 6029 .open = md_seq_open, 6030 .read = seq_read, 6031 .llseek = seq_lseek, 6032 .release = seq_release_private, 6033 .poll = mdstat_poll, 6034 }; 6035 6036 int register_md_personality(struct mdk_personality *p) 6037 { 6038 spin_lock(&pers_lock); 6039 list_add_tail(&p->list, &pers_list); 6040 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 6041 spin_unlock(&pers_lock); 6042 return 0; 6043 } 6044 6045 int unregister_md_personality(struct mdk_personality *p) 6046 { 6047 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6048 spin_lock(&pers_lock); 6049 list_del_init(&p->list); 6050 spin_unlock(&pers_lock); 6051 return 0; 6052 } 6053 6054 static int is_mddev_idle(mddev_t *mddev, int init) 6055 { 6056 mdk_rdev_t * rdev; 6057 int idle; 6058 int curr_events; 6059 6060 idle = 1; 6061 rcu_read_lock(); 6062 rdev_for_each_rcu(rdev, mddev) { 6063 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6064 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 6065 (int)part_stat_read(&disk->part0, sectors[1]) - 6066 atomic_read(&disk->sync_io); 6067 /* sync IO will cause sync_io to increase before the disk_stats 6068 * as sync_io is counted when a request starts, and 6069 * disk_stats is counted when it completes. 6070 * So resync activity will cause curr_events to be smaller than 6071 * when there was no such activity. 6072 * non-sync IO will cause disk_stat to increase without 6073 * increasing sync_io so curr_events will (eventually) 6074 * be larger than it was before. Once it becomes 6075 * substantially larger, the test below will cause 6076 * the array to appear non-idle, and resync will slow 6077 * down. 6078 * If there is a lot of outstanding resync activity when 6079 * we set last_event to curr_events, then all that activity 6080 * completing might cause the array to appear non-idle 6081 * and resync will be slowed down even though there might 6082 * not have been non-resync activity. This will only 6083 * happen once though. 'last_events' will soon reflect 6084 * the state where there is little or no outstanding 6085 * resync requests, and further resync activity will 6086 * always make curr_events less than last_events. 6087 * 6088 */ 6089 if (init || curr_events - rdev->last_events > 64) { 6090 rdev->last_events = curr_events; 6091 idle = 0; 6092 } 6093 } 6094 rcu_read_unlock(); 6095 return idle; 6096 } 6097 6098 void md_done_sync(mddev_t *mddev, int blocks, int ok) 6099 { 6100 /* another "blocks" (512byte) blocks have been synced */ 6101 atomic_sub(blocks, &mddev->recovery_active); 6102 wake_up(&mddev->recovery_wait); 6103 if (!ok) { 6104 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6105 md_wakeup_thread(mddev->thread); 6106 // stop recovery, signal do_sync .... 6107 } 6108 } 6109 6110 6111 /* md_write_start(mddev, bi) 6112 * If we need to update some array metadata (e.g. 'active' flag 6113 * in superblock) before writing, schedule a superblock update 6114 * and wait for it to complete. 6115 */ 6116 void md_write_start(mddev_t *mddev, struct bio *bi) 6117 { 6118 int did_change = 0; 6119 if (bio_data_dir(bi) != WRITE) 6120 return; 6121 6122 BUG_ON(mddev->ro == 1); 6123 if (mddev->ro == 2) { 6124 /* need to switch to read/write */ 6125 mddev->ro = 0; 6126 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6127 md_wakeup_thread(mddev->thread); 6128 md_wakeup_thread(mddev->sync_thread); 6129 did_change = 1; 6130 } 6131 atomic_inc(&mddev->writes_pending); 6132 if (mddev->safemode == 1) 6133 mddev->safemode = 0; 6134 if (mddev->in_sync) { 6135 spin_lock_irq(&mddev->write_lock); 6136 if (mddev->in_sync) { 6137 mddev->in_sync = 0; 6138 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6139 md_wakeup_thread(mddev->thread); 6140 did_change = 1; 6141 } 6142 spin_unlock_irq(&mddev->write_lock); 6143 } 6144 if (did_change) 6145 sysfs_notify_dirent(mddev->sysfs_state); 6146 wait_event(mddev->sb_wait, 6147 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6148 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6149 } 6150 6151 void md_write_end(mddev_t *mddev) 6152 { 6153 if (atomic_dec_and_test(&mddev->writes_pending)) { 6154 if (mddev->safemode == 2) 6155 md_wakeup_thread(mddev->thread); 6156 else if (mddev->safemode_delay) 6157 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 6158 } 6159 } 6160 6161 /* md_allow_write(mddev) 6162 * Calling this ensures that the array is marked 'active' so that writes 6163 * may proceed without blocking. It is important to call this before 6164 * attempting a GFP_KERNEL allocation while holding the mddev lock. 6165 * Must be called with mddev_lock held. 6166 * 6167 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6168 * is dropped, so return -EAGAIN after notifying userspace. 6169 */ 6170 int md_allow_write(mddev_t *mddev) 6171 { 6172 if (!mddev->pers) 6173 return 0; 6174 if (mddev->ro) 6175 return 0; 6176 if (!mddev->pers->sync_request) 6177 return 0; 6178 6179 spin_lock_irq(&mddev->write_lock); 6180 if (mddev->in_sync) { 6181 mddev->in_sync = 0; 6182 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6183 if (mddev->safemode_delay && 6184 mddev->safemode == 0) 6185 mddev->safemode = 1; 6186 spin_unlock_irq(&mddev->write_lock); 6187 md_update_sb(mddev, 0); 6188 sysfs_notify_dirent(mddev->sysfs_state); 6189 } else 6190 spin_unlock_irq(&mddev->write_lock); 6191 6192 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 6193 return -EAGAIN; 6194 else 6195 return 0; 6196 } 6197 EXPORT_SYMBOL_GPL(md_allow_write); 6198 6199 #define SYNC_MARKS 10 6200 #define SYNC_MARK_STEP (3*HZ) 6201 void md_do_sync(mddev_t *mddev) 6202 { 6203 mddev_t *mddev2; 6204 unsigned int currspeed = 0, 6205 window; 6206 sector_t max_sectors,j, io_sectors; 6207 unsigned long mark[SYNC_MARKS]; 6208 sector_t mark_cnt[SYNC_MARKS]; 6209 int last_mark,m; 6210 struct list_head *tmp; 6211 sector_t last_check; 6212 int skipped = 0; 6213 mdk_rdev_t *rdev; 6214 char *desc; 6215 6216 /* just incase thread restarts... */ 6217 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 6218 return; 6219 if (mddev->ro) /* never try to sync a read-only array */ 6220 return; 6221 6222 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6223 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 6224 desc = "data-check"; 6225 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6226 desc = "requested-resync"; 6227 else 6228 desc = "resync"; 6229 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6230 desc = "reshape"; 6231 else 6232 desc = "recovery"; 6233 6234 /* we overload curr_resync somewhat here. 6235 * 0 == not engaged in resync at all 6236 * 2 == checking that there is no conflict with another sync 6237 * 1 == like 2, but have yielded to allow conflicting resync to 6238 * commense 6239 * other == active in resync - this many blocks 6240 * 6241 * Before starting a resync we must have set curr_resync to 6242 * 2, and then checked that every "conflicting" array has curr_resync 6243 * less than ours. When we find one that is the same or higher 6244 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 6245 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 6246 * This will mean we have to start checking from the beginning again. 6247 * 6248 */ 6249 6250 do { 6251 mddev->curr_resync = 2; 6252 6253 try_again: 6254 if (kthread_should_stop()) { 6255 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6256 goto skip; 6257 } 6258 for_each_mddev(mddev2, tmp) { 6259 if (mddev2 == mddev) 6260 continue; 6261 if (!mddev->parallel_resync 6262 && mddev2->curr_resync 6263 && match_mddev_units(mddev, mddev2)) { 6264 DEFINE_WAIT(wq); 6265 if (mddev < mddev2 && mddev->curr_resync == 2) { 6266 /* arbitrarily yield */ 6267 mddev->curr_resync = 1; 6268 wake_up(&resync_wait); 6269 } 6270 if (mddev > mddev2 && mddev->curr_resync == 1) 6271 /* no need to wait here, we can wait the next 6272 * time 'round when curr_resync == 2 6273 */ 6274 continue; 6275 /* We need to wait 'interruptible' so as not to 6276 * contribute to the load average, and not to 6277 * be caught by 'softlockup' 6278 */ 6279 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 6280 if (!kthread_should_stop() && 6281 mddev2->curr_resync >= mddev->curr_resync) { 6282 printk(KERN_INFO "md: delaying %s of %s" 6283 " until %s has finished (they" 6284 " share one or more physical units)\n", 6285 desc, mdname(mddev), mdname(mddev2)); 6286 mddev_put(mddev2); 6287 if (signal_pending(current)) 6288 flush_signals(current); 6289 schedule(); 6290 finish_wait(&resync_wait, &wq); 6291 goto try_again; 6292 } 6293 finish_wait(&resync_wait, &wq); 6294 } 6295 } 6296 } while (mddev->curr_resync < 2); 6297 6298 j = 0; 6299 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6300 /* resync follows the size requested by the personality, 6301 * which defaults to physical size, but can be virtual size 6302 */ 6303 max_sectors = mddev->resync_max_sectors; 6304 mddev->resync_mismatches = 0; 6305 /* we don't use the checkpoint if there's a bitmap */ 6306 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6307 j = mddev->resync_min; 6308 else if (!mddev->bitmap) 6309 j = mddev->recovery_cp; 6310 6311 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6312 max_sectors = mddev->dev_sectors; 6313 else { 6314 /* recovery follows the physical size of devices */ 6315 max_sectors = mddev->dev_sectors; 6316 j = MaxSector; 6317 list_for_each_entry(rdev, &mddev->disks, same_set) 6318 if (rdev->raid_disk >= 0 && 6319 !test_bit(Faulty, &rdev->flags) && 6320 !test_bit(In_sync, &rdev->flags) && 6321 rdev->recovery_offset < j) 6322 j = rdev->recovery_offset; 6323 } 6324 6325 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6326 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 6327 " %d KB/sec/disk.\n", speed_min(mddev)); 6328 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 6329 "(but not more than %d KB/sec) for %s.\n", 6330 speed_max(mddev), desc); 6331 6332 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 6333 6334 io_sectors = 0; 6335 for (m = 0; m < SYNC_MARKS; m++) { 6336 mark[m] = jiffies; 6337 mark_cnt[m] = io_sectors; 6338 } 6339 last_mark = 0; 6340 mddev->resync_mark = mark[last_mark]; 6341 mddev->resync_mark_cnt = mark_cnt[last_mark]; 6342 6343 /* 6344 * Tune reconstruction: 6345 */ 6346 window = 32*(PAGE_SIZE/512); 6347 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6348 window/2,(unsigned long long) max_sectors/2); 6349 6350 atomic_set(&mddev->recovery_active, 0); 6351 last_check = 0; 6352 6353 if (j>2) { 6354 printk(KERN_INFO 6355 "md: resuming %s of %s from checkpoint.\n", 6356 desc, mdname(mddev)); 6357 mddev->curr_resync = j; 6358 } 6359 6360 while (j < max_sectors) { 6361 sector_t sectors; 6362 6363 skipped = 0; 6364 6365 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6366 ((mddev->curr_resync > mddev->curr_resync_completed && 6367 (mddev->curr_resync - mddev->curr_resync_completed) 6368 > (max_sectors >> 4)) || 6369 (j - mddev->curr_resync_completed)*2 6370 >= mddev->resync_max - mddev->curr_resync_completed 6371 )) { 6372 /* time to update curr_resync_completed */ 6373 blk_unplug(mddev->queue); 6374 wait_event(mddev->recovery_wait, 6375 atomic_read(&mddev->recovery_active) == 0); 6376 mddev->curr_resync_completed = 6377 mddev->curr_resync; 6378 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6379 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6380 } 6381 6382 if (j >= mddev->resync_max) 6383 wait_event(mddev->recovery_wait, 6384 mddev->resync_max > j 6385 || kthread_should_stop()); 6386 6387 if (kthread_should_stop()) 6388 goto interrupted; 6389 6390 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6391 currspeed < speed_min(mddev)); 6392 if (sectors == 0) { 6393 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6394 goto out; 6395 } 6396 6397 if (!skipped) { /* actual IO requested */ 6398 io_sectors += sectors; 6399 atomic_add(sectors, &mddev->recovery_active); 6400 } 6401 6402 j += sectors; 6403 if (j>1) mddev->curr_resync = j; 6404 mddev->curr_mark_cnt = io_sectors; 6405 if (last_check == 0) 6406 /* this is the earliers that rebuilt will be 6407 * visible in /proc/mdstat 6408 */ 6409 md_new_event(mddev); 6410 6411 if (last_check + window > io_sectors || j == max_sectors) 6412 continue; 6413 6414 last_check = io_sectors; 6415 6416 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6417 break; 6418 6419 repeat: 6420 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 6421 /* step marks */ 6422 int next = (last_mark+1) % SYNC_MARKS; 6423 6424 mddev->resync_mark = mark[next]; 6425 mddev->resync_mark_cnt = mark_cnt[next]; 6426 mark[next] = jiffies; 6427 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 6428 last_mark = next; 6429 } 6430 6431 6432 if (kthread_should_stop()) 6433 goto interrupted; 6434 6435 6436 /* 6437 * this loop exits only if either when we are slower than 6438 * the 'hard' speed limit, or the system was IO-idle for 6439 * a jiffy. 6440 * the system might be non-idle CPU-wise, but we only care 6441 * about not overloading the IO subsystem. (things like an 6442 * e2fsck being done on the RAID array should execute fast) 6443 */ 6444 blk_unplug(mddev->queue); 6445 cond_resched(); 6446 6447 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6448 /((jiffies-mddev->resync_mark)/HZ +1) +1; 6449 6450 if (currspeed > speed_min(mddev)) { 6451 if ((currspeed > speed_max(mddev)) || 6452 !is_mddev_idle(mddev, 0)) { 6453 msleep(500); 6454 goto repeat; 6455 } 6456 } 6457 } 6458 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 6459 /* 6460 * this also signals 'finished resyncing' to md_stop 6461 */ 6462 out: 6463 blk_unplug(mddev->queue); 6464 6465 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6466 6467 /* tell personality that we are finished */ 6468 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 6469 6470 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 6471 mddev->curr_resync > 2) { 6472 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6473 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6474 if (mddev->curr_resync >= mddev->recovery_cp) { 6475 printk(KERN_INFO 6476 "md: checkpointing %s of %s.\n", 6477 desc, mdname(mddev)); 6478 mddev->recovery_cp = mddev->curr_resync; 6479 } 6480 } else 6481 mddev->recovery_cp = MaxSector; 6482 } else { 6483 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6484 mddev->curr_resync = MaxSector; 6485 list_for_each_entry(rdev, &mddev->disks, same_set) 6486 if (rdev->raid_disk >= 0 && 6487 !test_bit(Faulty, &rdev->flags) && 6488 !test_bit(In_sync, &rdev->flags) && 6489 rdev->recovery_offset < mddev->curr_resync) 6490 rdev->recovery_offset = mddev->curr_resync; 6491 } 6492 } 6493 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6494 6495 skip: 6496 mddev->curr_resync = 0; 6497 mddev->curr_resync_completed = 0; 6498 mddev->resync_min = 0; 6499 mddev->resync_max = MaxSector; 6500 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6501 wake_up(&resync_wait); 6502 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6503 md_wakeup_thread(mddev->thread); 6504 return; 6505 6506 interrupted: 6507 /* 6508 * got a signal, exit. 6509 */ 6510 printk(KERN_INFO 6511 "md: md_do_sync() got signal ... exiting\n"); 6512 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6513 goto out; 6514 6515 } 6516 EXPORT_SYMBOL_GPL(md_do_sync); 6517 6518 6519 static int remove_and_add_spares(mddev_t *mddev) 6520 { 6521 mdk_rdev_t *rdev; 6522 int spares = 0; 6523 6524 mddev->curr_resync_completed = 0; 6525 6526 list_for_each_entry(rdev, &mddev->disks, same_set) 6527 if (rdev->raid_disk >= 0 && 6528 !test_bit(Blocked, &rdev->flags) && 6529 (test_bit(Faulty, &rdev->flags) || 6530 ! test_bit(In_sync, &rdev->flags)) && 6531 atomic_read(&rdev->nr_pending)==0) { 6532 if (mddev->pers->hot_remove_disk( 6533 mddev, rdev->raid_disk)==0) { 6534 char nm[20]; 6535 sprintf(nm,"rd%d", rdev->raid_disk); 6536 sysfs_remove_link(&mddev->kobj, nm); 6537 rdev->raid_disk = -1; 6538 } 6539 } 6540 6541 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 6542 list_for_each_entry(rdev, &mddev->disks, same_set) { 6543 if (rdev->raid_disk >= 0 && 6544 !test_bit(In_sync, &rdev->flags) && 6545 !test_bit(Blocked, &rdev->flags)) 6546 spares++; 6547 if (rdev->raid_disk < 0 6548 && !test_bit(Faulty, &rdev->flags)) { 6549 rdev->recovery_offset = 0; 6550 if (mddev->pers-> 6551 hot_add_disk(mddev, rdev) == 0) { 6552 char nm[20]; 6553 sprintf(nm, "rd%d", rdev->raid_disk); 6554 if (sysfs_create_link(&mddev->kobj, 6555 &rdev->kobj, nm)) 6556 printk(KERN_WARNING 6557 "md: cannot register " 6558 "%s for %s\n", 6559 nm, mdname(mddev)); 6560 spares++; 6561 md_new_event(mddev); 6562 } else 6563 break; 6564 } 6565 } 6566 } 6567 return spares; 6568 } 6569 /* 6570 * This routine is regularly called by all per-raid-array threads to 6571 * deal with generic issues like resync and super-block update. 6572 * Raid personalities that don't have a thread (linear/raid0) do not 6573 * need this as they never do any recovery or update the superblock. 6574 * 6575 * It does not do any resync itself, but rather "forks" off other threads 6576 * to do that as needed. 6577 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6578 * "->recovery" and create a thread at ->sync_thread. 6579 * When the thread finishes it sets MD_RECOVERY_DONE 6580 * and wakeups up this thread which will reap the thread and finish up. 6581 * This thread also removes any faulty devices (with nr_pending == 0). 6582 * 6583 * The overall approach is: 6584 * 1/ if the superblock needs updating, update it. 6585 * 2/ If a recovery thread is running, don't do anything else. 6586 * 3/ If recovery has finished, clean up, possibly marking spares active. 6587 * 4/ If there are any faulty devices, remove them. 6588 * 5/ If array is degraded, try to add spares devices 6589 * 6/ If array has spares or is not in-sync, start a resync thread. 6590 */ 6591 void md_check_recovery(mddev_t *mddev) 6592 { 6593 mdk_rdev_t *rdev; 6594 6595 6596 if (mddev->bitmap) 6597 bitmap_daemon_work(mddev->bitmap); 6598 6599 if (mddev->ro) 6600 return; 6601 6602 if (signal_pending(current)) { 6603 if (mddev->pers->sync_request && !mddev->external) { 6604 printk(KERN_INFO "md: %s in immediate safe mode\n", 6605 mdname(mddev)); 6606 mddev->safemode = 2; 6607 } 6608 flush_signals(current); 6609 } 6610 6611 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6612 return; 6613 if ( ! ( 6614 (mddev->flags && !mddev->external) || 6615 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6616 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6617 (mddev->external == 0 && mddev->safemode == 1) || 6618 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6619 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6620 )) 6621 return; 6622 6623 if (mddev_trylock(mddev)) { 6624 int spares = 0; 6625 6626 if (mddev->ro) { 6627 /* Only thing we do on a ro array is remove 6628 * failed devices. 6629 */ 6630 remove_and_add_spares(mddev); 6631 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6632 goto unlock; 6633 } 6634 6635 if (!mddev->external) { 6636 int did_change = 0; 6637 spin_lock_irq(&mddev->write_lock); 6638 if (mddev->safemode && 6639 !atomic_read(&mddev->writes_pending) && 6640 !mddev->in_sync && 6641 mddev->recovery_cp == MaxSector) { 6642 mddev->in_sync = 1; 6643 did_change = 1; 6644 if (mddev->persistent) 6645 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6646 } 6647 if (mddev->safemode == 1) 6648 mddev->safemode = 0; 6649 spin_unlock_irq(&mddev->write_lock); 6650 if (did_change) 6651 sysfs_notify_dirent(mddev->sysfs_state); 6652 } 6653 6654 if (mddev->flags) 6655 md_update_sb(mddev, 0); 6656 6657 list_for_each_entry(rdev, &mddev->disks, same_set) 6658 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6659 sysfs_notify_dirent(rdev->sysfs_state); 6660 6661 6662 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6663 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6664 /* resync/recovery still happening */ 6665 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6666 goto unlock; 6667 } 6668 if (mddev->sync_thread) { 6669 /* resync has finished, collect result */ 6670 md_unregister_thread(mddev->sync_thread); 6671 mddev->sync_thread = NULL; 6672 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6673 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6674 /* success...*/ 6675 /* activate any spares */ 6676 if (mddev->pers->spare_active(mddev)) 6677 sysfs_notify(&mddev->kobj, NULL, 6678 "degraded"); 6679 } 6680 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6681 mddev->pers->finish_reshape) 6682 mddev->pers->finish_reshape(mddev); 6683 md_update_sb(mddev, 1); 6684 6685 /* if array is no-longer degraded, then any saved_raid_disk 6686 * information must be scrapped 6687 */ 6688 if (!mddev->degraded) 6689 list_for_each_entry(rdev, &mddev->disks, same_set) 6690 rdev->saved_raid_disk = -1; 6691 6692 mddev->recovery = 0; 6693 /* flag recovery needed just to double check */ 6694 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6695 sysfs_notify_dirent(mddev->sysfs_action); 6696 md_new_event(mddev); 6697 goto unlock; 6698 } 6699 /* Set RUNNING before clearing NEEDED to avoid 6700 * any transients in the value of "sync_action". 6701 */ 6702 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6703 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6704 /* Clear some bits that don't mean anything, but 6705 * might be left set 6706 */ 6707 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6708 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6709 6710 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6711 goto unlock; 6712 /* no recovery is running. 6713 * remove any failed drives, then 6714 * add spares if possible. 6715 * Spare are also removed and re-added, to allow 6716 * the personality to fail the re-add. 6717 */ 6718 6719 if (mddev->reshape_position != MaxSector) { 6720 if (mddev->pers->check_reshape(mddev) != 0) 6721 /* Cannot proceed */ 6722 goto unlock; 6723 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6724 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6725 } else if ((spares = remove_and_add_spares(mddev))) { 6726 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6727 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6728 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6729 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6730 } else if (mddev->recovery_cp < MaxSector) { 6731 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6732 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6733 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6734 /* nothing to be done ... */ 6735 goto unlock; 6736 6737 if (mddev->pers->sync_request) { 6738 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6739 /* We are adding a device or devices to an array 6740 * which has the bitmap stored on all devices. 6741 * So make sure all bitmap pages get written 6742 */ 6743 bitmap_write_all(mddev->bitmap); 6744 } 6745 mddev->sync_thread = md_register_thread(md_do_sync, 6746 mddev, 6747 "%s_resync"); 6748 if (!mddev->sync_thread) { 6749 printk(KERN_ERR "%s: could not start resync" 6750 " thread...\n", 6751 mdname(mddev)); 6752 /* leave the spares where they are, it shouldn't hurt */ 6753 mddev->recovery = 0; 6754 } else 6755 md_wakeup_thread(mddev->sync_thread); 6756 sysfs_notify_dirent(mddev->sysfs_action); 6757 md_new_event(mddev); 6758 } 6759 unlock: 6760 if (!mddev->sync_thread) { 6761 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6762 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6763 &mddev->recovery)) 6764 if (mddev->sysfs_action) 6765 sysfs_notify_dirent(mddev->sysfs_action); 6766 } 6767 mddev_unlock(mddev); 6768 } 6769 } 6770 6771 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 6772 { 6773 sysfs_notify_dirent(rdev->sysfs_state); 6774 wait_event_timeout(rdev->blocked_wait, 6775 !test_bit(Blocked, &rdev->flags), 6776 msecs_to_jiffies(5000)); 6777 rdev_dec_pending(rdev, mddev); 6778 } 6779 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 6780 6781 static int md_notify_reboot(struct notifier_block *this, 6782 unsigned long code, void *x) 6783 { 6784 struct list_head *tmp; 6785 mddev_t *mddev; 6786 6787 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 6788 6789 printk(KERN_INFO "md: stopping all md devices.\n"); 6790 6791 for_each_mddev(mddev, tmp) 6792 if (mddev_trylock(mddev)) { 6793 /* Force a switch to readonly even array 6794 * appears to still be in use. Hence 6795 * the '100'. 6796 */ 6797 do_md_stop(mddev, 1, 100); 6798 mddev_unlock(mddev); 6799 } 6800 /* 6801 * certain more exotic SCSI devices are known to be 6802 * volatile wrt too early system reboots. While the 6803 * right place to handle this issue is the given 6804 * driver, we do want to have a safe RAID driver ... 6805 */ 6806 mdelay(1000*1); 6807 } 6808 return NOTIFY_DONE; 6809 } 6810 6811 static struct notifier_block md_notifier = { 6812 .notifier_call = md_notify_reboot, 6813 .next = NULL, 6814 .priority = INT_MAX, /* before any real devices */ 6815 }; 6816 6817 static void md_geninit(void) 6818 { 6819 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6820 6821 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 6822 } 6823 6824 static int __init md_init(void) 6825 { 6826 if (register_blkdev(MD_MAJOR, "md")) 6827 return -1; 6828 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6829 unregister_blkdev(MD_MAJOR, "md"); 6830 return -1; 6831 } 6832 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 6833 md_probe, NULL, NULL); 6834 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6835 md_probe, NULL, NULL); 6836 6837 register_reboot_notifier(&md_notifier); 6838 raid_table_header = register_sysctl_table(raid_root_table); 6839 6840 md_geninit(); 6841 return 0; 6842 } 6843 6844 6845 #ifndef MODULE 6846 6847 /* 6848 * Searches all registered partitions for autorun RAID arrays 6849 * at boot time. 6850 */ 6851 6852 static LIST_HEAD(all_detected_devices); 6853 struct detected_devices_node { 6854 struct list_head list; 6855 dev_t dev; 6856 }; 6857 6858 void md_autodetect_dev(dev_t dev) 6859 { 6860 struct detected_devices_node *node_detected_dev; 6861 6862 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 6863 if (node_detected_dev) { 6864 node_detected_dev->dev = dev; 6865 list_add_tail(&node_detected_dev->list, &all_detected_devices); 6866 } else { 6867 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 6868 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 6869 } 6870 } 6871 6872 6873 static void autostart_arrays(int part) 6874 { 6875 mdk_rdev_t *rdev; 6876 struct detected_devices_node *node_detected_dev; 6877 dev_t dev; 6878 int i_scanned, i_passed; 6879 6880 i_scanned = 0; 6881 i_passed = 0; 6882 6883 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 6884 6885 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 6886 i_scanned++; 6887 node_detected_dev = list_entry(all_detected_devices.next, 6888 struct detected_devices_node, list); 6889 list_del(&node_detected_dev->list); 6890 dev = node_detected_dev->dev; 6891 kfree(node_detected_dev); 6892 rdev = md_import_device(dev,0, 90); 6893 if (IS_ERR(rdev)) 6894 continue; 6895 6896 if (test_bit(Faulty, &rdev->flags)) { 6897 MD_BUG(); 6898 continue; 6899 } 6900 set_bit(AutoDetected, &rdev->flags); 6901 list_add(&rdev->same_set, &pending_raid_disks); 6902 i_passed++; 6903 } 6904 6905 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 6906 i_scanned, i_passed); 6907 6908 autorun_devices(part); 6909 } 6910 6911 #endif /* !MODULE */ 6912 6913 static __exit void md_exit(void) 6914 { 6915 mddev_t *mddev; 6916 struct list_head *tmp; 6917 6918 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 6919 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6920 6921 unregister_blkdev(MD_MAJOR,"md"); 6922 unregister_blkdev(mdp_major, "mdp"); 6923 unregister_reboot_notifier(&md_notifier); 6924 unregister_sysctl_table(raid_table_header); 6925 remove_proc_entry("mdstat", NULL); 6926 for_each_mddev(mddev, tmp) { 6927 export_array(mddev); 6928 mddev->hold_active = 0; 6929 } 6930 } 6931 6932 subsys_initcall(md_init); 6933 module_exit(md_exit) 6934 6935 static int get_ro(char *buffer, struct kernel_param *kp) 6936 { 6937 return sprintf(buffer, "%d", start_readonly); 6938 } 6939 static int set_ro(const char *val, struct kernel_param *kp) 6940 { 6941 char *e; 6942 int num = simple_strtoul(val, &e, 10); 6943 if (*val && (*e == '\0' || *e == '\n')) { 6944 start_readonly = num; 6945 return 0; 6946 } 6947 return -EINVAL; 6948 } 6949 6950 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6951 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6952 6953 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 6954 6955 EXPORT_SYMBOL(register_md_personality); 6956 EXPORT_SYMBOL(unregister_md_personality); 6957 EXPORT_SYMBOL(md_error); 6958 EXPORT_SYMBOL(md_done_sync); 6959 EXPORT_SYMBOL(md_write_start); 6960 EXPORT_SYMBOL(md_write_end); 6961 EXPORT_SYMBOL(md_register_thread); 6962 EXPORT_SYMBOL(md_unregister_thread); 6963 EXPORT_SYMBOL(md_wakeup_thread); 6964 EXPORT_SYMBOL(md_check_recovery); 6965 MODULE_LICENSE("GPL"); 6966 MODULE_ALIAS("md"); 6967 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 6968