1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/sysctl.h> 38 #include <linux/seq_file.h> 39 #include <linux/fs.h> 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/string.h> 43 #include <linux/hdreg.h> 44 #include <linux/proc_fs.h> 45 #include <linux/random.h> 46 #include <linux/module.h> 47 #include <linux/reboot.h> 48 #include <linux/file.h> 49 #include <linux/compat.h> 50 #include <linux/delay.h> 51 #include <linux/raid/md_p.h> 52 #include <linux/raid/md_u.h> 53 #include <linux/slab.h> 54 #include "md.h" 55 #include "bitmap.h" 56 #include "md-cluster.h" 57 58 #ifndef MODULE 59 static void autostart_arrays(int part); 60 #endif 61 62 /* pers_list is a list of registered personalities protected 63 * by pers_lock. 64 * pers_lock does extra service to protect accesses to 65 * mddev->thread when the mutex cannot be held. 66 */ 67 static LIST_HEAD(pers_list); 68 static DEFINE_SPINLOCK(pers_lock); 69 70 struct md_cluster_operations *md_cluster_ops; 71 EXPORT_SYMBOL(md_cluster_ops); 72 struct module *md_cluster_mod; 73 EXPORT_SYMBOL(md_cluster_mod); 74 75 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 76 static struct workqueue_struct *md_wq; 77 static struct workqueue_struct *md_misc_wq; 78 79 static int remove_and_add_spares(struct mddev *mddev, 80 struct md_rdev *this); 81 static void mddev_detach(struct mddev *mddev); 82 83 /* 84 * Default number of read corrections we'll attempt on an rdev 85 * before ejecting it from the array. We divide the read error 86 * count by 2 for every hour elapsed between read errors. 87 */ 88 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 89 /* 90 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 91 * is 1000 KB/sec, so the extra system load does not show up that much. 92 * Increase it if you want to have more _guaranteed_ speed. Note that 93 * the RAID driver will use the maximum available bandwidth if the IO 94 * subsystem is idle. There is also an 'absolute maximum' reconstruction 95 * speed limit - in case reconstruction slows down your system despite 96 * idle IO detection. 97 * 98 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 99 * or /sys/block/mdX/md/sync_speed_{min,max} 100 */ 101 102 static int sysctl_speed_limit_min = 1000; 103 static int sysctl_speed_limit_max = 200000; 104 static inline int speed_min(struct mddev *mddev) 105 { 106 return mddev->sync_speed_min ? 107 mddev->sync_speed_min : sysctl_speed_limit_min; 108 } 109 110 static inline int speed_max(struct mddev *mddev) 111 { 112 return mddev->sync_speed_max ? 113 mddev->sync_speed_max : sysctl_speed_limit_max; 114 } 115 116 static struct ctl_table_header *raid_table_header; 117 118 static struct ctl_table raid_table[] = { 119 { 120 .procname = "speed_limit_min", 121 .data = &sysctl_speed_limit_min, 122 .maxlen = sizeof(int), 123 .mode = S_IRUGO|S_IWUSR, 124 .proc_handler = proc_dointvec, 125 }, 126 { 127 .procname = "speed_limit_max", 128 .data = &sysctl_speed_limit_max, 129 .maxlen = sizeof(int), 130 .mode = S_IRUGO|S_IWUSR, 131 .proc_handler = proc_dointvec, 132 }, 133 { } 134 }; 135 136 static struct ctl_table raid_dir_table[] = { 137 { 138 .procname = "raid", 139 .maxlen = 0, 140 .mode = S_IRUGO|S_IXUGO, 141 .child = raid_table, 142 }, 143 { } 144 }; 145 146 static struct ctl_table raid_root_table[] = { 147 { 148 .procname = "dev", 149 .maxlen = 0, 150 .mode = 0555, 151 .child = raid_dir_table, 152 }, 153 { } 154 }; 155 156 static const struct block_device_operations md_fops; 157 158 static int start_readonly; 159 160 /* bio_clone_mddev 161 * like bio_clone, but with a local bio set 162 */ 163 164 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 165 struct mddev *mddev) 166 { 167 struct bio *b; 168 169 if (!mddev || !mddev->bio_set) 170 return bio_alloc(gfp_mask, nr_iovecs); 171 172 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 173 if (!b) 174 return NULL; 175 return b; 176 } 177 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 178 179 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 180 struct mddev *mddev) 181 { 182 if (!mddev || !mddev->bio_set) 183 return bio_clone(bio, gfp_mask); 184 185 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 186 } 187 EXPORT_SYMBOL_GPL(bio_clone_mddev); 188 189 /* 190 * We have a system wide 'event count' that is incremented 191 * on any 'interesting' event, and readers of /proc/mdstat 192 * can use 'poll' or 'select' to find out when the event 193 * count increases. 194 * 195 * Events are: 196 * start array, stop array, error, add device, remove device, 197 * start build, activate spare 198 */ 199 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 200 static atomic_t md_event_count; 201 void md_new_event(struct mddev *mddev) 202 { 203 atomic_inc(&md_event_count); 204 wake_up(&md_event_waiters); 205 } 206 EXPORT_SYMBOL_GPL(md_new_event); 207 208 /* Alternate version that can be called from interrupts 209 * when calling sysfs_notify isn't needed. 210 */ 211 static void md_new_event_inintr(struct mddev *mddev) 212 { 213 atomic_inc(&md_event_count); 214 wake_up(&md_event_waiters); 215 } 216 217 /* 218 * Enables to iterate over all existing md arrays 219 * all_mddevs_lock protects this list. 220 */ 221 static LIST_HEAD(all_mddevs); 222 static DEFINE_SPINLOCK(all_mddevs_lock); 223 224 /* 225 * iterates through all used mddevs in the system. 226 * We take care to grab the all_mddevs_lock whenever navigating 227 * the list, and to always hold a refcount when unlocked. 228 * Any code which breaks out of this loop while own 229 * a reference to the current mddev and must mddev_put it. 230 */ 231 #define for_each_mddev(_mddev,_tmp) \ 232 \ 233 for (({ spin_lock(&all_mddevs_lock); \ 234 _tmp = all_mddevs.next; \ 235 _mddev = NULL;}); \ 236 ({ if (_tmp != &all_mddevs) \ 237 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 238 spin_unlock(&all_mddevs_lock); \ 239 if (_mddev) mddev_put(_mddev); \ 240 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 241 _tmp != &all_mddevs;}); \ 242 ({ spin_lock(&all_mddevs_lock); \ 243 _tmp = _tmp->next;}) \ 244 ) 245 246 /* Rather than calling directly into the personality make_request function, 247 * IO requests come here first so that we can check if the device is 248 * being suspended pending a reconfiguration. 249 * We hold a refcount over the call to ->make_request. By the time that 250 * call has finished, the bio has been linked into some internal structure 251 * and so is visible to ->quiesce(), so we don't need the refcount any more. 252 */ 253 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 254 { 255 const int rw = bio_data_dir(bio); 256 struct mddev *mddev = q->queuedata; 257 unsigned int sectors; 258 int cpu; 259 260 blk_queue_split(q, &bio, q->bio_split); 261 262 if (mddev == NULL || mddev->pers == NULL 263 || !mddev->ready) { 264 bio_io_error(bio); 265 return BLK_QC_T_NONE; 266 } 267 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 268 if (bio_sectors(bio) != 0) 269 bio->bi_error = -EROFS; 270 bio_endio(bio); 271 return BLK_QC_T_NONE; 272 } 273 smp_rmb(); /* Ensure implications of 'active' are visible */ 274 rcu_read_lock(); 275 if (mddev->suspended) { 276 DEFINE_WAIT(__wait); 277 for (;;) { 278 prepare_to_wait(&mddev->sb_wait, &__wait, 279 TASK_UNINTERRUPTIBLE); 280 if (!mddev->suspended) 281 break; 282 rcu_read_unlock(); 283 schedule(); 284 rcu_read_lock(); 285 } 286 finish_wait(&mddev->sb_wait, &__wait); 287 } 288 atomic_inc(&mddev->active_io); 289 rcu_read_unlock(); 290 291 /* 292 * save the sectors now since our bio can 293 * go away inside make_request 294 */ 295 sectors = bio_sectors(bio); 296 mddev->pers->make_request(mddev, bio); 297 298 cpu = part_stat_lock(); 299 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 300 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 301 part_stat_unlock(); 302 303 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 304 wake_up(&mddev->sb_wait); 305 306 return BLK_QC_T_NONE; 307 } 308 309 /* mddev_suspend makes sure no new requests are submitted 310 * to the device, and that any requests that have been submitted 311 * are completely handled. 312 * Once mddev_detach() is called and completes, the module will be 313 * completely unused. 314 */ 315 void mddev_suspend(struct mddev *mddev) 316 { 317 BUG_ON(mddev->suspended); 318 mddev->suspended = 1; 319 synchronize_rcu(); 320 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 321 mddev->pers->quiesce(mddev, 1); 322 323 del_timer_sync(&mddev->safemode_timer); 324 } 325 EXPORT_SYMBOL_GPL(mddev_suspend); 326 327 void mddev_resume(struct mddev *mddev) 328 { 329 mddev->suspended = 0; 330 wake_up(&mddev->sb_wait); 331 mddev->pers->quiesce(mddev, 0); 332 333 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 334 md_wakeup_thread(mddev->thread); 335 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 336 } 337 EXPORT_SYMBOL_GPL(mddev_resume); 338 339 int mddev_congested(struct mddev *mddev, int bits) 340 { 341 struct md_personality *pers = mddev->pers; 342 int ret = 0; 343 344 rcu_read_lock(); 345 if (mddev->suspended) 346 ret = 1; 347 else if (pers && pers->congested) 348 ret = pers->congested(mddev, bits); 349 rcu_read_unlock(); 350 return ret; 351 } 352 EXPORT_SYMBOL_GPL(mddev_congested); 353 static int md_congested(void *data, int bits) 354 { 355 struct mddev *mddev = data; 356 return mddev_congested(mddev, bits); 357 } 358 359 /* 360 * Generic flush handling for md 361 */ 362 363 static void md_end_flush(struct bio *bio) 364 { 365 struct md_rdev *rdev = bio->bi_private; 366 struct mddev *mddev = rdev->mddev; 367 368 rdev_dec_pending(rdev, mddev); 369 370 if (atomic_dec_and_test(&mddev->flush_pending)) { 371 /* The pre-request flush has finished */ 372 queue_work(md_wq, &mddev->flush_work); 373 } 374 bio_put(bio); 375 } 376 377 static void md_submit_flush_data(struct work_struct *ws); 378 379 static void submit_flushes(struct work_struct *ws) 380 { 381 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 382 struct md_rdev *rdev; 383 384 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 385 atomic_set(&mddev->flush_pending, 1); 386 rcu_read_lock(); 387 rdev_for_each_rcu(rdev, mddev) 388 if (rdev->raid_disk >= 0 && 389 !test_bit(Faulty, &rdev->flags)) { 390 /* Take two references, one is dropped 391 * when request finishes, one after 392 * we reclaim rcu_read_lock 393 */ 394 struct bio *bi; 395 atomic_inc(&rdev->nr_pending); 396 atomic_inc(&rdev->nr_pending); 397 rcu_read_unlock(); 398 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 399 bi->bi_end_io = md_end_flush; 400 bi->bi_private = rdev; 401 bi->bi_bdev = rdev->bdev; 402 atomic_inc(&mddev->flush_pending); 403 submit_bio(WRITE_FLUSH, bi); 404 rcu_read_lock(); 405 rdev_dec_pending(rdev, mddev); 406 } 407 rcu_read_unlock(); 408 if (atomic_dec_and_test(&mddev->flush_pending)) 409 queue_work(md_wq, &mddev->flush_work); 410 } 411 412 static void md_submit_flush_data(struct work_struct *ws) 413 { 414 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 415 struct bio *bio = mddev->flush_bio; 416 417 if (bio->bi_iter.bi_size == 0) 418 /* an empty barrier - all done */ 419 bio_endio(bio); 420 else { 421 bio->bi_rw &= ~REQ_FLUSH; 422 mddev->pers->make_request(mddev, bio); 423 } 424 425 mddev->flush_bio = NULL; 426 wake_up(&mddev->sb_wait); 427 } 428 429 void md_flush_request(struct mddev *mddev, struct bio *bio) 430 { 431 spin_lock_irq(&mddev->lock); 432 wait_event_lock_irq(mddev->sb_wait, 433 !mddev->flush_bio, 434 mddev->lock); 435 mddev->flush_bio = bio; 436 spin_unlock_irq(&mddev->lock); 437 438 INIT_WORK(&mddev->flush_work, submit_flushes); 439 queue_work(md_wq, &mddev->flush_work); 440 } 441 EXPORT_SYMBOL(md_flush_request); 442 443 void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 444 { 445 struct mddev *mddev = cb->data; 446 md_wakeup_thread(mddev->thread); 447 kfree(cb); 448 } 449 EXPORT_SYMBOL(md_unplug); 450 451 static inline struct mddev *mddev_get(struct mddev *mddev) 452 { 453 atomic_inc(&mddev->active); 454 return mddev; 455 } 456 457 static void mddev_delayed_delete(struct work_struct *ws); 458 459 static void mddev_put(struct mddev *mddev) 460 { 461 struct bio_set *bs = NULL; 462 463 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 464 return; 465 if (!mddev->raid_disks && list_empty(&mddev->disks) && 466 mddev->ctime == 0 && !mddev->hold_active) { 467 /* Array is not configured at all, and not held active, 468 * so destroy it */ 469 list_del_init(&mddev->all_mddevs); 470 bs = mddev->bio_set; 471 mddev->bio_set = NULL; 472 if (mddev->gendisk) { 473 /* We did a probe so need to clean up. Call 474 * queue_work inside the spinlock so that 475 * flush_workqueue() after mddev_find will 476 * succeed in waiting for the work to be done. 477 */ 478 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 479 queue_work(md_misc_wq, &mddev->del_work); 480 } else 481 kfree(mddev); 482 } 483 spin_unlock(&all_mddevs_lock); 484 if (bs) 485 bioset_free(bs); 486 } 487 488 static void md_safemode_timeout(unsigned long data); 489 490 void mddev_init(struct mddev *mddev) 491 { 492 mutex_init(&mddev->open_mutex); 493 mutex_init(&mddev->reconfig_mutex); 494 mutex_init(&mddev->bitmap_info.mutex); 495 INIT_LIST_HEAD(&mddev->disks); 496 INIT_LIST_HEAD(&mddev->all_mddevs); 497 setup_timer(&mddev->safemode_timer, md_safemode_timeout, 498 (unsigned long) mddev); 499 atomic_set(&mddev->active, 1); 500 atomic_set(&mddev->openers, 0); 501 atomic_set(&mddev->active_io, 0); 502 spin_lock_init(&mddev->lock); 503 atomic_set(&mddev->flush_pending, 0); 504 init_waitqueue_head(&mddev->sb_wait); 505 init_waitqueue_head(&mddev->recovery_wait); 506 mddev->reshape_position = MaxSector; 507 mddev->reshape_backwards = 0; 508 mddev->last_sync_action = "none"; 509 mddev->resync_min = 0; 510 mddev->resync_max = MaxSector; 511 mddev->level = LEVEL_NONE; 512 } 513 EXPORT_SYMBOL_GPL(mddev_init); 514 515 static struct mddev *mddev_find(dev_t unit) 516 { 517 struct mddev *mddev, *new = NULL; 518 519 if (unit && MAJOR(unit) != MD_MAJOR) 520 unit &= ~((1<<MdpMinorShift)-1); 521 522 retry: 523 spin_lock(&all_mddevs_lock); 524 525 if (unit) { 526 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 527 if (mddev->unit == unit) { 528 mddev_get(mddev); 529 spin_unlock(&all_mddevs_lock); 530 kfree(new); 531 return mddev; 532 } 533 534 if (new) { 535 list_add(&new->all_mddevs, &all_mddevs); 536 spin_unlock(&all_mddevs_lock); 537 new->hold_active = UNTIL_IOCTL; 538 return new; 539 } 540 } else if (new) { 541 /* find an unused unit number */ 542 static int next_minor = 512; 543 int start = next_minor; 544 int is_free = 0; 545 int dev = 0; 546 while (!is_free) { 547 dev = MKDEV(MD_MAJOR, next_minor); 548 next_minor++; 549 if (next_minor > MINORMASK) 550 next_minor = 0; 551 if (next_minor == start) { 552 /* Oh dear, all in use. */ 553 spin_unlock(&all_mddevs_lock); 554 kfree(new); 555 return NULL; 556 } 557 558 is_free = 1; 559 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 560 if (mddev->unit == dev) { 561 is_free = 0; 562 break; 563 } 564 } 565 new->unit = dev; 566 new->md_minor = MINOR(dev); 567 new->hold_active = UNTIL_STOP; 568 list_add(&new->all_mddevs, &all_mddevs); 569 spin_unlock(&all_mddevs_lock); 570 return new; 571 } 572 spin_unlock(&all_mddevs_lock); 573 574 new = kzalloc(sizeof(*new), GFP_KERNEL); 575 if (!new) 576 return NULL; 577 578 new->unit = unit; 579 if (MAJOR(unit) == MD_MAJOR) 580 new->md_minor = MINOR(unit); 581 else 582 new->md_minor = MINOR(unit) >> MdpMinorShift; 583 584 mddev_init(new); 585 586 goto retry; 587 } 588 589 static struct attribute_group md_redundancy_group; 590 591 void mddev_unlock(struct mddev *mddev) 592 { 593 if (mddev->to_remove) { 594 /* These cannot be removed under reconfig_mutex as 595 * an access to the files will try to take reconfig_mutex 596 * while holding the file unremovable, which leads to 597 * a deadlock. 598 * So hold set sysfs_active while the remove in happeing, 599 * and anything else which might set ->to_remove or my 600 * otherwise change the sysfs namespace will fail with 601 * -EBUSY if sysfs_active is still set. 602 * We set sysfs_active under reconfig_mutex and elsewhere 603 * test it under the same mutex to ensure its correct value 604 * is seen. 605 */ 606 struct attribute_group *to_remove = mddev->to_remove; 607 mddev->to_remove = NULL; 608 mddev->sysfs_active = 1; 609 mutex_unlock(&mddev->reconfig_mutex); 610 611 if (mddev->kobj.sd) { 612 if (to_remove != &md_redundancy_group) 613 sysfs_remove_group(&mddev->kobj, to_remove); 614 if (mddev->pers == NULL || 615 mddev->pers->sync_request == NULL) { 616 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 617 if (mddev->sysfs_action) 618 sysfs_put(mddev->sysfs_action); 619 mddev->sysfs_action = NULL; 620 } 621 } 622 mddev->sysfs_active = 0; 623 } else 624 mutex_unlock(&mddev->reconfig_mutex); 625 626 /* As we've dropped the mutex we need a spinlock to 627 * make sure the thread doesn't disappear 628 */ 629 spin_lock(&pers_lock); 630 md_wakeup_thread(mddev->thread); 631 spin_unlock(&pers_lock); 632 } 633 EXPORT_SYMBOL_GPL(mddev_unlock); 634 635 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 636 { 637 struct md_rdev *rdev; 638 639 rdev_for_each_rcu(rdev, mddev) 640 if (rdev->desc_nr == nr) 641 return rdev; 642 643 return NULL; 644 } 645 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 646 647 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 648 { 649 struct md_rdev *rdev; 650 651 rdev_for_each(rdev, mddev) 652 if (rdev->bdev->bd_dev == dev) 653 return rdev; 654 655 return NULL; 656 } 657 658 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 659 { 660 struct md_rdev *rdev; 661 662 rdev_for_each_rcu(rdev, mddev) 663 if (rdev->bdev->bd_dev == dev) 664 return rdev; 665 666 return NULL; 667 } 668 669 static struct md_personality *find_pers(int level, char *clevel) 670 { 671 struct md_personality *pers; 672 list_for_each_entry(pers, &pers_list, list) { 673 if (level != LEVEL_NONE && pers->level == level) 674 return pers; 675 if (strcmp(pers->name, clevel)==0) 676 return pers; 677 } 678 return NULL; 679 } 680 681 /* return the offset of the super block in 512byte sectors */ 682 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 683 { 684 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 685 return MD_NEW_SIZE_SECTORS(num_sectors); 686 } 687 688 static int alloc_disk_sb(struct md_rdev *rdev) 689 { 690 rdev->sb_page = alloc_page(GFP_KERNEL); 691 if (!rdev->sb_page) { 692 printk(KERN_ALERT "md: out of memory.\n"); 693 return -ENOMEM; 694 } 695 696 return 0; 697 } 698 699 void md_rdev_clear(struct md_rdev *rdev) 700 { 701 if (rdev->sb_page) { 702 put_page(rdev->sb_page); 703 rdev->sb_loaded = 0; 704 rdev->sb_page = NULL; 705 rdev->sb_start = 0; 706 rdev->sectors = 0; 707 } 708 if (rdev->bb_page) { 709 put_page(rdev->bb_page); 710 rdev->bb_page = NULL; 711 } 712 kfree(rdev->badblocks.page); 713 rdev->badblocks.page = NULL; 714 } 715 EXPORT_SYMBOL_GPL(md_rdev_clear); 716 717 static void super_written(struct bio *bio) 718 { 719 struct md_rdev *rdev = bio->bi_private; 720 struct mddev *mddev = rdev->mddev; 721 722 if (bio->bi_error) { 723 printk("md: super_written gets error=%d\n", bio->bi_error); 724 md_error(mddev, rdev); 725 } 726 727 if (atomic_dec_and_test(&mddev->pending_writes)) 728 wake_up(&mddev->sb_wait); 729 bio_put(bio); 730 } 731 732 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 733 sector_t sector, int size, struct page *page) 734 { 735 /* write first size bytes of page to sector of rdev 736 * Increment mddev->pending_writes before returning 737 * and decrement it on completion, waking up sb_wait 738 * if zero is reached. 739 * If an error occurred, call md_error 740 */ 741 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 742 743 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 744 bio->bi_iter.bi_sector = sector; 745 bio_add_page(bio, page, size, 0); 746 bio->bi_private = rdev; 747 bio->bi_end_io = super_written; 748 749 atomic_inc(&mddev->pending_writes); 750 submit_bio(WRITE_FLUSH_FUA, bio); 751 } 752 753 void md_super_wait(struct mddev *mddev) 754 { 755 /* wait for all superblock writes that were scheduled to complete */ 756 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 757 } 758 759 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 760 struct page *page, int rw, bool metadata_op) 761 { 762 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 763 int ret; 764 765 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 766 rdev->meta_bdev : rdev->bdev; 767 if (metadata_op) 768 bio->bi_iter.bi_sector = sector + rdev->sb_start; 769 else if (rdev->mddev->reshape_position != MaxSector && 770 (rdev->mddev->reshape_backwards == 771 (sector >= rdev->mddev->reshape_position))) 772 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 773 else 774 bio->bi_iter.bi_sector = sector + rdev->data_offset; 775 bio_add_page(bio, page, size, 0); 776 submit_bio_wait(rw, bio); 777 778 ret = !bio->bi_error; 779 bio_put(bio); 780 return ret; 781 } 782 EXPORT_SYMBOL_GPL(sync_page_io); 783 784 static int read_disk_sb(struct md_rdev *rdev, int size) 785 { 786 char b[BDEVNAME_SIZE]; 787 788 if (rdev->sb_loaded) 789 return 0; 790 791 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) 792 goto fail; 793 rdev->sb_loaded = 1; 794 return 0; 795 796 fail: 797 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 798 bdevname(rdev->bdev,b)); 799 return -EINVAL; 800 } 801 802 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 803 { 804 return sb1->set_uuid0 == sb2->set_uuid0 && 805 sb1->set_uuid1 == sb2->set_uuid1 && 806 sb1->set_uuid2 == sb2->set_uuid2 && 807 sb1->set_uuid3 == sb2->set_uuid3; 808 } 809 810 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 811 { 812 int ret; 813 mdp_super_t *tmp1, *tmp2; 814 815 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 816 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 817 818 if (!tmp1 || !tmp2) { 819 ret = 0; 820 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 821 goto abort; 822 } 823 824 *tmp1 = *sb1; 825 *tmp2 = *sb2; 826 827 /* 828 * nr_disks is not constant 829 */ 830 tmp1->nr_disks = 0; 831 tmp2->nr_disks = 0; 832 833 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 834 abort: 835 kfree(tmp1); 836 kfree(tmp2); 837 return ret; 838 } 839 840 static u32 md_csum_fold(u32 csum) 841 { 842 csum = (csum & 0xffff) + (csum >> 16); 843 return (csum & 0xffff) + (csum >> 16); 844 } 845 846 static unsigned int calc_sb_csum(mdp_super_t *sb) 847 { 848 u64 newcsum = 0; 849 u32 *sb32 = (u32*)sb; 850 int i; 851 unsigned int disk_csum, csum; 852 853 disk_csum = sb->sb_csum; 854 sb->sb_csum = 0; 855 856 for (i = 0; i < MD_SB_BYTES/4 ; i++) 857 newcsum += sb32[i]; 858 csum = (newcsum & 0xffffffff) + (newcsum>>32); 859 860 #ifdef CONFIG_ALPHA 861 /* This used to use csum_partial, which was wrong for several 862 * reasons including that different results are returned on 863 * different architectures. It isn't critical that we get exactly 864 * the same return value as before (we always csum_fold before 865 * testing, and that removes any differences). However as we 866 * know that csum_partial always returned a 16bit value on 867 * alphas, do a fold to maximise conformity to previous behaviour. 868 */ 869 sb->sb_csum = md_csum_fold(disk_csum); 870 #else 871 sb->sb_csum = disk_csum; 872 #endif 873 return csum; 874 } 875 876 /* 877 * Handle superblock details. 878 * We want to be able to handle multiple superblock formats 879 * so we have a common interface to them all, and an array of 880 * different handlers. 881 * We rely on user-space to write the initial superblock, and support 882 * reading and updating of superblocks. 883 * Interface methods are: 884 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 885 * loads and validates a superblock on dev. 886 * if refdev != NULL, compare superblocks on both devices 887 * Return: 888 * 0 - dev has a superblock that is compatible with refdev 889 * 1 - dev has a superblock that is compatible and newer than refdev 890 * so dev should be used as the refdev in future 891 * -EINVAL superblock incompatible or invalid 892 * -othererror e.g. -EIO 893 * 894 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 895 * Verify that dev is acceptable into mddev. 896 * The first time, mddev->raid_disks will be 0, and data from 897 * dev should be merged in. Subsequent calls check that dev 898 * is new enough. Return 0 or -EINVAL 899 * 900 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 901 * Update the superblock for rdev with data in mddev 902 * This does not write to disc. 903 * 904 */ 905 906 struct super_type { 907 char *name; 908 struct module *owner; 909 int (*load_super)(struct md_rdev *rdev, 910 struct md_rdev *refdev, 911 int minor_version); 912 int (*validate_super)(struct mddev *mddev, 913 struct md_rdev *rdev); 914 void (*sync_super)(struct mddev *mddev, 915 struct md_rdev *rdev); 916 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 917 sector_t num_sectors); 918 int (*allow_new_offset)(struct md_rdev *rdev, 919 unsigned long long new_offset); 920 }; 921 922 /* 923 * Check that the given mddev has no bitmap. 924 * 925 * This function is called from the run method of all personalities that do not 926 * support bitmaps. It prints an error message and returns non-zero if mddev 927 * has a bitmap. Otherwise, it returns 0. 928 * 929 */ 930 int md_check_no_bitmap(struct mddev *mddev) 931 { 932 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 933 return 0; 934 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 935 mdname(mddev), mddev->pers->name); 936 return 1; 937 } 938 EXPORT_SYMBOL(md_check_no_bitmap); 939 940 /* 941 * load_super for 0.90.0 942 */ 943 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 944 { 945 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 946 mdp_super_t *sb; 947 int ret; 948 949 /* 950 * Calculate the position of the superblock (512byte sectors), 951 * it's at the end of the disk. 952 * 953 * It also happens to be a multiple of 4Kb. 954 */ 955 rdev->sb_start = calc_dev_sboffset(rdev); 956 957 ret = read_disk_sb(rdev, MD_SB_BYTES); 958 if (ret) return ret; 959 960 ret = -EINVAL; 961 962 bdevname(rdev->bdev, b); 963 sb = page_address(rdev->sb_page); 964 965 if (sb->md_magic != MD_SB_MAGIC) { 966 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 967 b); 968 goto abort; 969 } 970 971 if (sb->major_version != 0 || 972 sb->minor_version < 90 || 973 sb->minor_version > 91) { 974 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 975 sb->major_version, sb->minor_version, 976 b); 977 goto abort; 978 } 979 980 if (sb->raid_disks <= 0) 981 goto abort; 982 983 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 984 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 985 b); 986 goto abort; 987 } 988 989 rdev->preferred_minor = sb->md_minor; 990 rdev->data_offset = 0; 991 rdev->new_data_offset = 0; 992 rdev->sb_size = MD_SB_BYTES; 993 rdev->badblocks.shift = -1; 994 995 if (sb->level == LEVEL_MULTIPATH) 996 rdev->desc_nr = -1; 997 else 998 rdev->desc_nr = sb->this_disk.number; 999 1000 if (!refdev) { 1001 ret = 1; 1002 } else { 1003 __u64 ev1, ev2; 1004 mdp_super_t *refsb = page_address(refdev->sb_page); 1005 if (!uuid_equal(refsb, sb)) { 1006 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1007 b, bdevname(refdev->bdev,b2)); 1008 goto abort; 1009 } 1010 if (!sb_equal(refsb, sb)) { 1011 printk(KERN_WARNING "md: %s has same UUID" 1012 " but different superblock to %s\n", 1013 b, bdevname(refdev->bdev, b2)); 1014 goto abort; 1015 } 1016 ev1 = md_event(sb); 1017 ev2 = md_event(refsb); 1018 if (ev1 > ev2) 1019 ret = 1; 1020 else 1021 ret = 0; 1022 } 1023 rdev->sectors = rdev->sb_start; 1024 /* Limit to 4TB as metadata cannot record more than that. 1025 * (not needed for Linear and RAID0 as metadata doesn't 1026 * record this size) 1027 */ 1028 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1029 rdev->sectors = (2ULL << 32) - 2; 1030 1031 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1032 /* "this cannot possibly happen" ... */ 1033 ret = -EINVAL; 1034 1035 abort: 1036 return ret; 1037 } 1038 1039 /* 1040 * validate_super for 0.90.0 1041 */ 1042 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1043 { 1044 mdp_disk_t *desc; 1045 mdp_super_t *sb = page_address(rdev->sb_page); 1046 __u64 ev1 = md_event(sb); 1047 1048 rdev->raid_disk = -1; 1049 clear_bit(Faulty, &rdev->flags); 1050 clear_bit(In_sync, &rdev->flags); 1051 clear_bit(Bitmap_sync, &rdev->flags); 1052 clear_bit(WriteMostly, &rdev->flags); 1053 1054 if (mddev->raid_disks == 0) { 1055 mddev->major_version = 0; 1056 mddev->minor_version = sb->minor_version; 1057 mddev->patch_version = sb->patch_version; 1058 mddev->external = 0; 1059 mddev->chunk_sectors = sb->chunk_size >> 9; 1060 mddev->ctime = sb->ctime; 1061 mddev->utime = sb->utime; 1062 mddev->level = sb->level; 1063 mddev->clevel[0] = 0; 1064 mddev->layout = sb->layout; 1065 mddev->raid_disks = sb->raid_disks; 1066 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1067 mddev->events = ev1; 1068 mddev->bitmap_info.offset = 0; 1069 mddev->bitmap_info.space = 0; 1070 /* bitmap can use 60 K after the 4K superblocks */ 1071 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1072 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1073 mddev->reshape_backwards = 0; 1074 1075 if (mddev->minor_version >= 91) { 1076 mddev->reshape_position = sb->reshape_position; 1077 mddev->delta_disks = sb->delta_disks; 1078 mddev->new_level = sb->new_level; 1079 mddev->new_layout = sb->new_layout; 1080 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1081 if (mddev->delta_disks < 0) 1082 mddev->reshape_backwards = 1; 1083 } else { 1084 mddev->reshape_position = MaxSector; 1085 mddev->delta_disks = 0; 1086 mddev->new_level = mddev->level; 1087 mddev->new_layout = mddev->layout; 1088 mddev->new_chunk_sectors = mddev->chunk_sectors; 1089 } 1090 1091 if (sb->state & (1<<MD_SB_CLEAN)) 1092 mddev->recovery_cp = MaxSector; 1093 else { 1094 if (sb->events_hi == sb->cp_events_hi && 1095 sb->events_lo == sb->cp_events_lo) { 1096 mddev->recovery_cp = sb->recovery_cp; 1097 } else 1098 mddev->recovery_cp = 0; 1099 } 1100 1101 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1102 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1103 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1104 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1105 1106 mddev->max_disks = MD_SB_DISKS; 1107 1108 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1109 mddev->bitmap_info.file == NULL) { 1110 mddev->bitmap_info.offset = 1111 mddev->bitmap_info.default_offset; 1112 mddev->bitmap_info.space = 1113 mddev->bitmap_info.default_space; 1114 } 1115 1116 } else if (mddev->pers == NULL) { 1117 /* Insist on good event counter while assembling, except 1118 * for spares (which don't need an event count) */ 1119 ++ev1; 1120 if (sb->disks[rdev->desc_nr].state & ( 1121 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1122 if (ev1 < mddev->events) 1123 return -EINVAL; 1124 } else if (mddev->bitmap) { 1125 /* if adding to array with a bitmap, then we can accept an 1126 * older device ... but not too old. 1127 */ 1128 if (ev1 < mddev->bitmap->events_cleared) 1129 return 0; 1130 if (ev1 < mddev->events) 1131 set_bit(Bitmap_sync, &rdev->flags); 1132 } else { 1133 if (ev1 < mddev->events) 1134 /* just a hot-add of a new device, leave raid_disk at -1 */ 1135 return 0; 1136 } 1137 1138 if (mddev->level != LEVEL_MULTIPATH) { 1139 desc = sb->disks + rdev->desc_nr; 1140 1141 if (desc->state & (1<<MD_DISK_FAULTY)) 1142 set_bit(Faulty, &rdev->flags); 1143 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1144 desc->raid_disk < mddev->raid_disks */) { 1145 set_bit(In_sync, &rdev->flags); 1146 rdev->raid_disk = desc->raid_disk; 1147 rdev->saved_raid_disk = desc->raid_disk; 1148 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1149 /* active but not in sync implies recovery up to 1150 * reshape position. We don't know exactly where 1151 * that is, so set to zero for now */ 1152 if (mddev->minor_version >= 91) { 1153 rdev->recovery_offset = 0; 1154 rdev->raid_disk = desc->raid_disk; 1155 } 1156 } 1157 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1158 set_bit(WriteMostly, &rdev->flags); 1159 } else /* MULTIPATH are always insync */ 1160 set_bit(In_sync, &rdev->flags); 1161 return 0; 1162 } 1163 1164 /* 1165 * sync_super for 0.90.0 1166 */ 1167 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1168 { 1169 mdp_super_t *sb; 1170 struct md_rdev *rdev2; 1171 int next_spare = mddev->raid_disks; 1172 1173 /* make rdev->sb match mddev data.. 1174 * 1175 * 1/ zero out disks 1176 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1177 * 3/ any empty disks < next_spare become removed 1178 * 1179 * disks[0] gets initialised to REMOVED because 1180 * we cannot be sure from other fields if it has 1181 * been initialised or not. 1182 */ 1183 int i; 1184 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1185 1186 rdev->sb_size = MD_SB_BYTES; 1187 1188 sb = page_address(rdev->sb_page); 1189 1190 memset(sb, 0, sizeof(*sb)); 1191 1192 sb->md_magic = MD_SB_MAGIC; 1193 sb->major_version = mddev->major_version; 1194 sb->patch_version = mddev->patch_version; 1195 sb->gvalid_words = 0; /* ignored */ 1196 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1197 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1198 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1199 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1200 1201 sb->ctime = mddev->ctime; 1202 sb->level = mddev->level; 1203 sb->size = mddev->dev_sectors / 2; 1204 sb->raid_disks = mddev->raid_disks; 1205 sb->md_minor = mddev->md_minor; 1206 sb->not_persistent = 0; 1207 sb->utime = mddev->utime; 1208 sb->state = 0; 1209 sb->events_hi = (mddev->events>>32); 1210 sb->events_lo = (u32)mddev->events; 1211 1212 if (mddev->reshape_position == MaxSector) 1213 sb->minor_version = 90; 1214 else { 1215 sb->minor_version = 91; 1216 sb->reshape_position = mddev->reshape_position; 1217 sb->new_level = mddev->new_level; 1218 sb->delta_disks = mddev->delta_disks; 1219 sb->new_layout = mddev->new_layout; 1220 sb->new_chunk = mddev->new_chunk_sectors << 9; 1221 } 1222 mddev->minor_version = sb->minor_version; 1223 if (mddev->in_sync) 1224 { 1225 sb->recovery_cp = mddev->recovery_cp; 1226 sb->cp_events_hi = (mddev->events>>32); 1227 sb->cp_events_lo = (u32)mddev->events; 1228 if (mddev->recovery_cp == MaxSector) 1229 sb->state = (1<< MD_SB_CLEAN); 1230 } else 1231 sb->recovery_cp = 0; 1232 1233 sb->layout = mddev->layout; 1234 sb->chunk_size = mddev->chunk_sectors << 9; 1235 1236 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1237 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1238 1239 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1240 rdev_for_each(rdev2, mddev) { 1241 mdp_disk_t *d; 1242 int desc_nr; 1243 int is_active = test_bit(In_sync, &rdev2->flags); 1244 1245 if (rdev2->raid_disk >= 0 && 1246 sb->minor_version >= 91) 1247 /* we have nowhere to store the recovery_offset, 1248 * but if it is not below the reshape_position, 1249 * we can piggy-back on that. 1250 */ 1251 is_active = 1; 1252 if (rdev2->raid_disk < 0 || 1253 test_bit(Faulty, &rdev2->flags)) 1254 is_active = 0; 1255 if (is_active) 1256 desc_nr = rdev2->raid_disk; 1257 else 1258 desc_nr = next_spare++; 1259 rdev2->desc_nr = desc_nr; 1260 d = &sb->disks[rdev2->desc_nr]; 1261 nr_disks++; 1262 d->number = rdev2->desc_nr; 1263 d->major = MAJOR(rdev2->bdev->bd_dev); 1264 d->minor = MINOR(rdev2->bdev->bd_dev); 1265 if (is_active) 1266 d->raid_disk = rdev2->raid_disk; 1267 else 1268 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1269 if (test_bit(Faulty, &rdev2->flags)) 1270 d->state = (1<<MD_DISK_FAULTY); 1271 else if (is_active) { 1272 d->state = (1<<MD_DISK_ACTIVE); 1273 if (test_bit(In_sync, &rdev2->flags)) 1274 d->state |= (1<<MD_DISK_SYNC); 1275 active++; 1276 working++; 1277 } else { 1278 d->state = 0; 1279 spare++; 1280 working++; 1281 } 1282 if (test_bit(WriteMostly, &rdev2->flags)) 1283 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1284 } 1285 /* now set the "removed" and "faulty" bits on any missing devices */ 1286 for (i=0 ; i < mddev->raid_disks ; i++) { 1287 mdp_disk_t *d = &sb->disks[i]; 1288 if (d->state == 0 && d->number == 0) { 1289 d->number = i; 1290 d->raid_disk = i; 1291 d->state = (1<<MD_DISK_REMOVED); 1292 d->state |= (1<<MD_DISK_FAULTY); 1293 failed++; 1294 } 1295 } 1296 sb->nr_disks = nr_disks; 1297 sb->active_disks = active; 1298 sb->working_disks = working; 1299 sb->failed_disks = failed; 1300 sb->spare_disks = spare; 1301 1302 sb->this_disk = sb->disks[rdev->desc_nr]; 1303 sb->sb_csum = calc_sb_csum(sb); 1304 } 1305 1306 /* 1307 * rdev_size_change for 0.90.0 1308 */ 1309 static unsigned long long 1310 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1311 { 1312 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1313 return 0; /* component must fit device */ 1314 if (rdev->mddev->bitmap_info.offset) 1315 return 0; /* can't move bitmap */ 1316 rdev->sb_start = calc_dev_sboffset(rdev); 1317 if (!num_sectors || num_sectors > rdev->sb_start) 1318 num_sectors = rdev->sb_start; 1319 /* Limit to 4TB as metadata cannot record more than that. 1320 * 4TB == 2^32 KB, or 2*2^32 sectors. 1321 */ 1322 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1323 num_sectors = (2ULL << 32) - 2; 1324 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1325 rdev->sb_page); 1326 md_super_wait(rdev->mddev); 1327 return num_sectors; 1328 } 1329 1330 static int 1331 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1332 { 1333 /* non-zero offset changes not possible with v0.90 */ 1334 return new_offset == 0; 1335 } 1336 1337 /* 1338 * version 1 superblock 1339 */ 1340 1341 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1342 { 1343 __le32 disk_csum; 1344 u32 csum; 1345 unsigned long long newcsum; 1346 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1347 __le32 *isuper = (__le32*)sb; 1348 1349 disk_csum = sb->sb_csum; 1350 sb->sb_csum = 0; 1351 newcsum = 0; 1352 for (; size >= 4; size -= 4) 1353 newcsum += le32_to_cpu(*isuper++); 1354 1355 if (size == 2) 1356 newcsum += le16_to_cpu(*(__le16*) isuper); 1357 1358 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1359 sb->sb_csum = disk_csum; 1360 return cpu_to_le32(csum); 1361 } 1362 1363 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 1364 int acknowledged); 1365 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1366 { 1367 struct mdp_superblock_1 *sb; 1368 int ret; 1369 sector_t sb_start; 1370 sector_t sectors; 1371 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1372 int bmask; 1373 1374 /* 1375 * Calculate the position of the superblock in 512byte sectors. 1376 * It is always aligned to a 4K boundary and 1377 * depeding on minor_version, it can be: 1378 * 0: At least 8K, but less than 12K, from end of device 1379 * 1: At start of device 1380 * 2: 4K from start of device. 1381 */ 1382 switch(minor_version) { 1383 case 0: 1384 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1385 sb_start -= 8*2; 1386 sb_start &= ~(sector_t)(4*2-1); 1387 break; 1388 case 1: 1389 sb_start = 0; 1390 break; 1391 case 2: 1392 sb_start = 8; 1393 break; 1394 default: 1395 return -EINVAL; 1396 } 1397 rdev->sb_start = sb_start; 1398 1399 /* superblock is rarely larger than 1K, but it can be larger, 1400 * and it is safe to read 4k, so we do that 1401 */ 1402 ret = read_disk_sb(rdev, 4096); 1403 if (ret) return ret; 1404 1405 sb = page_address(rdev->sb_page); 1406 1407 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1408 sb->major_version != cpu_to_le32(1) || 1409 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1410 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1411 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1412 return -EINVAL; 1413 1414 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1415 printk("md: invalid superblock checksum on %s\n", 1416 bdevname(rdev->bdev,b)); 1417 return -EINVAL; 1418 } 1419 if (le64_to_cpu(sb->data_size) < 10) { 1420 printk("md: data_size too small on %s\n", 1421 bdevname(rdev->bdev,b)); 1422 return -EINVAL; 1423 } 1424 if (sb->pad0 || 1425 sb->pad3[0] || 1426 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1427 /* Some padding is non-zero, might be a new feature */ 1428 return -EINVAL; 1429 1430 rdev->preferred_minor = 0xffff; 1431 rdev->data_offset = le64_to_cpu(sb->data_offset); 1432 rdev->new_data_offset = rdev->data_offset; 1433 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1434 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1435 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1436 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1437 1438 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1439 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1440 if (rdev->sb_size & bmask) 1441 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1442 1443 if (minor_version 1444 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1445 return -EINVAL; 1446 if (minor_version 1447 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1448 return -EINVAL; 1449 1450 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1451 rdev->desc_nr = -1; 1452 else 1453 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1454 1455 if (!rdev->bb_page) { 1456 rdev->bb_page = alloc_page(GFP_KERNEL); 1457 if (!rdev->bb_page) 1458 return -ENOMEM; 1459 } 1460 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1461 rdev->badblocks.count == 0) { 1462 /* need to load the bad block list. 1463 * Currently we limit it to one page. 1464 */ 1465 s32 offset; 1466 sector_t bb_sector; 1467 u64 *bbp; 1468 int i; 1469 int sectors = le16_to_cpu(sb->bblog_size); 1470 if (sectors > (PAGE_SIZE / 512)) 1471 return -EINVAL; 1472 offset = le32_to_cpu(sb->bblog_offset); 1473 if (offset == 0) 1474 return -EINVAL; 1475 bb_sector = (long long)offset; 1476 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1477 rdev->bb_page, READ, true)) 1478 return -EIO; 1479 bbp = (u64 *)page_address(rdev->bb_page); 1480 rdev->badblocks.shift = sb->bblog_shift; 1481 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1482 u64 bb = le64_to_cpu(*bbp); 1483 int count = bb & (0x3ff); 1484 u64 sector = bb >> 10; 1485 sector <<= sb->bblog_shift; 1486 count <<= sb->bblog_shift; 1487 if (bb + 1 == 0) 1488 break; 1489 if (md_set_badblocks(&rdev->badblocks, 1490 sector, count, 1) == 0) 1491 return -EINVAL; 1492 } 1493 } else if (sb->bblog_offset != 0) 1494 rdev->badblocks.shift = 0; 1495 1496 if (!refdev) { 1497 ret = 1; 1498 } else { 1499 __u64 ev1, ev2; 1500 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1501 1502 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1503 sb->level != refsb->level || 1504 sb->layout != refsb->layout || 1505 sb->chunksize != refsb->chunksize) { 1506 printk(KERN_WARNING "md: %s has strangely different" 1507 " superblock to %s\n", 1508 bdevname(rdev->bdev,b), 1509 bdevname(refdev->bdev,b2)); 1510 return -EINVAL; 1511 } 1512 ev1 = le64_to_cpu(sb->events); 1513 ev2 = le64_to_cpu(refsb->events); 1514 1515 if (ev1 > ev2) 1516 ret = 1; 1517 else 1518 ret = 0; 1519 } 1520 if (minor_version) { 1521 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1522 sectors -= rdev->data_offset; 1523 } else 1524 sectors = rdev->sb_start; 1525 if (sectors < le64_to_cpu(sb->data_size)) 1526 return -EINVAL; 1527 rdev->sectors = le64_to_cpu(sb->data_size); 1528 return ret; 1529 } 1530 1531 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1532 { 1533 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1534 __u64 ev1 = le64_to_cpu(sb->events); 1535 1536 rdev->raid_disk = -1; 1537 clear_bit(Faulty, &rdev->flags); 1538 clear_bit(In_sync, &rdev->flags); 1539 clear_bit(Bitmap_sync, &rdev->flags); 1540 clear_bit(WriteMostly, &rdev->flags); 1541 1542 if (mddev->raid_disks == 0) { 1543 mddev->major_version = 1; 1544 mddev->patch_version = 0; 1545 mddev->external = 0; 1546 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1547 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1548 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1549 mddev->level = le32_to_cpu(sb->level); 1550 mddev->clevel[0] = 0; 1551 mddev->layout = le32_to_cpu(sb->layout); 1552 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1553 mddev->dev_sectors = le64_to_cpu(sb->size); 1554 mddev->events = ev1; 1555 mddev->bitmap_info.offset = 0; 1556 mddev->bitmap_info.space = 0; 1557 /* Default location for bitmap is 1K after superblock 1558 * using 3K - total of 4K 1559 */ 1560 mddev->bitmap_info.default_offset = 1024 >> 9; 1561 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1562 mddev->reshape_backwards = 0; 1563 1564 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1565 memcpy(mddev->uuid, sb->set_uuid, 16); 1566 1567 mddev->max_disks = (4096-256)/2; 1568 1569 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1570 mddev->bitmap_info.file == NULL) { 1571 mddev->bitmap_info.offset = 1572 (__s32)le32_to_cpu(sb->bitmap_offset); 1573 /* Metadata doesn't record how much space is available. 1574 * For 1.0, we assume we can use up to the superblock 1575 * if before, else to 4K beyond superblock. 1576 * For others, assume no change is possible. 1577 */ 1578 if (mddev->minor_version > 0) 1579 mddev->bitmap_info.space = 0; 1580 else if (mddev->bitmap_info.offset > 0) 1581 mddev->bitmap_info.space = 1582 8 - mddev->bitmap_info.offset; 1583 else 1584 mddev->bitmap_info.space = 1585 -mddev->bitmap_info.offset; 1586 } 1587 1588 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1589 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1590 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1591 mddev->new_level = le32_to_cpu(sb->new_level); 1592 mddev->new_layout = le32_to_cpu(sb->new_layout); 1593 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1594 if (mddev->delta_disks < 0 || 1595 (mddev->delta_disks == 0 && 1596 (le32_to_cpu(sb->feature_map) 1597 & MD_FEATURE_RESHAPE_BACKWARDS))) 1598 mddev->reshape_backwards = 1; 1599 } else { 1600 mddev->reshape_position = MaxSector; 1601 mddev->delta_disks = 0; 1602 mddev->new_level = mddev->level; 1603 mddev->new_layout = mddev->layout; 1604 mddev->new_chunk_sectors = mddev->chunk_sectors; 1605 } 1606 1607 } else if (mddev->pers == NULL) { 1608 /* Insist of good event counter while assembling, except for 1609 * spares (which don't need an event count) */ 1610 ++ev1; 1611 if (rdev->desc_nr >= 0 && 1612 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1613 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1614 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1615 if (ev1 < mddev->events) 1616 return -EINVAL; 1617 } else if (mddev->bitmap) { 1618 /* If adding to array with a bitmap, then we can accept an 1619 * older device, but not too old. 1620 */ 1621 if (ev1 < mddev->bitmap->events_cleared) 1622 return 0; 1623 if (ev1 < mddev->events) 1624 set_bit(Bitmap_sync, &rdev->flags); 1625 } else { 1626 if (ev1 < mddev->events) 1627 /* just a hot-add of a new device, leave raid_disk at -1 */ 1628 return 0; 1629 } 1630 if (mddev->level != LEVEL_MULTIPATH) { 1631 int role; 1632 if (rdev->desc_nr < 0 || 1633 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1634 role = MD_DISK_ROLE_SPARE; 1635 rdev->desc_nr = -1; 1636 } else 1637 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1638 switch(role) { 1639 case MD_DISK_ROLE_SPARE: /* spare */ 1640 break; 1641 case MD_DISK_ROLE_FAULTY: /* faulty */ 1642 set_bit(Faulty, &rdev->flags); 1643 break; 1644 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1645 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1646 /* journal device without journal feature */ 1647 printk(KERN_WARNING 1648 "md: journal device provided without journal feature, ignoring the device\n"); 1649 return -EINVAL; 1650 } 1651 set_bit(Journal, &rdev->flags); 1652 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1653 if (mddev->recovery_cp == MaxSector) 1654 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 1655 rdev->raid_disk = mddev->raid_disks; 1656 break; 1657 default: 1658 rdev->saved_raid_disk = role; 1659 if ((le32_to_cpu(sb->feature_map) & 1660 MD_FEATURE_RECOVERY_OFFSET)) { 1661 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1662 if (!(le32_to_cpu(sb->feature_map) & 1663 MD_FEATURE_RECOVERY_BITMAP)) 1664 rdev->saved_raid_disk = -1; 1665 } else 1666 set_bit(In_sync, &rdev->flags); 1667 rdev->raid_disk = role; 1668 break; 1669 } 1670 if (sb->devflags & WriteMostly1) 1671 set_bit(WriteMostly, &rdev->flags); 1672 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1673 set_bit(Replacement, &rdev->flags); 1674 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1675 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1676 } else /* MULTIPATH are always insync */ 1677 set_bit(In_sync, &rdev->flags); 1678 1679 return 0; 1680 } 1681 1682 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1683 { 1684 struct mdp_superblock_1 *sb; 1685 struct md_rdev *rdev2; 1686 int max_dev, i; 1687 /* make rdev->sb match mddev and rdev data. */ 1688 1689 sb = page_address(rdev->sb_page); 1690 1691 sb->feature_map = 0; 1692 sb->pad0 = 0; 1693 sb->recovery_offset = cpu_to_le64(0); 1694 memset(sb->pad3, 0, sizeof(sb->pad3)); 1695 1696 sb->utime = cpu_to_le64((__u64)mddev->utime); 1697 sb->events = cpu_to_le64(mddev->events); 1698 if (mddev->in_sync) 1699 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1700 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1701 sb->resync_offset = cpu_to_le64(MaxSector); 1702 else 1703 sb->resync_offset = cpu_to_le64(0); 1704 1705 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1706 1707 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1708 sb->size = cpu_to_le64(mddev->dev_sectors); 1709 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1710 sb->level = cpu_to_le32(mddev->level); 1711 sb->layout = cpu_to_le32(mddev->layout); 1712 1713 if (test_bit(WriteMostly, &rdev->flags)) 1714 sb->devflags |= WriteMostly1; 1715 else 1716 sb->devflags &= ~WriteMostly1; 1717 sb->data_offset = cpu_to_le64(rdev->data_offset); 1718 sb->data_size = cpu_to_le64(rdev->sectors); 1719 1720 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1721 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1722 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1723 } 1724 1725 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 1726 !test_bit(In_sync, &rdev->flags)) { 1727 sb->feature_map |= 1728 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1729 sb->recovery_offset = 1730 cpu_to_le64(rdev->recovery_offset); 1731 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1732 sb->feature_map |= 1733 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1734 } 1735 /* Note: recovery_offset and journal_tail share space */ 1736 if (test_bit(Journal, &rdev->flags)) 1737 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 1738 if (test_bit(Replacement, &rdev->flags)) 1739 sb->feature_map |= 1740 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1741 1742 if (mddev->reshape_position != MaxSector) { 1743 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1744 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1745 sb->new_layout = cpu_to_le32(mddev->new_layout); 1746 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1747 sb->new_level = cpu_to_le32(mddev->new_level); 1748 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1749 if (mddev->delta_disks == 0 && 1750 mddev->reshape_backwards) 1751 sb->feature_map 1752 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1753 if (rdev->new_data_offset != rdev->data_offset) { 1754 sb->feature_map 1755 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1756 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1757 - rdev->data_offset)); 1758 } 1759 } 1760 1761 if (mddev_is_clustered(mddev)) 1762 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 1763 1764 if (rdev->badblocks.count == 0) 1765 /* Nothing to do for bad blocks*/ ; 1766 else if (sb->bblog_offset == 0) 1767 /* Cannot record bad blocks on this device */ 1768 md_error(mddev, rdev); 1769 else { 1770 struct badblocks *bb = &rdev->badblocks; 1771 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1772 u64 *p = bb->page; 1773 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1774 if (bb->changed) { 1775 unsigned seq; 1776 1777 retry: 1778 seq = read_seqbegin(&bb->lock); 1779 1780 memset(bbp, 0xff, PAGE_SIZE); 1781 1782 for (i = 0 ; i < bb->count ; i++) { 1783 u64 internal_bb = p[i]; 1784 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1785 | BB_LEN(internal_bb)); 1786 bbp[i] = cpu_to_le64(store_bb); 1787 } 1788 bb->changed = 0; 1789 if (read_seqretry(&bb->lock, seq)) 1790 goto retry; 1791 1792 bb->sector = (rdev->sb_start + 1793 (int)le32_to_cpu(sb->bblog_offset)); 1794 bb->size = le16_to_cpu(sb->bblog_size); 1795 } 1796 } 1797 1798 max_dev = 0; 1799 rdev_for_each(rdev2, mddev) 1800 if (rdev2->desc_nr+1 > max_dev) 1801 max_dev = rdev2->desc_nr+1; 1802 1803 if (max_dev > le32_to_cpu(sb->max_dev)) { 1804 int bmask; 1805 sb->max_dev = cpu_to_le32(max_dev); 1806 rdev->sb_size = max_dev * 2 + 256; 1807 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1808 if (rdev->sb_size & bmask) 1809 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1810 } else 1811 max_dev = le32_to_cpu(sb->max_dev); 1812 1813 for (i=0; i<max_dev;i++) 1814 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1815 1816 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 1817 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1818 1819 rdev_for_each(rdev2, mddev) { 1820 i = rdev2->desc_nr; 1821 if (test_bit(Faulty, &rdev2->flags)) 1822 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1823 else if (test_bit(In_sync, &rdev2->flags)) 1824 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1825 else if (test_bit(Journal, &rdev2->flags)) 1826 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 1827 else if (rdev2->raid_disk >= 0) 1828 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1829 else 1830 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1831 } 1832 1833 sb->sb_csum = calc_sb_1_csum(sb); 1834 } 1835 1836 static unsigned long long 1837 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1838 { 1839 struct mdp_superblock_1 *sb; 1840 sector_t max_sectors; 1841 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1842 return 0; /* component must fit device */ 1843 if (rdev->data_offset != rdev->new_data_offset) 1844 return 0; /* too confusing */ 1845 if (rdev->sb_start < rdev->data_offset) { 1846 /* minor versions 1 and 2; superblock before data */ 1847 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1848 max_sectors -= rdev->data_offset; 1849 if (!num_sectors || num_sectors > max_sectors) 1850 num_sectors = max_sectors; 1851 } else if (rdev->mddev->bitmap_info.offset) { 1852 /* minor version 0 with bitmap we can't move */ 1853 return 0; 1854 } else { 1855 /* minor version 0; superblock after data */ 1856 sector_t sb_start; 1857 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1858 sb_start &= ~(sector_t)(4*2 - 1); 1859 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1860 if (!num_sectors || num_sectors > max_sectors) 1861 num_sectors = max_sectors; 1862 rdev->sb_start = sb_start; 1863 } 1864 sb = page_address(rdev->sb_page); 1865 sb->data_size = cpu_to_le64(num_sectors); 1866 sb->super_offset = rdev->sb_start; 1867 sb->sb_csum = calc_sb_1_csum(sb); 1868 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1869 rdev->sb_page); 1870 md_super_wait(rdev->mddev); 1871 return num_sectors; 1872 1873 } 1874 1875 static int 1876 super_1_allow_new_offset(struct md_rdev *rdev, 1877 unsigned long long new_offset) 1878 { 1879 /* All necessary checks on new >= old have been done */ 1880 struct bitmap *bitmap; 1881 if (new_offset >= rdev->data_offset) 1882 return 1; 1883 1884 /* with 1.0 metadata, there is no metadata to tread on 1885 * so we can always move back */ 1886 if (rdev->mddev->minor_version == 0) 1887 return 1; 1888 1889 /* otherwise we must be sure not to step on 1890 * any metadata, so stay: 1891 * 36K beyond start of superblock 1892 * beyond end of badblocks 1893 * beyond write-intent bitmap 1894 */ 1895 if (rdev->sb_start + (32+4)*2 > new_offset) 1896 return 0; 1897 bitmap = rdev->mddev->bitmap; 1898 if (bitmap && !rdev->mddev->bitmap_info.file && 1899 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1900 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1901 return 0; 1902 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1903 return 0; 1904 1905 return 1; 1906 } 1907 1908 static struct super_type super_types[] = { 1909 [0] = { 1910 .name = "0.90.0", 1911 .owner = THIS_MODULE, 1912 .load_super = super_90_load, 1913 .validate_super = super_90_validate, 1914 .sync_super = super_90_sync, 1915 .rdev_size_change = super_90_rdev_size_change, 1916 .allow_new_offset = super_90_allow_new_offset, 1917 }, 1918 [1] = { 1919 .name = "md-1", 1920 .owner = THIS_MODULE, 1921 .load_super = super_1_load, 1922 .validate_super = super_1_validate, 1923 .sync_super = super_1_sync, 1924 .rdev_size_change = super_1_rdev_size_change, 1925 .allow_new_offset = super_1_allow_new_offset, 1926 }, 1927 }; 1928 1929 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1930 { 1931 if (mddev->sync_super) { 1932 mddev->sync_super(mddev, rdev); 1933 return; 1934 } 1935 1936 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1937 1938 super_types[mddev->major_version].sync_super(mddev, rdev); 1939 } 1940 1941 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1942 { 1943 struct md_rdev *rdev, *rdev2; 1944 1945 rcu_read_lock(); 1946 rdev_for_each_rcu(rdev, mddev1) { 1947 if (test_bit(Faulty, &rdev->flags) || 1948 test_bit(Journal, &rdev->flags) || 1949 rdev->raid_disk == -1) 1950 continue; 1951 rdev_for_each_rcu(rdev2, mddev2) { 1952 if (test_bit(Faulty, &rdev2->flags) || 1953 test_bit(Journal, &rdev2->flags) || 1954 rdev2->raid_disk == -1) 1955 continue; 1956 if (rdev->bdev->bd_contains == 1957 rdev2->bdev->bd_contains) { 1958 rcu_read_unlock(); 1959 return 1; 1960 } 1961 } 1962 } 1963 rcu_read_unlock(); 1964 return 0; 1965 } 1966 1967 static LIST_HEAD(pending_raid_disks); 1968 1969 /* 1970 * Try to register data integrity profile for an mddev 1971 * 1972 * This is called when an array is started and after a disk has been kicked 1973 * from the array. It only succeeds if all working and active component devices 1974 * are integrity capable with matching profiles. 1975 */ 1976 int md_integrity_register(struct mddev *mddev) 1977 { 1978 struct md_rdev *rdev, *reference = NULL; 1979 1980 if (list_empty(&mddev->disks)) 1981 return 0; /* nothing to do */ 1982 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1983 return 0; /* shouldn't register, or already is */ 1984 rdev_for_each(rdev, mddev) { 1985 /* skip spares and non-functional disks */ 1986 if (test_bit(Faulty, &rdev->flags)) 1987 continue; 1988 if (rdev->raid_disk < 0) 1989 continue; 1990 if (!reference) { 1991 /* Use the first rdev as the reference */ 1992 reference = rdev; 1993 continue; 1994 } 1995 /* does this rdev's profile match the reference profile? */ 1996 if (blk_integrity_compare(reference->bdev->bd_disk, 1997 rdev->bdev->bd_disk) < 0) 1998 return -EINVAL; 1999 } 2000 if (!reference || !bdev_get_integrity(reference->bdev)) 2001 return 0; 2002 /* 2003 * All component devices are integrity capable and have matching 2004 * profiles, register the common profile for the md device. 2005 */ 2006 blk_integrity_register(mddev->gendisk, 2007 bdev_get_integrity(reference->bdev)); 2008 2009 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 2010 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2011 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 2012 mdname(mddev)); 2013 return -EINVAL; 2014 } 2015 return 0; 2016 } 2017 EXPORT_SYMBOL(md_integrity_register); 2018 2019 /* Disable data integrity if non-capable/non-matching disk is being added */ 2020 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2021 { 2022 struct blk_integrity *bi_rdev; 2023 struct blk_integrity *bi_mddev; 2024 2025 if (!mddev->gendisk) 2026 return; 2027 2028 bi_rdev = bdev_get_integrity(rdev->bdev); 2029 bi_mddev = blk_get_integrity(mddev->gendisk); 2030 2031 if (!bi_mddev) /* nothing to do */ 2032 return; 2033 if (rdev->raid_disk < 0) /* skip spares */ 2034 return; 2035 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 2036 rdev->bdev->bd_disk) >= 0) 2037 return; 2038 WARN_ON_ONCE(!mddev->suspended); 2039 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 2040 blk_integrity_unregister(mddev->gendisk); 2041 } 2042 EXPORT_SYMBOL(md_integrity_add_rdev); 2043 2044 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2045 { 2046 char b[BDEVNAME_SIZE]; 2047 struct kobject *ko; 2048 int err; 2049 2050 /* prevent duplicates */ 2051 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2052 return -EEXIST; 2053 2054 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2055 if (rdev->sectors && (mddev->dev_sectors == 0 || 2056 rdev->sectors < mddev->dev_sectors)) { 2057 if (mddev->pers) { 2058 /* Cannot change size, so fail 2059 * If mddev->level <= 0, then we don't care 2060 * about aligning sizes (e.g. linear) 2061 */ 2062 if (mddev->level > 0) 2063 return -ENOSPC; 2064 } else 2065 mddev->dev_sectors = rdev->sectors; 2066 } 2067 2068 /* Verify rdev->desc_nr is unique. 2069 * If it is -1, assign a free number, else 2070 * check number is not in use 2071 */ 2072 rcu_read_lock(); 2073 if (rdev->desc_nr < 0) { 2074 int choice = 0; 2075 if (mddev->pers) 2076 choice = mddev->raid_disks; 2077 while (md_find_rdev_nr_rcu(mddev, choice)) 2078 choice++; 2079 rdev->desc_nr = choice; 2080 } else { 2081 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2082 rcu_read_unlock(); 2083 return -EBUSY; 2084 } 2085 } 2086 rcu_read_unlock(); 2087 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2088 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2089 mdname(mddev), mddev->max_disks); 2090 return -EBUSY; 2091 } 2092 bdevname(rdev->bdev,b); 2093 strreplace(b, '/', '!'); 2094 2095 rdev->mddev = mddev; 2096 printk(KERN_INFO "md: bind<%s>\n", b); 2097 2098 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2099 goto fail; 2100 2101 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2102 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2103 /* failure here is OK */; 2104 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2105 2106 list_add_rcu(&rdev->same_set, &mddev->disks); 2107 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2108 2109 /* May as well allow recovery to be retried once */ 2110 mddev->recovery_disabled++; 2111 2112 return 0; 2113 2114 fail: 2115 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2116 b, mdname(mddev)); 2117 return err; 2118 } 2119 2120 static void md_delayed_delete(struct work_struct *ws) 2121 { 2122 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2123 kobject_del(&rdev->kobj); 2124 kobject_put(&rdev->kobj); 2125 } 2126 2127 static void unbind_rdev_from_array(struct md_rdev *rdev) 2128 { 2129 char b[BDEVNAME_SIZE]; 2130 2131 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2132 list_del_rcu(&rdev->same_set); 2133 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2134 rdev->mddev = NULL; 2135 sysfs_remove_link(&rdev->kobj, "block"); 2136 sysfs_put(rdev->sysfs_state); 2137 rdev->sysfs_state = NULL; 2138 rdev->badblocks.count = 0; 2139 /* We need to delay this, otherwise we can deadlock when 2140 * writing to 'remove' to "dev/state". We also need 2141 * to delay it due to rcu usage. 2142 */ 2143 synchronize_rcu(); 2144 INIT_WORK(&rdev->del_work, md_delayed_delete); 2145 kobject_get(&rdev->kobj); 2146 queue_work(md_misc_wq, &rdev->del_work); 2147 } 2148 2149 /* 2150 * prevent the device from being mounted, repartitioned or 2151 * otherwise reused by a RAID array (or any other kernel 2152 * subsystem), by bd_claiming the device. 2153 */ 2154 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2155 { 2156 int err = 0; 2157 struct block_device *bdev; 2158 char b[BDEVNAME_SIZE]; 2159 2160 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2161 shared ? (struct md_rdev *)lock_rdev : rdev); 2162 if (IS_ERR(bdev)) { 2163 printk(KERN_ERR "md: could not open %s.\n", 2164 __bdevname(dev, b)); 2165 return PTR_ERR(bdev); 2166 } 2167 rdev->bdev = bdev; 2168 return err; 2169 } 2170 2171 static void unlock_rdev(struct md_rdev *rdev) 2172 { 2173 struct block_device *bdev = rdev->bdev; 2174 rdev->bdev = NULL; 2175 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2176 } 2177 2178 void md_autodetect_dev(dev_t dev); 2179 2180 static void export_rdev(struct md_rdev *rdev) 2181 { 2182 char b[BDEVNAME_SIZE]; 2183 2184 printk(KERN_INFO "md: export_rdev(%s)\n", 2185 bdevname(rdev->bdev,b)); 2186 md_rdev_clear(rdev); 2187 #ifndef MODULE 2188 if (test_bit(AutoDetected, &rdev->flags)) 2189 md_autodetect_dev(rdev->bdev->bd_dev); 2190 #endif 2191 unlock_rdev(rdev); 2192 kobject_put(&rdev->kobj); 2193 } 2194 2195 void md_kick_rdev_from_array(struct md_rdev *rdev) 2196 { 2197 unbind_rdev_from_array(rdev); 2198 export_rdev(rdev); 2199 } 2200 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2201 2202 static void export_array(struct mddev *mddev) 2203 { 2204 struct md_rdev *rdev; 2205 2206 while (!list_empty(&mddev->disks)) { 2207 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2208 same_set); 2209 md_kick_rdev_from_array(rdev); 2210 } 2211 mddev->raid_disks = 0; 2212 mddev->major_version = 0; 2213 } 2214 2215 static void sync_sbs(struct mddev *mddev, int nospares) 2216 { 2217 /* Update each superblock (in-memory image), but 2218 * if we are allowed to, skip spares which already 2219 * have the right event counter, or have one earlier 2220 * (which would mean they aren't being marked as dirty 2221 * with the rest of the array) 2222 */ 2223 struct md_rdev *rdev; 2224 rdev_for_each(rdev, mddev) { 2225 if (rdev->sb_events == mddev->events || 2226 (nospares && 2227 rdev->raid_disk < 0 && 2228 rdev->sb_events+1 == mddev->events)) { 2229 /* Don't update this superblock */ 2230 rdev->sb_loaded = 2; 2231 } else { 2232 sync_super(mddev, rdev); 2233 rdev->sb_loaded = 1; 2234 } 2235 } 2236 } 2237 2238 static bool does_sb_need_changing(struct mddev *mddev) 2239 { 2240 struct md_rdev *rdev; 2241 struct mdp_superblock_1 *sb; 2242 int role; 2243 2244 /* Find a good rdev */ 2245 rdev_for_each(rdev, mddev) 2246 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2247 break; 2248 2249 /* No good device found. */ 2250 if (!rdev) 2251 return false; 2252 2253 sb = page_address(rdev->sb_page); 2254 /* Check if a device has become faulty or a spare become active */ 2255 rdev_for_each(rdev, mddev) { 2256 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2257 /* Device activated? */ 2258 if (role == 0xffff && rdev->raid_disk >=0 && 2259 !test_bit(Faulty, &rdev->flags)) 2260 return true; 2261 /* Device turned faulty? */ 2262 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2263 return true; 2264 } 2265 2266 /* Check if any mddev parameters have changed */ 2267 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2268 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2269 (mddev->layout != le64_to_cpu(sb->layout)) || 2270 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2271 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2272 return true; 2273 2274 return false; 2275 } 2276 2277 void md_update_sb(struct mddev *mddev, int force_change) 2278 { 2279 struct md_rdev *rdev; 2280 int sync_req; 2281 int nospares = 0; 2282 int any_badblocks_changed = 0; 2283 int ret = -1; 2284 2285 if (mddev->ro) { 2286 if (force_change) 2287 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2288 return; 2289 } 2290 2291 if (mddev_is_clustered(mddev)) { 2292 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2293 force_change = 1; 2294 ret = md_cluster_ops->metadata_update_start(mddev); 2295 /* Has someone else has updated the sb */ 2296 if (!does_sb_need_changing(mddev)) { 2297 if (ret == 0) 2298 md_cluster_ops->metadata_update_cancel(mddev); 2299 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2300 return; 2301 } 2302 } 2303 repeat: 2304 /* First make sure individual recovery_offsets are correct */ 2305 rdev_for_each(rdev, mddev) { 2306 if (rdev->raid_disk >= 0 && 2307 mddev->delta_disks >= 0 && 2308 !test_bit(Journal, &rdev->flags) && 2309 !test_bit(In_sync, &rdev->flags) && 2310 mddev->curr_resync_completed > rdev->recovery_offset) 2311 rdev->recovery_offset = mddev->curr_resync_completed; 2312 2313 } 2314 if (!mddev->persistent) { 2315 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2316 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2317 if (!mddev->external) { 2318 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2319 rdev_for_each(rdev, mddev) { 2320 if (rdev->badblocks.changed) { 2321 rdev->badblocks.changed = 0; 2322 md_ack_all_badblocks(&rdev->badblocks); 2323 md_error(mddev, rdev); 2324 } 2325 clear_bit(Blocked, &rdev->flags); 2326 clear_bit(BlockedBadBlocks, &rdev->flags); 2327 wake_up(&rdev->blocked_wait); 2328 } 2329 } 2330 wake_up(&mddev->sb_wait); 2331 return; 2332 } 2333 2334 spin_lock(&mddev->lock); 2335 2336 mddev->utime = get_seconds(); 2337 2338 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2339 force_change = 1; 2340 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2341 /* just a clean<-> dirty transition, possibly leave spares alone, 2342 * though if events isn't the right even/odd, we will have to do 2343 * spares after all 2344 */ 2345 nospares = 1; 2346 if (force_change) 2347 nospares = 0; 2348 if (mddev->degraded) 2349 /* If the array is degraded, then skipping spares is both 2350 * dangerous and fairly pointless. 2351 * Dangerous because a device that was removed from the array 2352 * might have a event_count that still looks up-to-date, 2353 * so it can be re-added without a resync. 2354 * Pointless because if there are any spares to skip, 2355 * then a recovery will happen and soon that array won't 2356 * be degraded any more and the spare can go back to sleep then. 2357 */ 2358 nospares = 0; 2359 2360 sync_req = mddev->in_sync; 2361 2362 /* If this is just a dirty<->clean transition, and the array is clean 2363 * and 'events' is odd, we can roll back to the previous clean state */ 2364 if (nospares 2365 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2366 && mddev->can_decrease_events 2367 && mddev->events != 1) { 2368 mddev->events--; 2369 mddev->can_decrease_events = 0; 2370 } else { 2371 /* otherwise we have to go forward and ... */ 2372 mddev->events ++; 2373 mddev->can_decrease_events = nospares; 2374 } 2375 2376 /* 2377 * This 64-bit counter should never wrap. 2378 * Either we are in around ~1 trillion A.C., assuming 2379 * 1 reboot per second, or we have a bug... 2380 */ 2381 WARN_ON(mddev->events == 0); 2382 2383 rdev_for_each(rdev, mddev) { 2384 if (rdev->badblocks.changed) 2385 any_badblocks_changed++; 2386 if (test_bit(Faulty, &rdev->flags)) 2387 set_bit(FaultRecorded, &rdev->flags); 2388 } 2389 2390 sync_sbs(mddev, nospares); 2391 spin_unlock(&mddev->lock); 2392 2393 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2394 mdname(mddev), mddev->in_sync); 2395 2396 bitmap_update_sb(mddev->bitmap); 2397 rdev_for_each(rdev, mddev) { 2398 char b[BDEVNAME_SIZE]; 2399 2400 if (rdev->sb_loaded != 1) 2401 continue; /* no noise on spare devices */ 2402 2403 if (!test_bit(Faulty, &rdev->flags)) { 2404 md_super_write(mddev,rdev, 2405 rdev->sb_start, rdev->sb_size, 2406 rdev->sb_page); 2407 pr_debug("md: (write) %s's sb offset: %llu\n", 2408 bdevname(rdev->bdev, b), 2409 (unsigned long long)rdev->sb_start); 2410 rdev->sb_events = mddev->events; 2411 if (rdev->badblocks.size) { 2412 md_super_write(mddev, rdev, 2413 rdev->badblocks.sector, 2414 rdev->badblocks.size << 9, 2415 rdev->bb_page); 2416 rdev->badblocks.size = 0; 2417 } 2418 2419 } else 2420 pr_debug("md: %s (skipping faulty)\n", 2421 bdevname(rdev->bdev, b)); 2422 2423 if (mddev->level == LEVEL_MULTIPATH) 2424 /* only need to write one superblock... */ 2425 break; 2426 } 2427 md_super_wait(mddev); 2428 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2429 2430 spin_lock(&mddev->lock); 2431 if (mddev->in_sync != sync_req || 2432 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2433 /* have to write it out again */ 2434 spin_unlock(&mddev->lock); 2435 goto repeat; 2436 } 2437 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2438 spin_unlock(&mddev->lock); 2439 wake_up(&mddev->sb_wait); 2440 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2441 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2442 2443 rdev_for_each(rdev, mddev) { 2444 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2445 clear_bit(Blocked, &rdev->flags); 2446 2447 if (any_badblocks_changed) 2448 md_ack_all_badblocks(&rdev->badblocks); 2449 clear_bit(BlockedBadBlocks, &rdev->flags); 2450 wake_up(&rdev->blocked_wait); 2451 } 2452 2453 if (mddev_is_clustered(mddev) && ret == 0) 2454 md_cluster_ops->metadata_update_finish(mddev); 2455 } 2456 EXPORT_SYMBOL(md_update_sb); 2457 2458 static int add_bound_rdev(struct md_rdev *rdev) 2459 { 2460 struct mddev *mddev = rdev->mddev; 2461 int err = 0; 2462 2463 if (!mddev->pers->hot_remove_disk) { 2464 /* If there is hot_add_disk but no hot_remove_disk 2465 * then added disks for geometry changes, 2466 * and should be added immediately. 2467 */ 2468 super_types[mddev->major_version]. 2469 validate_super(mddev, rdev); 2470 err = mddev->pers->hot_add_disk(mddev, rdev); 2471 if (err) { 2472 unbind_rdev_from_array(rdev); 2473 export_rdev(rdev); 2474 return err; 2475 } 2476 } 2477 sysfs_notify_dirent_safe(rdev->sysfs_state); 2478 2479 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2480 if (mddev->degraded) 2481 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2482 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2483 md_new_event(mddev); 2484 md_wakeup_thread(mddev->thread); 2485 return 0; 2486 } 2487 2488 /* words written to sysfs files may, or may not, be \n terminated. 2489 * We want to accept with case. For this we use cmd_match. 2490 */ 2491 static int cmd_match(const char *cmd, const char *str) 2492 { 2493 /* See if cmd, written into a sysfs file, matches 2494 * str. They must either be the same, or cmd can 2495 * have a trailing newline 2496 */ 2497 while (*cmd && *str && *cmd == *str) { 2498 cmd++; 2499 str++; 2500 } 2501 if (*cmd == '\n') 2502 cmd++; 2503 if (*str || *cmd) 2504 return 0; 2505 return 1; 2506 } 2507 2508 struct rdev_sysfs_entry { 2509 struct attribute attr; 2510 ssize_t (*show)(struct md_rdev *, char *); 2511 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2512 }; 2513 2514 static ssize_t 2515 state_show(struct md_rdev *rdev, char *page) 2516 { 2517 char *sep = ""; 2518 size_t len = 0; 2519 unsigned long flags = ACCESS_ONCE(rdev->flags); 2520 2521 if (test_bit(Faulty, &flags) || 2522 rdev->badblocks.unacked_exist) { 2523 len+= sprintf(page+len, "%sfaulty",sep); 2524 sep = ","; 2525 } 2526 if (test_bit(In_sync, &flags)) { 2527 len += sprintf(page+len, "%sin_sync",sep); 2528 sep = ","; 2529 } 2530 if (test_bit(Journal, &flags)) { 2531 len += sprintf(page+len, "%sjournal",sep); 2532 sep = ","; 2533 } 2534 if (test_bit(WriteMostly, &flags)) { 2535 len += sprintf(page+len, "%swrite_mostly",sep); 2536 sep = ","; 2537 } 2538 if (test_bit(Blocked, &flags) || 2539 (rdev->badblocks.unacked_exist 2540 && !test_bit(Faulty, &flags))) { 2541 len += sprintf(page+len, "%sblocked", sep); 2542 sep = ","; 2543 } 2544 if (!test_bit(Faulty, &flags) && 2545 !test_bit(Journal, &flags) && 2546 !test_bit(In_sync, &flags)) { 2547 len += sprintf(page+len, "%sspare", sep); 2548 sep = ","; 2549 } 2550 if (test_bit(WriteErrorSeen, &flags)) { 2551 len += sprintf(page+len, "%swrite_error", sep); 2552 sep = ","; 2553 } 2554 if (test_bit(WantReplacement, &flags)) { 2555 len += sprintf(page+len, "%swant_replacement", sep); 2556 sep = ","; 2557 } 2558 if (test_bit(Replacement, &flags)) { 2559 len += sprintf(page+len, "%sreplacement", sep); 2560 sep = ","; 2561 } 2562 2563 return len+sprintf(page+len, "\n"); 2564 } 2565 2566 static ssize_t 2567 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2568 { 2569 /* can write 2570 * faulty - simulates an error 2571 * remove - disconnects the device 2572 * writemostly - sets write_mostly 2573 * -writemostly - clears write_mostly 2574 * blocked - sets the Blocked flags 2575 * -blocked - clears the Blocked and possibly simulates an error 2576 * insync - sets Insync providing device isn't active 2577 * -insync - clear Insync for a device with a slot assigned, 2578 * so that it gets rebuilt based on bitmap 2579 * write_error - sets WriteErrorSeen 2580 * -write_error - clears WriteErrorSeen 2581 */ 2582 int err = -EINVAL; 2583 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2584 md_error(rdev->mddev, rdev); 2585 if (test_bit(Faulty, &rdev->flags)) 2586 err = 0; 2587 else 2588 err = -EBUSY; 2589 } else if (cmd_match(buf, "remove")) { 2590 if (rdev->raid_disk >= 0) 2591 err = -EBUSY; 2592 else { 2593 struct mddev *mddev = rdev->mddev; 2594 err = 0; 2595 if (mddev_is_clustered(mddev)) 2596 err = md_cluster_ops->remove_disk(mddev, rdev); 2597 2598 if (err == 0) { 2599 md_kick_rdev_from_array(rdev); 2600 if (mddev->pers) 2601 md_update_sb(mddev, 1); 2602 md_new_event(mddev); 2603 } 2604 } 2605 } else if (cmd_match(buf, "writemostly")) { 2606 set_bit(WriteMostly, &rdev->flags); 2607 err = 0; 2608 } else if (cmd_match(buf, "-writemostly")) { 2609 clear_bit(WriteMostly, &rdev->flags); 2610 err = 0; 2611 } else if (cmd_match(buf, "blocked")) { 2612 set_bit(Blocked, &rdev->flags); 2613 err = 0; 2614 } else if (cmd_match(buf, "-blocked")) { 2615 if (!test_bit(Faulty, &rdev->flags) && 2616 rdev->badblocks.unacked_exist) { 2617 /* metadata handler doesn't understand badblocks, 2618 * so we need to fail the device 2619 */ 2620 md_error(rdev->mddev, rdev); 2621 } 2622 clear_bit(Blocked, &rdev->flags); 2623 clear_bit(BlockedBadBlocks, &rdev->flags); 2624 wake_up(&rdev->blocked_wait); 2625 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2626 md_wakeup_thread(rdev->mddev->thread); 2627 2628 err = 0; 2629 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2630 set_bit(In_sync, &rdev->flags); 2631 err = 0; 2632 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2633 !test_bit(Journal, &rdev->flags)) { 2634 if (rdev->mddev->pers == NULL) { 2635 clear_bit(In_sync, &rdev->flags); 2636 rdev->saved_raid_disk = rdev->raid_disk; 2637 rdev->raid_disk = -1; 2638 err = 0; 2639 } 2640 } else if (cmd_match(buf, "write_error")) { 2641 set_bit(WriteErrorSeen, &rdev->flags); 2642 err = 0; 2643 } else if (cmd_match(buf, "-write_error")) { 2644 clear_bit(WriteErrorSeen, &rdev->flags); 2645 err = 0; 2646 } else if (cmd_match(buf, "want_replacement")) { 2647 /* Any non-spare device that is not a replacement can 2648 * become want_replacement at any time, but we then need to 2649 * check if recovery is needed. 2650 */ 2651 if (rdev->raid_disk >= 0 && 2652 !test_bit(Journal, &rdev->flags) && 2653 !test_bit(Replacement, &rdev->flags)) 2654 set_bit(WantReplacement, &rdev->flags); 2655 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2656 md_wakeup_thread(rdev->mddev->thread); 2657 err = 0; 2658 } else if (cmd_match(buf, "-want_replacement")) { 2659 /* Clearing 'want_replacement' is always allowed. 2660 * Once replacements starts it is too late though. 2661 */ 2662 err = 0; 2663 clear_bit(WantReplacement, &rdev->flags); 2664 } else if (cmd_match(buf, "replacement")) { 2665 /* Can only set a device as a replacement when array has not 2666 * yet been started. Once running, replacement is automatic 2667 * from spares, or by assigning 'slot'. 2668 */ 2669 if (rdev->mddev->pers) 2670 err = -EBUSY; 2671 else { 2672 set_bit(Replacement, &rdev->flags); 2673 err = 0; 2674 } 2675 } else if (cmd_match(buf, "-replacement")) { 2676 /* Similarly, can only clear Replacement before start */ 2677 if (rdev->mddev->pers) 2678 err = -EBUSY; 2679 else { 2680 clear_bit(Replacement, &rdev->flags); 2681 err = 0; 2682 } 2683 } else if (cmd_match(buf, "re-add")) { 2684 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2685 /* clear_bit is performed _after_ all the devices 2686 * have their local Faulty bit cleared. If any writes 2687 * happen in the meantime in the local node, they 2688 * will land in the local bitmap, which will be synced 2689 * by this node eventually 2690 */ 2691 if (!mddev_is_clustered(rdev->mddev) || 2692 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2693 clear_bit(Faulty, &rdev->flags); 2694 err = add_bound_rdev(rdev); 2695 } 2696 } else 2697 err = -EBUSY; 2698 } 2699 if (!err) 2700 sysfs_notify_dirent_safe(rdev->sysfs_state); 2701 return err ? err : len; 2702 } 2703 static struct rdev_sysfs_entry rdev_state = 2704 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2705 2706 static ssize_t 2707 errors_show(struct md_rdev *rdev, char *page) 2708 { 2709 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2710 } 2711 2712 static ssize_t 2713 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2714 { 2715 unsigned int n; 2716 int rv; 2717 2718 rv = kstrtouint(buf, 10, &n); 2719 if (rv < 0) 2720 return rv; 2721 atomic_set(&rdev->corrected_errors, n); 2722 return len; 2723 } 2724 static struct rdev_sysfs_entry rdev_errors = 2725 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2726 2727 static ssize_t 2728 slot_show(struct md_rdev *rdev, char *page) 2729 { 2730 if (test_bit(Journal, &rdev->flags)) 2731 return sprintf(page, "journal\n"); 2732 else if (rdev->raid_disk < 0) 2733 return sprintf(page, "none\n"); 2734 else 2735 return sprintf(page, "%d\n", rdev->raid_disk); 2736 } 2737 2738 static ssize_t 2739 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2740 { 2741 int slot; 2742 int err; 2743 2744 if (test_bit(Journal, &rdev->flags)) 2745 return -EBUSY; 2746 if (strncmp(buf, "none", 4)==0) 2747 slot = -1; 2748 else { 2749 err = kstrtouint(buf, 10, (unsigned int *)&slot); 2750 if (err < 0) 2751 return err; 2752 } 2753 if (rdev->mddev->pers && slot == -1) { 2754 /* Setting 'slot' on an active array requires also 2755 * updating the 'rd%d' link, and communicating 2756 * with the personality with ->hot_*_disk. 2757 * For now we only support removing 2758 * failed/spare devices. This normally happens automatically, 2759 * but not when the metadata is externally managed. 2760 */ 2761 if (rdev->raid_disk == -1) 2762 return -EEXIST; 2763 /* personality does all needed checks */ 2764 if (rdev->mddev->pers->hot_remove_disk == NULL) 2765 return -EINVAL; 2766 clear_bit(Blocked, &rdev->flags); 2767 remove_and_add_spares(rdev->mddev, rdev); 2768 if (rdev->raid_disk >= 0) 2769 return -EBUSY; 2770 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2771 md_wakeup_thread(rdev->mddev->thread); 2772 } else if (rdev->mddev->pers) { 2773 /* Activating a spare .. or possibly reactivating 2774 * if we ever get bitmaps working here. 2775 */ 2776 2777 if (rdev->raid_disk != -1) 2778 return -EBUSY; 2779 2780 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2781 return -EBUSY; 2782 2783 if (rdev->mddev->pers->hot_add_disk == NULL) 2784 return -EINVAL; 2785 2786 if (slot >= rdev->mddev->raid_disks && 2787 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2788 return -ENOSPC; 2789 2790 rdev->raid_disk = slot; 2791 if (test_bit(In_sync, &rdev->flags)) 2792 rdev->saved_raid_disk = slot; 2793 else 2794 rdev->saved_raid_disk = -1; 2795 clear_bit(In_sync, &rdev->flags); 2796 clear_bit(Bitmap_sync, &rdev->flags); 2797 remove_and_add_spares(rdev->mddev, rdev); 2798 if (rdev->raid_disk == -1) 2799 return -EBUSY; 2800 /* don't wakeup anyone, leave that to userspace. */ 2801 } else { 2802 if (slot >= rdev->mddev->raid_disks && 2803 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2804 return -ENOSPC; 2805 rdev->raid_disk = slot; 2806 /* assume it is working */ 2807 clear_bit(Faulty, &rdev->flags); 2808 clear_bit(WriteMostly, &rdev->flags); 2809 set_bit(In_sync, &rdev->flags); 2810 sysfs_notify_dirent_safe(rdev->sysfs_state); 2811 } 2812 return len; 2813 } 2814 2815 static struct rdev_sysfs_entry rdev_slot = 2816 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2817 2818 static ssize_t 2819 offset_show(struct md_rdev *rdev, char *page) 2820 { 2821 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2822 } 2823 2824 static ssize_t 2825 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2826 { 2827 unsigned long long offset; 2828 if (kstrtoull(buf, 10, &offset) < 0) 2829 return -EINVAL; 2830 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2831 return -EBUSY; 2832 if (rdev->sectors && rdev->mddev->external) 2833 /* Must set offset before size, so overlap checks 2834 * can be sane */ 2835 return -EBUSY; 2836 rdev->data_offset = offset; 2837 rdev->new_data_offset = offset; 2838 return len; 2839 } 2840 2841 static struct rdev_sysfs_entry rdev_offset = 2842 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2843 2844 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2845 { 2846 return sprintf(page, "%llu\n", 2847 (unsigned long long)rdev->new_data_offset); 2848 } 2849 2850 static ssize_t new_offset_store(struct md_rdev *rdev, 2851 const char *buf, size_t len) 2852 { 2853 unsigned long long new_offset; 2854 struct mddev *mddev = rdev->mddev; 2855 2856 if (kstrtoull(buf, 10, &new_offset) < 0) 2857 return -EINVAL; 2858 2859 if (mddev->sync_thread || 2860 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 2861 return -EBUSY; 2862 if (new_offset == rdev->data_offset) 2863 /* reset is always permitted */ 2864 ; 2865 else if (new_offset > rdev->data_offset) { 2866 /* must not push array size beyond rdev_sectors */ 2867 if (new_offset - rdev->data_offset 2868 + mddev->dev_sectors > rdev->sectors) 2869 return -E2BIG; 2870 } 2871 /* Metadata worries about other space details. */ 2872 2873 /* decreasing the offset is inconsistent with a backwards 2874 * reshape. 2875 */ 2876 if (new_offset < rdev->data_offset && 2877 mddev->reshape_backwards) 2878 return -EINVAL; 2879 /* Increasing offset is inconsistent with forwards 2880 * reshape. reshape_direction should be set to 2881 * 'backwards' first. 2882 */ 2883 if (new_offset > rdev->data_offset && 2884 !mddev->reshape_backwards) 2885 return -EINVAL; 2886 2887 if (mddev->pers && mddev->persistent && 2888 !super_types[mddev->major_version] 2889 .allow_new_offset(rdev, new_offset)) 2890 return -E2BIG; 2891 rdev->new_data_offset = new_offset; 2892 if (new_offset > rdev->data_offset) 2893 mddev->reshape_backwards = 1; 2894 else if (new_offset < rdev->data_offset) 2895 mddev->reshape_backwards = 0; 2896 2897 return len; 2898 } 2899 static struct rdev_sysfs_entry rdev_new_offset = 2900 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2901 2902 static ssize_t 2903 rdev_size_show(struct md_rdev *rdev, char *page) 2904 { 2905 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2906 } 2907 2908 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2909 { 2910 /* check if two start/length pairs overlap */ 2911 if (s1+l1 <= s2) 2912 return 0; 2913 if (s2+l2 <= s1) 2914 return 0; 2915 return 1; 2916 } 2917 2918 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2919 { 2920 unsigned long long blocks; 2921 sector_t new; 2922 2923 if (kstrtoull(buf, 10, &blocks) < 0) 2924 return -EINVAL; 2925 2926 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2927 return -EINVAL; /* sector conversion overflow */ 2928 2929 new = blocks * 2; 2930 if (new != blocks * 2) 2931 return -EINVAL; /* unsigned long long to sector_t overflow */ 2932 2933 *sectors = new; 2934 return 0; 2935 } 2936 2937 static ssize_t 2938 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2939 { 2940 struct mddev *my_mddev = rdev->mddev; 2941 sector_t oldsectors = rdev->sectors; 2942 sector_t sectors; 2943 2944 if (test_bit(Journal, &rdev->flags)) 2945 return -EBUSY; 2946 if (strict_blocks_to_sectors(buf, §ors) < 0) 2947 return -EINVAL; 2948 if (rdev->data_offset != rdev->new_data_offset) 2949 return -EINVAL; /* too confusing */ 2950 if (my_mddev->pers && rdev->raid_disk >= 0) { 2951 if (my_mddev->persistent) { 2952 sectors = super_types[my_mddev->major_version]. 2953 rdev_size_change(rdev, sectors); 2954 if (!sectors) 2955 return -EBUSY; 2956 } else if (!sectors) 2957 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2958 rdev->data_offset; 2959 if (!my_mddev->pers->resize) 2960 /* Cannot change size for RAID0 or Linear etc */ 2961 return -EINVAL; 2962 } 2963 if (sectors < my_mddev->dev_sectors) 2964 return -EINVAL; /* component must fit device */ 2965 2966 rdev->sectors = sectors; 2967 if (sectors > oldsectors && my_mddev->external) { 2968 /* Need to check that all other rdevs with the same 2969 * ->bdev do not overlap. 'rcu' is sufficient to walk 2970 * the rdev lists safely. 2971 * This check does not provide a hard guarantee, it 2972 * just helps avoid dangerous mistakes. 2973 */ 2974 struct mddev *mddev; 2975 int overlap = 0; 2976 struct list_head *tmp; 2977 2978 rcu_read_lock(); 2979 for_each_mddev(mddev, tmp) { 2980 struct md_rdev *rdev2; 2981 2982 rdev_for_each(rdev2, mddev) 2983 if (rdev->bdev == rdev2->bdev && 2984 rdev != rdev2 && 2985 overlaps(rdev->data_offset, rdev->sectors, 2986 rdev2->data_offset, 2987 rdev2->sectors)) { 2988 overlap = 1; 2989 break; 2990 } 2991 if (overlap) { 2992 mddev_put(mddev); 2993 break; 2994 } 2995 } 2996 rcu_read_unlock(); 2997 if (overlap) { 2998 /* Someone else could have slipped in a size 2999 * change here, but doing so is just silly. 3000 * We put oldsectors back because we *know* it is 3001 * safe, and trust userspace not to race with 3002 * itself 3003 */ 3004 rdev->sectors = oldsectors; 3005 return -EBUSY; 3006 } 3007 } 3008 return len; 3009 } 3010 3011 static struct rdev_sysfs_entry rdev_size = 3012 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3013 3014 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3015 { 3016 unsigned long long recovery_start = rdev->recovery_offset; 3017 3018 if (test_bit(In_sync, &rdev->flags) || 3019 recovery_start == MaxSector) 3020 return sprintf(page, "none\n"); 3021 3022 return sprintf(page, "%llu\n", recovery_start); 3023 } 3024 3025 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3026 { 3027 unsigned long long recovery_start; 3028 3029 if (cmd_match(buf, "none")) 3030 recovery_start = MaxSector; 3031 else if (kstrtoull(buf, 10, &recovery_start)) 3032 return -EINVAL; 3033 3034 if (rdev->mddev->pers && 3035 rdev->raid_disk >= 0) 3036 return -EBUSY; 3037 3038 rdev->recovery_offset = recovery_start; 3039 if (recovery_start == MaxSector) 3040 set_bit(In_sync, &rdev->flags); 3041 else 3042 clear_bit(In_sync, &rdev->flags); 3043 return len; 3044 } 3045 3046 static struct rdev_sysfs_entry rdev_recovery_start = 3047 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3048 3049 static ssize_t 3050 badblocks_show(struct badblocks *bb, char *page, int unack); 3051 static ssize_t 3052 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); 3053 3054 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3055 { 3056 return badblocks_show(&rdev->badblocks, page, 0); 3057 } 3058 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3059 { 3060 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3061 /* Maybe that ack was all we needed */ 3062 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3063 wake_up(&rdev->blocked_wait); 3064 return rv; 3065 } 3066 static struct rdev_sysfs_entry rdev_bad_blocks = 3067 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3068 3069 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3070 { 3071 return badblocks_show(&rdev->badblocks, page, 1); 3072 } 3073 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3074 { 3075 return badblocks_store(&rdev->badblocks, page, len, 1); 3076 } 3077 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3078 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3079 3080 static struct attribute *rdev_default_attrs[] = { 3081 &rdev_state.attr, 3082 &rdev_errors.attr, 3083 &rdev_slot.attr, 3084 &rdev_offset.attr, 3085 &rdev_new_offset.attr, 3086 &rdev_size.attr, 3087 &rdev_recovery_start.attr, 3088 &rdev_bad_blocks.attr, 3089 &rdev_unack_bad_blocks.attr, 3090 NULL, 3091 }; 3092 static ssize_t 3093 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3094 { 3095 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3096 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3097 3098 if (!entry->show) 3099 return -EIO; 3100 if (!rdev->mddev) 3101 return -EBUSY; 3102 return entry->show(rdev, page); 3103 } 3104 3105 static ssize_t 3106 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3107 const char *page, size_t length) 3108 { 3109 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3110 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3111 ssize_t rv; 3112 struct mddev *mddev = rdev->mddev; 3113 3114 if (!entry->store) 3115 return -EIO; 3116 if (!capable(CAP_SYS_ADMIN)) 3117 return -EACCES; 3118 rv = mddev ? mddev_lock(mddev): -EBUSY; 3119 if (!rv) { 3120 if (rdev->mddev == NULL) 3121 rv = -EBUSY; 3122 else 3123 rv = entry->store(rdev, page, length); 3124 mddev_unlock(mddev); 3125 } 3126 return rv; 3127 } 3128 3129 static void rdev_free(struct kobject *ko) 3130 { 3131 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3132 kfree(rdev); 3133 } 3134 static const struct sysfs_ops rdev_sysfs_ops = { 3135 .show = rdev_attr_show, 3136 .store = rdev_attr_store, 3137 }; 3138 static struct kobj_type rdev_ktype = { 3139 .release = rdev_free, 3140 .sysfs_ops = &rdev_sysfs_ops, 3141 .default_attrs = rdev_default_attrs, 3142 }; 3143 3144 int md_rdev_init(struct md_rdev *rdev) 3145 { 3146 rdev->desc_nr = -1; 3147 rdev->saved_raid_disk = -1; 3148 rdev->raid_disk = -1; 3149 rdev->flags = 0; 3150 rdev->data_offset = 0; 3151 rdev->new_data_offset = 0; 3152 rdev->sb_events = 0; 3153 rdev->last_read_error.tv_sec = 0; 3154 rdev->last_read_error.tv_nsec = 0; 3155 rdev->sb_loaded = 0; 3156 rdev->bb_page = NULL; 3157 atomic_set(&rdev->nr_pending, 0); 3158 atomic_set(&rdev->read_errors, 0); 3159 atomic_set(&rdev->corrected_errors, 0); 3160 3161 INIT_LIST_HEAD(&rdev->same_set); 3162 init_waitqueue_head(&rdev->blocked_wait); 3163 3164 /* Add space to store bad block list. 3165 * This reserves the space even on arrays where it cannot 3166 * be used - I wonder if that matters 3167 */ 3168 rdev->badblocks.count = 0; 3169 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ 3170 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); 3171 seqlock_init(&rdev->badblocks.lock); 3172 if (rdev->badblocks.page == NULL) 3173 return -ENOMEM; 3174 3175 return 0; 3176 } 3177 EXPORT_SYMBOL_GPL(md_rdev_init); 3178 /* 3179 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3180 * 3181 * mark the device faulty if: 3182 * 3183 * - the device is nonexistent (zero size) 3184 * - the device has no valid superblock 3185 * 3186 * a faulty rdev _never_ has rdev->sb set. 3187 */ 3188 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3189 { 3190 char b[BDEVNAME_SIZE]; 3191 int err; 3192 struct md_rdev *rdev; 3193 sector_t size; 3194 3195 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3196 if (!rdev) { 3197 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 3198 return ERR_PTR(-ENOMEM); 3199 } 3200 3201 err = md_rdev_init(rdev); 3202 if (err) 3203 goto abort_free; 3204 err = alloc_disk_sb(rdev); 3205 if (err) 3206 goto abort_free; 3207 3208 err = lock_rdev(rdev, newdev, super_format == -2); 3209 if (err) 3210 goto abort_free; 3211 3212 kobject_init(&rdev->kobj, &rdev_ktype); 3213 3214 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3215 if (!size) { 3216 printk(KERN_WARNING 3217 "md: %s has zero or unknown size, marking faulty!\n", 3218 bdevname(rdev->bdev,b)); 3219 err = -EINVAL; 3220 goto abort_free; 3221 } 3222 3223 if (super_format >= 0) { 3224 err = super_types[super_format]. 3225 load_super(rdev, NULL, super_minor); 3226 if (err == -EINVAL) { 3227 printk(KERN_WARNING 3228 "md: %s does not have a valid v%d.%d " 3229 "superblock, not importing!\n", 3230 bdevname(rdev->bdev,b), 3231 super_format, super_minor); 3232 goto abort_free; 3233 } 3234 if (err < 0) { 3235 printk(KERN_WARNING 3236 "md: could not read %s's sb, not importing!\n", 3237 bdevname(rdev->bdev,b)); 3238 goto abort_free; 3239 } 3240 } 3241 3242 return rdev; 3243 3244 abort_free: 3245 if (rdev->bdev) 3246 unlock_rdev(rdev); 3247 md_rdev_clear(rdev); 3248 kfree(rdev); 3249 return ERR_PTR(err); 3250 } 3251 3252 /* 3253 * Check a full RAID array for plausibility 3254 */ 3255 3256 static void analyze_sbs(struct mddev *mddev) 3257 { 3258 int i; 3259 struct md_rdev *rdev, *freshest, *tmp; 3260 char b[BDEVNAME_SIZE]; 3261 3262 freshest = NULL; 3263 rdev_for_each_safe(rdev, tmp, mddev) 3264 switch (super_types[mddev->major_version]. 3265 load_super(rdev, freshest, mddev->minor_version)) { 3266 case 1: 3267 freshest = rdev; 3268 break; 3269 case 0: 3270 break; 3271 default: 3272 printk( KERN_ERR \ 3273 "md: fatal superblock inconsistency in %s" 3274 " -- removing from array\n", 3275 bdevname(rdev->bdev,b)); 3276 md_kick_rdev_from_array(rdev); 3277 } 3278 3279 super_types[mddev->major_version]. 3280 validate_super(mddev, freshest); 3281 3282 i = 0; 3283 rdev_for_each_safe(rdev, tmp, mddev) { 3284 if (mddev->max_disks && 3285 (rdev->desc_nr >= mddev->max_disks || 3286 i > mddev->max_disks)) { 3287 printk(KERN_WARNING 3288 "md: %s: %s: only %d devices permitted\n", 3289 mdname(mddev), bdevname(rdev->bdev, b), 3290 mddev->max_disks); 3291 md_kick_rdev_from_array(rdev); 3292 continue; 3293 } 3294 if (rdev != freshest) { 3295 if (super_types[mddev->major_version]. 3296 validate_super(mddev, rdev)) { 3297 printk(KERN_WARNING "md: kicking non-fresh %s" 3298 " from array!\n", 3299 bdevname(rdev->bdev,b)); 3300 md_kick_rdev_from_array(rdev); 3301 continue; 3302 } 3303 } 3304 if (mddev->level == LEVEL_MULTIPATH) { 3305 rdev->desc_nr = i++; 3306 rdev->raid_disk = rdev->desc_nr; 3307 set_bit(In_sync, &rdev->flags); 3308 } else if (rdev->raid_disk >= 3309 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3310 !test_bit(Journal, &rdev->flags)) { 3311 rdev->raid_disk = -1; 3312 clear_bit(In_sync, &rdev->flags); 3313 } 3314 } 3315 } 3316 3317 /* Read a fixed-point number. 3318 * Numbers in sysfs attributes should be in "standard" units where 3319 * possible, so time should be in seconds. 3320 * However we internally use a a much smaller unit such as 3321 * milliseconds or jiffies. 3322 * This function takes a decimal number with a possible fractional 3323 * component, and produces an integer which is the result of 3324 * multiplying that number by 10^'scale'. 3325 * all without any floating-point arithmetic. 3326 */ 3327 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3328 { 3329 unsigned long result = 0; 3330 long decimals = -1; 3331 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3332 if (*cp == '.') 3333 decimals = 0; 3334 else if (decimals < scale) { 3335 unsigned int value; 3336 value = *cp - '0'; 3337 result = result * 10 + value; 3338 if (decimals >= 0) 3339 decimals++; 3340 } 3341 cp++; 3342 } 3343 if (*cp == '\n') 3344 cp++; 3345 if (*cp) 3346 return -EINVAL; 3347 if (decimals < 0) 3348 decimals = 0; 3349 while (decimals < scale) { 3350 result *= 10; 3351 decimals ++; 3352 } 3353 *res = result; 3354 return 0; 3355 } 3356 3357 static ssize_t 3358 safe_delay_show(struct mddev *mddev, char *page) 3359 { 3360 int msec = (mddev->safemode_delay*1000)/HZ; 3361 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3362 } 3363 static ssize_t 3364 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3365 { 3366 unsigned long msec; 3367 3368 if (mddev_is_clustered(mddev)) { 3369 pr_info("md: Safemode is disabled for clustered mode\n"); 3370 return -EINVAL; 3371 } 3372 3373 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3374 return -EINVAL; 3375 if (msec == 0) 3376 mddev->safemode_delay = 0; 3377 else { 3378 unsigned long old_delay = mddev->safemode_delay; 3379 unsigned long new_delay = (msec*HZ)/1000; 3380 3381 if (new_delay == 0) 3382 new_delay = 1; 3383 mddev->safemode_delay = new_delay; 3384 if (new_delay < old_delay || old_delay == 0) 3385 mod_timer(&mddev->safemode_timer, jiffies+1); 3386 } 3387 return len; 3388 } 3389 static struct md_sysfs_entry md_safe_delay = 3390 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3391 3392 static ssize_t 3393 level_show(struct mddev *mddev, char *page) 3394 { 3395 struct md_personality *p; 3396 int ret; 3397 spin_lock(&mddev->lock); 3398 p = mddev->pers; 3399 if (p) 3400 ret = sprintf(page, "%s\n", p->name); 3401 else if (mddev->clevel[0]) 3402 ret = sprintf(page, "%s\n", mddev->clevel); 3403 else if (mddev->level != LEVEL_NONE) 3404 ret = sprintf(page, "%d\n", mddev->level); 3405 else 3406 ret = 0; 3407 spin_unlock(&mddev->lock); 3408 return ret; 3409 } 3410 3411 static ssize_t 3412 level_store(struct mddev *mddev, const char *buf, size_t len) 3413 { 3414 char clevel[16]; 3415 ssize_t rv; 3416 size_t slen = len; 3417 struct md_personality *pers, *oldpers; 3418 long level; 3419 void *priv, *oldpriv; 3420 struct md_rdev *rdev; 3421 3422 if (slen == 0 || slen >= sizeof(clevel)) 3423 return -EINVAL; 3424 3425 rv = mddev_lock(mddev); 3426 if (rv) 3427 return rv; 3428 3429 if (mddev->pers == NULL) { 3430 strncpy(mddev->clevel, buf, slen); 3431 if (mddev->clevel[slen-1] == '\n') 3432 slen--; 3433 mddev->clevel[slen] = 0; 3434 mddev->level = LEVEL_NONE; 3435 rv = len; 3436 goto out_unlock; 3437 } 3438 rv = -EROFS; 3439 if (mddev->ro) 3440 goto out_unlock; 3441 3442 /* request to change the personality. Need to ensure: 3443 * - array is not engaged in resync/recovery/reshape 3444 * - old personality can be suspended 3445 * - new personality will access other array. 3446 */ 3447 3448 rv = -EBUSY; 3449 if (mddev->sync_thread || 3450 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3451 mddev->reshape_position != MaxSector || 3452 mddev->sysfs_active) 3453 goto out_unlock; 3454 3455 rv = -EINVAL; 3456 if (!mddev->pers->quiesce) { 3457 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3458 mdname(mddev), mddev->pers->name); 3459 goto out_unlock; 3460 } 3461 3462 /* Now find the new personality */ 3463 strncpy(clevel, buf, slen); 3464 if (clevel[slen-1] == '\n') 3465 slen--; 3466 clevel[slen] = 0; 3467 if (kstrtol(clevel, 10, &level)) 3468 level = LEVEL_NONE; 3469 3470 if (request_module("md-%s", clevel) != 0) 3471 request_module("md-level-%s", clevel); 3472 spin_lock(&pers_lock); 3473 pers = find_pers(level, clevel); 3474 if (!pers || !try_module_get(pers->owner)) { 3475 spin_unlock(&pers_lock); 3476 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3477 rv = -EINVAL; 3478 goto out_unlock; 3479 } 3480 spin_unlock(&pers_lock); 3481 3482 if (pers == mddev->pers) { 3483 /* Nothing to do! */ 3484 module_put(pers->owner); 3485 rv = len; 3486 goto out_unlock; 3487 } 3488 if (!pers->takeover) { 3489 module_put(pers->owner); 3490 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3491 mdname(mddev), clevel); 3492 rv = -EINVAL; 3493 goto out_unlock; 3494 } 3495 3496 rdev_for_each(rdev, mddev) 3497 rdev->new_raid_disk = rdev->raid_disk; 3498 3499 /* ->takeover must set new_* and/or delta_disks 3500 * if it succeeds, and may set them when it fails. 3501 */ 3502 priv = pers->takeover(mddev); 3503 if (IS_ERR(priv)) { 3504 mddev->new_level = mddev->level; 3505 mddev->new_layout = mddev->layout; 3506 mddev->new_chunk_sectors = mddev->chunk_sectors; 3507 mddev->raid_disks -= mddev->delta_disks; 3508 mddev->delta_disks = 0; 3509 mddev->reshape_backwards = 0; 3510 module_put(pers->owner); 3511 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3512 mdname(mddev), clevel); 3513 rv = PTR_ERR(priv); 3514 goto out_unlock; 3515 } 3516 3517 /* Looks like we have a winner */ 3518 mddev_suspend(mddev); 3519 mddev_detach(mddev); 3520 3521 spin_lock(&mddev->lock); 3522 oldpers = mddev->pers; 3523 oldpriv = mddev->private; 3524 mddev->pers = pers; 3525 mddev->private = priv; 3526 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3527 mddev->level = mddev->new_level; 3528 mddev->layout = mddev->new_layout; 3529 mddev->chunk_sectors = mddev->new_chunk_sectors; 3530 mddev->delta_disks = 0; 3531 mddev->reshape_backwards = 0; 3532 mddev->degraded = 0; 3533 spin_unlock(&mddev->lock); 3534 3535 if (oldpers->sync_request == NULL && 3536 mddev->external) { 3537 /* We are converting from a no-redundancy array 3538 * to a redundancy array and metadata is managed 3539 * externally so we need to be sure that writes 3540 * won't block due to a need to transition 3541 * clean->dirty 3542 * until external management is started. 3543 */ 3544 mddev->in_sync = 0; 3545 mddev->safemode_delay = 0; 3546 mddev->safemode = 0; 3547 } 3548 3549 oldpers->free(mddev, oldpriv); 3550 3551 if (oldpers->sync_request == NULL && 3552 pers->sync_request != NULL) { 3553 /* need to add the md_redundancy_group */ 3554 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3555 printk(KERN_WARNING 3556 "md: cannot register extra attributes for %s\n", 3557 mdname(mddev)); 3558 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3559 } 3560 if (oldpers->sync_request != NULL && 3561 pers->sync_request == NULL) { 3562 /* need to remove the md_redundancy_group */ 3563 if (mddev->to_remove == NULL) 3564 mddev->to_remove = &md_redundancy_group; 3565 } 3566 3567 rdev_for_each(rdev, mddev) { 3568 if (rdev->raid_disk < 0) 3569 continue; 3570 if (rdev->new_raid_disk >= mddev->raid_disks) 3571 rdev->new_raid_disk = -1; 3572 if (rdev->new_raid_disk == rdev->raid_disk) 3573 continue; 3574 sysfs_unlink_rdev(mddev, rdev); 3575 } 3576 rdev_for_each(rdev, mddev) { 3577 if (rdev->raid_disk < 0) 3578 continue; 3579 if (rdev->new_raid_disk == rdev->raid_disk) 3580 continue; 3581 rdev->raid_disk = rdev->new_raid_disk; 3582 if (rdev->raid_disk < 0) 3583 clear_bit(In_sync, &rdev->flags); 3584 else { 3585 if (sysfs_link_rdev(mddev, rdev)) 3586 printk(KERN_WARNING "md: cannot register rd%d" 3587 " for %s after level change\n", 3588 rdev->raid_disk, mdname(mddev)); 3589 } 3590 } 3591 3592 if (pers->sync_request == NULL) { 3593 /* this is now an array without redundancy, so 3594 * it must always be in_sync 3595 */ 3596 mddev->in_sync = 1; 3597 del_timer_sync(&mddev->safemode_timer); 3598 } 3599 blk_set_stacking_limits(&mddev->queue->limits); 3600 pers->run(mddev); 3601 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3602 mddev_resume(mddev); 3603 if (!mddev->thread) 3604 md_update_sb(mddev, 1); 3605 sysfs_notify(&mddev->kobj, NULL, "level"); 3606 md_new_event(mddev); 3607 rv = len; 3608 out_unlock: 3609 mddev_unlock(mddev); 3610 return rv; 3611 } 3612 3613 static struct md_sysfs_entry md_level = 3614 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3615 3616 static ssize_t 3617 layout_show(struct mddev *mddev, char *page) 3618 { 3619 /* just a number, not meaningful for all levels */ 3620 if (mddev->reshape_position != MaxSector && 3621 mddev->layout != mddev->new_layout) 3622 return sprintf(page, "%d (%d)\n", 3623 mddev->new_layout, mddev->layout); 3624 return sprintf(page, "%d\n", mddev->layout); 3625 } 3626 3627 static ssize_t 3628 layout_store(struct mddev *mddev, const char *buf, size_t len) 3629 { 3630 unsigned int n; 3631 int err; 3632 3633 err = kstrtouint(buf, 10, &n); 3634 if (err < 0) 3635 return err; 3636 err = mddev_lock(mddev); 3637 if (err) 3638 return err; 3639 3640 if (mddev->pers) { 3641 if (mddev->pers->check_reshape == NULL) 3642 err = -EBUSY; 3643 else if (mddev->ro) 3644 err = -EROFS; 3645 else { 3646 mddev->new_layout = n; 3647 err = mddev->pers->check_reshape(mddev); 3648 if (err) 3649 mddev->new_layout = mddev->layout; 3650 } 3651 } else { 3652 mddev->new_layout = n; 3653 if (mddev->reshape_position == MaxSector) 3654 mddev->layout = n; 3655 } 3656 mddev_unlock(mddev); 3657 return err ?: len; 3658 } 3659 static struct md_sysfs_entry md_layout = 3660 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3661 3662 static ssize_t 3663 raid_disks_show(struct mddev *mddev, char *page) 3664 { 3665 if (mddev->raid_disks == 0) 3666 return 0; 3667 if (mddev->reshape_position != MaxSector && 3668 mddev->delta_disks != 0) 3669 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3670 mddev->raid_disks - mddev->delta_disks); 3671 return sprintf(page, "%d\n", mddev->raid_disks); 3672 } 3673 3674 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3675 3676 static ssize_t 3677 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3678 { 3679 unsigned int n; 3680 int err; 3681 3682 err = kstrtouint(buf, 10, &n); 3683 if (err < 0) 3684 return err; 3685 3686 err = mddev_lock(mddev); 3687 if (err) 3688 return err; 3689 if (mddev->pers) 3690 err = update_raid_disks(mddev, n); 3691 else if (mddev->reshape_position != MaxSector) { 3692 struct md_rdev *rdev; 3693 int olddisks = mddev->raid_disks - mddev->delta_disks; 3694 3695 err = -EINVAL; 3696 rdev_for_each(rdev, mddev) { 3697 if (olddisks < n && 3698 rdev->data_offset < rdev->new_data_offset) 3699 goto out_unlock; 3700 if (olddisks > n && 3701 rdev->data_offset > rdev->new_data_offset) 3702 goto out_unlock; 3703 } 3704 err = 0; 3705 mddev->delta_disks = n - olddisks; 3706 mddev->raid_disks = n; 3707 mddev->reshape_backwards = (mddev->delta_disks < 0); 3708 } else 3709 mddev->raid_disks = n; 3710 out_unlock: 3711 mddev_unlock(mddev); 3712 return err ? err : len; 3713 } 3714 static struct md_sysfs_entry md_raid_disks = 3715 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3716 3717 static ssize_t 3718 chunk_size_show(struct mddev *mddev, char *page) 3719 { 3720 if (mddev->reshape_position != MaxSector && 3721 mddev->chunk_sectors != mddev->new_chunk_sectors) 3722 return sprintf(page, "%d (%d)\n", 3723 mddev->new_chunk_sectors << 9, 3724 mddev->chunk_sectors << 9); 3725 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3726 } 3727 3728 static ssize_t 3729 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3730 { 3731 unsigned long n; 3732 int err; 3733 3734 err = kstrtoul(buf, 10, &n); 3735 if (err < 0) 3736 return err; 3737 3738 err = mddev_lock(mddev); 3739 if (err) 3740 return err; 3741 if (mddev->pers) { 3742 if (mddev->pers->check_reshape == NULL) 3743 err = -EBUSY; 3744 else if (mddev->ro) 3745 err = -EROFS; 3746 else { 3747 mddev->new_chunk_sectors = n >> 9; 3748 err = mddev->pers->check_reshape(mddev); 3749 if (err) 3750 mddev->new_chunk_sectors = mddev->chunk_sectors; 3751 } 3752 } else { 3753 mddev->new_chunk_sectors = n >> 9; 3754 if (mddev->reshape_position == MaxSector) 3755 mddev->chunk_sectors = n >> 9; 3756 } 3757 mddev_unlock(mddev); 3758 return err ?: len; 3759 } 3760 static struct md_sysfs_entry md_chunk_size = 3761 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3762 3763 static ssize_t 3764 resync_start_show(struct mddev *mddev, char *page) 3765 { 3766 if (mddev->recovery_cp == MaxSector) 3767 return sprintf(page, "none\n"); 3768 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3769 } 3770 3771 static ssize_t 3772 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3773 { 3774 unsigned long long n; 3775 int err; 3776 3777 if (cmd_match(buf, "none")) 3778 n = MaxSector; 3779 else { 3780 err = kstrtoull(buf, 10, &n); 3781 if (err < 0) 3782 return err; 3783 if (n != (sector_t)n) 3784 return -EINVAL; 3785 } 3786 3787 err = mddev_lock(mddev); 3788 if (err) 3789 return err; 3790 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3791 err = -EBUSY; 3792 3793 if (!err) { 3794 mddev->recovery_cp = n; 3795 if (mddev->pers) 3796 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3797 } 3798 mddev_unlock(mddev); 3799 return err ?: len; 3800 } 3801 static struct md_sysfs_entry md_resync_start = 3802 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 3803 resync_start_show, resync_start_store); 3804 3805 /* 3806 * The array state can be: 3807 * 3808 * clear 3809 * No devices, no size, no level 3810 * Equivalent to STOP_ARRAY ioctl 3811 * inactive 3812 * May have some settings, but array is not active 3813 * all IO results in error 3814 * When written, doesn't tear down array, but just stops it 3815 * suspended (not supported yet) 3816 * All IO requests will block. The array can be reconfigured. 3817 * Writing this, if accepted, will block until array is quiescent 3818 * readonly 3819 * no resync can happen. no superblocks get written. 3820 * write requests fail 3821 * read-auto 3822 * like readonly, but behaves like 'clean' on a write request. 3823 * 3824 * clean - no pending writes, but otherwise active. 3825 * When written to inactive array, starts without resync 3826 * If a write request arrives then 3827 * if metadata is known, mark 'dirty' and switch to 'active'. 3828 * if not known, block and switch to write-pending 3829 * If written to an active array that has pending writes, then fails. 3830 * active 3831 * fully active: IO and resync can be happening. 3832 * When written to inactive array, starts with resync 3833 * 3834 * write-pending 3835 * clean, but writes are blocked waiting for 'active' to be written. 3836 * 3837 * active-idle 3838 * like active, but no writes have been seen for a while (100msec). 3839 * 3840 */ 3841 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3842 write_pending, active_idle, bad_word}; 3843 static char *array_states[] = { 3844 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3845 "write-pending", "active-idle", NULL }; 3846 3847 static int match_word(const char *word, char **list) 3848 { 3849 int n; 3850 for (n=0; list[n]; n++) 3851 if (cmd_match(word, list[n])) 3852 break; 3853 return n; 3854 } 3855 3856 static ssize_t 3857 array_state_show(struct mddev *mddev, char *page) 3858 { 3859 enum array_state st = inactive; 3860 3861 if (mddev->pers) 3862 switch(mddev->ro) { 3863 case 1: 3864 st = readonly; 3865 break; 3866 case 2: 3867 st = read_auto; 3868 break; 3869 case 0: 3870 if (mddev->in_sync) 3871 st = clean; 3872 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3873 st = write_pending; 3874 else if (mddev->safemode) 3875 st = active_idle; 3876 else 3877 st = active; 3878 } 3879 else { 3880 if (list_empty(&mddev->disks) && 3881 mddev->raid_disks == 0 && 3882 mddev->dev_sectors == 0) 3883 st = clear; 3884 else 3885 st = inactive; 3886 } 3887 return sprintf(page, "%s\n", array_states[st]); 3888 } 3889 3890 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 3891 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 3892 static int do_md_run(struct mddev *mddev); 3893 static int restart_array(struct mddev *mddev); 3894 3895 static ssize_t 3896 array_state_store(struct mddev *mddev, const char *buf, size_t len) 3897 { 3898 int err; 3899 enum array_state st = match_word(buf, array_states); 3900 3901 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 3902 /* don't take reconfig_mutex when toggling between 3903 * clean and active 3904 */ 3905 spin_lock(&mddev->lock); 3906 if (st == active) { 3907 restart_array(mddev); 3908 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3909 wake_up(&mddev->sb_wait); 3910 err = 0; 3911 } else /* st == clean */ { 3912 restart_array(mddev); 3913 if (atomic_read(&mddev->writes_pending) == 0) { 3914 if (mddev->in_sync == 0) { 3915 mddev->in_sync = 1; 3916 if (mddev->safemode == 1) 3917 mddev->safemode = 0; 3918 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3919 } 3920 err = 0; 3921 } else 3922 err = -EBUSY; 3923 } 3924 spin_unlock(&mddev->lock); 3925 return err ?: len; 3926 } 3927 err = mddev_lock(mddev); 3928 if (err) 3929 return err; 3930 err = -EINVAL; 3931 switch(st) { 3932 case bad_word: 3933 break; 3934 case clear: 3935 /* stopping an active array */ 3936 err = do_md_stop(mddev, 0, NULL); 3937 break; 3938 case inactive: 3939 /* stopping an active array */ 3940 if (mddev->pers) 3941 err = do_md_stop(mddev, 2, NULL); 3942 else 3943 err = 0; /* already inactive */ 3944 break; 3945 case suspended: 3946 break; /* not supported yet */ 3947 case readonly: 3948 if (mddev->pers) 3949 err = md_set_readonly(mddev, NULL); 3950 else { 3951 mddev->ro = 1; 3952 set_disk_ro(mddev->gendisk, 1); 3953 err = do_md_run(mddev); 3954 } 3955 break; 3956 case read_auto: 3957 if (mddev->pers) { 3958 if (mddev->ro == 0) 3959 err = md_set_readonly(mddev, NULL); 3960 else if (mddev->ro == 1) 3961 err = restart_array(mddev); 3962 if (err == 0) { 3963 mddev->ro = 2; 3964 set_disk_ro(mddev->gendisk, 0); 3965 } 3966 } else { 3967 mddev->ro = 2; 3968 err = do_md_run(mddev); 3969 } 3970 break; 3971 case clean: 3972 if (mddev->pers) { 3973 err = restart_array(mddev); 3974 if (err) 3975 break; 3976 spin_lock(&mddev->lock); 3977 if (atomic_read(&mddev->writes_pending) == 0) { 3978 if (mddev->in_sync == 0) { 3979 mddev->in_sync = 1; 3980 if (mddev->safemode == 1) 3981 mddev->safemode = 0; 3982 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3983 } 3984 err = 0; 3985 } else 3986 err = -EBUSY; 3987 spin_unlock(&mddev->lock); 3988 } else 3989 err = -EINVAL; 3990 break; 3991 case active: 3992 if (mddev->pers) { 3993 err = restart_array(mddev); 3994 if (err) 3995 break; 3996 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3997 wake_up(&mddev->sb_wait); 3998 err = 0; 3999 } else { 4000 mddev->ro = 0; 4001 set_disk_ro(mddev->gendisk, 0); 4002 err = do_md_run(mddev); 4003 } 4004 break; 4005 case write_pending: 4006 case active_idle: 4007 /* these cannot be set */ 4008 break; 4009 } 4010 4011 if (!err) { 4012 if (mddev->hold_active == UNTIL_IOCTL) 4013 mddev->hold_active = 0; 4014 sysfs_notify_dirent_safe(mddev->sysfs_state); 4015 } 4016 mddev_unlock(mddev); 4017 return err ?: len; 4018 } 4019 static struct md_sysfs_entry md_array_state = 4020 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4021 4022 static ssize_t 4023 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4024 return sprintf(page, "%d\n", 4025 atomic_read(&mddev->max_corr_read_errors)); 4026 } 4027 4028 static ssize_t 4029 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4030 { 4031 unsigned int n; 4032 int rv; 4033 4034 rv = kstrtouint(buf, 10, &n); 4035 if (rv < 0) 4036 return rv; 4037 atomic_set(&mddev->max_corr_read_errors, n); 4038 return len; 4039 } 4040 4041 static struct md_sysfs_entry max_corr_read_errors = 4042 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4043 max_corrected_read_errors_store); 4044 4045 static ssize_t 4046 null_show(struct mddev *mddev, char *page) 4047 { 4048 return -EINVAL; 4049 } 4050 4051 static ssize_t 4052 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4053 { 4054 /* buf must be %d:%d\n? giving major and minor numbers */ 4055 /* The new device is added to the array. 4056 * If the array has a persistent superblock, we read the 4057 * superblock to initialise info and check validity. 4058 * Otherwise, only checking done is that in bind_rdev_to_array, 4059 * which mainly checks size. 4060 */ 4061 char *e; 4062 int major = simple_strtoul(buf, &e, 10); 4063 int minor; 4064 dev_t dev; 4065 struct md_rdev *rdev; 4066 int err; 4067 4068 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4069 return -EINVAL; 4070 minor = simple_strtoul(e+1, &e, 10); 4071 if (*e && *e != '\n') 4072 return -EINVAL; 4073 dev = MKDEV(major, minor); 4074 if (major != MAJOR(dev) || 4075 minor != MINOR(dev)) 4076 return -EOVERFLOW; 4077 4078 flush_workqueue(md_misc_wq); 4079 4080 err = mddev_lock(mddev); 4081 if (err) 4082 return err; 4083 if (mddev->persistent) { 4084 rdev = md_import_device(dev, mddev->major_version, 4085 mddev->minor_version); 4086 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4087 struct md_rdev *rdev0 4088 = list_entry(mddev->disks.next, 4089 struct md_rdev, same_set); 4090 err = super_types[mddev->major_version] 4091 .load_super(rdev, rdev0, mddev->minor_version); 4092 if (err < 0) 4093 goto out; 4094 } 4095 } else if (mddev->external) 4096 rdev = md_import_device(dev, -2, -1); 4097 else 4098 rdev = md_import_device(dev, -1, -1); 4099 4100 if (IS_ERR(rdev)) { 4101 mddev_unlock(mddev); 4102 return PTR_ERR(rdev); 4103 } 4104 err = bind_rdev_to_array(rdev, mddev); 4105 out: 4106 if (err) 4107 export_rdev(rdev); 4108 mddev_unlock(mddev); 4109 return err ? err : len; 4110 } 4111 4112 static struct md_sysfs_entry md_new_device = 4113 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4114 4115 static ssize_t 4116 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4117 { 4118 char *end; 4119 unsigned long chunk, end_chunk; 4120 int err; 4121 4122 err = mddev_lock(mddev); 4123 if (err) 4124 return err; 4125 if (!mddev->bitmap) 4126 goto out; 4127 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4128 while (*buf) { 4129 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4130 if (buf == end) break; 4131 if (*end == '-') { /* range */ 4132 buf = end + 1; 4133 end_chunk = simple_strtoul(buf, &end, 0); 4134 if (buf == end) break; 4135 } 4136 if (*end && !isspace(*end)) break; 4137 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4138 buf = skip_spaces(end); 4139 } 4140 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4141 out: 4142 mddev_unlock(mddev); 4143 return len; 4144 } 4145 4146 static struct md_sysfs_entry md_bitmap = 4147 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4148 4149 static ssize_t 4150 size_show(struct mddev *mddev, char *page) 4151 { 4152 return sprintf(page, "%llu\n", 4153 (unsigned long long)mddev->dev_sectors / 2); 4154 } 4155 4156 static int update_size(struct mddev *mddev, sector_t num_sectors); 4157 4158 static ssize_t 4159 size_store(struct mddev *mddev, const char *buf, size_t len) 4160 { 4161 /* If array is inactive, we can reduce the component size, but 4162 * not increase it (except from 0). 4163 * If array is active, we can try an on-line resize 4164 */ 4165 sector_t sectors; 4166 int err = strict_blocks_to_sectors(buf, §ors); 4167 4168 if (err < 0) 4169 return err; 4170 err = mddev_lock(mddev); 4171 if (err) 4172 return err; 4173 if (mddev->pers) { 4174 err = update_size(mddev, sectors); 4175 md_update_sb(mddev, 1); 4176 } else { 4177 if (mddev->dev_sectors == 0 || 4178 mddev->dev_sectors > sectors) 4179 mddev->dev_sectors = sectors; 4180 else 4181 err = -ENOSPC; 4182 } 4183 mddev_unlock(mddev); 4184 return err ? err : len; 4185 } 4186 4187 static struct md_sysfs_entry md_size = 4188 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4189 4190 /* Metadata version. 4191 * This is one of 4192 * 'none' for arrays with no metadata (good luck...) 4193 * 'external' for arrays with externally managed metadata, 4194 * or N.M for internally known formats 4195 */ 4196 static ssize_t 4197 metadata_show(struct mddev *mddev, char *page) 4198 { 4199 if (mddev->persistent) 4200 return sprintf(page, "%d.%d\n", 4201 mddev->major_version, mddev->minor_version); 4202 else if (mddev->external) 4203 return sprintf(page, "external:%s\n", mddev->metadata_type); 4204 else 4205 return sprintf(page, "none\n"); 4206 } 4207 4208 static ssize_t 4209 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4210 { 4211 int major, minor; 4212 char *e; 4213 int err; 4214 /* Changing the details of 'external' metadata is 4215 * always permitted. Otherwise there must be 4216 * no devices attached to the array. 4217 */ 4218 4219 err = mddev_lock(mddev); 4220 if (err) 4221 return err; 4222 err = -EBUSY; 4223 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4224 ; 4225 else if (!list_empty(&mddev->disks)) 4226 goto out_unlock; 4227 4228 err = 0; 4229 if (cmd_match(buf, "none")) { 4230 mddev->persistent = 0; 4231 mddev->external = 0; 4232 mddev->major_version = 0; 4233 mddev->minor_version = 90; 4234 goto out_unlock; 4235 } 4236 if (strncmp(buf, "external:", 9) == 0) { 4237 size_t namelen = len-9; 4238 if (namelen >= sizeof(mddev->metadata_type)) 4239 namelen = sizeof(mddev->metadata_type)-1; 4240 strncpy(mddev->metadata_type, buf+9, namelen); 4241 mddev->metadata_type[namelen] = 0; 4242 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4243 mddev->metadata_type[--namelen] = 0; 4244 mddev->persistent = 0; 4245 mddev->external = 1; 4246 mddev->major_version = 0; 4247 mddev->minor_version = 90; 4248 goto out_unlock; 4249 } 4250 major = simple_strtoul(buf, &e, 10); 4251 err = -EINVAL; 4252 if (e==buf || *e != '.') 4253 goto out_unlock; 4254 buf = e+1; 4255 minor = simple_strtoul(buf, &e, 10); 4256 if (e==buf || (*e && *e != '\n') ) 4257 goto out_unlock; 4258 err = -ENOENT; 4259 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4260 goto out_unlock; 4261 mddev->major_version = major; 4262 mddev->minor_version = minor; 4263 mddev->persistent = 1; 4264 mddev->external = 0; 4265 err = 0; 4266 out_unlock: 4267 mddev_unlock(mddev); 4268 return err ?: len; 4269 } 4270 4271 static struct md_sysfs_entry md_metadata = 4272 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4273 4274 static ssize_t 4275 action_show(struct mddev *mddev, char *page) 4276 { 4277 char *type = "idle"; 4278 unsigned long recovery = mddev->recovery; 4279 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4280 type = "frozen"; 4281 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4282 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4283 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4284 type = "reshape"; 4285 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4286 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4287 type = "resync"; 4288 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4289 type = "check"; 4290 else 4291 type = "repair"; 4292 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4293 type = "recover"; 4294 else if (mddev->reshape_position != MaxSector) 4295 type = "reshape"; 4296 } 4297 return sprintf(page, "%s\n", type); 4298 } 4299 4300 static ssize_t 4301 action_store(struct mddev *mddev, const char *page, size_t len) 4302 { 4303 if (!mddev->pers || !mddev->pers->sync_request) 4304 return -EINVAL; 4305 4306 4307 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4308 if (cmd_match(page, "frozen")) 4309 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4310 else 4311 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4312 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4313 mddev_lock(mddev) == 0) { 4314 flush_workqueue(md_misc_wq); 4315 if (mddev->sync_thread) { 4316 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4317 md_reap_sync_thread(mddev); 4318 } 4319 mddev_unlock(mddev); 4320 } 4321 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4322 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4323 return -EBUSY; 4324 else if (cmd_match(page, "resync")) 4325 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4326 else if (cmd_match(page, "recover")) { 4327 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4328 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4329 } else if (cmd_match(page, "reshape")) { 4330 int err; 4331 if (mddev->pers->start_reshape == NULL) 4332 return -EINVAL; 4333 err = mddev_lock(mddev); 4334 if (!err) { 4335 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4336 err = mddev->pers->start_reshape(mddev); 4337 mddev_unlock(mddev); 4338 } 4339 if (err) 4340 return err; 4341 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4342 } else { 4343 if (cmd_match(page, "check")) 4344 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4345 else if (!cmd_match(page, "repair")) 4346 return -EINVAL; 4347 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4348 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4349 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4350 } 4351 if (mddev->ro == 2) { 4352 /* A write to sync_action is enough to justify 4353 * canceling read-auto mode 4354 */ 4355 mddev->ro = 0; 4356 md_wakeup_thread(mddev->sync_thread); 4357 } 4358 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4359 md_wakeup_thread(mddev->thread); 4360 sysfs_notify_dirent_safe(mddev->sysfs_action); 4361 return len; 4362 } 4363 4364 static struct md_sysfs_entry md_scan_mode = 4365 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4366 4367 static ssize_t 4368 last_sync_action_show(struct mddev *mddev, char *page) 4369 { 4370 return sprintf(page, "%s\n", mddev->last_sync_action); 4371 } 4372 4373 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4374 4375 static ssize_t 4376 mismatch_cnt_show(struct mddev *mddev, char *page) 4377 { 4378 return sprintf(page, "%llu\n", 4379 (unsigned long long) 4380 atomic64_read(&mddev->resync_mismatches)); 4381 } 4382 4383 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4384 4385 static ssize_t 4386 sync_min_show(struct mddev *mddev, char *page) 4387 { 4388 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4389 mddev->sync_speed_min ? "local": "system"); 4390 } 4391 4392 static ssize_t 4393 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4394 { 4395 unsigned int min; 4396 int rv; 4397 4398 if (strncmp(buf, "system", 6)==0) { 4399 min = 0; 4400 } else { 4401 rv = kstrtouint(buf, 10, &min); 4402 if (rv < 0) 4403 return rv; 4404 if (min == 0) 4405 return -EINVAL; 4406 } 4407 mddev->sync_speed_min = min; 4408 return len; 4409 } 4410 4411 static struct md_sysfs_entry md_sync_min = 4412 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4413 4414 static ssize_t 4415 sync_max_show(struct mddev *mddev, char *page) 4416 { 4417 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4418 mddev->sync_speed_max ? "local": "system"); 4419 } 4420 4421 static ssize_t 4422 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4423 { 4424 unsigned int max; 4425 int rv; 4426 4427 if (strncmp(buf, "system", 6)==0) { 4428 max = 0; 4429 } else { 4430 rv = kstrtouint(buf, 10, &max); 4431 if (rv < 0) 4432 return rv; 4433 if (max == 0) 4434 return -EINVAL; 4435 } 4436 mddev->sync_speed_max = max; 4437 return len; 4438 } 4439 4440 static struct md_sysfs_entry md_sync_max = 4441 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4442 4443 static ssize_t 4444 degraded_show(struct mddev *mddev, char *page) 4445 { 4446 return sprintf(page, "%d\n", mddev->degraded); 4447 } 4448 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4449 4450 static ssize_t 4451 sync_force_parallel_show(struct mddev *mddev, char *page) 4452 { 4453 return sprintf(page, "%d\n", mddev->parallel_resync); 4454 } 4455 4456 static ssize_t 4457 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4458 { 4459 long n; 4460 4461 if (kstrtol(buf, 10, &n)) 4462 return -EINVAL; 4463 4464 if (n != 0 && n != 1) 4465 return -EINVAL; 4466 4467 mddev->parallel_resync = n; 4468 4469 if (mddev->sync_thread) 4470 wake_up(&resync_wait); 4471 4472 return len; 4473 } 4474 4475 /* force parallel resync, even with shared block devices */ 4476 static struct md_sysfs_entry md_sync_force_parallel = 4477 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4478 sync_force_parallel_show, sync_force_parallel_store); 4479 4480 static ssize_t 4481 sync_speed_show(struct mddev *mddev, char *page) 4482 { 4483 unsigned long resync, dt, db; 4484 if (mddev->curr_resync == 0) 4485 return sprintf(page, "none\n"); 4486 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4487 dt = (jiffies - mddev->resync_mark) / HZ; 4488 if (!dt) dt++; 4489 db = resync - mddev->resync_mark_cnt; 4490 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4491 } 4492 4493 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4494 4495 static ssize_t 4496 sync_completed_show(struct mddev *mddev, char *page) 4497 { 4498 unsigned long long max_sectors, resync; 4499 4500 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4501 return sprintf(page, "none\n"); 4502 4503 if (mddev->curr_resync == 1 || 4504 mddev->curr_resync == 2) 4505 return sprintf(page, "delayed\n"); 4506 4507 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4508 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4509 max_sectors = mddev->resync_max_sectors; 4510 else 4511 max_sectors = mddev->dev_sectors; 4512 4513 resync = mddev->curr_resync_completed; 4514 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4515 } 4516 4517 static struct md_sysfs_entry md_sync_completed = 4518 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4519 4520 static ssize_t 4521 min_sync_show(struct mddev *mddev, char *page) 4522 { 4523 return sprintf(page, "%llu\n", 4524 (unsigned long long)mddev->resync_min); 4525 } 4526 static ssize_t 4527 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4528 { 4529 unsigned long long min; 4530 int err; 4531 4532 if (kstrtoull(buf, 10, &min)) 4533 return -EINVAL; 4534 4535 spin_lock(&mddev->lock); 4536 err = -EINVAL; 4537 if (min > mddev->resync_max) 4538 goto out_unlock; 4539 4540 err = -EBUSY; 4541 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4542 goto out_unlock; 4543 4544 /* Round down to multiple of 4K for safety */ 4545 mddev->resync_min = round_down(min, 8); 4546 err = 0; 4547 4548 out_unlock: 4549 spin_unlock(&mddev->lock); 4550 return err ?: len; 4551 } 4552 4553 static struct md_sysfs_entry md_min_sync = 4554 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4555 4556 static ssize_t 4557 max_sync_show(struct mddev *mddev, char *page) 4558 { 4559 if (mddev->resync_max == MaxSector) 4560 return sprintf(page, "max\n"); 4561 else 4562 return sprintf(page, "%llu\n", 4563 (unsigned long long)mddev->resync_max); 4564 } 4565 static ssize_t 4566 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4567 { 4568 int err; 4569 spin_lock(&mddev->lock); 4570 if (strncmp(buf, "max", 3) == 0) 4571 mddev->resync_max = MaxSector; 4572 else { 4573 unsigned long long max; 4574 int chunk; 4575 4576 err = -EINVAL; 4577 if (kstrtoull(buf, 10, &max)) 4578 goto out_unlock; 4579 if (max < mddev->resync_min) 4580 goto out_unlock; 4581 4582 err = -EBUSY; 4583 if (max < mddev->resync_max && 4584 mddev->ro == 0 && 4585 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4586 goto out_unlock; 4587 4588 /* Must be a multiple of chunk_size */ 4589 chunk = mddev->chunk_sectors; 4590 if (chunk) { 4591 sector_t temp = max; 4592 4593 err = -EINVAL; 4594 if (sector_div(temp, chunk)) 4595 goto out_unlock; 4596 } 4597 mddev->resync_max = max; 4598 } 4599 wake_up(&mddev->recovery_wait); 4600 err = 0; 4601 out_unlock: 4602 spin_unlock(&mddev->lock); 4603 return err ?: len; 4604 } 4605 4606 static struct md_sysfs_entry md_max_sync = 4607 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4608 4609 static ssize_t 4610 suspend_lo_show(struct mddev *mddev, char *page) 4611 { 4612 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4613 } 4614 4615 static ssize_t 4616 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4617 { 4618 unsigned long long old, new; 4619 int err; 4620 4621 err = kstrtoull(buf, 10, &new); 4622 if (err < 0) 4623 return err; 4624 if (new != (sector_t)new) 4625 return -EINVAL; 4626 4627 err = mddev_lock(mddev); 4628 if (err) 4629 return err; 4630 err = -EINVAL; 4631 if (mddev->pers == NULL || 4632 mddev->pers->quiesce == NULL) 4633 goto unlock; 4634 old = mddev->suspend_lo; 4635 mddev->suspend_lo = new; 4636 if (new >= old) 4637 /* Shrinking suspended region */ 4638 mddev->pers->quiesce(mddev, 2); 4639 else { 4640 /* Expanding suspended region - need to wait */ 4641 mddev->pers->quiesce(mddev, 1); 4642 mddev->pers->quiesce(mddev, 0); 4643 } 4644 err = 0; 4645 unlock: 4646 mddev_unlock(mddev); 4647 return err ?: len; 4648 } 4649 static struct md_sysfs_entry md_suspend_lo = 4650 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4651 4652 static ssize_t 4653 suspend_hi_show(struct mddev *mddev, char *page) 4654 { 4655 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4656 } 4657 4658 static ssize_t 4659 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4660 { 4661 unsigned long long old, new; 4662 int err; 4663 4664 err = kstrtoull(buf, 10, &new); 4665 if (err < 0) 4666 return err; 4667 if (new != (sector_t)new) 4668 return -EINVAL; 4669 4670 err = mddev_lock(mddev); 4671 if (err) 4672 return err; 4673 err = -EINVAL; 4674 if (mddev->pers == NULL || 4675 mddev->pers->quiesce == NULL) 4676 goto unlock; 4677 old = mddev->suspend_hi; 4678 mddev->suspend_hi = new; 4679 if (new <= old) 4680 /* Shrinking suspended region */ 4681 mddev->pers->quiesce(mddev, 2); 4682 else { 4683 /* Expanding suspended region - need to wait */ 4684 mddev->pers->quiesce(mddev, 1); 4685 mddev->pers->quiesce(mddev, 0); 4686 } 4687 err = 0; 4688 unlock: 4689 mddev_unlock(mddev); 4690 return err ?: len; 4691 } 4692 static struct md_sysfs_entry md_suspend_hi = 4693 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4694 4695 static ssize_t 4696 reshape_position_show(struct mddev *mddev, char *page) 4697 { 4698 if (mddev->reshape_position != MaxSector) 4699 return sprintf(page, "%llu\n", 4700 (unsigned long long)mddev->reshape_position); 4701 strcpy(page, "none\n"); 4702 return 5; 4703 } 4704 4705 static ssize_t 4706 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4707 { 4708 struct md_rdev *rdev; 4709 unsigned long long new; 4710 int err; 4711 4712 err = kstrtoull(buf, 10, &new); 4713 if (err < 0) 4714 return err; 4715 if (new != (sector_t)new) 4716 return -EINVAL; 4717 err = mddev_lock(mddev); 4718 if (err) 4719 return err; 4720 err = -EBUSY; 4721 if (mddev->pers) 4722 goto unlock; 4723 mddev->reshape_position = new; 4724 mddev->delta_disks = 0; 4725 mddev->reshape_backwards = 0; 4726 mddev->new_level = mddev->level; 4727 mddev->new_layout = mddev->layout; 4728 mddev->new_chunk_sectors = mddev->chunk_sectors; 4729 rdev_for_each(rdev, mddev) 4730 rdev->new_data_offset = rdev->data_offset; 4731 err = 0; 4732 unlock: 4733 mddev_unlock(mddev); 4734 return err ?: len; 4735 } 4736 4737 static struct md_sysfs_entry md_reshape_position = 4738 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4739 reshape_position_store); 4740 4741 static ssize_t 4742 reshape_direction_show(struct mddev *mddev, char *page) 4743 { 4744 return sprintf(page, "%s\n", 4745 mddev->reshape_backwards ? "backwards" : "forwards"); 4746 } 4747 4748 static ssize_t 4749 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4750 { 4751 int backwards = 0; 4752 int err; 4753 4754 if (cmd_match(buf, "forwards")) 4755 backwards = 0; 4756 else if (cmd_match(buf, "backwards")) 4757 backwards = 1; 4758 else 4759 return -EINVAL; 4760 if (mddev->reshape_backwards == backwards) 4761 return len; 4762 4763 err = mddev_lock(mddev); 4764 if (err) 4765 return err; 4766 /* check if we are allowed to change */ 4767 if (mddev->delta_disks) 4768 err = -EBUSY; 4769 else if (mddev->persistent && 4770 mddev->major_version == 0) 4771 err = -EINVAL; 4772 else 4773 mddev->reshape_backwards = backwards; 4774 mddev_unlock(mddev); 4775 return err ?: len; 4776 } 4777 4778 static struct md_sysfs_entry md_reshape_direction = 4779 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4780 reshape_direction_store); 4781 4782 static ssize_t 4783 array_size_show(struct mddev *mddev, char *page) 4784 { 4785 if (mddev->external_size) 4786 return sprintf(page, "%llu\n", 4787 (unsigned long long)mddev->array_sectors/2); 4788 else 4789 return sprintf(page, "default\n"); 4790 } 4791 4792 static ssize_t 4793 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4794 { 4795 sector_t sectors; 4796 int err; 4797 4798 err = mddev_lock(mddev); 4799 if (err) 4800 return err; 4801 4802 if (strncmp(buf, "default", 7) == 0) { 4803 if (mddev->pers) 4804 sectors = mddev->pers->size(mddev, 0, 0); 4805 else 4806 sectors = mddev->array_sectors; 4807 4808 mddev->external_size = 0; 4809 } else { 4810 if (strict_blocks_to_sectors(buf, §ors) < 0) 4811 err = -EINVAL; 4812 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4813 err = -E2BIG; 4814 else 4815 mddev->external_size = 1; 4816 } 4817 4818 if (!err) { 4819 mddev->array_sectors = sectors; 4820 if (mddev->pers) { 4821 set_capacity(mddev->gendisk, mddev->array_sectors); 4822 revalidate_disk(mddev->gendisk); 4823 } 4824 } 4825 mddev_unlock(mddev); 4826 return err ?: len; 4827 } 4828 4829 static struct md_sysfs_entry md_array_size = 4830 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4831 array_size_store); 4832 4833 static struct attribute *md_default_attrs[] = { 4834 &md_level.attr, 4835 &md_layout.attr, 4836 &md_raid_disks.attr, 4837 &md_chunk_size.attr, 4838 &md_size.attr, 4839 &md_resync_start.attr, 4840 &md_metadata.attr, 4841 &md_new_device.attr, 4842 &md_safe_delay.attr, 4843 &md_array_state.attr, 4844 &md_reshape_position.attr, 4845 &md_reshape_direction.attr, 4846 &md_array_size.attr, 4847 &max_corr_read_errors.attr, 4848 NULL, 4849 }; 4850 4851 static struct attribute *md_redundancy_attrs[] = { 4852 &md_scan_mode.attr, 4853 &md_last_scan_mode.attr, 4854 &md_mismatches.attr, 4855 &md_sync_min.attr, 4856 &md_sync_max.attr, 4857 &md_sync_speed.attr, 4858 &md_sync_force_parallel.attr, 4859 &md_sync_completed.attr, 4860 &md_min_sync.attr, 4861 &md_max_sync.attr, 4862 &md_suspend_lo.attr, 4863 &md_suspend_hi.attr, 4864 &md_bitmap.attr, 4865 &md_degraded.attr, 4866 NULL, 4867 }; 4868 static struct attribute_group md_redundancy_group = { 4869 .name = NULL, 4870 .attrs = md_redundancy_attrs, 4871 }; 4872 4873 static ssize_t 4874 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4875 { 4876 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4877 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4878 ssize_t rv; 4879 4880 if (!entry->show) 4881 return -EIO; 4882 spin_lock(&all_mddevs_lock); 4883 if (list_empty(&mddev->all_mddevs)) { 4884 spin_unlock(&all_mddevs_lock); 4885 return -EBUSY; 4886 } 4887 mddev_get(mddev); 4888 spin_unlock(&all_mddevs_lock); 4889 4890 rv = entry->show(mddev, page); 4891 mddev_put(mddev); 4892 return rv; 4893 } 4894 4895 static ssize_t 4896 md_attr_store(struct kobject *kobj, struct attribute *attr, 4897 const char *page, size_t length) 4898 { 4899 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4900 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4901 ssize_t rv; 4902 4903 if (!entry->store) 4904 return -EIO; 4905 if (!capable(CAP_SYS_ADMIN)) 4906 return -EACCES; 4907 spin_lock(&all_mddevs_lock); 4908 if (list_empty(&mddev->all_mddevs)) { 4909 spin_unlock(&all_mddevs_lock); 4910 return -EBUSY; 4911 } 4912 mddev_get(mddev); 4913 spin_unlock(&all_mddevs_lock); 4914 rv = entry->store(mddev, page, length); 4915 mddev_put(mddev); 4916 return rv; 4917 } 4918 4919 static void md_free(struct kobject *ko) 4920 { 4921 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4922 4923 if (mddev->sysfs_state) 4924 sysfs_put(mddev->sysfs_state); 4925 4926 if (mddev->queue) 4927 blk_cleanup_queue(mddev->queue); 4928 if (mddev->gendisk) { 4929 del_gendisk(mddev->gendisk); 4930 put_disk(mddev->gendisk); 4931 } 4932 4933 kfree(mddev); 4934 } 4935 4936 static const struct sysfs_ops md_sysfs_ops = { 4937 .show = md_attr_show, 4938 .store = md_attr_store, 4939 }; 4940 static struct kobj_type md_ktype = { 4941 .release = md_free, 4942 .sysfs_ops = &md_sysfs_ops, 4943 .default_attrs = md_default_attrs, 4944 }; 4945 4946 int mdp_major = 0; 4947 4948 static void mddev_delayed_delete(struct work_struct *ws) 4949 { 4950 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4951 4952 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4953 kobject_del(&mddev->kobj); 4954 kobject_put(&mddev->kobj); 4955 } 4956 4957 static int md_alloc(dev_t dev, char *name) 4958 { 4959 static DEFINE_MUTEX(disks_mutex); 4960 struct mddev *mddev = mddev_find(dev); 4961 struct gendisk *disk; 4962 int partitioned; 4963 int shift; 4964 int unit; 4965 int error; 4966 4967 if (!mddev) 4968 return -ENODEV; 4969 4970 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4971 shift = partitioned ? MdpMinorShift : 0; 4972 unit = MINOR(mddev->unit) >> shift; 4973 4974 /* wait for any previous instance of this device to be 4975 * completely removed (mddev_delayed_delete). 4976 */ 4977 flush_workqueue(md_misc_wq); 4978 4979 mutex_lock(&disks_mutex); 4980 error = -EEXIST; 4981 if (mddev->gendisk) 4982 goto abort; 4983 4984 if (name) { 4985 /* Need to ensure that 'name' is not a duplicate. 4986 */ 4987 struct mddev *mddev2; 4988 spin_lock(&all_mddevs_lock); 4989 4990 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4991 if (mddev2->gendisk && 4992 strcmp(mddev2->gendisk->disk_name, name) == 0) { 4993 spin_unlock(&all_mddevs_lock); 4994 goto abort; 4995 } 4996 spin_unlock(&all_mddevs_lock); 4997 } 4998 4999 error = -ENOMEM; 5000 mddev->queue = blk_alloc_queue(GFP_KERNEL); 5001 if (!mddev->queue) 5002 goto abort; 5003 mddev->queue->queuedata = mddev; 5004 5005 blk_queue_make_request(mddev->queue, md_make_request); 5006 blk_set_stacking_limits(&mddev->queue->limits); 5007 5008 disk = alloc_disk(1 << shift); 5009 if (!disk) { 5010 blk_cleanup_queue(mddev->queue); 5011 mddev->queue = NULL; 5012 goto abort; 5013 } 5014 disk->major = MAJOR(mddev->unit); 5015 disk->first_minor = unit << shift; 5016 if (name) 5017 strcpy(disk->disk_name, name); 5018 else if (partitioned) 5019 sprintf(disk->disk_name, "md_d%d", unit); 5020 else 5021 sprintf(disk->disk_name, "md%d", unit); 5022 disk->fops = &md_fops; 5023 disk->private_data = mddev; 5024 disk->queue = mddev->queue; 5025 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); 5026 /* Allow extended partitions. This makes the 5027 * 'mdp' device redundant, but we can't really 5028 * remove it now. 5029 */ 5030 disk->flags |= GENHD_FL_EXT_DEVT; 5031 mddev->gendisk = disk; 5032 /* As soon as we call add_disk(), another thread could get 5033 * through to md_open, so make sure it doesn't get too far 5034 */ 5035 mutex_lock(&mddev->open_mutex); 5036 add_disk(disk); 5037 5038 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 5039 &disk_to_dev(disk)->kobj, "%s", "md"); 5040 if (error) { 5041 /* This isn't possible, but as kobject_init_and_add is marked 5042 * __must_check, we must do something with the result 5043 */ 5044 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 5045 disk->disk_name); 5046 error = 0; 5047 } 5048 if (mddev->kobj.sd && 5049 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5050 printk(KERN_DEBUG "pointless warning\n"); 5051 mutex_unlock(&mddev->open_mutex); 5052 abort: 5053 mutex_unlock(&disks_mutex); 5054 if (!error && mddev->kobj.sd) { 5055 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5056 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5057 } 5058 mddev_put(mddev); 5059 return error; 5060 } 5061 5062 static struct kobject *md_probe(dev_t dev, int *part, void *data) 5063 { 5064 md_alloc(dev, NULL); 5065 return NULL; 5066 } 5067 5068 static int add_named_array(const char *val, struct kernel_param *kp) 5069 { 5070 /* val must be "md_*" where * is not all digits. 5071 * We allocate an array with a large free minor number, and 5072 * set the name to val. val must not already be an active name. 5073 */ 5074 int len = strlen(val); 5075 char buf[DISK_NAME_LEN]; 5076 5077 while (len && val[len-1] == '\n') 5078 len--; 5079 if (len >= DISK_NAME_LEN) 5080 return -E2BIG; 5081 strlcpy(buf, val, len+1); 5082 if (strncmp(buf, "md_", 3) != 0) 5083 return -EINVAL; 5084 return md_alloc(0, buf); 5085 } 5086 5087 static void md_safemode_timeout(unsigned long data) 5088 { 5089 struct mddev *mddev = (struct mddev *) data; 5090 5091 if (!atomic_read(&mddev->writes_pending)) { 5092 mddev->safemode = 1; 5093 if (mddev->external) 5094 sysfs_notify_dirent_safe(mddev->sysfs_state); 5095 } 5096 md_wakeup_thread(mddev->thread); 5097 } 5098 5099 static int start_dirty_degraded; 5100 5101 int md_run(struct mddev *mddev) 5102 { 5103 int err; 5104 struct md_rdev *rdev; 5105 struct md_personality *pers; 5106 5107 if (list_empty(&mddev->disks)) 5108 /* cannot run an array with no devices.. */ 5109 return -EINVAL; 5110 5111 if (mddev->pers) 5112 return -EBUSY; 5113 /* Cannot run until previous stop completes properly */ 5114 if (mddev->sysfs_active) 5115 return -EBUSY; 5116 5117 /* 5118 * Analyze all RAID superblock(s) 5119 */ 5120 if (!mddev->raid_disks) { 5121 if (!mddev->persistent) 5122 return -EINVAL; 5123 analyze_sbs(mddev); 5124 } 5125 5126 if (mddev->level != LEVEL_NONE) 5127 request_module("md-level-%d", mddev->level); 5128 else if (mddev->clevel[0]) 5129 request_module("md-%s", mddev->clevel); 5130 5131 /* 5132 * Drop all container device buffers, from now on 5133 * the only valid external interface is through the md 5134 * device. 5135 */ 5136 rdev_for_each(rdev, mddev) { 5137 if (test_bit(Faulty, &rdev->flags)) 5138 continue; 5139 sync_blockdev(rdev->bdev); 5140 invalidate_bdev(rdev->bdev); 5141 5142 /* perform some consistency tests on the device. 5143 * We don't want the data to overlap the metadata, 5144 * Internal Bitmap issues have been handled elsewhere. 5145 */ 5146 if (rdev->meta_bdev) { 5147 /* Nothing to check */; 5148 } else if (rdev->data_offset < rdev->sb_start) { 5149 if (mddev->dev_sectors && 5150 rdev->data_offset + mddev->dev_sectors 5151 > rdev->sb_start) { 5152 printk("md: %s: data overlaps metadata\n", 5153 mdname(mddev)); 5154 return -EINVAL; 5155 } 5156 } else { 5157 if (rdev->sb_start + rdev->sb_size/512 5158 > rdev->data_offset) { 5159 printk("md: %s: metadata overlaps data\n", 5160 mdname(mddev)); 5161 return -EINVAL; 5162 } 5163 } 5164 sysfs_notify_dirent_safe(rdev->sysfs_state); 5165 } 5166 5167 if (mddev->bio_set == NULL) 5168 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5169 5170 spin_lock(&pers_lock); 5171 pers = find_pers(mddev->level, mddev->clevel); 5172 if (!pers || !try_module_get(pers->owner)) { 5173 spin_unlock(&pers_lock); 5174 if (mddev->level != LEVEL_NONE) 5175 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5176 mddev->level); 5177 else 5178 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5179 mddev->clevel); 5180 return -EINVAL; 5181 } 5182 spin_unlock(&pers_lock); 5183 if (mddev->level != pers->level) { 5184 mddev->level = pers->level; 5185 mddev->new_level = pers->level; 5186 } 5187 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5188 5189 if (mddev->reshape_position != MaxSector && 5190 pers->start_reshape == NULL) { 5191 /* This personality cannot handle reshaping... */ 5192 module_put(pers->owner); 5193 return -EINVAL; 5194 } 5195 5196 if (pers->sync_request) { 5197 /* Warn if this is a potentially silly 5198 * configuration. 5199 */ 5200 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5201 struct md_rdev *rdev2; 5202 int warned = 0; 5203 5204 rdev_for_each(rdev, mddev) 5205 rdev_for_each(rdev2, mddev) { 5206 if (rdev < rdev2 && 5207 rdev->bdev->bd_contains == 5208 rdev2->bdev->bd_contains) { 5209 printk(KERN_WARNING 5210 "%s: WARNING: %s appears to be" 5211 " on the same physical disk as" 5212 " %s.\n", 5213 mdname(mddev), 5214 bdevname(rdev->bdev,b), 5215 bdevname(rdev2->bdev,b2)); 5216 warned = 1; 5217 } 5218 } 5219 5220 if (warned) 5221 printk(KERN_WARNING 5222 "True protection against single-disk" 5223 " failure might be compromised.\n"); 5224 } 5225 5226 mddev->recovery = 0; 5227 /* may be over-ridden by personality */ 5228 mddev->resync_max_sectors = mddev->dev_sectors; 5229 5230 mddev->ok_start_degraded = start_dirty_degraded; 5231 5232 if (start_readonly && mddev->ro == 0) 5233 mddev->ro = 2; /* read-only, but switch on first write */ 5234 5235 err = pers->run(mddev); 5236 if (err) 5237 printk(KERN_ERR "md: pers->run() failed ...\n"); 5238 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5239 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5240 " but 'external_size' not in effect?\n", __func__); 5241 printk(KERN_ERR 5242 "md: invalid array_size %llu > default size %llu\n", 5243 (unsigned long long)mddev->array_sectors / 2, 5244 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5245 err = -EINVAL; 5246 } 5247 if (err == 0 && pers->sync_request && 5248 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5249 struct bitmap *bitmap; 5250 5251 bitmap = bitmap_create(mddev, -1); 5252 if (IS_ERR(bitmap)) { 5253 err = PTR_ERR(bitmap); 5254 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5255 mdname(mddev), err); 5256 } else 5257 mddev->bitmap = bitmap; 5258 5259 } 5260 if (err) { 5261 mddev_detach(mddev); 5262 if (mddev->private) 5263 pers->free(mddev, mddev->private); 5264 mddev->private = NULL; 5265 module_put(pers->owner); 5266 bitmap_destroy(mddev); 5267 return err; 5268 } 5269 if (mddev->queue) { 5270 mddev->queue->backing_dev_info.congested_data = mddev; 5271 mddev->queue->backing_dev_info.congested_fn = md_congested; 5272 } 5273 if (pers->sync_request) { 5274 if (mddev->kobj.sd && 5275 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5276 printk(KERN_WARNING 5277 "md: cannot register extra attributes for %s\n", 5278 mdname(mddev)); 5279 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5280 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5281 mddev->ro = 0; 5282 5283 atomic_set(&mddev->writes_pending,0); 5284 atomic_set(&mddev->max_corr_read_errors, 5285 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5286 mddev->safemode = 0; 5287 if (mddev_is_clustered(mddev)) 5288 mddev->safemode_delay = 0; 5289 else 5290 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5291 mddev->in_sync = 1; 5292 smp_wmb(); 5293 spin_lock(&mddev->lock); 5294 mddev->pers = pers; 5295 mddev->ready = 1; 5296 spin_unlock(&mddev->lock); 5297 rdev_for_each(rdev, mddev) 5298 if (rdev->raid_disk >= 0) 5299 if (sysfs_link_rdev(mddev, rdev)) 5300 /* failure here is OK */; 5301 5302 if (mddev->degraded && !mddev->ro) 5303 /* This ensures that recovering status is reported immediately 5304 * via sysfs - until a lack of spares is confirmed. 5305 */ 5306 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5307 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5308 5309 if (mddev->flags & MD_UPDATE_SB_FLAGS) 5310 md_update_sb(mddev, 0); 5311 5312 md_new_event(mddev); 5313 sysfs_notify_dirent_safe(mddev->sysfs_state); 5314 sysfs_notify_dirent_safe(mddev->sysfs_action); 5315 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5316 return 0; 5317 } 5318 EXPORT_SYMBOL_GPL(md_run); 5319 5320 static int do_md_run(struct mddev *mddev) 5321 { 5322 int err; 5323 5324 err = md_run(mddev); 5325 if (err) 5326 goto out; 5327 err = bitmap_load(mddev); 5328 if (err) { 5329 bitmap_destroy(mddev); 5330 goto out; 5331 } 5332 5333 if (mddev_is_clustered(mddev)) 5334 md_allow_write(mddev); 5335 5336 md_wakeup_thread(mddev->thread); 5337 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5338 5339 set_capacity(mddev->gendisk, mddev->array_sectors); 5340 revalidate_disk(mddev->gendisk); 5341 mddev->changed = 1; 5342 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5343 out: 5344 return err; 5345 } 5346 5347 static int restart_array(struct mddev *mddev) 5348 { 5349 struct gendisk *disk = mddev->gendisk; 5350 5351 /* Complain if it has no devices */ 5352 if (list_empty(&mddev->disks)) 5353 return -ENXIO; 5354 if (!mddev->pers) 5355 return -EINVAL; 5356 if (!mddev->ro) 5357 return -EBUSY; 5358 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5359 struct md_rdev *rdev; 5360 bool has_journal = false; 5361 5362 rcu_read_lock(); 5363 rdev_for_each_rcu(rdev, mddev) { 5364 if (test_bit(Journal, &rdev->flags) && 5365 !test_bit(Faulty, &rdev->flags)) { 5366 has_journal = true; 5367 break; 5368 } 5369 } 5370 rcu_read_unlock(); 5371 5372 /* Don't restart rw with journal missing/faulty */ 5373 if (!has_journal) 5374 return -EINVAL; 5375 } 5376 5377 mddev->safemode = 0; 5378 mddev->ro = 0; 5379 set_disk_ro(disk, 0); 5380 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5381 mdname(mddev)); 5382 /* Kick recovery or resync if necessary */ 5383 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5384 md_wakeup_thread(mddev->thread); 5385 md_wakeup_thread(mddev->sync_thread); 5386 sysfs_notify_dirent_safe(mddev->sysfs_state); 5387 return 0; 5388 } 5389 5390 static void md_clean(struct mddev *mddev) 5391 { 5392 mddev->array_sectors = 0; 5393 mddev->external_size = 0; 5394 mddev->dev_sectors = 0; 5395 mddev->raid_disks = 0; 5396 mddev->recovery_cp = 0; 5397 mddev->resync_min = 0; 5398 mddev->resync_max = MaxSector; 5399 mddev->reshape_position = MaxSector; 5400 mddev->external = 0; 5401 mddev->persistent = 0; 5402 mddev->level = LEVEL_NONE; 5403 mddev->clevel[0] = 0; 5404 mddev->flags = 0; 5405 mddev->ro = 0; 5406 mddev->metadata_type[0] = 0; 5407 mddev->chunk_sectors = 0; 5408 mddev->ctime = mddev->utime = 0; 5409 mddev->layout = 0; 5410 mddev->max_disks = 0; 5411 mddev->events = 0; 5412 mddev->can_decrease_events = 0; 5413 mddev->delta_disks = 0; 5414 mddev->reshape_backwards = 0; 5415 mddev->new_level = LEVEL_NONE; 5416 mddev->new_layout = 0; 5417 mddev->new_chunk_sectors = 0; 5418 mddev->curr_resync = 0; 5419 atomic64_set(&mddev->resync_mismatches, 0); 5420 mddev->suspend_lo = mddev->suspend_hi = 0; 5421 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5422 mddev->recovery = 0; 5423 mddev->in_sync = 0; 5424 mddev->changed = 0; 5425 mddev->degraded = 0; 5426 mddev->safemode = 0; 5427 mddev->private = NULL; 5428 mddev->bitmap_info.offset = 0; 5429 mddev->bitmap_info.default_offset = 0; 5430 mddev->bitmap_info.default_space = 0; 5431 mddev->bitmap_info.chunksize = 0; 5432 mddev->bitmap_info.daemon_sleep = 0; 5433 mddev->bitmap_info.max_write_behind = 0; 5434 } 5435 5436 static void __md_stop_writes(struct mddev *mddev) 5437 { 5438 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5439 flush_workqueue(md_misc_wq); 5440 if (mddev->sync_thread) { 5441 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5442 md_reap_sync_thread(mddev); 5443 } 5444 5445 del_timer_sync(&mddev->safemode_timer); 5446 5447 bitmap_flush(mddev); 5448 md_super_wait(mddev); 5449 5450 if (mddev->ro == 0 && 5451 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5452 (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5453 /* mark array as shutdown cleanly */ 5454 if (!mddev_is_clustered(mddev)) 5455 mddev->in_sync = 1; 5456 md_update_sb(mddev, 1); 5457 } 5458 } 5459 5460 void md_stop_writes(struct mddev *mddev) 5461 { 5462 mddev_lock_nointr(mddev); 5463 __md_stop_writes(mddev); 5464 mddev_unlock(mddev); 5465 } 5466 EXPORT_SYMBOL_GPL(md_stop_writes); 5467 5468 static void mddev_detach(struct mddev *mddev) 5469 { 5470 struct bitmap *bitmap = mddev->bitmap; 5471 /* wait for behind writes to complete */ 5472 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 5473 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", 5474 mdname(mddev)); 5475 /* need to kick something here to make sure I/O goes? */ 5476 wait_event(bitmap->behind_wait, 5477 atomic_read(&bitmap->behind_writes) == 0); 5478 } 5479 if (mddev->pers && mddev->pers->quiesce) { 5480 mddev->pers->quiesce(mddev, 1); 5481 mddev->pers->quiesce(mddev, 0); 5482 } 5483 md_unregister_thread(&mddev->thread); 5484 if (mddev->queue) 5485 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5486 } 5487 5488 static void __md_stop(struct mddev *mddev) 5489 { 5490 struct md_personality *pers = mddev->pers; 5491 mddev_detach(mddev); 5492 /* Ensure ->event_work is done */ 5493 flush_workqueue(md_misc_wq); 5494 spin_lock(&mddev->lock); 5495 mddev->ready = 0; 5496 mddev->pers = NULL; 5497 spin_unlock(&mddev->lock); 5498 pers->free(mddev, mddev->private); 5499 mddev->private = NULL; 5500 if (pers->sync_request && mddev->to_remove == NULL) 5501 mddev->to_remove = &md_redundancy_group; 5502 module_put(pers->owner); 5503 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5504 } 5505 5506 void md_stop(struct mddev *mddev) 5507 { 5508 /* stop the array and free an attached data structures. 5509 * This is called from dm-raid 5510 */ 5511 __md_stop(mddev); 5512 bitmap_destroy(mddev); 5513 if (mddev->bio_set) 5514 bioset_free(mddev->bio_set); 5515 } 5516 5517 EXPORT_SYMBOL_GPL(md_stop); 5518 5519 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5520 { 5521 int err = 0; 5522 int did_freeze = 0; 5523 5524 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5525 did_freeze = 1; 5526 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5527 md_wakeup_thread(mddev->thread); 5528 } 5529 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5530 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5531 if (mddev->sync_thread) 5532 /* Thread might be blocked waiting for metadata update 5533 * which will now never happen */ 5534 wake_up_process(mddev->sync_thread->tsk); 5535 5536 if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) 5537 return -EBUSY; 5538 mddev_unlock(mddev); 5539 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5540 &mddev->recovery)); 5541 wait_event(mddev->sb_wait, 5542 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5543 mddev_lock_nointr(mddev); 5544 5545 mutex_lock(&mddev->open_mutex); 5546 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5547 mddev->sync_thread || 5548 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5549 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5550 printk("md: %s still in use.\n",mdname(mddev)); 5551 if (did_freeze) { 5552 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5553 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5554 md_wakeup_thread(mddev->thread); 5555 } 5556 err = -EBUSY; 5557 goto out; 5558 } 5559 if (mddev->pers) { 5560 __md_stop_writes(mddev); 5561 5562 err = -ENXIO; 5563 if (mddev->ro==1) 5564 goto out; 5565 mddev->ro = 1; 5566 set_disk_ro(mddev->gendisk, 1); 5567 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5568 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5569 md_wakeup_thread(mddev->thread); 5570 sysfs_notify_dirent_safe(mddev->sysfs_state); 5571 err = 0; 5572 } 5573 out: 5574 mutex_unlock(&mddev->open_mutex); 5575 return err; 5576 } 5577 5578 /* mode: 5579 * 0 - completely stop and dis-assemble array 5580 * 2 - stop but do not disassemble array 5581 */ 5582 static int do_md_stop(struct mddev *mddev, int mode, 5583 struct block_device *bdev) 5584 { 5585 struct gendisk *disk = mddev->gendisk; 5586 struct md_rdev *rdev; 5587 int did_freeze = 0; 5588 5589 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5590 did_freeze = 1; 5591 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5592 md_wakeup_thread(mddev->thread); 5593 } 5594 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5595 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5596 if (mddev->sync_thread) 5597 /* Thread might be blocked waiting for metadata update 5598 * which will now never happen */ 5599 wake_up_process(mddev->sync_thread->tsk); 5600 5601 mddev_unlock(mddev); 5602 wait_event(resync_wait, (mddev->sync_thread == NULL && 5603 !test_bit(MD_RECOVERY_RUNNING, 5604 &mddev->recovery))); 5605 mddev_lock_nointr(mddev); 5606 5607 mutex_lock(&mddev->open_mutex); 5608 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5609 mddev->sysfs_active || 5610 mddev->sync_thread || 5611 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5612 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5613 printk("md: %s still in use.\n",mdname(mddev)); 5614 mutex_unlock(&mddev->open_mutex); 5615 if (did_freeze) { 5616 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5617 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5618 md_wakeup_thread(mddev->thread); 5619 } 5620 return -EBUSY; 5621 } 5622 if (mddev->pers) { 5623 if (mddev->ro) 5624 set_disk_ro(disk, 0); 5625 5626 __md_stop_writes(mddev); 5627 __md_stop(mddev); 5628 mddev->queue->backing_dev_info.congested_fn = NULL; 5629 5630 /* tell userspace to handle 'inactive' */ 5631 sysfs_notify_dirent_safe(mddev->sysfs_state); 5632 5633 rdev_for_each(rdev, mddev) 5634 if (rdev->raid_disk >= 0) 5635 sysfs_unlink_rdev(mddev, rdev); 5636 5637 set_capacity(disk, 0); 5638 mutex_unlock(&mddev->open_mutex); 5639 mddev->changed = 1; 5640 revalidate_disk(disk); 5641 5642 if (mddev->ro) 5643 mddev->ro = 0; 5644 } else 5645 mutex_unlock(&mddev->open_mutex); 5646 /* 5647 * Free resources if final stop 5648 */ 5649 if (mode == 0) { 5650 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5651 5652 bitmap_destroy(mddev); 5653 if (mddev->bitmap_info.file) { 5654 struct file *f = mddev->bitmap_info.file; 5655 spin_lock(&mddev->lock); 5656 mddev->bitmap_info.file = NULL; 5657 spin_unlock(&mddev->lock); 5658 fput(f); 5659 } 5660 mddev->bitmap_info.offset = 0; 5661 5662 export_array(mddev); 5663 5664 md_clean(mddev); 5665 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5666 if (mddev->hold_active == UNTIL_STOP) 5667 mddev->hold_active = 0; 5668 } 5669 md_new_event(mddev); 5670 sysfs_notify_dirent_safe(mddev->sysfs_state); 5671 return 0; 5672 } 5673 5674 #ifndef MODULE 5675 static void autorun_array(struct mddev *mddev) 5676 { 5677 struct md_rdev *rdev; 5678 int err; 5679 5680 if (list_empty(&mddev->disks)) 5681 return; 5682 5683 printk(KERN_INFO "md: running: "); 5684 5685 rdev_for_each(rdev, mddev) { 5686 char b[BDEVNAME_SIZE]; 5687 printk("<%s>", bdevname(rdev->bdev,b)); 5688 } 5689 printk("\n"); 5690 5691 err = do_md_run(mddev); 5692 if (err) { 5693 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5694 do_md_stop(mddev, 0, NULL); 5695 } 5696 } 5697 5698 /* 5699 * lets try to run arrays based on all disks that have arrived 5700 * until now. (those are in pending_raid_disks) 5701 * 5702 * the method: pick the first pending disk, collect all disks with 5703 * the same UUID, remove all from the pending list and put them into 5704 * the 'same_array' list. Then order this list based on superblock 5705 * update time (freshest comes first), kick out 'old' disks and 5706 * compare superblocks. If everything's fine then run it. 5707 * 5708 * If "unit" is allocated, then bump its reference count 5709 */ 5710 static void autorun_devices(int part) 5711 { 5712 struct md_rdev *rdev0, *rdev, *tmp; 5713 struct mddev *mddev; 5714 char b[BDEVNAME_SIZE]; 5715 5716 printk(KERN_INFO "md: autorun ...\n"); 5717 while (!list_empty(&pending_raid_disks)) { 5718 int unit; 5719 dev_t dev; 5720 LIST_HEAD(candidates); 5721 rdev0 = list_entry(pending_raid_disks.next, 5722 struct md_rdev, same_set); 5723 5724 printk(KERN_INFO "md: considering %s ...\n", 5725 bdevname(rdev0->bdev,b)); 5726 INIT_LIST_HEAD(&candidates); 5727 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5728 if (super_90_load(rdev, rdev0, 0) >= 0) { 5729 printk(KERN_INFO "md: adding %s ...\n", 5730 bdevname(rdev->bdev,b)); 5731 list_move(&rdev->same_set, &candidates); 5732 } 5733 /* 5734 * now we have a set of devices, with all of them having 5735 * mostly sane superblocks. It's time to allocate the 5736 * mddev. 5737 */ 5738 if (part) { 5739 dev = MKDEV(mdp_major, 5740 rdev0->preferred_minor << MdpMinorShift); 5741 unit = MINOR(dev) >> MdpMinorShift; 5742 } else { 5743 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5744 unit = MINOR(dev); 5745 } 5746 if (rdev0->preferred_minor != unit) { 5747 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5748 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5749 break; 5750 } 5751 5752 md_probe(dev, NULL, NULL); 5753 mddev = mddev_find(dev); 5754 if (!mddev || !mddev->gendisk) { 5755 if (mddev) 5756 mddev_put(mddev); 5757 printk(KERN_ERR 5758 "md: cannot allocate memory for md drive.\n"); 5759 break; 5760 } 5761 if (mddev_lock(mddev)) 5762 printk(KERN_WARNING "md: %s locked, cannot run\n", 5763 mdname(mddev)); 5764 else if (mddev->raid_disks || mddev->major_version 5765 || !list_empty(&mddev->disks)) { 5766 printk(KERN_WARNING 5767 "md: %s already running, cannot run %s\n", 5768 mdname(mddev), bdevname(rdev0->bdev,b)); 5769 mddev_unlock(mddev); 5770 } else { 5771 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5772 mddev->persistent = 1; 5773 rdev_for_each_list(rdev, tmp, &candidates) { 5774 list_del_init(&rdev->same_set); 5775 if (bind_rdev_to_array(rdev, mddev)) 5776 export_rdev(rdev); 5777 } 5778 autorun_array(mddev); 5779 mddev_unlock(mddev); 5780 } 5781 /* on success, candidates will be empty, on error 5782 * it won't... 5783 */ 5784 rdev_for_each_list(rdev, tmp, &candidates) { 5785 list_del_init(&rdev->same_set); 5786 export_rdev(rdev); 5787 } 5788 mddev_put(mddev); 5789 } 5790 printk(KERN_INFO "md: ... autorun DONE.\n"); 5791 } 5792 #endif /* !MODULE */ 5793 5794 static int get_version(void __user *arg) 5795 { 5796 mdu_version_t ver; 5797 5798 ver.major = MD_MAJOR_VERSION; 5799 ver.minor = MD_MINOR_VERSION; 5800 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5801 5802 if (copy_to_user(arg, &ver, sizeof(ver))) 5803 return -EFAULT; 5804 5805 return 0; 5806 } 5807 5808 static int get_array_info(struct mddev *mddev, void __user *arg) 5809 { 5810 mdu_array_info_t info; 5811 int nr,working,insync,failed,spare; 5812 struct md_rdev *rdev; 5813 5814 nr = working = insync = failed = spare = 0; 5815 rcu_read_lock(); 5816 rdev_for_each_rcu(rdev, mddev) { 5817 nr++; 5818 if (test_bit(Faulty, &rdev->flags)) 5819 failed++; 5820 else { 5821 working++; 5822 if (test_bit(In_sync, &rdev->flags)) 5823 insync++; 5824 else 5825 spare++; 5826 } 5827 } 5828 rcu_read_unlock(); 5829 5830 info.major_version = mddev->major_version; 5831 info.minor_version = mddev->minor_version; 5832 info.patch_version = MD_PATCHLEVEL_VERSION; 5833 info.ctime = mddev->ctime; 5834 info.level = mddev->level; 5835 info.size = mddev->dev_sectors / 2; 5836 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5837 info.size = -1; 5838 info.nr_disks = nr; 5839 info.raid_disks = mddev->raid_disks; 5840 info.md_minor = mddev->md_minor; 5841 info.not_persistent= !mddev->persistent; 5842 5843 info.utime = mddev->utime; 5844 info.state = 0; 5845 if (mddev->in_sync) 5846 info.state = (1<<MD_SB_CLEAN); 5847 if (mddev->bitmap && mddev->bitmap_info.offset) 5848 info.state |= (1<<MD_SB_BITMAP_PRESENT); 5849 if (mddev_is_clustered(mddev)) 5850 info.state |= (1<<MD_SB_CLUSTERED); 5851 info.active_disks = insync; 5852 info.working_disks = working; 5853 info.failed_disks = failed; 5854 info.spare_disks = spare; 5855 5856 info.layout = mddev->layout; 5857 info.chunk_size = mddev->chunk_sectors << 9; 5858 5859 if (copy_to_user(arg, &info, sizeof(info))) 5860 return -EFAULT; 5861 5862 return 0; 5863 } 5864 5865 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 5866 { 5867 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5868 char *ptr; 5869 int err; 5870 5871 file = kzalloc(sizeof(*file), GFP_NOIO); 5872 if (!file) 5873 return -ENOMEM; 5874 5875 err = 0; 5876 spin_lock(&mddev->lock); 5877 /* bitmap enabled */ 5878 if (mddev->bitmap_info.file) { 5879 ptr = file_path(mddev->bitmap_info.file, file->pathname, 5880 sizeof(file->pathname)); 5881 if (IS_ERR(ptr)) 5882 err = PTR_ERR(ptr); 5883 else 5884 memmove(file->pathname, ptr, 5885 sizeof(file->pathname)-(ptr-file->pathname)); 5886 } 5887 spin_unlock(&mddev->lock); 5888 5889 if (err == 0 && 5890 copy_to_user(arg, file, sizeof(*file))) 5891 err = -EFAULT; 5892 5893 kfree(file); 5894 return err; 5895 } 5896 5897 static int get_disk_info(struct mddev *mddev, void __user * arg) 5898 { 5899 mdu_disk_info_t info; 5900 struct md_rdev *rdev; 5901 5902 if (copy_from_user(&info, arg, sizeof(info))) 5903 return -EFAULT; 5904 5905 rcu_read_lock(); 5906 rdev = md_find_rdev_nr_rcu(mddev, info.number); 5907 if (rdev) { 5908 info.major = MAJOR(rdev->bdev->bd_dev); 5909 info.minor = MINOR(rdev->bdev->bd_dev); 5910 info.raid_disk = rdev->raid_disk; 5911 info.state = 0; 5912 if (test_bit(Faulty, &rdev->flags)) 5913 info.state |= (1<<MD_DISK_FAULTY); 5914 else if (test_bit(In_sync, &rdev->flags)) { 5915 info.state |= (1<<MD_DISK_ACTIVE); 5916 info.state |= (1<<MD_DISK_SYNC); 5917 } 5918 if (test_bit(Journal, &rdev->flags)) 5919 info.state |= (1<<MD_DISK_JOURNAL); 5920 if (test_bit(WriteMostly, &rdev->flags)) 5921 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5922 } else { 5923 info.major = info.minor = 0; 5924 info.raid_disk = -1; 5925 info.state = (1<<MD_DISK_REMOVED); 5926 } 5927 rcu_read_unlock(); 5928 5929 if (copy_to_user(arg, &info, sizeof(info))) 5930 return -EFAULT; 5931 5932 return 0; 5933 } 5934 5935 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 5936 { 5937 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5938 struct md_rdev *rdev; 5939 dev_t dev = MKDEV(info->major,info->minor); 5940 5941 if (mddev_is_clustered(mddev) && 5942 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 5943 pr_err("%s: Cannot add to clustered mddev.\n", 5944 mdname(mddev)); 5945 return -EINVAL; 5946 } 5947 5948 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5949 return -EOVERFLOW; 5950 5951 if (!mddev->raid_disks) { 5952 int err; 5953 /* expecting a device which has a superblock */ 5954 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5955 if (IS_ERR(rdev)) { 5956 printk(KERN_WARNING 5957 "md: md_import_device returned %ld\n", 5958 PTR_ERR(rdev)); 5959 return PTR_ERR(rdev); 5960 } 5961 if (!list_empty(&mddev->disks)) { 5962 struct md_rdev *rdev0 5963 = list_entry(mddev->disks.next, 5964 struct md_rdev, same_set); 5965 err = super_types[mddev->major_version] 5966 .load_super(rdev, rdev0, mddev->minor_version); 5967 if (err < 0) { 5968 printk(KERN_WARNING 5969 "md: %s has different UUID to %s\n", 5970 bdevname(rdev->bdev,b), 5971 bdevname(rdev0->bdev,b2)); 5972 export_rdev(rdev); 5973 return -EINVAL; 5974 } 5975 } 5976 err = bind_rdev_to_array(rdev, mddev); 5977 if (err) 5978 export_rdev(rdev); 5979 return err; 5980 } 5981 5982 /* 5983 * add_new_disk can be used once the array is assembled 5984 * to add "hot spares". They must already have a superblock 5985 * written 5986 */ 5987 if (mddev->pers) { 5988 int err; 5989 if (!mddev->pers->hot_add_disk) { 5990 printk(KERN_WARNING 5991 "%s: personality does not support diskops!\n", 5992 mdname(mddev)); 5993 return -EINVAL; 5994 } 5995 if (mddev->persistent) 5996 rdev = md_import_device(dev, mddev->major_version, 5997 mddev->minor_version); 5998 else 5999 rdev = md_import_device(dev, -1, -1); 6000 if (IS_ERR(rdev)) { 6001 printk(KERN_WARNING 6002 "md: md_import_device returned %ld\n", 6003 PTR_ERR(rdev)); 6004 return PTR_ERR(rdev); 6005 } 6006 /* set saved_raid_disk if appropriate */ 6007 if (!mddev->persistent) { 6008 if (info->state & (1<<MD_DISK_SYNC) && 6009 info->raid_disk < mddev->raid_disks) { 6010 rdev->raid_disk = info->raid_disk; 6011 set_bit(In_sync, &rdev->flags); 6012 clear_bit(Bitmap_sync, &rdev->flags); 6013 } else 6014 rdev->raid_disk = -1; 6015 rdev->saved_raid_disk = rdev->raid_disk; 6016 } else 6017 super_types[mddev->major_version]. 6018 validate_super(mddev, rdev); 6019 if ((info->state & (1<<MD_DISK_SYNC)) && 6020 rdev->raid_disk != info->raid_disk) { 6021 /* This was a hot-add request, but events doesn't 6022 * match, so reject it. 6023 */ 6024 export_rdev(rdev); 6025 return -EINVAL; 6026 } 6027 6028 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6029 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6030 set_bit(WriteMostly, &rdev->flags); 6031 else 6032 clear_bit(WriteMostly, &rdev->flags); 6033 6034 if (info->state & (1<<MD_DISK_JOURNAL)) 6035 set_bit(Journal, &rdev->flags); 6036 /* 6037 * check whether the device shows up in other nodes 6038 */ 6039 if (mddev_is_clustered(mddev)) { 6040 if (info->state & (1 << MD_DISK_CANDIDATE)) 6041 set_bit(Candidate, &rdev->flags); 6042 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6043 /* --add initiated by this node */ 6044 err = md_cluster_ops->add_new_disk(mddev, rdev); 6045 if (err) { 6046 export_rdev(rdev); 6047 return err; 6048 } 6049 } 6050 } 6051 6052 rdev->raid_disk = -1; 6053 err = bind_rdev_to_array(rdev, mddev); 6054 6055 if (err) 6056 export_rdev(rdev); 6057 6058 if (mddev_is_clustered(mddev)) { 6059 if (info->state & (1 << MD_DISK_CANDIDATE)) 6060 md_cluster_ops->new_disk_ack(mddev, (err == 0)); 6061 else { 6062 if (err) 6063 md_cluster_ops->add_new_disk_cancel(mddev); 6064 else 6065 err = add_bound_rdev(rdev); 6066 } 6067 6068 } else if (!err) 6069 err = add_bound_rdev(rdev); 6070 6071 return err; 6072 } 6073 6074 /* otherwise, add_new_disk is only allowed 6075 * for major_version==0 superblocks 6076 */ 6077 if (mddev->major_version != 0) { 6078 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 6079 mdname(mddev)); 6080 return -EINVAL; 6081 } 6082 6083 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6084 int err; 6085 rdev = md_import_device(dev, -1, 0); 6086 if (IS_ERR(rdev)) { 6087 printk(KERN_WARNING 6088 "md: error, md_import_device() returned %ld\n", 6089 PTR_ERR(rdev)); 6090 return PTR_ERR(rdev); 6091 } 6092 rdev->desc_nr = info->number; 6093 if (info->raid_disk < mddev->raid_disks) 6094 rdev->raid_disk = info->raid_disk; 6095 else 6096 rdev->raid_disk = -1; 6097 6098 if (rdev->raid_disk < mddev->raid_disks) 6099 if (info->state & (1<<MD_DISK_SYNC)) 6100 set_bit(In_sync, &rdev->flags); 6101 6102 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6103 set_bit(WriteMostly, &rdev->flags); 6104 6105 if (!mddev->persistent) { 6106 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 6107 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6108 } else 6109 rdev->sb_start = calc_dev_sboffset(rdev); 6110 rdev->sectors = rdev->sb_start; 6111 6112 err = bind_rdev_to_array(rdev, mddev); 6113 if (err) { 6114 export_rdev(rdev); 6115 return err; 6116 } 6117 } 6118 6119 return 0; 6120 } 6121 6122 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6123 { 6124 char b[BDEVNAME_SIZE]; 6125 struct md_rdev *rdev; 6126 int ret = -1; 6127 6128 rdev = find_rdev(mddev, dev); 6129 if (!rdev) 6130 return -ENXIO; 6131 6132 if (mddev_is_clustered(mddev)) 6133 ret = md_cluster_ops->metadata_update_start(mddev); 6134 6135 if (rdev->raid_disk < 0) 6136 goto kick_rdev; 6137 6138 clear_bit(Blocked, &rdev->flags); 6139 remove_and_add_spares(mddev, rdev); 6140 6141 if (rdev->raid_disk >= 0) 6142 goto busy; 6143 6144 kick_rdev: 6145 if (mddev_is_clustered(mddev) && ret == 0) 6146 md_cluster_ops->remove_disk(mddev, rdev); 6147 6148 md_kick_rdev_from_array(rdev); 6149 md_update_sb(mddev, 1); 6150 md_new_event(mddev); 6151 6152 return 0; 6153 busy: 6154 if (mddev_is_clustered(mddev) && ret == 0) 6155 md_cluster_ops->metadata_update_cancel(mddev); 6156 6157 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 6158 bdevname(rdev->bdev,b), mdname(mddev)); 6159 return -EBUSY; 6160 } 6161 6162 static int hot_add_disk(struct mddev *mddev, dev_t dev) 6163 { 6164 char b[BDEVNAME_SIZE]; 6165 int err; 6166 struct md_rdev *rdev; 6167 6168 if (!mddev->pers) 6169 return -ENODEV; 6170 6171 if (mddev->major_version != 0) { 6172 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 6173 " version-0 superblocks.\n", 6174 mdname(mddev)); 6175 return -EINVAL; 6176 } 6177 if (!mddev->pers->hot_add_disk) { 6178 printk(KERN_WARNING 6179 "%s: personality does not support diskops!\n", 6180 mdname(mddev)); 6181 return -EINVAL; 6182 } 6183 6184 rdev = md_import_device(dev, -1, 0); 6185 if (IS_ERR(rdev)) { 6186 printk(KERN_WARNING 6187 "md: error, md_import_device() returned %ld\n", 6188 PTR_ERR(rdev)); 6189 return -EINVAL; 6190 } 6191 6192 if (mddev->persistent) 6193 rdev->sb_start = calc_dev_sboffset(rdev); 6194 else 6195 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6196 6197 rdev->sectors = rdev->sb_start; 6198 6199 if (test_bit(Faulty, &rdev->flags)) { 6200 printk(KERN_WARNING 6201 "md: can not hot-add faulty %s disk to %s!\n", 6202 bdevname(rdev->bdev,b), mdname(mddev)); 6203 err = -EINVAL; 6204 goto abort_export; 6205 } 6206 6207 clear_bit(In_sync, &rdev->flags); 6208 rdev->desc_nr = -1; 6209 rdev->saved_raid_disk = -1; 6210 err = bind_rdev_to_array(rdev, mddev); 6211 if (err) 6212 goto abort_export; 6213 6214 /* 6215 * The rest should better be atomic, we can have disk failures 6216 * noticed in interrupt contexts ... 6217 */ 6218 6219 rdev->raid_disk = -1; 6220 6221 md_update_sb(mddev, 1); 6222 /* 6223 * Kick recovery, maybe this spare has to be added to the 6224 * array immediately. 6225 */ 6226 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6227 md_wakeup_thread(mddev->thread); 6228 md_new_event(mddev); 6229 return 0; 6230 6231 abort_export: 6232 export_rdev(rdev); 6233 return err; 6234 } 6235 6236 static int set_bitmap_file(struct mddev *mddev, int fd) 6237 { 6238 int err = 0; 6239 6240 if (mddev->pers) { 6241 if (!mddev->pers->quiesce || !mddev->thread) 6242 return -EBUSY; 6243 if (mddev->recovery || mddev->sync_thread) 6244 return -EBUSY; 6245 /* we should be able to change the bitmap.. */ 6246 } 6247 6248 if (fd >= 0) { 6249 struct inode *inode; 6250 struct file *f; 6251 6252 if (mddev->bitmap || mddev->bitmap_info.file) 6253 return -EEXIST; /* cannot add when bitmap is present */ 6254 f = fget(fd); 6255 6256 if (f == NULL) { 6257 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 6258 mdname(mddev)); 6259 return -EBADF; 6260 } 6261 6262 inode = f->f_mapping->host; 6263 if (!S_ISREG(inode->i_mode)) { 6264 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 6265 mdname(mddev)); 6266 err = -EBADF; 6267 } else if (!(f->f_mode & FMODE_WRITE)) { 6268 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 6269 mdname(mddev)); 6270 err = -EBADF; 6271 } else if (atomic_read(&inode->i_writecount) != 1) { 6272 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 6273 mdname(mddev)); 6274 err = -EBUSY; 6275 } 6276 if (err) { 6277 fput(f); 6278 return err; 6279 } 6280 mddev->bitmap_info.file = f; 6281 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6282 } else if (mddev->bitmap == NULL) 6283 return -ENOENT; /* cannot remove what isn't there */ 6284 err = 0; 6285 if (mddev->pers) { 6286 mddev->pers->quiesce(mddev, 1); 6287 if (fd >= 0) { 6288 struct bitmap *bitmap; 6289 6290 bitmap = bitmap_create(mddev, -1); 6291 if (!IS_ERR(bitmap)) { 6292 mddev->bitmap = bitmap; 6293 err = bitmap_load(mddev); 6294 } else 6295 err = PTR_ERR(bitmap); 6296 } 6297 if (fd < 0 || err) { 6298 bitmap_destroy(mddev); 6299 fd = -1; /* make sure to put the file */ 6300 } 6301 mddev->pers->quiesce(mddev, 0); 6302 } 6303 if (fd < 0) { 6304 struct file *f = mddev->bitmap_info.file; 6305 if (f) { 6306 spin_lock(&mddev->lock); 6307 mddev->bitmap_info.file = NULL; 6308 spin_unlock(&mddev->lock); 6309 fput(f); 6310 } 6311 } 6312 6313 return err; 6314 } 6315 6316 /* 6317 * set_array_info is used two different ways 6318 * The original usage is when creating a new array. 6319 * In this usage, raid_disks is > 0 and it together with 6320 * level, size, not_persistent,layout,chunksize determine the 6321 * shape of the array. 6322 * This will always create an array with a type-0.90.0 superblock. 6323 * The newer usage is when assembling an array. 6324 * In this case raid_disks will be 0, and the major_version field is 6325 * use to determine which style super-blocks are to be found on the devices. 6326 * The minor and patch _version numbers are also kept incase the 6327 * super_block handler wishes to interpret them. 6328 */ 6329 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6330 { 6331 6332 if (info->raid_disks == 0) { 6333 /* just setting version number for superblock loading */ 6334 if (info->major_version < 0 || 6335 info->major_version >= ARRAY_SIZE(super_types) || 6336 super_types[info->major_version].name == NULL) { 6337 /* maybe try to auto-load a module? */ 6338 printk(KERN_INFO 6339 "md: superblock version %d not known\n", 6340 info->major_version); 6341 return -EINVAL; 6342 } 6343 mddev->major_version = info->major_version; 6344 mddev->minor_version = info->minor_version; 6345 mddev->patch_version = info->patch_version; 6346 mddev->persistent = !info->not_persistent; 6347 /* ensure mddev_put doesn't delete this now that there 6348 * is some minimal configuration. 6349 */ 6350 mddev->ctime = get_seconds(); 6351 return 0; 6352 } 6353 mddev->major_version = MD_MAJOR_VERSION; 6354 mddev->minor_version = MD_MINOR_VERSION; 6355 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6356 mddev->ctime = get_seconds(); 6357 6358 mddev->level = info->level; 6359 mddev->clevel[0] = 0; 6360 mddev->dev_sectors = 2 * (sector_t)info->size; 6361 mddev->raid_disks = info->raid_disks; 6362 /* don't set md_minor, it is determined by which /dev/md* was 6363 * openned 6364 */ 6365 if (info->state & (1<<MD_SB_CLEAN)) 6366 mddev->recovery_cp = MaxSector; 6367 else 6368 mddev->recovery_cp = 0; 6369 mddev->persistent = ! info->not_persistent; 6370 mddev->external = 0; 6371 6372 mddev->layout = info->layout; 6373 mddev->chunk_sectors = info->chunk_size >> 9; 6374 6375 mddev->max_disks = MD_SB_DISKS; 6376 6377 if (mddev->persistent) 6378 mddev->flags = 0; 6379 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6380 6381 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6382 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6383 mddev->bitmap_info.offset = 0; 6384 6385 mddev->reshape_position = MaxSector; 6386 6387 /* 6388 * Generate a 128 bit UUID 6389 */ 6390 get_random_bytes(mddev->uuid, 16); 6391 6392 mddev->new_level = mddev->level; 6393 mddev->new_chunk_sectors = mddev->chunk_sectors; 6394 mddev->new_layout = mddev->layout; 6395 mddev->delta_disks = 0; 6396 mddev->reshape_backwards = 0; 6397 6398 return 0; 6399 } 6400 6401 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6402 { 6403 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6404 6405 if (mddev->external_size) 6406 return; 6407 6408 mddev->array_sectors = array_sectors; 6409 } 6410 EXPORT_SYMBOL(md_set_array_sectors); 6411 6412 static int update_size(struct mddev *mddev, sector_t num_sectors) 6413 { 6414 struct md_rdev *rdev; 6415 int rv; 6416 int fit = (num_sectors == 0); 6417 6418 if (mddev->pers->resize == NULL) 6419 return -EINVAL; 6420 /* The "num_sectors" is the number of sectors of each device that 6421 * is used. This can only make sense for arrays with redundancy. 6422 * linear and raid0 always use whatever space is available. We can only 6423 * consider changing this number if no resync or reconstruction is 6424 * happening, and if the new size is acceptable. It must fit before the 6425 * sb_start or, if that is <data_offset, it must fit before the size 6426 * of each device. If num_sectors is zero, we find the largest size 6427 * that fits. 6428 */ 6429 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6430 mddev->sync_thread) 6431 return -EBUSY; 6432 if (mddev->ro) 6433 return -EROFS; 6434 6435 rdev_for_each(rdev, mddev) { 6436 sector_t avail = rdev->sectors; 6437 6438 if (fit && (num_sectors == 0 || num_sectors > avail)) 6439 num_sectors = avail; 6440 if (avail < num_sectors) 6441 return -ENOSPC; 6442 } 6443 rv = mddev->pers->resize(mddev, num_sectors); 6444 if (!rv) 6445 revalidate_disk(mddev->gendisk); 6446 return rv; 6447 } 6448 6449 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6450 { 6451 int rv; 6452 struct md_rdev *rdev; 6453 /* change the number of raid disks */ 6454 if (mddev->pers->check_reshape == NULL) 6455 return -EINVAL; 6456 if (mddev->ro) 6457 return -EROFS; 6458 if (raid_disks <= 0 || 6459 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6460 return -EINVAL; 6461 if (mddev->sync_thread || 6462 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6463 mddev->reshape_position != MaxSector) 6464 return -EBUSY; 6465 6466 rdev_for_each(rdev, mddev) { 6467 if (mddev->raid_disks < raid_disks && 6468 rdev->data_offset < rdev->new_data_offset) 6469 return -EINVAL; 6470 if (mddev->raid_disks > raid_disks && 6471 rdev->data_offset > rdev->new_data_offset) 6472 return -EINVAL; 6473 } 6474 6475 mddev->delta_disks = raid_disks - mddev->raid_disks; 6476 if (mddev->delta_disks < 0) 6477 mddev->reshape_backwards = 1; 6478 else if (mddev->delta_disks > 0) 6479 mddev->reshape_backwards = 0; 6480 6481 rv = mddev->pers->check_reshape(mddev); 6482 if (rv < 0) { 6483 mddev->delta_disks = 0; 6484 mddev->reshape_backwards = 0; 6485 } 6486 return rv; 6487 } 6488 6489 /* 6490 * update_array_info is used to change the configuration of an 6491 * on-line array. 6492 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6493 * fields in the info are checked against the array. 6494 * Any differences that cannot be handled will cause an error. 6495 * Normally, only one change can be managed at a time. 6496 */ 6497 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6498 { 6499 int rv = 0; 6500 int cnt = 0; 6501 int state = 0; 6502 6503 /* calculate expected state,ignoring low bits */ 6504 if (mddev->bitmap && mddev->bitmap_info.offset) 6505 state |= (1 << MD_SB_BITMAP_PRESENT); 6506 6507 if (mddev->major_version != info->major_version || 6508 mddev->minor_version != info->minor_version || 6509 /* mddev->patch_version != info->patch_version || */ 6510 mddev->ctime != info->ctime || 6511 mddev->level != info->level || 6512 /* mddev->layout != info->layout || */ 6513 mddev->persistent != !info->not_persistent || 6514 mddev->chunk_sectors != info->chunk_size >> 9 || 6515 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6516 ((state^info->state) & 0xfffffe00) 6517 ) 6518 return -EINVAL; 6519 /* Check there is only one change */ 6520 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6521 cnt++; 6522 if (mddev->raid_disks != info->raid_disks) 6523 cnt++; 6524 if (mddev->layout != info->layout) 6525 cnt++; 6526 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6527 cnt++; 6528 if (cnt == 0) 6529 return 0; 6530 if (cnt > 1) 6531 return -EINVAL; 6532 6533 if (mddev->layout != info->layout) { 6534 /* Change layout 6535 * we don't need to do anything at the md level, the 6536 * personality will take care of it all. 6537 */ 6538 if (mddev->pers->check_reshape == NULL) 6539 return -EINVAL; 6540 else { 6541 mddev->new_layout = info->layout; 6542 rv = mddev->pers->check_reshape(mddev); 6543 if (rv) 6544 mddev->new_layout = mddev->layout; 6545 return rv; 6546 } 6547 } 6548 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6549 rv = update_size(mddev, (sector_t)info->size * 2); 6550 6551 if (mddev->raid_disks != info->raid_disks) 6552 rv = update_raid_disks(mddev, info->raid_disks); 6553 6554 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6555 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6556 rv = -EINVAL; 6557 goto err; 6558 } 6559 if (mddev->recovery || mddev->sync_thread) { 6560 rv = -EBUSY; 6561 goto err; 6562 } 6563 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6564 struct bitmap *bitmap; 6565 /* add the bitmap */ 6566 if (mddev->bitmap) { 6567 rv = -EEXIST; 6568 goto err; 6569 } 6570 if (mddev->bitmap_info.default_offset == 0) { 6571 rv = -EINVAL; 6572 goto err; 6573 } 6574 mddev->bitmap_info.offset = 6575 mddev->bitmap_info.default_offset; 6576 mddev->bitmap_info.space = 6577 mddev->bitmap_info.default_space; 6578 mddev->pers->quiesce(mddev, 1); 6579 bitmap = bitmap_create(mddev, -1); 6580 if (!IS_ERR(bitmap)) { 6581 mddev->bitmap = bitmap; 6582 rv = bitmap_load(mddev); 6583 } else 6584 rv = PTR_ERR(bitmap); 6585 if (rv) 6586 bitmap_destroy(mddev); 6587 mddev->pers->quiesce(mddev, 0); 6588 } else { 6589 /* remove the bitmap */ 6590 if (!mddev->bitmap) { 6591 rv = -ENOENT; 6592 goto err; 6593 } 6594 if (mddev->bitmap->storage.file) { 6595 rv = -EINVAL; 6596 goto err; 6597 } 6598 mddev->pers->quiesce(mddev, 1); 6599 bitmap_destroy(mddev); 6600 mddev->pers->quiesce(mddev, 0); 6601 mddev->bitmap_info.offset = 0; 6602 } 6603 } 6604 md_update_sb(mddev, 1); 6605 return rv; 6606 err: 6607 return rv; 6608 } 6609 6610 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6611 { 6612 struct md_rdev *rdev; 6613 int err = 0; 6614 6615 if (mddev->pers == NULL) 6616 return -ENODEV; 6617 6618 rcu_read_lock(); 6619 rdev = find_rdev_rcu(mddev, dev); 6620 if (!rdev) 6621 err = -ENODEV; 6622 else { 6623 md_error(mddev, rdev); 6624 if (!test_bit(Faulty, &rdev->flags)) 6625 err = -EBUSY; 6626 } 6627 rcu_read_unlock(); 6628 return err; 6629 } 6630 6631 /* 6632 * We have a problem here : there is no easy way to give a CHS 6633 * virtual geometry. We currently pretend that we have a 2 heads 6634 * 4 sectors (with a BIG number of cylinders...). This drives 6635 * dosfs just mad... ;-) 6636 */ 6637 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6638 { 6639 struct mddev *mddev = bdev->bd_disk->private_data; 6640 6641 geo->heads = 2; 6642 geo->sectors = 4; 6643 geo->cylinders = mddev->array_sectors / 8; 6644 return 0; 6645 } 6646 6647 static inline bool md_ioctl_valid(unsigned int cmd) 6648 { 6649 switch (cmd) { 6650 case ADD_NEW_DISK: 6651 case BLKROSET: 6652 case GET_ARRAY_INFO: 6653 case GET_BITMAP_FILE: 6654 case GET_DISK_INFO: 6655 case HOT_ADD_DISK: 6656 case HOT_REMOVE_DISK: 6657 case RAID_AUTORUN: 6658 case RAID_VERSION: 6659 case RESTART_ARRAY_RW: 6660 case RUN_ARRAY: 6661 case SET_ARRAY_INFO: 6662 case SET_BITMAP_FILE: 6663 case SET_DISK_FAULTY: 6664 case STOP_ARRAY: 6665 case STOP_ARRAY_RO: 6666 case CLUSTERED_DISK_NACK: 6667 return true; 6668 default: 6669 return false; 6670 } 6671 } 6672 6673 static int md_ioctl(struct block_device *bdev, fmode_t mode, 6674 unsigned int cmd, unsigned long arg) 6675 { 6676 int err = 0; 6677 void __user *argp = (void __user *)arg; 6678 struct mddev *mddev = NULL; 6679 int ro; 6680 6681 if (!md_ioctl_valid(cmd)) 6682 return -ENOTTY; 6683 6684 switch (cmd) { 6685 case RAID_VERSION: 6686 case GET_ARRAY_INFO: 6687 case GET_DISK_INFO: 6688 break; 6689 default: 6690 if (!capable(CAP_SYS_ADMIN)) 6691 return -EACCES; 6692 } 6693 6694 /* 6695 * Commands dealing with the RAID driver but not any 6696 * particular array: 6697 */ 6698 switch (cmd) { 6699 case RAID_VERSION: 6700 err = get_version(argp); 6701 goto out; 6702 6703 #ifndef MODULE 6704 case RAID_AUTORUN: 6705 err = 0; 6706 autostart_arrays(arg); 6707 goto out; 6708 #endif 6709 default:; 6710 } 6711 6712 /* 6713 * Commands creating/starting a new array: 6714 */ 6715 6716 mddev = bdev->bd_disk->private_data; 6717 6718 if (!mddev) { 6719 BUG(); 6720 goto out; 6721 } 6722 6723 /* Some actions do not requires the mutex */ 6724 switch (cmd) { 6725 case GET_ARRAY_INFO: 6726 if (!mddev->raid_disks && !mddev->external) 6727 err = -ENODEV; 6728 else 6729 err = get_array_info(mddev, argp); 6730 goto out; 6731 6732 case GET_DISK_INFO: 6733 if (!mddev->raid_disks && !mddev->external) 6734 err = -ENODEV; 6735 else 6736 err = get_disk_info(mddev, argp); 6737 goto out; 6738 6739 case SET_DISK_FAULTY: 6740 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6741 goto out; 6742 6743 case GET_BITMAP_FILE: 6744 err = get_bitmap_file(mddev, argp); 6745 goto out; 6746 6747 } 6748 6749 if (cmd == ADD_NEW_DISK) 6750 /* need to ensure md_delayed_delete() has completed */ 6751 flush_workqueue(md_misc_wq); 6752 6753 if (cmd == HOT_REMOVE_DISK) 6754 /* need to ensure recovery thread has run */ 6755 wait_event_interruptible_timeout(mddev->sb_wait, 6756 !test_bit(MD_RECOVERY_NEEDED, 6757 &mddev->flags), 6758 msecs_to_jiffies(5000)); 6759 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6760 /* Need to flush page cache, and ensure no-one else opens 6761 * and writes 6762 */ 6763 mutex_lock(&mddev->open_mutex); 6764 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 6765 mutex_unlock(&mddev->open_mutex); 6766 err = -EBUSY; 6767 goto out; 6768 } 6769 set_bit(MD_STILL_CLOSED, &mddev->flags); 6770 mutex_unlock(&mddev->open_mutex); 6771 sync_blockdev(bdev); 6772 } 6773 err = mddev_lock(mddev); 6774 if (err) { 6775 printk(KERN_INFO 6776 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6777 err, cmd); 6778 goto out; 6779 } 6780 6781 if (cmd == SET_ARRAY_INFO) { 6782 mdu_array_info_t info; 6783 if (!arg) 6784 memset(&info, 0, sizeof(info)); 6785 else if (copy_from_user(&info, argp, sizeof(info))) { 6786 err = -EFAULT; 6787 goto unlock; 6788 } 6789 if (mddev->pers) { 6790 err = update_array_info(mddev, &info); 6791 if (err) { 6792 printk(KERN_WARNING "md: couldn't update" 6793 " array info. %d\n", err); 6794 goto unlock; 6795 } 6796 goto unlock; 6797 } 6798 if (!list_empty(&mddev->disks)) { 6799 printk(KERN_WARNING 6800 "md: array %s already has disks!\n", 6801 mdname(mddev)); 6802 err = -EBUSY; 6803 goto unlock; 6804 } 6805 if (mddev->raid_disks) { 6806 printk(KERN_WARNING 6807 "md: array %s already initialised!\n", 6808 mdname(mddev)); 6809 err = -EBUSY; 6810 goto unlock; 6811 } 6812 err = set_array_info(mddev, &info); 6813 if (err) { 6814 printk(KERN_WARNING "md: couldn't set" 6815 " array info. %d\n", err); 6816 goto unlock; 6817 } 6818 goto unlock; 6819 } 6820 6821 /* 6822 * Commands querying/configuring an existing array: 6823 */ 6824 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6825 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6826 if ((!mddev->raid_disks && !mddev->external) 6827 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6828 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6829 && cmd != GET_BITMAP_FILE) { 6830 err = -ENODEV; 6831 goto unlock; 6832 } 6833 6834 /* 6835 * Commands even a read-only array can execute: 6836 */ 6837 switch (cmd) { 6838 case RESTART_ARRAY_RW: 6839 err = restart_array(mddev); 6840 goto unlock; 6841 6842 case STOP_ARRAY: 6843 err = do_md_stop(mddev, 0, bdev); 6844 goto unlock; 6845 6846 case STOP_ARRAY_RO: 6847 err = md_set_readonly(mddev, bdev); 6848 goto unlock; 6849 6850 case HOT_REMOVE_DISK: 6851 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6852 goto unlock; 6853 6854 case ADD_NEW_DISK: 6855 /* We can support ADD_NEW_DISK on read-only arrays 6856 * on if we are re-adding a preexisting device. 6857 * So require mddev->pers and MD_DISK_SYNC. 6858 */ 6859 if (mddev->pers) { 6860 mdu_disk_info_t info; 6861 if (copy_from_user(&info, argp, sizeof(info))) 6862 err = -EFAULT; 6863 else if (!(info.state & (1<<MD_DISK_SYNC))) 6864 /* Need to clear read-only for this */ 6865 break; 6866 else 6867 err = add_new_disk(mddev, &info); 6868 goto unlock; 6869 } 6870 break; 6871 6872 case BLKROSET: 6873 if (get_user(ro, (int __user *)(arg))) { 6874 err = -EFAULT; 6875 goto unlock; 6876 } 6877 err = -EINVAL; 6878 6879 /* if the bdev is going readonly the value of mddev->ro 6880 * does not matter, no writes are coming 6881 */ 6882 if (ro) 6883 goto unlock; 6884 6885 /* are we are already prepared for writes? */ 6886 if (mddev->ro != 1) 6887 goto unlock; 6888 6889 /* transitioning to readauto need only happen for 6890 * arrays that call md_write_start 6891 */ 6892 if (mddev->pers) { 6893 err = restart_array(mddev); 6894 if (err == 0) { 6895 mddev->ro = 2; 6896 set_disk_ro(mddev->gendisk, 0); 6897 } 6898 } 6899 goto unlock; 6900 } 6901 6902 /* 6903 * The remaining ioctls are changing the state of the 6904 * superblock, so we do not allow them on read-only arrays. 6905 */ 6906 if (mddev->ro && mddev->pers) { 6907 if (mddev->ro == 2) { 6908 mddev->ro = 0; 6909 sysfs_notify_dirent_safe(mddev->sysfs_state); 6910 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6911 /* mddev_unlock will wake thread */ 6912 /* If a device failed while we were read-only, we 6913 * need to make sure the metadata is updated now. 6914 */ 6915 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 6916 mddev_unlock(mddev); 6917 wait_event(mddev->sb_wait, 6918 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6919 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6920 mddev_lock_nointr(mddev); 6921 } 6922 } else { 6923 err = -EROFS; 6924 goto unlock; 6925 } 6926 } 6927 6928 switch (cmd) { 6929 case ADD_NEW_DISK: 6930 { 6931 mdu_disk_info_t info; 6932 if (copy_from_user(&info, argp, sizeof(info))) 6933 err = -EFAULT; 6934 else 6935 err = add_new_disk(mddev, &info); 6936 goto unlock; 6937 } 6938 6939 case CLUSTERED_DISK_NACK: 6940 if (mddev_is_clustered(mddev)) 6941 md_cluster_ops->new_disk_ack(mddev, false); 6942 else 6943 err = -EINVAL; 6944 goto unlock; 6945 6946 case HOT_ADD_DISK: 6947 err = hot_add_disk(mddev, new_decode_dev(arg)); 6948 goto unlock; 6949 6950 case RUN_ARRAY: 6951 err = do_md_run(mddev); 6952 goto unlock; 6953 6954 case SET_BITMAP_FILE: 6955 err = set_bitmap_file(mddev, (int)arg); 6956 goto unlock; 6957 6958 default: 6959 err = -EINVAL; 6960 goto unlock; 6961 } 6962 6963 unlock: 6964 if (mddev->hold_active == UNTIL_IOCTL && 6965 err != -EINVAL) 6966 mddev->hold_active = 0; 6967 mddev_unlock(mddev); 6968 out: 6969 return err; 6970 } 6971 #ifdef CONFIG_COMPAT 6972 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 6973 unsigned int cmd, unsigned long arg) 6974 { 6975 switch (cmd) { 6976 case HOT_REMOVE_DISK: 6977 case HOT_ADD_DISK: 6978 case SET_DISK_FAULTY: 6979 case SET_BITMAP_FILE: 6980 /* These take in integer arg, do not convert */ 6981 break; 6982 default: 6983 arg = (unsigned long)compat_ptr(arg); 6984 break; 6985 } 6986 6987 return md_ioctl(bdev, mode, cmd, arg); 6988 } 6989 #endif /* CONFIG_COMPAT */ 6990 6991 static int md_open(struct block_device *bdev, fmode_t mode) 6992 { 6993 /* 6994 * Succeed if we can lock the mddev, which confirms that 6995 * it isn't being stopped right now. 6996 */ 6997 struct mddev *mddev = mddev_find(bdev->bd_dev); 6998 int err; 6999 7000 if (!mddev) 7001 return -ENODEV; 7002 7003 if (mddev->gendisk != bdev->bd_disk) { 7004 /* we are racing with mddev_put which is discarding this 7005 * bd_disk. 7006 */ 7007 mddev_put(mddev); 7008 /* Wait until bdev->bd_disk is definitely gone */ 7009 flush_workqueue(md_misc_wq); 7010 /* Then retry the open from the top */ 7011 return -ERESTARTSYS; 7012 } 7013 BUG_ON(mddev != bdev->bd_disk->private_data); 7014 7015 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7016 goto out; 7017 7018 err = 0; 7019 atomic_inc(&mddev->openers); 7020 clear_bit(MD_STILL_CLOSED, &mddev->flags); 7021 mutex_unlock(&mddev->open_mutex); 7022 7023 check_disk_change(bdev); 7024 out: 7025 return err; 7026 } 7027 7028 static void md_release(struct gendisk *disk, fmode_t mode) 7029 { 7030 struct mddev *mddev = disk->private_data; 7031 7032 BUG_ON(!mddev); 7033 atomic_dec(&mddev->openers); 7034 mddev_put(mddev); 7035 } 7036 7037 static int md_media_changed(struct gendisk *disk) 7038 { 7039 struct mddev *mddev = disk->private_data; 7040 7041 return mddev->changed; 7042 } 7043 7044 static int md_revalidate(struct gendisk *disk) 7045 { 7046 struct mddev *mddev = disk->private_data; 7047 7048 mddev->changed = 0; 7049 return 0; 7050 } 7051 static const struct block_device_operations md_fops = 7052 { 7053 .owner = THIS_MODULE, 7054 .open = md_open, 7055 .release = md_release, 7056 .ioctl = md_ioctl, 7057 #ifdef CONFIG_COMPAT 7058 .compat_ioctl = md_compat_ioctl, 7059 #endif 7060 .getgeo = md_getgeo, 7061 .media_changed = md_media_changed, 7062 .revalidate_disk= md_revalidate, 7063 }; 7064 7065 static int md_thread(void *arg) 7066 { 7067 struct md_thread *thread = arg; 7068 7069 /* 7070 * md_thread is a 'system-thread', it's priority should be very 7071 * high. We avoid resource deadlocks individually in each 7072 * raid personality. (RAID5 does preallocation) We also use RR and 7073 * the very same RT priority as kswapd, thus we will never get 7074 * into a priority inversion deadlock. 7075 * 7076 * we definitely have to have equal or higher priority than 7077 * bdflush, otherwise bdflush will deadlock if there are too 7078 * many dirty RAID5 blocks. 7079 */ 7080 7081 allow_signal(SIGKILL); 7082 while (!kthread_should_stop()) { 7083 7084 /* We need to wait INTERRUPTIBLE so that 7085 * we don't add to the load-average. 7086 * That means we need to be sure no signals are 7087 * pending 7088 */ 7089 if (signal_pending(current)) 7090 flush_signals(current); 7091 7092 wait_event_interruptible_timeout 7093 (thread->wqueue, 7094 test_bit(THREAD_WAKEUP, &thread->flags) 7095 || kthread_should_stop(), 7096 thread->timeout); 7097 7098 clear_bit(THREAD_WAKEUP, &thread->flags); 7099 if (!kthread_should_stop()) 7100 thread->run(thread); 7101 } 7102 7103 return 0; 7104 } 7105 7106 void md_wakeup_thread(struct md_thread *thread) 7107 { 7108 if (thread) { 7109 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7110 set_bit(THREAD_WAKEUP, &thread->flags); 7111 wake_up(&thread->wqueue); 7112 } 7113 } 7114 EXPORT_SYMBOL(md_wakeup_thread); 7115 7116 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7117 struct mddev *mddev, const char *name) 7118 { 7119 struct md_thread *thread; 7120 7121 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7122 if (!thread) 7123 return NULL; 7124 7125 init_waitqueue_head(&thread->wqueue); 7126 7127 thread->run = run; 7128 thread->mddev = mddev; 7129 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7130 thread->tsk = kthread_run(md_thread, thread, 7131 "%s_%s", 7132 mdname(thread->mddev), 7133 name); 7134 if (IS_ERR(thread->tsk)) { 7135 kfree(thread); 7136 return NULL; 7137 } 7138 return thread; 7139 } 7140 EXPORT_SYMBOL(md_register_thread); 7141 7142 void md_unregister_thread(struct md_thread **threadp) 7143 { 7144 struct md_thread *thread = *threadp; 7145 if (!thread) 7146 return; 7147 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7148 /* Locking ensures that mddev_unlock does not wake_up a 7149 * non-existent thread 7150 */ 7151 spin_lock(&pers_lock); 7152 *threadp = NULL; 7153 spin_unlock(&pers_lock); 7154 7155 kthread_stop(thread->tsk); 7156 kfree(thread); 7157 } 7158 EXPORT_SYMBOL(md_unregister_thread); 7159 7160 void md_error(struct mddev *mddev, struct md_rdev *rdev) 7161 { 7162 if (!rdev || test_bit(Faulty, &rdev->flags)) 7163 return; 7164 7165 if (!mddev->pers || !mddev->pers->error_handler) 7166 return; 7167 mddev->pers->error_handler(mddev,rdev); 7168 if (mddev->degraded) 7169 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7170 sysfs_notify_dirent_safe(rdev->sysfs_state); 7171 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7172 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7173 md_wakeup_thread(mddev->thread); 7174 if (mddev->event_work.func) 7175 queue_work(md_misc_wq, &mddev->event_work); 7176 md_new_event_inintr(mddev); 7177 } 7178 EXPORT_SYMBOL(md_error); 7179 7180 /* seq_file implementation /proc/mdstat */ 7181 7182 static void status_unused(struct seq_file *seq) 7183 { 7184 int i = 0; 7185 struct md_rdev *rdev; 7186 7187 seq_printf(seq, "unused devices: "); 7188 7189 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7190 char b[BDEVNAME_SIZE]; 7191 i++; 7192 seq_printf(seq, "%s ", 7193 bdevname(rdev->bdev,b)); 7194 } 7195 if (!i) 7196 seq_printf(seq, "<none>"); 7197 7198 seq_printf(seq, "\n"); 7199 } 7200 7201 static int status_resync(struct seq_file *seq, struct mddev *mddev) 7202 { 7203 sector_t max_sectors, resync, res; 7204 unsigned long dt, db; 7205 sector_t rt; 7206 int scale; 7207 unsigned int per_milli; 7208 7209 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7210 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7211 max_sectors = mddev->resync_max_sectors; 7212 else 7213 max_sectors = mddev->dev_sectors; 7214 7215 resync = mddev->curr_resync; 7216 if (resync <= 3) { 7217 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7218 /* Still cleaning up */ 7219 resync = max_sectors; 7220 } else 7221 resync -= atomic_read(&mddev->recovery_active); 7222 7223 if (resync == 0) { 7224 if (mddev->recovery_cp < MaxSector) { 7225 seq_printf(seq, "\tresync=PENDING"); 7226 return 1; 7227 } 7228 return 0; 7229 } 7230 if (resync < 3) { 7231 seq_printf(seq, "\tresync=DELAYED"); 7232 return 1; 7233 } 7234 7235 WARN_ON(max_sectors == 0); 7236 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7237 * in a sector_t, and (max_sectors>>scale) will fit in a 7238 * u32, as those are the requirements for sector_div. 7239 * Thus 'scale' must be at least 10 7240 */ 7241 scale = 10; 7242 if (sizeof(sector_t) > sizeof(unsigned long)) { 7243 while ( max_sectors/2 > (1ULL<<(scale+32))) 7244 scale++; 7245 } 7246 res = (resync>>scale)*1000; 7247 sector_div(res, (u32)((max_sectors>>scale)+1)); 7248 7249 per_milli = res; 7250 { 7251 int i, x = per_milli/50, y = 20-x; 7252 seq_printf(seq, "["); 7253 for (i = 0; i < x; i++) 7254 seq_printf(seq, "="); 7255 seq_printf(seq, ">"); 7256 for (i = 0; i < y; i++) 7257 seq_printf(seq, "."); 7258 seq_printf(seq, "] "); 7259 } 7260 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7261 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7262 "reshape" : 7263 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7264 "check" : 7265 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7266 "resync" : "recovery"))), 7267 per_milli/10, per_milli % 10, 7268 (unsigned long long) resync/2, 7269 (unsigned long long) max_sectors/2); 7270 7271 /* 7272 * dt: time from mark until now 7273 * db: blocks written from mark until now 7274 * rt: remaining time 7275 * 7276 * rt is a sector_t, so could be 32bit or 64bit. 7277 * So we divide before multiply in case it is 32bit and close 7278 * to the limit. 7279 * We scale the divisor (db) by 32 to avoid losing precision 7280 * near the end of resync when the number of remaining sectors 7281 * is close to 'db'. 7282 * We then divide rt by 32 after multiplying by db to compensate. 7283 * The '+1' avoids division by zero if db is very small. 7284 */ 7285 dt = ((jiffies - mddev->resync_mark) / HZ); 7286 if (!dt) dt++; 7287 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7288 - mddev->resync_mark_cnt; 7289 7290 rt = max_sectors - resync; /* number of remaining sectors */ 7291 sector_div(rt, db/32+1); 7292 rt *= dt; 7293 rt >>= 5; 7294 7295 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7296 ((unsigned long)rt % 60)/6); 7297 7298 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7299 return 1; 7300 } 7301 7302 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7303 { 7304 struct list_head *tmp; 7305 loff_t l = *pos; 7306 struct mddev *mddev; 7307 7308 if (l >= 0x10000) 7309 return NULL; 7310 if (!l--) 7311 /* header */ 7312 return (void*)1; 7313 7314 spin_lock(&all_mddevs_lock); 7315 list_for_each(tmp,&all_mddevs) 7316 if (!l--) { 7317 mddev = list_entry(tmp, struct mddev, all_mddevs); 7318 mddev_get(mddev); 7319 spin_unlock(&all_mddevs_lock); 7320 return mddev; 7321 } 7322 spin_unlock(&all_mddevs_lock); 7323 if (!l--) 7324 return (void*)2;/* tail */ 7325 return NULL; 7326 } 7327 7328 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7329 { 7330 struct list_head *tmp; 7331 struct mddev *next_mddev, *mddev = v; 7332 7333 ++*pos; 7334 if (v == (void*)2) 7335 return NULL; 7336 7337 spin_lock(&all_mddevs_lock); 7338 if (v == (void*)1) 7339 tmp = all_mddevs.next; 7340 else 7341 tmp = mddev->all_mddevs.next; 7342 if (tmp != &all_mddevs) 7343 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7344 else { 7345 next_mddev = (void*)2; 7346 *pos = 0x10000; 7347 } 7348 spin_unlock(&all_mddevs_lock); 7349 7350 if (v != (void*)1) 7351 mddev_put(mddev); 7352 return next_mddev; 7353 7354 } 7355 7356 static void md_seq_stop(struct seq_file *seq, void *v) 7357 { 7358 struct mddev *mddev = v; 7359 7360 if (mddev && v != (void*)1 && v != (void*)2) 7361 mddev_put(mddev); 7362 } 7363 7364 static int md_seq_show(struct seq_file *seq, void *v) 7365 { 7366 struct mddev *mddev = v; 7367 sector_t sectors; 7368 struct md_rdev *rdev; 7369 7370 if (v == (void*)1) { 7371 struct md_personality *pers; 7372 seq_printf(seq, "Personalities : "); 7373 spin_lock(&pers_lock); 7374 list_for_each_entry(pers, &pers_list, list) 7375 seq_printf(seq, "[%s] ", pers->name); 7376 7377 spin_unlock(&pers_lock); 7378 seq_printf(seq, "\n"); 7379 seq->poll_event = atomic_read(&md_event_count); 7380 return 0; 7381 } 7382 if (v == (void*)2) { 7383 status_unused(seq); 7384 return 0; 7385 } 7386 7387 spin_lock(&mddev->lock); 7388 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7389 seq_printf(seq, "%s : %sactive", mdname(mddev), 7390 mddev->pers ? "" : "in"); 7391 if (mddev->pers) { 7392 if (mddev->ro==1) 7393 seq_printf(seq, " (read-only)"); 7394 if (mddev->ro==2) 7395 seq_printf(seq, " (auto-read-only)"); 7396 seq_printf(seq, " %s", mddev->pers->name); 7397 } 7398 7399 sectors = 0; 7400 rcu_read_lock(); 7401 rdev_for_each_rcu(rdev, mddev) { 7402 char b[BDEVNAME_SIZE]; 7403 seq_printf(seq, " %s[%d]", 7404 bdevname(rdev->bdev,b), rdev->desc_nr); 7405 if (test_bit(WriteMostly, &rdev->flags)) 7406 seq_printf(seq, "(W)"); 7407 if (test_bit(Journal, &rdev->flags)) 7408 seq_printf(seq, "(J)"); 7409 if (test_bit(Faulty, &rdev->flags)) { 7410 seq_printf(seq, "(F)"); 7411 continue; 7412 } 7413 if (rdev->raid_disk < 0) 7414 seq_printf(seq, "(S)"); /* spare */ 7415 if (test_bit(Replacement, &rdev->flags)) 7416 seq_printf(seq, "(R)"); 7417 sectors += rdev->sectors; 7418 } 7419 rcu_read_unlock(); 7420 7421 if (!list_empty(&mddev->disks)) { 7422 if (mddev->pers) 7423 seq_printf(seq, "\n %llu blocks", 7424 (unsigned long long) 7425 mddev->array_sectors / 2); 7426 else 7427 seq_printf(seq, "\n %llu blocks", 7428 (unsigned long long)sectors / 2); 7429 } 7430 if (mddev->persistent) { 7431 if (mddev->major_version != 0 || 7432 mddev->minor_version != 90) { 7433 seq_printf(seq," super %d.%d", 7434 mddev->major_version, 7435 mddev->minor_version); 7436 } 7437 } else if (mddev->external) 7438 seq_printf(seq, " super external:%s", 7439 mddev->metadata_type); 7440 else 7441 seq_printf(seq, " super non-persistent"); 7442 7443 if (mddev->pers) { 7444 mddev->pers->status(seq, mddev); 7445 seq_printf(seq, "\n "); 7446 if (mddev->pers->sync_request) { 7447 if (status_resync(seq, mddev)) 7448 seq_printf(seq, "\n "); 7449 } 7450 } else 7451 seq_printf(seq, "\n "); 7452 7453 bitmap_status(seq, mddev->bitmap); 7454 7455 seq_printf(seq, "\n"); 7456 } 7457 spin_unlock(&mddev->lock); 7458 7459 return 0; 7460 } 7461 7462 static const struct seq_operations md_seq_ops = { 7463 .start = md_seq_start, 7464 .next = md_seq_next, 7465 .stop = md_seq_stop, 7466 .show = md_seq_show, 7467 }; 7468 7469 static int md_seq_open(struct inode *inode, struct file *file) 7470 { 7471 struct seq_file *seq; 7472 int error; 7473 7474 error = seq_open(file, &md_seq_ops); 7475 if (error) 7476 return error; 7477 7478 seq = file->private_data; 7479 seq->poll_event = atomic_read(&md_event_count); 7480 return error; 7481 } 7482 7483 static int md_unloading; 7484 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7485 { 7486 struct seq_file *seq = filp->private_data; 7487 int mask; 7488 7489 if (md_unloading) 7490 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7491 poll_wait(filp, &md_event_waiters, wait); 7492 7493 /* always allow read */ 7494 mask = POLLIN | POLLRDNORM; 7495 7496 if (seq->poll_event != atomic_read(&md_event_count)) 7497 mask |= POLLERR | POLLPRI; 7498 return mask; 7499 } 7500 7501 static const struct file_operations md_seq_fops = { 7502 .owner = THIS_MODULE, 7503 .open = md_seq_open, 7504 .read = seq_read, 7505 .llseek = seq_lseek, 7506 .release = seq_release_private, 7507 .poll = mdstat_poll, 7508 }; 7509 7510 int register_md_personality(struct md_personality *p) 7511 { 7512 printk(KERN_INFO "md: %s personality registered for level %d\n", 7513 p->name, p->level); 7514 spin_lock(&pers_lock); 7515 list_add_tail(&p->list, &pers_list); 7516 spin_unlock(&pers_lock); 7517 return 0; 7518 } 7519 EXPORT_SYMBOL(register_md_personality); 7520 7521 int unregister_md_personality(struct md_personality *p) 7522 { 7523 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7524 spin_lock(&pers_lock); 7525 list_del_init(&p->list); 7526 spin_unlock(&pers_lock); 7527 return 0; 7528 } 7529 EXPORT_SYMBOL(unregister_md_personality); 7530 7531 int register_md_cluster_operations(struct md_cluster_operations *ops, 7532 struct module *module) 7533 { 7534 int ret = 0; 7535 spin_lock(&pers_lock); 7536 if (md_cluster_ops != NULL) 7537 ret = -EALREADY; 7538 else { 7539 md_cluster_ops = ops; 7540 md_cluster_mod = module; 7541 } 7542 spin_unlock(&pers_lock); 7543 return ret; 7544 } 7545 EXPORT_SYMBOL(register_md_cluster_operations); 7546 7547 int unregister_md_cluster_operations(void) 7548 { 7549 spin_lock(&pers_lock); 7550 md_cluster_ops = NULL; 7551 spin_unlock(&pers_lock); 7552 return 0; 7553 } 7554 EXPORT_SYMBOL(unregister_md_cluster_operations); 7555 7556 int md_setup_cluster(struct mddev *mddev, int nodes) 7557 { 7558 int err; 7559 7560 err = request_module("md-cluster"); 7561 if (err) { 7562 pr_err("md-cluster module not found.\n"); 7563 return -ENOENT; 7564 } 7565 7566 spin_lock(&pers_lock); 7567 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7568 spin_unlock(&pers_lock); 7569 return -ENOENT; 7570 } 7571 spin_unlock(&pers_lock); 7572 7573 return md_cluster_ops->join(mddev, nodes); 7574 } 7575 7576 void md_cluster_stop(struct mddev *mddev) 7577 { 7578 if (!md_cluster_ops) 7579 return; 7580 md_cluster_ops->leave(mddev); 7581 module_put(md_cluster_mod); 7582 } 7583 7584 static int is_mddev_idle(struct mddev *mddev, int init) 7585 { 7586 struct md_rdev *rdev; 7587 int idle; 7588 int curr_events; 7589 7590 idle = 1; 7591 rcu_read_lock(); 7592 rdev_for_each_rcu(rdev, mddev) { 7593 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7594 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7595 (int)part_stat_read(&disk->part0, sectors[1]) - 7596 atomic_read(&disk->sync_io); 7597 /* sync IO will cause sync_io to increase before the disk_stats 7598 * as sync_io is counted when a request starts, and 7599 * disk_stats is counted when it completes. 7600 * So resync activity will cause curr_events to be smaller than 7601 * when there was no such activity. 7602 * non-sync IO will cause disk_stat to increase without 7603 * increasing sync_io so curr_events will (eventually) 7604 * be larger than it was before. Once it becomes 7605 * substantially larger, the test below will cause 7606 * the array to appear non-idle, and resync will slow 7607 * down. 7608 * If there is a lot of outstanding resync activity when 7609 * we set last_event to curr_events, then all that activity 7610 * completing might cause the array to appear non-idle 7611 * and resync will be slowed down even though there might 7612 * not have been non-resync activity. This will only 7613 * happen once though. 'last_events' will soon reflect 7614 * the state where there is little or no outstanding 7615 * resync requests, and further resync activity will 7616 * always make curr_events less than last_events. 7617 * 7618 */ 7619 if (init || curr_events - rdev->last_events > 64) { 7620 rdev->last_events = curr_events; 7621 idle = 0; 7622 } 7623 } 7624 rcu_read_unlock(); 7625 return idle; 7626 } 7627 7628 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7629 { 7630 /* another "blocks" (512byte) blocks have been synced */ 7631 atomic_sub(blocks, &mddev->recovery_active); 7632 wake_up(&mddev->recovery_wait); 7633 if (!ok) { 7634 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7635 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7636 md_wakeup_thread(mddev->thread); 7637 // stop recovery, signal do_sync .... 7638 } 7639 } 7640 EXPORT_SYMBOL(md_done_sync); 7641 7642 /* md_write_start(mddev, bi) 7643 * If we need to update some array metadata (e.g. 'active' flag 7644 * in superblock) before writing, schedule a superblock update 7645 * and wait for it to complete. 7646 */ 7647 void md_write_start(struct mddev *mddev, struct bio *bi) 7648 { 7649 int did_change = 0; 7650 if (bio_data_dir(bi) != WRITE) 7651 return; 7652 7653 BUG_ON(mddev->ro == 1); 7654 if (mddev->ro == 2) { 7655 /* need to switch to read/write */ 7656 mddev->ro = 0; 7657 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7658 md_wakeup_thread(mddev->thread); 7659 md_wakeup_thread(mddev->sync_thread); 7660 did_change = 1; 7661 } 7662 atomic_inc(&mddev->writes_pending); 7663 if (mddev->safemode == 1) 7664 mddev->safemode = 0; 7665 if (mddev->in_sync) { 7666 spin_lock(&mddev->lock); 7667 if (mddev->in_sync) { 7668 mddev->in_sync = 0; 7669 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7670 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7671 md_wakeup_thread(mddev->thread); 7672 did_change = 1; 7673 } 7674 spin_unlock(&mddev->lock); 7675 } 7676 if (did_change) 7677 sysfs_notify_dirent_safe(mddev->sysfs_state); 7678 wait_event(mddev->sb_wait, 7679 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7680 } 7681 EXPORT_SYMBOL(md_write_start); 7682 7683 void md_write_end(struct mddev *mddev) 7684 { 7685 if (atomic_dec_and_test(&mddev->writes_pending)) { 7686 if (mddev->safemode == 2) 7687 md_wakeup_thread(mddev->thread); 7688 else if (mddev->safemode_delay) 7689 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7690 } 7691 } 7692 EXPORT_SYMBOL(md_write_end); 7693 7694 /* md_allow_write(mddev) 7695 * Calling this ensures that the array is marked 'active' so that writes 7696 * may proceed without blocking. It is important to call this before 7697 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7698 * Must be called with mddev_lock held. 7699 * 7700 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 7701 * is dropped, so return -EAGAIN after notifying userspace. 7702 */ 7703 int md_allow_write(struct mddev *mddev) 7704 { 7705 if (!mddev->pers) 7706 return 0; 7707 if (mddev->ro) 7708 return 0; 7709 if (!mddev->pers->sync_request) 7710 return 0; 7711 7712 spin_lock(&mddev->lock); 7713 if (mddev->in_sync) { 7714 mddev->in_sync = 0; 7715 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7716 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7717 if (mddev->safemode_delay && 7718 mddev->safemode == 0) 7719 mddev->safemode = 1; 7720 spin_unlock(&mddev->lock); 7721 md_update_sb(mddev, 0); 7722 sysfs_notify_dirent_safe(mddev->sysfs_state); 7723 } else 7724 spin_unlock(&mddev->lock); 7725 7726 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7727 return -EAGAIN; 7728 else 7729 return 0; 7730 } 7731 EXPORT_SYMBOL_GPL(md_allow_write); 7732 7733 #define SYNC_MARKS 10 7734 #define SYNC_MARK_STEP (3*HZ) 7735 #define UPDATE_FREQUENCY (5*60*HZ) 7736 void md_do_sync(struct md_thread *thread) 7737 { 7738 struct mddev *mddev = thread->mddev; 7739 struct mddev *mddev2; 7740 unsigned int currspeed = 0, 7741 window; 7742 sector_t max_sectors,j, io_sectors, recovery_done; 7743 unsigned long mark[SYNC_MARKS]; 7744 unsigned long update_time; 7745 sector_t mark_cnt[SYNC_MARKS]; 7746 int last_mark,m; 7747 struct list_head *tmp; 7748 sector_t last_check; 7749 int skipped = 0; 7750 struct md_rdev *rdev; 7751 char *desc, *action = NULL; 7752 struct blk_plug plug; 7753 bool cluster_resync_finished = false; 7754 7755 /* just incase thread restarts... */ 7756 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7757 return; 7758 if (mddev->ro) {/* never try to sync a read-only array */ 7759 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7760 return; 7761 } 7762 7763 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7764 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7765 desc = "data-check"; 7766 action = "check"; 7767 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7768 desc = "requested-resync"; 7769 action = "repair"; 7770 } else 7771 desc = "resync"; 7772 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7773 desc = "reshape"; 7774 else 7775 desc = "recovery"; 7776 7777 mddev->last_sync_action = action ?: desc; 7778 7779 /* we overload curr_resync somewhat here. 7780 * 0 == not engaged in resync at all 7781 * 2 == checking that there is no conflict with another sync 7782 * 1 == like 2, but have yielded to allow conflicting resync to 7783 * commense 7784 * other == active in resync - this many blocks 7785 * 7786 * Before starting a resync we must have set curr_resync to 7787 * 2, and then checked that every "conflicting" array has curr_resync 7788 * less than ours. When we find one that is the same or higher 7789 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7790 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7791 * This will mean we have to start checking from the beginning again. 7792 * 7793 */ 7794 7795 do { 7796 mddev->curr_resync = 2; 7797 7798 try_again: 7799 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7800 goto skip; 7801 for_each_mddev(mddev2, tmp) { 7802 if (mddev2 == mddev) 7803 continue; 7804 if (!mddev->parallel_resync 7805 && mddev2->curr_resync 7806 && match_mddev_units(mddev, mddev2)) { 7807 DEFINE_WAIT(wq); 7808 if (mddev < mddev2 && mddev->curr_resync == 2) { 7809 /* arbitrarily yield */ 7810 mddev->curr_resync = 1; 7811 wake_up(&resync_wait); 7812 } 7813 if (mddev > mddev2 && mddev->curr_resync == 1) 7814 /* no need to wait here, we can wait the next 7815 * time 'round when curr_resync == 2 7816 */ 7817 continue; 7818 /* We need to wait 'interruptible' so as not to 7819 * contribute to the load average, and not to 7820 * be caught by 'softlockup' 7821 */ 7822 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7823 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7824 mddev2->curr_resync >= mddev->curr_resync) { 7825 printk(KERN_INFO "md: delaying %s of %s" 7826 " until %s has finished (they" 7827 " share one or more physical units)\n", 7828 desc, mdname(mddev), mdname(mddev2)); 7829 mddev_put(mddev2); 7830 if (signal_pending(current)) 7831 flush_signals(current); 7832 schedule(); 7833 finish_wait(&resync_wait, &wq); 7834 goto try_again; 7835 } 7836 finish_wait(&resync_wait, &wq); 7837 } 7838 } 7839 } while (mddev->curr_resync < 2); 7840 7841 j = 0; 7842 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7843 /* resync follows the size requested by the personality, 7844 * which defaults to physical size, but can be virtual size 7845 */ 7846 max_sectors = mddev->resync_max_sectors; 7847 atomic64_set(&mddev->resync_mismatches, 0); 7848 /* we don't use the checkpoint if there's a bitmap */ 7849 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7850 j = mddev->resync_min; 7851 else if (!mddev->bitmap) 7852 j = mddev->recovery_cp; 7853 7854 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7855 max_sectors = mddev->resync_max_sectors; 7856 else { 7857 /* recovery follows the physical size of devices */ 7858 max_sectors = mddev->dev_sectors; 7859 j = MaxSector; 7860 rcu_read_lock(); 7861 rdev_for_each_rcu(rdev, mddev) 7862 if (rdev->raid_disk >= 0 && 7863 !test_bit(Journal, &rdev->flags) && 7864 !test_bit(Faulty, &rdev->flags) && 7865 !test_bit(In_sync, &rdev->flags) && 7866 rdev->recovery_offset < j) 7867 j = rdev->recovery_offset; 7868 rcu_read_unlock(); 7869 7870 /* If there is a bitmap, we need to make sure all 7871 * writes that started before we added a spare 7872 * complete before we start doing a recovery. 7873 * Otherwise the write might complete and (via 7874 * bitmap_endwrite) set a bit in the bitmap after the 7875 * recovery has checked that bit and skipped that 7876 * region. 7877 */ 7878 if (mddev->bitmap) { 7879 mddev->pers->quiesce(mddev, 1); 7880 mddev->pers->quiesce(mddev, 0); 7881 } 7882 } 7883 7884 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 7885 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 7886 " %d KB/sec/disk.\n", speed_min(mddev)); 7887 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 7888 "(but not more than %d KB/sec) for %s.\n", 7889 speed_max(mddev), desc); 7890 7891 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 7892 7893 io_sectors = 0; 7894 for (m = 0; m < SYNC_MARKS; m++) { 7895 mark[m] = jiffies; 7896 mark_cnt[m] = io_sectors; 7897 } 7898 last_mark = 0; 7899 mddev->resync_mark = mark[last_mark]; 7900 mddev->resync_mark_cnt = mark_cnt[last_mark]; 7901 7902 /* 7903 * Tune reconstruction: 7904 */ 7905 window = 32*(PAGE_SIZE/512); 7906 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 7907 window/2, (unsigned long long)max_sectors/2); 7908 7909 atomic_set(&mddev->recovery_active, 0); 7910 last_check = 0; 7911 7912 if (j>2) { 7913 printk(KERN_INFO 7914 "md: resuming %s of %s from checkpoint.\n", 7915 desc, mdname(mddev)); 7916 mddev->curr_resync = j; 7917 } else 7918 mddev->curr_resync = 3; /* no longer delayed */ 7919 mddev->curr_resync_completed = j; 7920 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7921 md_new_event(mddev); 7922 update_time = jiffies; 7923 7924 blk_start_plug(&plug); 7925 while (j < max_sectors) { 7926 sector_t sectors; 7927 7928 skipped = 0; 7929 7930 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7931 ((mddev->curr_resync > mddev->curr_resync_completed && 7932 (mddev->curr_resync - mddev->curr_resync_completed) 7933 > (max_sectors >> 4)) || 7934 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 7935 (j - mddev->curr_resync_completed)*2 7936 >= mddev->resync_max - mddev->curr_resync_completed || 7937 mddev->curr_resync_completed > mddev->resync_max 7938 )) { 7939 /* time to update curr_resync_completed */ 7940 wait_event(mddev->recovery_wait, 7941 atomic_read(&mddev->recovery_active) == 0); 7942 mddev->curr_resync_completed = j; 7943 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 7944 j > mddev->recovery_cp) 7945 mddev->recovery_cp = j; 7946 update_time = jiffies; 7947 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7948 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7949 } 7950 7951 while (j >= mddev->resync_max && 7952 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7953 /* As this condition is controlled by user-space, 7954 * we can block indefinitely, so use '_interruptible' 7955 * to avoid triggering warnings. 7956 */ 7957 flush_signals(current); /* just in case */ 7958 wait_event_interruptible(mddev->recovery_wait, 7959 mddev->resync_max > j 7960 || test_bit(MD_RECOVERY_INTR, 7961 &mddev->recovery)); 7962 } 7963 7964 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7965 break; 7966 7967 sectors = mddev->pers->sync_request(mddev, j, &skipped); 7968 if (sectors == 0) { 7969 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7970 break; 7971 } 7972 7973 if (!skipped) { /* actual IO requested */ 7974 io_sectors += sectors; 7975 atomic_add(sectors, &mddev->recovery_active); 7976 } 7977 7978 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7979 break; 7980 7981 j += sectors; 7982 if (j > max_sectors) 7983 /* when skipping, extra large numbers can be returned. */ 7984 j = max_sectors; 7985 if (j > 2) 7986 mddev->curr_resync = j; 7987 mddev->curr_mark_cnt = io_sectors; 7988 if (last_check == 0) 7989 /* this is the earliest that rebuild will be 7990 * visible in /proc/mdstat 7991 */ 7992 md_new_event(mddev); 7993 7994 if (last_check + window > io_sectors || j == max_sectors) 7995 continue; 7996 7997 last_check = io_sectors; 7998 repeat: 7999 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8000 /* step marks */ 8001 int next = (last_mark+1) % SYNC_MARKS; 8002 8003 mddev->resync_mark = mark[next]; 8004 mddev->resync_mark_cnt = mark_cnt[next]; 8005 mark[next] = jiffies; 8006 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8007 last_mark = next; 8008 } 8009 8010 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8011 break; 8012 8013 /* 8014 * this loop exits only if either when we are slower than 8015 * the 'hard' speed limit, or the system was IO-idle for 8016 * a jiffy. 8017 * the system might be non-idle CPU-wise, but we only care 8018 * about not overloading the IO subsystem. (things like an 8019 * e2fsck being done on the RAID array should execute fast) 8020 */ 8021 cond_resched(); 8022 8023 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8024 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8025 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8026 8027 if (currspeed > speed_min(mddev)) { 8028 if (currspeed > speed_max(mddev)) { 8029 msleep(500); 8030 goto repeat; 8031 } 8032 if (!is_mddev_idle(mddev, 0)) { 8033 /* 8034 * Give other IO more of a chance. 8035 * The faster the devices, the less we wait. 8036 */ 8037 wait_event(mddev->recovery_wait, 8038 !atomic_read(&mddev->recovery_active)); 8039 } 8040 } 8041 } 8042 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, 8043 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8044 ? "interrupted" : "done"); 8045 /* 8046 * this also signals 'finished resyncing' to md_stop 8047 */ 8048 blk_finish_plug(&plug); 8049 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8050 8051 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8052 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8053 mddev->curr_resync > 2) { 8054 mddev->curr_resync_completed = mddev->curr_resync; 8055 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8056 } 8057 /* tell personality and other nodes that we are finished */ 8058 if (mddev_is_clustered(mddev)) { 8059 md_cluster_ops->resync_finish(mddev); 8060 cluster_resync_finished = true; 8061 } 8062 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8063 8064 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 8065 mddev->curr_resync > 2) { 8066 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8067 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8068 if (mddev->curr_resync >= mddev->recovery_cp) { 8069 printk(KERN_INFO 8070 "md: checkpointing %s of %s.\n", 8071 desc, mdname(mddev)); 8072 if (test_bit(MD_RECOVERY_ERROR, 8073 &mddev->recovery)) 8074 mddev->recovery_cp = 8075 mddev->curr_resync_completed; 8076 else 8077 mddev->recovery_cp = 8078 mddev->curr_resync; 8079 } 8080 } else 8081 mddev->recovery_cp = MaxSector; 8082 } else { 8083 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8084 mddev->curr_resync = MaxSector; 8085 rcu_read_lock(); 8086 rdev_for_each_rcu(rdev, mddev) 8087 if (rdev->raid_disk >= 0 && 8088 mddev->delta_disks >= 0 && 8089 !test_bit(Journal, &rdev->flags) && 8090 !test_bit(Faulty, &rdev->flags) && 8091 !test_bit(In_sync, &rdev->flags) && 8092 rdev->recovery_offset < mddev->curr_resync) 8093 rdev->recovery_offset = mddev->curr_resync; 8094 rcu_read_unlock(); 8095 } 8096 } 8097 skip: 8098 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8099 8100 if (mddev_is_clustered(mddev) && 8101 test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8102 !cluster_resync_finished) 8103 md_cluster_ops->resync_finish(mddev); 8104 8105 spin_lock(&mddev->lock); 8106 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8107 /* We completed so min/max setting can be forgotten if used. */ 8108 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8109 mddev->resync_min = 0; 8110 mddev->resync_max = MaxSector; 8111 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8112 mddev->resync_min = mddev->curr_resync_completed; 8113 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 8114 mddev->curr_resync = 0; 8115 spin_unlock(&mddev->lock); 8116 8117 wake_up(&resync_wait); 8118 md_wakeup_thread(mddev->thread); 8119 return; 8120 } 8121 EXPORT_SYMBOL_GPL(md_do_sync); 8122 8123 static int remove_and_add_spares(struct mddev *mddev, 8124 struct md_rdev *this) 8125 { 8126 struct md_rdev *rdev; 8127 int spares = 0; 8128 int removed = 0; 8129 8130 rdev_for_each(rdev, mddev) 8131 if ((this == NULL || rdev == this) && 8132 rdev->raid_disk >= 0 && 8133 !test_bit(Blocked, &rdev->flags) && 8134 (test_bit(Faulty, &rdev->flags) || 8135 (!test_bit(In_sync, &rdev->flags) && 8136 !test_bit(Journal, &rdev->flags))) && 8137 atomic_read(&rdev->nr_pending)==0) { 8138 if (mddev->pers->hot_remove_disk( 8139 mddev, rdev) == 0) { 8140 sysfs_unlink_rdev(mddev, rdev); 8141 rdev->raid_disk = -1; 8142 removed++; 8143 } 8144 } 8145 if (removed && mddev->kobj.sd) 8146 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8147 8148 if (this && removed) 8149 goto no_add; 8150 8151 rdev_for_each(rdev, mddev) { 8152 if (this && this != rdev) 8153 continue; 8154 if (test_bit(Candidate, &rdev->flags)) 8155 continue; 8156 if (rdev->raid_disk >= 0 && 8157 !test_bit(In_sync, &rdev->flags) && 8158 !test_bit(Journal, &rdev->flags) && 8159 !test_bit(Faulty, &rdev->flags)) 8160 spares++; 8161 if (rdev->raid_disk >= 0) 8162 continue; 8163 if (test_bit(Faulty, &rdev->flags)) 8164 continue; 8165 if (test_bit(Journal, &rdev->flags)) 8166 continue; 8167 if (mddev->ro && 8168 ! (rdev->saved_raid_disk >= 0 && 8169 !test_bit(Bitmap_sync, &rdev->flags))) 8170 continue; 8171 8172 rdev->recovery_offset = 0; 8173 if (mddev->pers-> 8174 hot_add_disk(mddev, rdev) == 0) { 8175 if (sysfs_link_rdev(mddev, rdev)) 8176 /* failure here is OK */; 8177 spares++; 8178 md_new_event(mddev); 8179 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8180 } 8181 } 8182 no_add: 8183 if (removed) 8184 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8185 return spares; 8186 } 8187 8188 static void md_start_sync(struct work_struct *ws) 8189 { 8190 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8191 int ret = 0; 8192 8193 if (mddev_is_clustered(mddev)) { 8194 ret = md_cluster_ops->resync_start(mddev); 8195 if (ret) { 8196 mddev->sync_thread = NULL; 8197 goto out; 8198 } 8199 } 8200 8201 mddev->sync_thread = md_register_thread(md_do_sync, 8202 mddev, 8203 "resync"); 8204 out: 8205 if (!mddev->sync_thread) { 8206 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) 8207 printk(KERN_ERR "%s: could not start resync" 8208 " thread...\n", 8209 mdname(mddev)); 8210 /* leave the spares where they are, it shouldn't hurt */ 8211 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8212 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8213 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8214 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8215 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8216 wake_up(&resync_wait); 8217 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8218 &mddev->recovery)) 8219 if (mddev->sysfs_action) 8220 sysfs_notify_dirent_safe(mddev->sysfs_action); 8221 } else 8222 md_wakeup_thread(mddev->sync_thread); 8223 sysfs_notify_dirent_safe(mddev->sysfs_action); 8224 md_new_event(mddev); 8225 } 8226 8227 /* 8228 * This routine is regularly called by all per-raid-array threads to 8229 * deal with generic issues like resync and super-block update. 8230 * Raid personalities that don't have a thread (linear/raid0) do not 8231 * need this as they never do any recovery or update the superblock. 8232 * 8233 * It does not do any resync itself, but rather "forks" off other threads 8234 * to do that as needed. 8235 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8236 * "->recovery" and create a thread at ->sync_thread. 8237 * When the thread finishes it sets MD_RECOVERY_DONE 8238 * and wakeups up this thread which will reap the thread and finish up. 8239 * This thread also removes any faulty devices (with nr_pending == 0). 8240 * 8241 * The overall approach is: 8242 * 1/ if the superblock needs updating, update it. 8243 * 2/ If a recovery thread is running, don't do anything else. 8244 * 3/ If recovery has finished, clean up, possibly marking spares active. 8245 * 4/ If there are any faulty devices, remove them. 8246 * 5/ If array is degraded, try to add spares devices 8247 * 6/ If array has spares or is not in-sync, start a resync thread. 8248 */ 8249 void md_check_recovery(struct mddev *mddev) 8250 { 8251 if (mddev->suspended) 8252 return; 8253 8254 if (mddev->bitmap) 8255 bitmap_daemon_work(mddev); 8256 8257 if (signal_pending(current)) { 8258 if (mddev->pers->sync_request && !mddev->external) { 8259 printk(KERN_INFO "md: %s in immediate safe mode\n", 8260 mdname(mddev)); 8261 mddev->safemode = 2; 8262 } 8263 flush_signals(current); 8264 } 8265 8266 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8267 return; 8268 if ( ! ( 8269 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || 8270 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8271 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8272 (mddev->external == 0 && mddev->safemode == 1) || 8273 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 8274 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8275 )) 8276 return; 8277 8278 if (mddev_trylock(mddev)) { 8279 int spares = 0; 8280 8281 if (mddev->ro) { 8282 struct md_rdev *rdev; 8283 if (!mddev->external && mddev->in_sync) 8284 /* 'Blocked' flag not needed as failed devices 8285 * will be recorded if array switched to read/write. 8286 * Leaving it set will prevent the device 8287 * from being removed. 8288 */ 8289 rdev_for_each(rdev, mddev) 8290 clear_bit(Blocked, &rdev->flags); 8291 /* On a read-only array we can: 8292 * - remove failed devices 8293 * - add already-in_sync devices if the array itself 8294 * is in-sync. 8295 * As we only add devices that are already in-sync, 8296 * we can activate the spares immediately. 8297 */ 8298 remove_and_add_spares(mddev, NULL); 8299 /* There is no thread, but we need to call 8300 * ->spare_active and clear saved_raid_disk 8301 */ 8302 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8303 md_reap_sync_thread(mddev); 8304 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8305 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8306 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 8307 goto unlock; 8308 } 8309 8310 if (!mddev->external) { 8311 int did_change = 0; 8312 spin_lock(&mddev->lock); 8313 if (mddev->safemode && 8314 !atomic_read(&mddev->writes_pending) && 8315 !mddev->in_sync && 8316 mddev->recovery_cp == MaxSector) { 8317 mddev->in_sync = 1; 8318 did_change = 1; 8319 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 8320 } 8321 if (mddev->safemode == 1) 8322 mddev->safemode = 0; 8323 spin_unlock(&mddev->lock); 8324 if (did_change) 8325 sysfs_notify_dirent_safe(mddev->sysfs_state); 8326 } 8327 8328 if (mddev->flags & MD_UPDATE_SB_FLAGS) 8329 md_update_sb(mddev, 0); 8330 8331 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8332 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8333 /* resync/recovery still happening */ 8334 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8335 goto unlock; 8336 } 8337 if (mddev->sync_thread) { 8338 md_reap_sync_thread(mddev); 8339 goto unlock; 8340 } 8341 /* Set RUNNING before clearing NEEDED to avoid 8342 * any transients in the value of "sync_action". 8343 */ 8344 mddev->curr_resync_completed = 0; 8345 spin_lock(&mddev->lock); 8346 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8347 spin_unlock(&mddev->lock); 8348 /* Clear some bits that don't mean anything, but 8349 * might be left set 8350 */ 8351 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8352 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8353 8354 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8355 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8356 goto not_running; 8357 /* no recovery is running. 8358 * remove any failed drives, then 8359 * add spares if possible. 8360 * Spares are also removed and re-added, to allow 8361 * the personality to fail the re-add. 8362 */ 8363 8364 if (mddev->reshape_position != MaxSector) { 8365 if (mddev->pers->check_reshape == NULL || 8366 mddev->pers->check_reshape(mddev) != 0) 8367 /* Cannot proceed */ 8368 goto not_running; 8369 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8370 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8371 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8372 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8373 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8374 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8375 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8376 } else if (mddev->recovery_cp < MaxSector) { 8377 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8378 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8379 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8380 /* nothing to be done ... */ 8381 goto not_running; 8382 8383 if (mddev->pers->sync_request) { 8384 if (spares) { 8385 /* We are adding a device or devices to an array 8386 * which has the bitmap stored on all devices. 8387 * So make sure all bitmap pages get written 8388 */ 8389 bitmap_write_all(mddev->bitmap); 8390 } 8391 INIT_WORK(&mddev->del_work, md_start_sync); 8392 queue_work(md_misc_wq, &mddev->del_work); 8393 goto unlock; 8394 } 8395 not_running: 8396 if (!mddev->sync_thread) { 8397 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8398 wake_up(&resync_wait); 8399 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8400 &mddev->recovery)) 8401 if (mddev->sysfs_action) 8402 sysfs_notify_dirent_safe(mddev->sysfs_action); 8403 } 8404 unlock: 8405 wake_up(&mddev->sb_wait); 8406 mddev_unlock(mddev); 8407 } 8408 } 8409 EXPORT_SYMBOL(md_check_recovery); 8410 8411 void md_reap_sync_thread(struct mddev *mddev) 8412 { 8413 struct md_rdev *rdev; 8414 8415 /* resync has finished, collect result */ 8416 md_unregister_thread(&mddev->sync_thread); 8417 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8418 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8419 /* success...*/ 8420 /* activate any spares */ 8421 if (mddev->pers->spare_active(mddev)) { 8422 sysfs_notify(&mddev->kobj, NULL, 8423 "degraded"); 8424 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8425 } 8426 } 8427 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8428 mddev->pers->finish_reshape) 8429 mddev->pers->finish_reshape(mddev); 8430 8431 /* If array is no-longer degraded, then any saved_raid_disk 8432 * information must be scrapped. 8433 */ 8434 if (!mddev->degraded) 8435 rdev_for_each(rdev, mddev) 8436 rdev->saved_raid_disk = -1; 8437 8438 md_update_sb(mddev, 1); 8439 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8440 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8441 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8442 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8443 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8444 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8445 wake_up(&resync_wait); 8446 /* flag recovery needed just to double check */ 8447 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8448 sysfs_notify_dirent_safe(mddev->sysfs_action); 8449 md_new_event(mddev); 8450 if (mddev->event_work.func) 8451 queue_work(md_misc_wq, &mddev->event_work); 8452 } 8453 EXPORT_SYMBOL(md_reap_sync_thread); 8454 8455 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8456 { 8457 sysfs_notify_dirent_safe(rdev->sysfs_state); 8458 wait_event_timeout(rdev->blocked_wait, 8459 !test_bit(Blocked, &rdev->flags) && 8460 !test_bit(BlockedBadBlocks, &rdev->flags), 8461 msecs_to_jiffies(5000)); 8462 rdev_dec_pending(rdev, mddev); 8463 } 8464 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8465 8466 void md_finish_reshape(struct mddev *mddev) 8467 { 8468 /* called be personality module when reshape completes. */ 8469 struct md_rdev *rdev; 8470 8471 rdev_for_each(rdev, mddev) { 8472 if (rdev->data_offset > rdev->new_data_offset) 8473 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8474 else 8475 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8476 rdev->data_offset = rdev->new_data_offset; 8477 } 8478 } 8479 EXPORT_SYMBOL(md_finish_reshape); 8480 8481 /* Bad block management. 8482 * We can record which blocks on each device are 'bad' and so just 8483 * fail those blocks, or that stripe, rather than the whole device. 8484 * Entries in the bad-block table are 64bits wide. This comprises: 8485 * Length of bad-range, in sectors: 0-511 for lengths 1-512 8486 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) 8487 * A 'shift' can be set so that larger blocks are tracked and 8488 * consequently larger devices can be covered. 8489 * 'Acknowledged' flag - 1 bit. - the most significant bit. 8490 * 8491 * Locking of the bad-block table uses a seqlock so md_is_badblock 8492 * might need to retry if it is very unlucky. 8493 * We will sometimes want to check for bad blocks in a bi_end_io function, 8494 * so we use the write_seqlock_irq variant. 8495 * 8496 * When looking for a bad block we specify a range and want to 8497 * know if any block in the range is bad. So we binary-search 8498 * to the last range that starts at-or-before the given endpoint, 8499 * (or "before the sector after the target range") 8500 * then see if it ends after the given start. 8501 * We return 8502 * 0 if there are no known bad blocks in the range 8503 * 1 if there are known bad block which are all acknowledged 8504 * -1 if there are bad blocks which have not yet been acknowledged in metadata. 8505 * plus the start/length of the first bad section we overlap. 8506 */ 8507 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 8508 sector_t *first_bad, int *bad_sectors) 8509 { 8510 int hi; 8511 int lo; 8512 u64 *p = bb->page; 8513 int rv; 8514 sector_t target = s + sectors; 8515 unsigned seq; 8516 8517 if (bb->shift > 0) { 8518 /* round the start down, and the end up */ 8519 s >>= bb->shift; 8520 target += (1<<bb->shift) - 1; 8521 target >>= bb->shift; 8522 sectors = target - s; 8523 } 8524 /* 'target' is now the first block after the bad range */ 8525 8526 retry: 8527 seq = read_seqbegin(&bb->lock); 8528 lo = 0; 8529 rv = 0; 8530 hi = bb->count; 8531 8532 /* Binary search between lo and hi for 'target' 8533 * i.e. for the last range that starts before 'target' 8534 */ 8535 /* INVARIANT: ranges before 'lo' and at-or-after 'hi' 8536 * are known not to be the last range before target. 8537 * VARIANT: hi-lo is the number of possible 8538 * ranges, and decreases until it reaches 1 8539 */ 8540 while (hi - lo > 1) { 8541 int mid = (lo + hi) / 2; 8542 sector_t a = BB_OFFSET(p[mid]); 8543 if (a < target) 8544 /* This could still be the one, earlier ranges 8545 * could not. */ 8546 lo = mid; 8547 else 8548 /* This and later ranges are definitely out. */ 8549 hi = mid; 8550 } 8551 /* 'lo' might be the last that started before target, but 'hi' isn't */ 8552 if (hi > lo) { 8553 /* need to check all range that end after 's' to see if 8554 * any are unacknowledged. 8555 */ 8556 while (lo >= 0 && 8557 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8558 if (BB_OFFSET(p[lo]) < target) { 8559 /* starts before the end, and finishes after 8560 * the start, so they must overlap 8561 */ 8562 if (rv != -1 && BB_ACK(p[lo])) 8563 rv = 1; 8564 else 8565 rv = -1; 8566 *first_bad = BB_OFFSET(p[lo]); 8567 *bad_sectors = BB_LEN(p[lo]); 8568 } 8569 lo--; 8570 } 8571 } 8572 8573 if (read_seqretry(&bb->lock, seq)) 8574 goto retry; 8575 8576 return rv; 8577 } 8578 EXPORT_SYMBOL_GPL(md_is_badblock); 8579 8580 /* 8581 * Add a range of bad blocks to the table. 8582 * This might extend the table, or might contract it 8583 * if two adjacent ranges can be merged. 8584 * We binary-search to find the 'insertion' point, then 8585 * decide how best to handle it. 8586 */ 8587 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 8588 int acknowledged) 8589 { 8590 u64 *p; 8591 int lo, hi; 8592 int rv = 1; 8593 unsigned long flags; 8594 8595 if (bb->shift < 0) 8596 /* badblocks are disabled */ 8597 return 0; 8598 8599 if (bb->shift) { 8600 /* round the start down, and the end up */ 8601 sector_t next = s + sectors; 8602 s >>= bb->shift; 8603 next += (1<<bb->shift) - 1; 8604 next >>= bb->shift; 8605 sectors = next - s; 8606 } 8607 8608 write_seqlock_irqsave(&bb->lock, flags); 8609 8610 p = bb->page; 8611 lo = 0; 8612 hi = bb->count; 8613 /* Find the last range that starts at-or-before 's' */ 8614 while (hi - lo > 1) { 8615 int mid = (lo + hi) / 2; 8616 sector_t a = BB_OFFSET(p[mid]); 8617 if (a <= s) 8618 lo = mid; 8619 else 8620 hi = mid; 8621 } 8622 if (hi > lo && BB_OFFSET(p[lo]) > s) 8623 hi = lo; 8624 8625 if (hi > lo) { 8626 /* we found a range that might merge with the start 8627 * of our new range 8628 */ 8629 sector_t a = BB_OFFSET(p[lo]); 8630 sector_t e = a + BB_LEN(p[lo]); 8631 int ack = BB_ACK(p[lo]); 8632 if (e >= s) { 8633 /* Yes, we can merge with a previous range */ 8634 if (s == a && s + sectors >= e) 8635 /* new range covers old */ 8636 ack = acknowledged; 8637 else 8638 ack = ack && acknowledged; 8639 8640 if (e < s + sectors) 8641 e = s + sectors; 8642 if (e - a <= BB_MAX_LEN) { 8643 p[lo] = BB_MAKE(a, e-a, ack); 8644 s = e; 8645 } else { 8646 /* does not all fit in one range, 8647 * make p[lo] maximal 8648 */ 8649 if (BB_LEN(p[lo]) != BB_MAX_LEN) 8650 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); 8651 s = a + BB_MAX_LEN; 8652 } 8653 sectors = e - s; 8654 } 8655 } 8656 if (sectors && hi < bb->count) { 8657 /* 'hi' points to the first range that starts after 's'. 8658 * Maybe we can merge with the start of that range */ 8659 sector_t a = BB_OFFSET(p[hi]); 8660 sector_t e = a + BB_LEN(p[hi]); 8661 int ack = BB_ACK(p[hi]); 8662 if (a <= s + sectors) { 8663 /* merging is possible */ 8664 if (e <= s + sectors) { 8665 /* full overlap */ 8666 e = s + sectors; 8667 ack = acknowledged; 8668 } else 8669 ack = ack && acknowledged; 8670 8671 a = s; 8672 if (e - a <= BB_MAX_LEN) { 8673 p[hi] = BB_MAKE(a, e-a, ack); 8674 s = e; 8675 } else { 8676 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); 8677 s = a + BB_MAX_LEN; 8678 } 8679 sectors = e - s; 8680 lo = hi; 8681 hi++; 8682 } 8683 } 8684 if (sectors == 0 && hi < bb->count) { 8685 /* we might be able to combine lo and hi */ 8686 /* Note: 's' is at the end of 'lo' */ 8687 sector_t a = BB_OFFSET(p[hi]); 8688 int lolen = BB_LEN(p[lo]); 8689 int hilen = BB_LEN(p[hi]); 8690 int newlen = lolen + hilen - (s - a); 8691 if (s >= a && newlen < BB_MAX_LEN) { 8692 /* yes, we can combine them */ 8693 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); 8694 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); 8695 memmove(p + hi, p + hi + 1, 8696 (bb->count - hi - 1) * 8); 8697 bb->count--; 8698 } 8699 } 8700 while (sectors) { 8701 /* didn't merge (it all). 8702 * Need to add a range just before 'hi' */ 8703 if (bb->count >= MD_MAX_BADBLOCKS) { 8704 /* No room for more */ 8705 rv = 0; 8706 break; 8707 } else { 8708 int this_sectors = sectors; 8709 memmove(p + hi + 1, p + hi, 8710 (bb->count - hi) * 8); 8711 bb->count++; 8712 8713 if (this_sectors > BB_MAX_LEN) 8714 this_sectors = BB_MAX_LEN; 8715 p[hi] = BB_MAKE(s, this_sectors, acknowledged); 8716 sectors -= this_sectors; 8717 s += this_sectors; 8718 } 8719 } 8720 8721 bb->changed = 1; 8722 if (!acknowledged) 8723 bb->unacked_exist = 1; 8724 write_sequnlock_irqrestore(&bb->lock, flags); 8725 8726 return rv; 8727 } 8728 8729 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8730 int is_new) 8731 { 8732 int rv; 8733 if (is_new) 8734 s += rdev->new_data_offset; 8735 else 8736 s += rdev->data_offset; 8737 rv = md_set_badblocks(&rdev->badblocks, 8738 s, sectors, 0); 8739 if (rv) { 8740 /* Make sure they get written out promptly */ 8741 sysfs_notify_dirent_safe(rdev->sysfs_state); 8742 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8743 set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); 8744 md_wakeup_thread(rdev->mddev->thread); 8745 } 8746 return rv; 8747 } 8748 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8749 8750 /* 8751 * Remove a range of bad blocks from the table. 8752 * This may involve extending the table if we spilt a region, 8753 * but it must not fail. So if the table becomes full, we just 8754 * drop the remove request. 8755 */ 8756 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) 8757 { 8758 u64 *p; 8759 int lo, hi; 8760 sector_t target = s + sectors; 8761 int rv = 0; 8762 8763 if (bb->shift > 0) { 8764 /* When clearing we round the start up and the end down. 8765 * This should not matter as the shift should align with 8766 * the block size and no rounding should ever be needed. 8767 * However it is better the think a block is bad when it 8768 * isn't than to think a block is not bad when it is. 8769 */ 8770 s += (1<<bb->shift) - 1; 8771 s >>= bb->shift; 8772 target >>= bb->shift; 8773 sectors = target - s; 8774 } 8775 8776 write_seqlock_irq(&bb->lock); 8777 8778 p = bb->page; 8779 lo = 0; 8780 hi = bb->count; 8781 /* Find the last range that starts before 'target' */ 8782 while (hi - lo > 1) { 8783 int mid = (lo + hi) / 2; 8784 sector_t a = BB_OFFSET(p[mid]); 8785 if (a < target) 8786 lo = mid; 8787 else 8788 hi = mid; 8789 } 8790 if (hi > lo) { 8791 /* p[lo] is the last range that could overlap the 8792 * current range. Earlier ranges could also overlap, 8793 * but only this one can overlap the end of the range. 8794 */ 8795 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { 8796 /* Partial overlap, leave the tail of this range */ 8797 int ack = BB_ACK(p[lo]); 8798 sector_t a = BB_OFFSET(p[lo]); 8799 sector_t end = a + BB_LEN(p[lo]); 8800 8801 if (a < s) { 8802 /* we need to split this range */ 8803 if (bb->count >= MD_MAX_BADBLOCKS) { 8804 rv = -ENOSPC; 8805 goto out; 8806 } 8807 memmove(p+lo+1, p+lo, (bb->count - lo) * 8); 8808 bb->count++; 8809 p[lo] = BB_MAKE(a, s-a, ack); 8810 lo++; 8811 } 8812 p[lo] = BB_MAKE(target, end - target, ack); 8813 /* there is no longer an overlap */ 8814 hi = lo; 8815 lo--; 8816 } 8817 while (lo >= 0 && 8818 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8819 /* This range does overlap */ 8820 if (BB_OFFSET(p[lo]) < s) { 8821 /* Keep the early parts of this range. */ 8822 int ack = BB_ACK(p[lo]); 8823 sector_t start = BB_OFFSET(p[lo]); 8824 p[lo] = BB_MAKE(start, s - start, ack); 8825 /* now low doesn't overlap, so.. */ 8826 break; 8827 } 8828 lo--; 8829 } 8830 /* 'lo' is strictly before, 'hi' is strictly after, 8831 * anything between needs to be discarded 8832 */ 8833 if (hi - lo > 1) { 8834 memmove(p+lo+1, p+hi, (bb->count - hi) * 8); 8835 bb->count -= (hi - lo - 1); 8836 } 8837 } 8838 8839 bb->changed = 1; 8840 out: 8841 write_sequnlock_irq(&bb->lock); 8842 return rv; 8843 } 8844 8845 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8846 int is_new) 8847 { 8848 if (is_new) 8849 s += rdev->new_data_offset; 8850 else 8851 s += rdev->data_offset; 8852 return md_clear_badblocks(&rdev->badblocks, 8853 s, sectors); 8854 } 8855 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8856 8857 /* 8858 * Acknowledge all bad blocks in a list. 8859 * This only succeeds if ->changed is clear. It is used by 8860 * in-kernel metadata updates 8861 */ 8862 void md_ack_all_badblocks(struct badblocks *bb) 8863 { 8864 if (bb->page == NULL || bb->changed) 8865 /* no point even trying */ 8866 return; 8867 write_seqlock_irq(&bb->lock); 8868 8869 if (bb->changed == 0 && bb->unacked_exist) { 8870 u64 *p = bb->page; 8871 int i; 8872 for (i = 0; i < bb->count ; i++) { 8873 if (!BB_ACK(p[i])) { 8874 sector_t start = BB_OFFSET(p[i]); 8875 int len = BB_LEN(p[i]); 8876 p[i] = BB_MAKE(start, len, 1); 8877 } 8878 } 8879 bb->unacked_exist = 0; 8880 } 8881 write_sequnlock_irq(&bb->lock); 8882 } 8883 EXPORT_SYMBOL_GPL(md_ack_all_badblocks); 8884 8885 /* sysfs access to bad-blocks list. 8886 * We present two files. 8887 * 'bad-blocks' lists sector numbers and lengths of ranges that 8888 * are recorded as bad. The list is truncated to fit within 8889 * the one-page limit of sysfs. 8890 * Writing "sector length" to this file adds an acknowledged 8891 * bad block list. 8892 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 8893 * been acknowledged. Writing to this file adds bad blocks 8894 * without acknowledging them. This is largely for testing. 8895 */ 8896 8897 static ssize_t 8898 badblocks_show(struct badblocks *bb, char *page, int unack) 8899 { 8900 size_t len; 8901 int i; 8902 u64 *p = bb->page; 8903 unsigned seq; 8904 8905 if (bb->shift < 0) 8906 return 0; 8907 8908 retry: 8909 seq = read_seqbegin(&bb->lock); 8910 8911 len = 0; 8912 i = 0; 8913 8914 while (len < PAGE_SIZE && i < bb->count) { 8915 sector_t s = BB_OFFSET(p[i]); 8916 unsigned int length = BB_LEN(p[i]); 8917 int ack = BB_ACK(p[i]); 8918 i++; 8919 8920 if (unack && ack) 8921 continue; 8922 8923 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", 8924 (unsigned long long)s << bb->shift, 8925 length << bb->shift); 8926 } 8927 if (unack && len == 0) 8928 bb->unacked_exist = 0; 8929 8930 if (read_seqretry(&bb->lock, seq)) 8931 goto retry; 8932 8933 return len; 8934 } 8935 8936 #define DO_DEBUG 1 8937 8938 static ssize_t 8939 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) 8940 { 8941 unsigned long long sector; 8942 int length; 8943 char newline; 8944 #ifdef DO_DEBUG 8945 /* Allow clearing via sysfs *only* for testing/debugging. 8946 * Normally only a successful write may clear a badblock 8947 */ 8948 int clear = 0; 8949 if (page[0] == '-') { 8950 clear = 1; 8951 page++; 8952 } 8953 #endif /* DO_DEBUG */ 8954 8955 switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { 8956 case 3: 8957 if (newline != '\n') 8958 return -EINVAL; 8959 case 2: 8960 if (length <= 0) 8961 return -EINVAL; 8962 break; 8963 default: 8964 return -EINVAL; 8965 } 8966 8967 #ifdef DO_DEBUG 8968 if (clear) { 8969 md_clear_badblocks(bb, sector, length); 8970 return len; 8971 } 8972 #endif /* DO_DEBUG */ 8973 if (md_set_badblocks(bb, sector, length, !unack)) 8974 return len; 8975 else 8976 return -ENOSPC; 8977 } 8978 8979 static int md_notify_reboot(struct notifier_block *this, 8980 unsigned long code, void *x) 8981 { 8982 struct list_head *tmp; 8983 struct mddev *mddev; 8984 int need_delay = 0; 8985 8986 for_each_mddev(mddev, tmp) { 8987 if (mddev_trylock(mddev)) { 8988 if (mddev->pers) 8989 __md_stop_writes(mddev); 8990 if (mddev->persistent) 8991 mddev->safemode = 2; 8992 mddev_unlock(mddev); 8993 } 8994 need_delay = 1; 8995 } 8996 /* 8997 * certain more exotic SCSI devices are known to be 8998 * volatile wrt too early system reboots. While the 8999 * right place to handle this issue is the given 9000 * driver, we do want to have a safe RAID driver ... 9001 */ 9002 if (need_delay) 9003 mdelay(1000*1); 9004 9005 return NOTIFY_DONE; 9006 } 9007 9008 static struct notifier_block md_notifier = { 9009 .notifier_call = md_notify_reboot, 9010 .next = NULL, 9011 .priority = INT_MAX, /* before any real devices */ 9012 }; 9013 9014 static void md_geninit(void) 9015 { 9016 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9017 9018 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 9019 } 9020 9021 static int __init md_init(void) 9022 { 9023 int ret = -ENOMEM; 9024 9025 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9026 if (!md_wq) 9027 goto err_wq; 9028 9029 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9030 if (!md_misc_wq) 9031 goto err_misc_wq; 9032 9033 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 9034 goto err_md; 9035 9036 if ((ret = register_blkdev(0, "mdp")) < 0) 9037 goto err_mdp; 9038 mdp_major = ret; 9039 9040 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 9041 md_probe, NULL, NULL); 9042 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 9043 md_probe, NULL, NULL); 9044 9045 register_reboot_notifier(&md_notifier); 9046 raid_table_header = register_sysctl_table(raid_root_table); 9047 9048 md_geninit(); 9049 return 0; 9050 9051 err_mdp: 9052 unregister_blkdev(MD_MAJOR, "md"); 9053 err_md: 9054 destroy_workqueue(md_misc_wq); 9055 err_misc_wq: 9056 destroy_workqueue(md_wq); 9057 err_wq: 9058 return ret; 9059 } 9060 9061 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9062 { 9063 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9064 struct md_rdev *rdev2; 9065 int role, ret; 9066 char b[BDEVNAME_SIZE]; 9067 9068 /* Check for change of roles in the active devices */ 9069 rdev_for_each(rdev2, mddev) { 9070 if (test_bit(Faulty, &rdev2->flags)) 9071 continue; 9072 9073 /* Check if the roles changed */ 9074 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9075 9076 if (test_bit(Candidate, &rdev2->flags)) { 9077 if (role == 0xfffe) { 9078 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 9079 md_kick_rdev_from_array(rdev2); 9080 continue; 9081 } 9082 else 9083 clear_bit(Candidate, &rdev2->flags); 9084 } 9085 9086 if (role != rdev2->raid_disk) { 9087 /* got activated */ 9088 if (rdev2->raid_disk == -1 && role != 0xffff) { 9089 rdev2->saved_raid_disk = role; 9090 ret = remove_and_add_spares(mddev, rdev2); 9091 pr_info("Activated spare: %s\n", 9092 bdevname(rdev2->bdev,b)); 9093 continue; 9094 } 9095 /* device faulty 9096 * We just want to do the minimum to mark the disk 9097 * as faulty. The recovery is performed by the 9098 * one who initiated the error. 9099 */ 9100 if ((role == 0xfffe) || (role == 0xfffd)) { 9101 md_error(mddev, rdev2); 9102 clear_bit(Blocked, &rdev2->flags); 9103 } 9104 } 9105 } 9106 9107 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) 9108 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9109 9110 /* Finally set the event to be up to date */ 9111 mddev->events = le64_to_cpu(sb->events); 9112 } 9113 9114 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9115 { 9116 int err; 9117 struct page *swapout = rdev->sb_page; 9118 struct mdp_superblock_1 *sb; 9119 9120 /* Store the sb page of the rdev in the swapout temporary 9121 * variable in case we err in the future 9122 */ 9123 rdev->sb_page = NULL; 9124 alloc_disk_sb(rdev); 9125 ClearPageUptodate(rdev->sb_page); 9126 rdev->sb_loaded = 0; 9127 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); 9128 9129 if (err < 0) { 9130 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9131 __func__, __LINE__, rdev->desc_nr, err); 9132 put_page(rdev->sb_page); 9133 rdev->sb_page = swapout; 9134 rdev->sb_loaded = 1; 9135 return err; 9136 } 9137 9138 sb = page_address(rdev->sb_page); 9139 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9140 * is not set 9141 */ 9142 9143 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9144 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9145 9146 /* The other node finished recovery, call spare_active to set 9147 * device In_sync and mddev->degraded 9148 */ 9149 if (rdev->recovery_offset == MaxSector && 9150 !test_bit(In_sync, &rdev->flags) && 9151 mddev->pers->spare_active(mddev)) 9152 sysfs_notify(&mddev->kobj, NULL, "degraded"); 9153 9154 put_page(swapout); 9155 return 0; 9156 } 9157 9158 void md_reload_sb(struct mddev *mddev, int nr) 9159 { 9160 struct md_rdev *rdev; 9161 int err; 9162 9163 /* Find the rdev */ 9164 rdev_for_each_rcu(rdev, mddev) { 9165 if (rdev->desc_nr == nr) 9166 break; 9167 } 9168 9169 if (!rdev || rdev->desc_nr != nr) { 9170 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9171 return; 9172 } 9173 9174 err = read_rdev(mddev, rdev); 9175 if (err < 0) 9176 return; 9177 9178 check_sb_changes(mddev, rdev); 9179 9180 /* Read all rdev's to update recovery_offset */ 9181 rdev_for_each_rcu(rdev, mddev) 9182 read_rdev(mddev, rdev); 9183 } 9184 EXPORT_SYMBOL(md_reload_sb); 9185 9186 #ifndef MODULE 9187 9188 /* 9189 * Searches all registered partitions for autorun RAID arrays 9190 * at boot time. 9191 */ 9192 9193 static LIST_HEAD(all_detected_devices); 9194 struct detected_devices_node { 9195 struct list_head list; 9196 dev_t dev; 9197 }; 9198 9199 void md_autodetect_dev(dev_t dev) 9200 { 9201 struct detected_devices_node *node_detected_dev; 9202 9203 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9204 if (node_detected_dev) { 9205 node_detected_dev->dev = dev; 9206 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9207 } else { 9208 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 9209 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 9210 } 9211 } 9212 9213 static void autostart_arrays(int part) 9214 { 9215 struct md_rdev *rdev; 9216 struct detected_devices_node *node_detected_dev; 9217 dev_t dev; 9218 int i_scanned, i_passed; 9219 9220 i_scanned = 0; 9221 i_passed = 0; 9222 9223 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 9224 9225 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9226 i_scanned++; 9227 node_detected_dev = list_entry(all_detected_devices.next, 9228 struct detected_devices_node, list); 9229 list_del(&node_detected_dev->list); 9230 dev = node_detected_dev->dev; 9231 kfree(node_detected_dev); 9232 rdev = md_import_device(dev,0, 90); 9233 if (IS_ERR(rdev)) 9234 continue; 9235 9236 if (test_bit(Faulty, &rdev->flags)) 9237 continue; 9238 9239 set_bit(AutoDetected, &rdev->flags); 9240 list_add(&rdev->same_set, &pending_raid_disks); 9241 i_passed++; 9242 } 9243 9244 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 9245 i_scanned, i_passed); 9246 9247 autorun_devices(part); 9248 } 9249 9250 #endif /* !MODULE */ 9251 9252 static __exit void md_exit(void) 9253 { 9254 struct mddev *mddev; 9255 struct list_head *tmp; 9256 int delay = 1; 9257 9258 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 9259 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 9260 9261 unregister_blkdev(MD_MAJOR,"md"); 9262 unregister_blkdev(mdp_major, "mdp"); 9263 unregister_reboot_notifier(&md_notifier); 9264 unregister_sysctl_table(raid_table_header); 9265 9266 /* We cannot unload the modules while some process is 9267 * waiting for us in select() or poll() - wake them up 9268 */ 9269 md_unloading = 1; 9270 while (waitqueue_active(&md_event_waiters)) { 9271 /* not safe to leave yet */ 9272 wake_up(&md_event_waiters); 9273 msleep(delay); 9274 delay += delay; 9275 } 9276 remove_proc_entry("mdstat", NULL); 9277 9278 for_each_mddev(mddev, tmp) { 9279 export_array(mddev); 9280 mddev->hold_active = 0; 9281 } 9282 destroy_workqueue(md_misc_wq); 9283 destroy_workqueue(md_wq); 9284 } 9285 9286 subsys_initcall(md_init); 9287 module_exit(md_exit) 9288 9289 static int get_ro(char *buffer, struct kernel_param *kp) 9290 { 9291 return sprintf(buffer, "%d", start_readonly); 9292 } 9293 static int set_ro(const char *val, struct kernel_param *kp) 9294 { 9295 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 9296 } 9297 9298 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 9299 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 9300 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9301 9302 MODULE_LICENSE("GPL"); 9303 MODULE_DESCRIPTION("MD RAID framework"); 9304 MODULE_ALIAS("md"); 9305 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9306