1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 34 Errors, Warnings, etc. 35 Please use: 36 pr_crit() for error conditions that risk data loss 37 pr_err() for error conditions that are unexpected, like an IO error 38 or internal inconsistency 39 pr_warn() for error conditions that could have been predicated, like 40 adding a device to an array when it has incompatible metadata 41 pr_info() for every interesting, very rare events, like an array starting 42 or stopping, or resync starting or stopping 43 pr_debug() for everything else. 44 45 */ 46 47 #include <linux/sched/signal.h> 48 #include <linux/kthread.h> 49 #include <linux/blkdev.h> 50 #include <linux/badblocks.h> 51 #include <linux/sysctl.h> 52 #include <linux/seq_file.h> 53 #include <linux/fs.h> 54 #include <linux/poll.h> 55 #include <linux/ctype.h> 56 #include <linux/string.h> 57 #include <linux/hdreg.h> 58 #include <linux/proc_fs.h> 59 #include <linux/random.h> 60 #include <linux/module.h> 61 #include <linux/reboot.h> 62 #include <linux/file.h> 63 #include <linux/compat.h> 64 #include <linux/delay.h> 65 #include <linux/raid/md_p.h> 66 #include <linux/raid/md_u.h> 67 #include <linux/slab.h> 68 #include <linux/percpu-refcount.h> 69 70 #include <trace/events/block.h> 71 #include "md.h" 72 #include "bitmap.h" 73 #include "md-cluster.h" 74 75 #ifndef MODULE 76 static void autostart_arrays(int part); 77 #endif 78 79 /* pers_list is a list of registered personalities protected 80 * by pers_lock. 81 * pers_lock does extra service to protect accesses to 82 * mddev->thread when the mutex cannot be held. 83 */ 84 static LIST_HEAD(pers_list); 85 static DEFINE_SPINLOCK(pers_lock); 86 87 struct md_cluster_operations *md_cluster_ops; 88 EXPORT_SYMBOL(md_cluster_ops); 89 struct module *md_cluster_mod; 90 EXPORT_SYMBOL(md_cluster_mod); 91 92 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 93 static struct workqueue_struct *md_wq; 94 static struct workqueue_struct *md_misc_wq; 95 96 static int remove_and_add_spares(struct mddev *mddev, 97 struct md_rdev *this); 98 static void mddev_detach(struct mddev *mddev); 99 100 /* 101 * Default number of read corrections we'll attempt on an rdev 102 * before ejecting it from the array. We divide the read error 103 * count by 2 for every hour elapsed between read errors. 104 */ 105 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 106 /* 107 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 108 * is 1000 KB/sec, so the extra system load does not show up that much. 109 * Increase it if you want to have more _guaranteed_ speed. Note that 110 * the RAID driver will use the maximum available bandwidth if the IO 111 * subsystem is idle. There is also an 'absolute maximum' reconstruction 112 * speed limit - in case reconstruction slows down your system despite 113 * idle IO detection. 114 * 115 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 116 * or /sys/block/mdX/md/sync_speed_{min,max} 117 */ 118 119 static int sysctl_speed_limit_min = 1000; 120 static int sysctl_speed_limit_max = 200000; 121 static inline int speed_min(struct mddev *mddev) 122 { 123 return mddev->sync_speed_min ? 124 mddev->sync_speed_min : sysctl_speed_limit_min; 125 } 126 127 static inline int speed_max(struct mddev *mddev) 128 { 129 return mddev->sync_speed_max ? 130 mddev->sync_speed_max : sysctl_speed_limit_max; 131 } 132 133 static struct ctl_table_header *raid_table_header; 134 135 static struct ctl_table raid_table[] = { 136 { 137 .procname = "speed_limit_min", 138 .data = &sysctl_speed_limit_min, 139 .maxlen = sizeof(int), 140 .mode = S_IRUGO|S_IWUSR, 141 .proc_handler = proc_dointvec, 142 }, 143 { 144 .procname = "speed_limit_max", 145 .data = &sysctl_speed_limit_max, 146 .maxlen = sizeof(int), 147 .mode = S_IRUGO|S_IWUSR, 148 .proc_handler = proc_dointvec, 149 }, 150 { } 151 }; 152 153 static struct ctl_table raid_dir_table[] = { 154 { 155 .procname = "raid", 156 .maxlen = 0, 157 .mode = S_IRUGO|S_IXUGO, 158 .child = raid_table, 159 }, 160 { } 161 }; 162 163 static struct ctl_table raid_root_table[] = { 164 { 165 .procname = "dev", 166 .maxlen = 0, 167 .mode = 0555, 168 .child = raid_dir_table, 169 }, 170 { } 171 }; 172 173 static const struct block_device_operations md_fops; 174 175 static int start_readonly; 176 177 /* 178 * The original mechanism for creating an md device is to create 179 * a device node in /dev and to open it. This causes races with device-close. 180 * The preferred method is to write to the "new_array" module parameter. 181 * This can avoid races. 182 * Setting create_on_open to false disables the original mechanism 183 * so all the races disappear. 184 */ 185 static bool create_on_open = true; 186 187 /* bio_clone_mddev 188 * like bio_clone_bioset, but with a local bio set 189 */ 190 191 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 192 struct mddev *mddev) 193 { 194 struct bio *b; 195 196 if (!mddev || !mddev->bio_set) 197 return bio_alloc(gfp_mask, nr_iovecs); 198 199 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 200 if (!b) 201 return NULL; 202 return b; 203 } 204 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 205 206 static struct bio *md_bio_alloc_sync(struct mddev *mddev) 207 { 208 if (!mddev || !mddev->sync_set) 209 return bio_alloc(GFP_NOIO, 1); 210 211 return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); 212 } 213 214 /* 215 * We have a system wide 'event count' that is incremented 216 * on any 'interesting' event, and readers of /proc/mdstat 217 * can use 'poll' or 'select' to find out when the event 218 * count increases. 219 * 220 * Events are: 221 * start array, stop array, error, add device, remove device, 222 * start build, activate spare 223 */ 224 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 225 static atomic_t md_event_count; 226 void md_new_event(struct mddev *mddev) 227 { 228 atomic_inc(&md_event_count); 229 wake_up(&md_event_waiters); 230 } 231 EXPORT_SYMBOL_GPL(md_new_event); 232 233 /* 234 * Enables to iterate over all existing md arrays 235 * all_mddevs_lock protects this list. 236 */ 237 static LIST_HEAD(all_mddevs); 238 static DEFINE_SPINLOCK(all_mddevs_lock); 239 240 /* 241 * iterates through all used mddevs in the system. 242 * We take care to grab the all_mddevs_lock whenever navigating 243 * the list, and to always hold a refcount when unlocked. 244 * Any code which breaks out of this loop while own 245 * a reference to the current mddev and must mddev_put it. 246 */ 247 #define for_each_mddev(_mddev,_tmp) \ 248 \ 249 for (({ spin_lock(&all_mddevs_lock); \ 250 _tmp = all_mddevs.next; \ 251 _mddev = NULL;}); \ 252 ({ if (_tmp != &all_mddevs) \ 253 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 254 spin_unlock(&all_mddevs_lock); \ 255 if (_mddev) mddev_put(_mddev); \ 256 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 257 _tmp != &all_mddevs;}); \ 258 ({ spin_lock(&all_mddevs_lock); \ 259 _tmp = _tmp->next;}) \ 260 ) 261 262 /* Rather than calling directly into the personality make_request function, 263 * IO requests come here first so that we can check if the device is 264 * being suspended pending a reconfiguration. 265 * We hold a refcount over the call to ->make_request. By the time that 266 * call has finished, the bio has been linked into some internal structure 267 * and so is visible to ->quiesce(), so we don't need the refcount any more. 268 */ 269 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 270 { 271 const int rw = bio_data_dir(bio); 272 struct mddev *mddev = q->queuedata; 273 unsigned int sectors; 274 int cpu; 275 276 blk_queue_split(q, &bio); 277 278 if (mddev == NULL || mddev->pers == NULL) { 279 bio_io_error(bio); 280 return BLK_QC_T_NONE; 281 } 282 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 283 if (bio_sectors(bio) != 0) 284 bio->bi_status = BLK_STS_IOERR; 285 bio_endio(bio); 286 return BLK_QC_T_NONE; 287 } 288 check_suspended: 289 rcu_read_lock(); 290 if (mddev->suspended) { 291 DEFINE_WAIT(__wait); 292 for (;;) { 293 prepare_to_wait(&mddev->sb_wait, &__wait, 294 TASK_UNINTERRUPTIBLE); 295 if (!mddev->suspended) 296 break; 297 rcu_read_unlock(); 298 schedule(); 299 rcu_read_lock(); 300 } 301 finish_wait(&mddev->sb_wait, &__wait); 302 } 303 atomic_inc(&mddev->active_io); 304 rcu_read_unlock(); 305 306 /* 307 * save the sectors now since our bio can 308 * go away inside make_request 309 */ 310 sectors = bio_sectors(bio); 311 /* bio could be mergeable after passing to underlayer */ 312 bio->bi_opf &= ~REQ_NOMERGE; 313 if (!mddev->pers->make_request(mddev, bio)) { 314 atomic_dec(&mddev->active_io); 315 wake_up(&mddev->sb_wait); 316 goto check_suspended; 317 } 318 319 cpu = part_stat_lock(); 320 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 321 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 322 part_stat_unlock(); 323 324 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 325 wake_up(&mddev->sb_wait); 326 327 return BLK_QC_T_NONE; 328 } 329 330 /* mddev_suspend makes sure no new requests are submitted 331 * to the device, and that any requests that have been submitted 332 * are completely handled. 333 * Once mddev_detach() is called and completes, the module will be 334 * completely unused. 335 */ 336 void mddev_suspend(struct mddev *mddev) 337 { 338 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 339 if (mddev->suspended++) 340 return; 341 synchronize_rcu(); 342 wake_up(&mddev->sb_wait); 343 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 344 mddev->pers->quiesce(mddev, 1); 345 346 del_timer_sync(&mddev->safemode_timer); 347 } 348 EXPORT_SYMBOL_GPL(mddev_suspend); 349 350 void mddev_resume(struct mddev *mddev) 351 { 352 if (--mddev->suspended) 353 return; 354 wake_up(&mddev->sb_wait); 355 mddev->pers->quiesce(mddev, 0); 356 357 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 358 md_wakeup_thread(mddev->thread); 359 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 360 } 361 EXPORT_SYMBOL_GPL(mddev_resume); 362 363 int mddev_congested(struct mddev *mddev, int bits) 364 { 365 struct md_personality *pers = mddev->pers; 366 int ret = 0; 367 368 rcu_read_lock(); 369 if (mddev->suspended) 370 ret = 1; 371 else if (pers && pers->congested) 372 ret = pers->congested(mddev, bits); 373 rcu_read_unlock(); 374 return ret; 375 } 376 EXPORT_SYMBOL_GPL(mddev_congested); 377 static int md_congested(void *data, int bits) 378 { 379 struct mddev *mddev = data; 380 return mddev_congested(mddev, bits); 381 } 382 383 /* 384 * Generic flush handling for md 385 */ 386 387 static void md_end_flush(struct bio *bio) 388 { 389 struct md_rdev *rdev = bio->bi_private; 390 struct mddev *mddev = rdev->mddev; 391 392 rdev_dec_pending(rdev, mddev); 393 394 if (atomic_dec_and_test(&mddev->flush_pending)) { 395 /* The pre-request flush has finished */ 396 queue_work(md_wq, &mddev->flush_work); 397 } 398 bio_put(bio); 399 } 400 401 static void md_submit_flush_data(struct work_struct *ws); 402 403 static void submit_flushes(struct work_struct *ws) 404 { 405 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 406 struct md_rdev *rdev; 407 408 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 409 atomic_set(&mddev->flush_pending, 1); 410 rcu_read_lock(); 411 rdev_for_each_rcu(rdev, mddev) 412 if (rdev->raid_disk >= 0 && 413 !test_bit(Faulty, &rdev->flags)) { 414 /* Take two references, one is dropped 415 * when request finishes, one after 416 * we reclaim rcu_read_lock 417 */ 418 struct bio *bi; 419 atomic_inc(&rdev->nr_pending); 420 atomic_inc(&rdev->nr_pending); 421 rcu_read_unlock(); 422 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 423 bi->bi_end_io = md_end_flush; 424 bi->bi_private = rdev; 425 bi->bi_bdev = rdev->bdev; 426 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 427 atomic_inc(&mddev->flush_pending); 428 submit_bio(bi); 429 rcu_read_lock(); 430 rdev_dec_pending(rdev, mddev); 431 } 432 rcu_read_unlock(); 433 if (atomic_dec_and_test(&mddev->flush_pending)) 434 queue_work(md_wq, &mddev->flush_work); 435 } 436 437 static void md_submit_flush_data(struct work_struct *ws) 438 { 439 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 440 struct bio *bio = mddev->flush_bio; 441 442 if (bio->bi_iter.bi_size == 0) 443 /* an empty barrier - all done */ 444 bio_endio(bio); 445 else { 446 bio->bi_opf &= ~REQ_PREFLUSH; 447 mddev->pers->make_request(mddev, bio); 448 } 449 450 mddev->flush_bio = NULL; 451 wake_up(&mddev->sb_wait); 452 } 453 454 void md_flush_request(struct mddev *mddev, struct bio *bio) 455 { 456 spin_lock_irq(&mddev->lock); 457 wait_event_lock_irq(mddev->sb_wait, 458 !mddev->flush_bio, 459 mddev->lock); 460 mddev->flush_bio = bio; 461 spin_unlock_irq(&mddev->lock); 462 463 INIT_WORK(&mddev->flush_work, submit_flushes); 464 queue_work(md_wq, &mddev->flush_work); 465 } 466 EXPORT_SYMBOL(md_flush_request); 467 468 static inline struct mddev *mddev_get(struct mddev *mddev) 469 { 470 atomic_inc(&mddev->active); 471 return mddev; 472 } 473 474 static void mddev_delayed_delete(struct work_struct *ws); 475 476 static void mddev_put(struct mddev *mddev) 477 { 478 struct bio_set *bs = NULL, *sync_bs = NULL; 479 480 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 481 return; 482 if (!mddev->raid_disks && list_empty(&mddev->disks) && 483 mddev->ctime == 0 && !mddev->hold_active) { 484 /* Array is not configured at all, and not held active, 485 * so destroy it */ 486 list_del_init(&mddev->all_mddevs); 487 bs = mddev->bio_set; 488 sync_bs = mddev->sync_set; 489 mddev->bio_set = NULL; 490 mddev->sync_set = NULL; 491 if (mddev->gendisk) { 492 /* We did a probe so need to clean up. Call 493 * queue_work inside the spinlock so that 494 * flush_workqueue() after mddev_find will 495 * succeed in waiting for the work to be done. 496 */ 497 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 498 queue_work(md_misc_wq, &mddev->del_work); 499 } else 500 kfree(mddev); 501 } 502 spin_unlock(&all_mddevs_lock); 503 if (bs) 504 bioset_free(bs); 505 if (sync_bs) 506 bioset_free(sync_bs); 507 } 508 509 static void md_safemode_timeout(unsigned long data); 510 511 void mddev_init(struct mddev *mddev) 512 { 513 mutex_init(&mddev->open_mutex); 514 mutex_init(&mddev->reconfig_mutex); 515 mutex_init(&mddev->bitmap_info.mutex); 516 INIT_LIST_HEAD(&mddev->disks); 517 INIT_LIST_HEAD(&mddev->all_mddevs); 518 setup_timer(&mddev->safemode_timer, md_safemode_timeout, 519 (unsigned long) mddev); 520 atomic_set(&mddev->active, 1); 521 atomic_set(&mddev->openers, 0); 522 atomic_set(&mddev->active_io, 0); 523 spin_lock_init(&mddev->lock); 524 atomic_set(&mddev->flush_pending, 0); 525 init_waitqueue_head(&mddev->sb_wait); 526 init_waitqueue_head(&mddev->recovery_wait); 527 mddev->reshape_position = MaxSector; 528 mddev->reshape_backwards = 0; 529 mddev->last_sync_action = "none"; 530 mddev->resync_min = 0; 531 mddev->resync_max = MaxSector; 532 mddev->level = LEVEL_NONE; 533 } 534 EXPORT_SYMBOL_GPL(mddev_init); 535 536 static struct mddev *mddev_find(dev_t unit) 537 { 538 struct mddev *mddev, *new = NULL; 539 540 if (unit && MAJOR(unit) != MD_MAJOR) 541 unit &= ~((1<<MdpMinorShift)-1); 542 543 retry: 544 spin_lock(&all_mddevs_lock); 545 546 if (unit) { 547 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 548 if (mddev->unit == unit) { 549 mddev_get(mddev); 550 spin_unlock(&all_mddevs_lock); 551 kfree(new); 552 return mddev; 553 } 554 555 if (new) { 556 list_add(&new->all_mddevs, &all_mddevs); 557 spin_unlock(&all_mddevs_lock); 558 new->hold_active = UNTIL_IOCTL; 559 return new; 560 } 561 } else if (new) { 562 /* find an unused unit number */ 563 static int next_minor = 512; 564 int start = next_minor; 565 int is_free = 0; 566 int dev = 0; 567 while (!is_free) { 568 dev = MKDEV(MD_MAJOR, next_minor); 569 next_minor++; 570 if (next_minor > MINORMASK) 571 next_minor = 0; 572 if (next_minor == start) { 573 /* Oh dear, all in use. */ 574 spin_unlock(&all_mddevs_lock); 575 kfree(new); 576 return NULL; 577 } 578 579 is_free = 1; 580 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 581 if (mddev->unit == dev) { 582 is_free = 0; 583 break; 584 } 585 } 586 new->unit = dev; 587 new->md_minor = MINOR(dev); 588 new->hold_active = UNTIL_STOP; 589 list_add(&new->all_mddevs, &all_mddevs); 590 spin_unlock(&all_mddevs_lock); 591 return new; 592 } 593 spin_unlock(&all_mddevs_lock); 594 595 new = kzalloc(sizeof(*new), GFP_KERNEL); 596 if (!new) 597 return NULL; 598 599 new->unit = unit; 600 if (MAJOR(unit) == MD_MAJOR) 601 new->md_minor = MINOR(unit); 602 else 603 new->md_minor = MINOR(unit) >> MdpMinorShift; 604 605 mddev_init(new); 606 607 goto retry; 608 } 609 610 static struct attribute_group md_redundancy_group; 611 612 void mddev_unlock(struct mddev *mddev) 613 { 614 if (mddev->to_remove) { 615 /* These cannot be removed under reconfig_mutex as 616 * an access to the files will try to take reconfig_mutex 617 * while holding the file unremovable, which leads to 618 * a deadlock. 619 * So hold set sysfs_active while the remove in happeing, 620 * and anything else which might set ->to_remove or my 621 * otherwise change the sysfs namespace will fail with 622 * -EBUSY if sysfs_active is still set. 623 * We set sysfs_active under reconfig_mutex and elsewhere 624 * test it under the same mutex to ensure its correct value 625 * is seen. 626 */ 627 struct attribute_group *to_remove = mddev->to_remove; 628 mddev->to_remove = NULL; 629 mddev->sysfs_active = 1; 630 mutex_unlock(&mddev->reconfig_mutex); 631 632 if (mddev->kobj.sd) { 633 if (to_remove != &md_redundancy_group) 634 sysfs_remove_group(&mddev->kobj, to_remove); 635 if (mddev->pers == NULL || 636 mddev->pers->sync_request == NULL) { 637 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 638 if (mddev->sysfs_action) 639 sysfs_put(mddev->sysfs_action); 640 mddev->sysfs_action = NULL; 641 } 642 } 643 mddev->sysfs_active = 0; 644 } else 645 mutex_unlock(&mddev->reconfig_mutex); 646 647 /* As we've dropped the mutex we need a spinlock to 648 * make sure the thread doesn't disappear 649 */ 650 spin_lock(&pers_lock); 651 md_wakeup_thread(mddev->thread); 652 spin_unlock(&pers_lock); 653 } 654 EXPORT_SYMBOL_GPL(mddev_unlock); 655 656 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 657 { 658 struct md_rdev *rdev; 659 660 rdev_for_each_rcu(rdev, mddev) 661 if (rdev->desc_nr == nr) 662 return rdev; 663 664 return NULL; 665 } 666 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 667 668 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 669 { 670 struct md_rdev *rdev; 671 672 rdev_for_each(rdev, mddev) 673 if (rdev->bdev->bd_dev == dev) 674 return rdev; 675 676 return NULL; 677 } 678 679 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 680 { 681 struct md_rdev *rdev; 682 683 rdev_for_each_rcu(rdev, mddev) 684 if (rdev->bdev->bd_dev == dev) 685 return rdev; 686 687 return NULL; 688 } 689 690 static struct md_personality *find_pers(int level, char *clevel) 691 { 692 struct md_personality *pers; 693 list_for_each_entry(pers, &pers_list, list) { 694 if (level != LEVEL_NONE && pers->level == level) 695 return pers; 696 if (strcmp(pers->name, clevel)==0) 697 return pers; 698 } 699 return NULL; 700 } 701 702 /* return the offset of the super block in 512byte sectors */ 703 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 704 { 705 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 706 return MD_NEW_SIZE_SECTORS(num_sectors); 707 } 708 709 static int alloc_disk_sb(struct md_rdev *rdev) 710 { 711 rdev->sb_page = alloc_page(GFP_KERNEL); 712 if (!rdev->sb_page) 713 return -ENOMEM; 714 return 0; 715 } 716 717 void md_rdev_clear(struct md_rdev *rdev) 718 { 719 if (rdev->sb_page) { 720 put_page(rdev->sb_page); 721 rdev->sb_loaded = 0; 722 rdev->sb_page = NULL; 723 rdev->sb_start = 0; 724 rdev->sectors = 0; 725 } 726 if (rdev->bb_page) { 727 put_page(rdev->bb_page); 728 rdev->bb_page = NULL; 729 } 730 badblocks_exit(&rdev->badblocks); 731 } 732 EXPORT_SYMBOL_GPL(md_rdev_clear); 733 734 static void super_written(struct bio *bio) 735 { 736 struct md_rdev *rdev = bio->bi_private; 737 struct mddev *mddev = rdev->mddev; 738 739 if (bio->bi_status) { 740 pr_err("md: super_written gets error=%d\n", bio->bi_status); 741 md_error(mddev, rdev); 742 if (!test_bit(Faulty, &rdev->flags) 743 && (bio->bi_opf & MD_FAILFAST)) { 744 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 745 set_bit(LastDev, &rdev->flags); 746 } 747 } else 748 clear_bit(LastDev, &rdev->flags); 749 750 if (atomic_dec_and_test(&mddev->pending_writes)) 751 wake_up(&mddev->sb_wait); 752 rdev_dec_pending(rdev, mddev); 753 bio_put(bio); 754 } 755 756 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 757 sector_t sector, int size, struct page *page) 758 { 759 /* write first size bytes of page to sector of rdev 760 * Increment mddev->pending_writes before returning 761 * and decrement it on completion, waking up sb_wait 762 * if zero is reached. 763 * If an error occurred, call md_error 764 */ 765 struct bio *bio; 766 int ff = 0; 767 768 if (test_bit(Faulty, &rdev->flags)) 769 return; 770 771 bio = md_bio_alloc_sync(mddev); 772 773 atomic_inc(&rdev->nr_pending); 774 775 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 776 bio->bi_iter.bi_sector = sector; 777 bio_add_page(bio, page, size, 0); 778 bio->bi_private = rdev; 779 bio->bi_end_io = super_written; 780 781 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 782 test_bit(FailFast, &rdev->flags) && 783 !test_bit(LastDev, &rdev->flags)) 784 ff = MD_FAILFAST; 785 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff; 786 787 atomic_inc(&mddev->pending_writes); 788 submit_bio(bio); 789 } 790 791 int md_super_wait(struct mddev *mddev) 792 { 793 /* wait for all superblock writes that were scheduled to complete */ 794 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 795 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 796 return -EAGAIN; 797 return 0; 798 } 799 800 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 801 struct page *page, int op, int op_flags, bool metadata_op) 802 { 803 struct bio *bio = md_bio_alloc_sync(rdev->mddev); 804 int ret; 805 806 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 807 rdev->meta_bdev : rdev->bdev; 808 bio_set_op_attrs(bio, op, op_flags); 809 if (metadata_op) 810 bio->bi_iter.bi_sector = sector + rdev->sb_start; 811 else if (rdev->mddev->reshape_position != MaxSector && 812 (rdev->mddev->reshape_backwards == 813 (sector >= rdev->mddev->reshape_position))) 814 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 815 else 816 bio->bi_iter.bi_sector = sector + rdev->data_offset; 817 bio_add_page(bio, page, size, 0); 818 819 submit_bio_wait(bio); 820 821 ret = !bio->bi_status; 822 bio_put(bio); 823 return ret; 824 } 825 EXPORT_SYMBOL_GPL(sync_page_io); 826 827 static int read_disk_sb(struct md_rdev *rdev, int size) 828 { 829 char b[BDEVNAME_SIZE]; 830 831 if (rdev->sb_loaded) 832 return 0; 833 834 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) 835 goto fail; 836 rdev->sb_loaded = 1; 837 return 0; 838 839 fail: 840 pr_err("md: disabled device %s, could not read superblock.\n", 841 bdevname(rdev->bdev,b)); 842 return -EINVAL; 843 } 844 845 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 846 { 847 return sb1->set_uuid0 == sb2->set_uuid0 && 848 sb1->set_uuid1 == sb2->set_uuid1 && 849 sb1->set_uuid2 == sb2->set_uuid2 && 850 sb1->set_uuid3 == sb2->set_uuid3; 851 } 852 853 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 854 { 855 int ret; 856 mdp_super_t *tmp1, *tmp2; 857 858 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 859 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 860 861 if (!tmp1 || !tmp2) { 862 ret = 0; 863 goto abort; 864 } 865 866 *tmp1 = *sb1; 867 *tmp2 = *sb2; 868 869 /* 870 * nr_disks is not constant 871 */ 872 tmp1->nr_disks = 0; 873 tmp2->nr_disks = 0; 874 875 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 876 abort: 877 kfree(tmp1); 878 kfree(tmp2); 879 return ret; 880 } 881 882 static u32 md_csum_fold(u32 csum) 883 { 884 csum = (csum & 0xffff) + (csum >> 16); 885 return (csum & 0xffff) + (csum >> 16); 886 } 887 888 static unsigned int calc_sb_csum(mdp_super_t *sb) 889 { 890 u64 newcsum = 0; 891 u32 *sb32 = (u32*)sb; 892 int i; 893 unsigned int disk_csum, csum; 894 895 disk_csum = sb->sb_csum; 896 sb->sb_csum = 0; 897 898 for (i = 0; i < MD_SB_BYTES/4 ; i++) 899 newcsum += sb32[i]; 900 csum = (newcsum & 0xffffffff) + (newcsum>>32); 901 902 #ifdef CONFIG_ALPHA 903 /* This used to use csum_partial, which was wrong for several 904 * reasons including that different results are returned on 905 * different architectures. It isn't critical that we get exactly 906 * the same return value as before (we always csum_fold before 907 * testing, and that removes any differences). However as we 908 * know that csum_partial always returned a 16bit value on 909 * alphas, do a fold to maximise conformity to previous behaviour. 910 */ 911 sb->sb_csum = md_csum_fold(disk_csum); 912 #else 913 sb->sb_csum = disk_csum; 914 #endif 915 return csum; 916 } 917 918 /* 919 * Handle superblock details. 920 * We want to be able to handle multiple superblock formats 921 * so we have a common interface to them all, and an array of 922 * different handlers. 923 * We rely on user-space to write the initial superblock, and support 924 * reading and updating of superblocks. 925 * Interface methods are: 926 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 927 * loads and validates a superblock on dev. 928 * if refdev != NULL, compare superblocks on both devices 929 * Return: 930 * 0 - dev has a superblock that is compatible with refdev 931 * 1 - dev has a superblock that is compatible and newer than refdev 932 * so dev should be used as the refdev in future 933 * -EINVAL superblock incompatible or invalid 934 * -othererror e.g. -EIO 935 * 936 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 937 * Verify that dev is acceptable into mddev. 938 * The first time, mddev->raid_disks will be 0, and data from 939 * dev should be merged in. Subsequent calls check that dev 940 * is new enough. Return 0 or -EINVAL 941 * 942 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 943 * Update the superblock for rdev with data in mddev 944 * This does not write to disc. 945 * 946 */ 947 948 struct super_type { 949 char *name; 950 struct module *owner; 951 int (*load_super)(struct md_rdev *rdev, 952 struct md_rdev *refdev, 953 int minor_version); 954 int (*validate_super)(struct mddev *mddev, 955 struct md_rdev *rdev); 956 void (*sync_super)(struct mddev *mddev, 957 struct md_rdev *rdev); 958 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 959 sector_t num_sectors); 960 int (*allow_new_offset)(struct md_rdev *rdev, 961 unsigned long long new_offset); 962 }; 963 964 /* 965 * Check that the given mddev has no bitmap. 966 * 967 * This function is called from the run method of all personalities that do not 968 * support bitmaps. It prints an error message and returns non-zero if mddev 969 * has a bitmap. Otherwise, it returns 0. 970 * 971 */ 972 int md_check_no_bitmap(struct mddev *mddev) 973 { 974 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 975 return 0; 976 pr_warn("%s: bitmaps are not supported for %s\n", 977 mdname(mddev), mddev->pers->name); 978 return 1; 979 } 980 EXPORT_SYMBOL(md_check_no_bitmap); 981 982 /* 983 * load_super for 0.90.0 984 */ 985 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 986 { 987 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 988 mdp_super_t *sb; 989 int ret; 990 991 /* 992 * Calculate the position of the superblock (512byte sectors), 993 * it's at the end of the disk. 994 * 995 * It also happens to be a multiple of 4Kb. 996 */ 997 rdev->sb_start = calc_dev_sboffset(rdev); 998 999 ret = read_disk_sb(rdev, MD_SB_BYTES); 1000 if (ret) 1001 return ret; 1002 1003 ret = -EINVAL; 1004 1005 bdevname(rdev->bdev, b); 1006 sb = page_address(rdev->sb_page); 1007 1008 if (sb->md_magic != MD_SB_MAGIC) { 1009 pr_warn("md: invalid raid superblock magic on %s\n", b); 1010 goto abort; 1011 } 1012 1013 if (sb->major_version != 0 || 1014 sb->minor_version < 90 || 1015 sb->minor_version > 91) { 1016 pr_warn("Bad version number %d.%d on %s\n", 1017 sb->major_version, sb->minor_version, b); 1018 goto abort; 1019 } 1020 1021 if (sb->raid_disks <= 0) 1022 goto abort; 1023 1024 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1025 pr_warn("md: invalid superblock checksum on %s\n", b); 1026 goto abort; 1027 } 1028 1029 rdev->preferred_minor = sb->md_minor; 1030 rdev->data_offset = 0; 1031 rdev->new_data_offset = 0; 1032 rdev->sb_size = MD_SB_BYTES; 1033 rdev->badblocks.shift = -1; 1034 1035 if (sb->level == LEVEL_MULTIPATH) 1036 rdev->desc_nr = -1; 1037 else 1038 rdev->desc_nr = sb->this_disk.number; 1039 1040 if (!refdev) { 1041 ret = 1; 1042 } else { 1043 __u64 ev1, ev2; 1044 mdp_super_t *refsb = page_address(refdev->sb_page); 1045 if (!md_uuid_equal(refsb, sb)) { 1046 pr_warn("md: %s has different UUID to %s\n", 1047 b, bdevname(refdev->bdev,b2)); 1048 goto abort; 1049 } 1050 if (!md_sb_equal(refsb, sb)) { 1051 pr_warn("md: %s has same UUID but different superblock to %s\n", 1052 b, bdevname(refdev->bdev, b2)); 1053 goto abort; 1054 } 1055 ev1 = md_event(sb); 1056 ev2 = md_event(refsb); 1057 if (ev1 > ev2) 1058 ret = 1; 1059 else 1060 ret = 0; 1061 } 1062 rdev->sectors = rdev->sb_start; 1063 /* Limit to 4TB as metadata cannot record more than that. 1064 * (not needed for Linear and RAID0 as metadata doesn't 1065 * record this size) 1066 */ 1067 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && 1068 sb->level >= 1) 1069 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1070 1071 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1072 /* "this cannot possibly happen" ... */ 1073 ret = -EINVAL; 1074 1075 abort: 1076 return ret; 1077 } 1078 1079 /* 1080 * validate_super for 0.90.0 1081 */ 1082 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1083 { 1084 mdp_disk_t *desc; 1085 mdp_super_t *sb = page_address(rdev->sb_page); 1086 __u64 ev1 = md_event(sb); 1087 1088 rdev->raid_disk = -1; 1089 clear_bit(Faulty, &rdev->flags); 1090 clear_bit(In_sync, &rdev->flags); 1091 clear_bit(Bitmap_sync, &rdev->flags); 1092 clear_bit(WriteMostly, &rdev->flags); 1093 1094 if (mddev->raid_disks == 0) { 1095 mddev->major_version = 0; 1096 mddev->minor_version = sb->minor_version; 1097 mddev->patch_version = sb->patch_version; 1098 mddev->external = 0; 1099 mddev->chunk_sectors = sb->chunk_size >> 9; 1100 mddev->ctime = sb->ctime; 1101 mddev->utime = sb->utime; 1102 mddev->level = sb->level; 1103 mddev->clevel[0] = 0; 1104 mddev->layout = sb->layout; 1105 mddev->raid_disks = sb->raid_disks; 1106 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1107 mddev->events = ev1; 1108 mddev->bitmap_info.offset = 0; 1109 mddev->bitmap_info.space = 0; 1110 /* bitmap can use 60 K after the 4K superblocks */ 1111 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1112 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1113 mddev->reshape_backwards = 0; 1114 1115 if (mddev->minor_version >= 91) { 1116 mddev->reshape_position = sb->reshape_position; 1117 mddev->delta_disks = sb->delta_disks; 1118 mddev->new_level = sb->new_level; 1119 mddev->new_layout = sb->new_layout; 1120 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1121 if (mddev->delta_disks < 0) 1122 mddev->reshape_backwards = 1; 1123 } else { 1124 mddev->reshape_position = MaxSector; 1125 mddev->delta_disks = 0; 1126 mddev->new_level = mddev->level; 1127 mddev->new_layout = mddev->layout; 1128 mddev->new_chunk_sectors = mddev->chunk_sectors; 1129 } 1130 1131 if (sb->state & (1<<MD_SB_CLEAN)) 1132 mddev->recovery_cp = MaxSector; 1133 else { 1134 if (sb->events_hi == sb->cp_events_hi && 1135 sb->events_lo == sb->cp_events_lo) { 1136 mddev->recovery_cp = sb->recovery_cp; 1137 } else 1138 mddev->recovery_cp = 0; 1139 } 1140 1141 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1142 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1143 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1144 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1145 1146 mddev->max_disks = MD_SB_DISKS; 1147 1148 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1149 mddev->bitmap_info.file == NULL) { 1150 mddev->bitmap_info.offset = 1151 mddev->bitmap_info.default_offset; 1152 mddev->bitmap_info.space = 1153 mddev->bitmap_info.default_space; 1154 } 1155 1156 } else if (mddev->pers == NULL) { 1157 /* Insist on good event counter while assembling, except 1158 * for spares (which don't need an event count) */ 1159 ++ev1; 1160 if (sb->disks[rdev->desc_nr].state & ( 1161 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1162 if (ev1 < mddev->events) 1163 return -EINVAL; 1164 } else if (mddev->bitmap) { 1165 /* if adding to array with a bitmap, then we can accept an 1166 * older device ... but not too old. 1167 */ 1168 if (ev1 < mddev->bitmap->events_cleared) 1169 return 0; 1170 if (ev1 < mddev->events) 1171 set_bit(Bitmap_sync, &rdev->flags); 1172 } else { 1173 if (ev1 < mddev->events) 1174 /* just a hot-add of a new device, leave raid_disk at -1 */ 1175 return 0; 1176 } 1177 1178 if (mddev->level != LEVEL_MULTIPATH) { 1179 desc = sb->disks + rdev->desc_nr; 1180 1181 if (desc->state & (1<<MD_DISK_FAULTY)) 1182 set_bit(Faulty, &rdev->flags); 1183 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1184 desc->raid_disk < mddev->raid_disks */) { 1185 set_bit(In_sync, &rdev->flags); 1186 rdev->raid_disk = desc->raid_disk; 1187 rdev->saved_raid_disk = desc->raid_disk; 1188 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1189 /* active but not in sync implies recovery up to 1190 * reshape position. We don't know exactly where 1191 * that is, so set to zero for now */ 1192 if (mddev->minor_version >= 91) { 1193 rdev->recovery_offset = 0; 1194 rdev->raid_disk = desc->raid_disk; 1195 } 1196 } 1197 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1198 set_bit(WriteMostly, &rdev->flags); 1199 if (desc->state & (1<<MD_DISK_FAILFAST)) 1200 set_bit(FailFast, &rdev->flags); 1201 } else /* MULTIPATH are always insync */ 1202 set_bit(In_sync, &rdev->flags); 1203 return 0; 1204 } 1205 1206 /* 1207 * sync_super for 0.90.0 1208 */ 1209 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1210 { 1211 mdp_super_t *sb; 1212 struct md_rdev *rdev2; 1213 int next_spare = mddev->raid_disks; 1214 1215 /* make rdev->sb match mddev data.. 1216 * 1217 * 1/ zero out disks 1218 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1219 * 3/ any empty disks < next_spare become removed 1220 * 1221 * disks[0] gets initialised to REMOVED because 1222 * we cannot be sure from other fields if it has 1223 * been initialised or not. 1224 */ 1225 int i; 1226 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1227 1228 rdev->sb_size = MD_SB_BYTES; 1229 1230 sb = page_address(rdev->sb_page); 1231 1232 memset(sb, 0, sizeof(*sb)); 1233 1234 sb->md_magic = MD_SB_MAGIC; 1235 sb->major_version = mddev->major_version; 1236 sb->patch_version = mddev->patch_version; 1237 sb->gvalid_words = 0; /* ignored */ 1238 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1239 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1240 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1241 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1242 1243 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1244 sb->level = mddev->level; 1245 sb->size = mddev->dev_sectors / 2; 1246 sb->raid_disks = mddev->raid_disks; 1247 sb->md_minor = mddev->md_minor; 1248 sb->not_persistent = 0; 1249 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1250 sb->state = 0; 1251 sb->events_hi = (mddev->events>>32); 1252 sb->events_lo = (u32)mddev->events; 1253 1254 if (mddev->reshape_position == MaxSector) 1255 sb->minor_version = 90; 1256 else { 1257 sb->minor_version = 91; 1258 sb->reshape_position = mddev->reshape_position; 1259 sb->new_level = mddev->new_level; 1260 sb->delta_disks = mddev->delta_disks; 1261 sb->new_layout = mddev->new_layout; 1262 sb->new_chunk = mddev->new_chunk_sectors << 9; 1263 } 1264 mddev->minor_version = sb->minor_version; 1265 if (mddev->in_sync) 1266 { 1267 sb->recovery_cp = mddev->recovery_cp; 1268 sb->cp_events_hi = (mddev->events>>32); 1269 sb->cp_events_lo = (u32)mddev->events; 1270 if (mddev->recovery_cp == MaxSector) 1271 sb->state = (1<< MD_SB_CLEAN); 1272 } else 1273 sb->recovery_cp = 0; 1274 1275 sb->layout = mddev->layout; 1276 sb->chunk_size = mddev->chunk_sectors << 9; 1277 1278 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1279 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1280 1281 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1282 rdev_for_each(rdev2, mddev) { 1283 mdp_disk_t *d; 1284 int desc_nr; 1285 int is_active = test_bit(In_sync, &rdev2->flags); 1286 1287 if (rdev2->raid_disk >= 0 && 1288 sb->minor_version >= 91) 1289 /* we have nowhere to store the recovery_offset, 1290 * but if it is not below the reshape_position, 1291 * we can piggy-back on that. 1292 */ 1293 is_active = 1; 1294 if (rdev2->raid_disk < 0 || 1295 test_bit(Faulty, &rdev2->flags)) 1296 is_active = 0; 1297 if (is_active) 1298 desc_nr = rdev2->raid_disk; 1299 else 1300 desc_nr = next_spare++; 1301 rdev2->desc_nr = desc_nr; 1302 d = &sb->disks[rdev2->desc_nr]; 1303 nr_disks++; 1304 d->number = rdev2->desc_nr; 1305 d->major = MAJOR(rdev2->bdev->bd_dev); 1306 d->minor = MINOR(rdev2->bdev->bd_dev); 1307 if (is_active) 1308 d->raid_disk = rdev2->raid_disk; 1309 else 1310 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1311 if (test_bit(Faulty, &rdev2->flags)) 1312 d->state = (1<<MD_DISK_FAULTY); 1313 else if (is_active) { 1314 d->state = (1<<MD_DISK_ACTIVE); 1315 if (test_bit(In_sync, &rdev2->flags)) 1316 d->state |= (1<<MD_DISK_SYNC); 1317 active++; 1318 working++; 1319 } else { 1320 d->state = 0; 1321 spare++; 1322 working++; 1323 } 1324 if (test_bit(WriteMostly, &rdev2->flags)) 1325 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1326 if (test_bit(FailFast, &rdev2->flags)) 1327 d->state |= (1<<MD_DISK_FAILFAST); 1328 } 1329 /* now set the "removed" and "faulty" bits on any missing devices */ 1330 for (i=0 ; i < mddev->raid_disks ; i++) { 1331 mdp_disk_t *d = &sb->disks[i]; 1332 if (d->state == 0 && d->number == 0) { 1333 d->number = i; 1334 d->raid_disk = i; 1335 d->state = (1<<MD_DISK_REMOVED); 1336 d->state |= (1<<MD_DISK_FAULTY); 1337 failed++; 1338 } 1339 } 1340 sb->nr_disks = nr_disks; 1341 sb->active_disks = active; 1342 sb->working_disks = working; 1343 sb->failed_disks = failed; 1344 sb->spare_disks = spare; 1345 1346 sb->this_disk = sb->disks[rdev->desc_nr]; 1347 sb->sb_csum = calc_sb_csum(sb); 1348 } 1349 1350 /* 1351 * rdev_size_change for 0.90.0 1352 */ 1353 static unsigned long long 1354 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1355 { 1356 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1357 return 0; /* component must fit device */ 1358 if (rdev->mddev->bitmap_info.offset) 1359 return 0; /* can't move bitmap */ 1360 rdev->sb_start = calc_dev_sboffset(rdev); 1361 if (!num_sectors || num_sectors > rdev->sb_start) 1362 num_sectors = rdev->sb_start; 1363 /* Limit to 4TB as metadata cannot record more than that. 1364 * 4TB == 2^32 KB, or 2*2^32 sectors. 1365 */ 1366 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && 1367 rdev->mddev->level >= 1) 1368 num_sectors = (sector_t)(2ULL << 32) - 2; 1369 do { 1370 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1371 rdev->sb_page); 1372 } while (md_super_wait(rdev->mddev) < 0); 1373 return num_sectors; 1374 } 1375 1376 static int 1377 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1378 { 1379 /* non-zero offset changes not possible with v0.90 */ 1380 return new_offset == 0; 1381 } 1382 1383 /* 1384 * version 1 superblock 1385 */ 1386 1387 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1388 { 1389 __le32 disk_csum; 1390 u32 csum; 1391 unsigned long long newcsum; 1392 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1393 __le32 *isuper = (__le32*)sb; 1394 1395 disk_csum = sb->sb_csum; 1396 sb->sb_csum = 0; 1397 newcsum = 0; 1398 for (; size >= 4; size -= 4) 1399 newcsum += le32_to_cpu(*isuper++); 1400 1401 if (size == 2) 1402 newcsum += le16_to_cpu(*(__le16*) isuper); 1403 1404 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1405 sb->sb_csum = disk_csum; 1406 return cpu_to_le32(csum); 1407 } 1408 1409 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1410 { 1411 struct mdp_superblock_1 *sb; 1412 int ret; 1413 sector_t sb_start; 1414 sector_t sectors; 1415 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1416 int bmask; 1417 1418 /* 1419 * Calculate the position of the superblock in 512byte sectors. 1420 * It is always aligned to a 4K boundary and 1421 * depeding on minor_version, it can be: 1422 * 0: At least 8K, but less than 12K, from end of device 1423 * 1: At start of device 1424 * 2: 4K from start of device. 1425 */ 1426 switch(minor_version) { 1427 case 0: 1428 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1429 sb_start -= 8*2; 1430 sb_start &= ~(sector_t)(4*2-1); 1431 break; 1432 case 1: 1433 sb_start = 0; 1434 break; 1435 case 2: 1436 sb_start = 8; 1437 break; 1438 default: 1439 return -EINVAL; 1440 } 1441 rdev->sb_start = sb_start; 1442 1443 /* superblock is rarely larger than 1K, but it can be larger, 1444 * and it is safe to read 4k, so we do that 1445 */ 1446 ret = read_disk_sb(rdev, 4096); 1447 if (ret) return ret; 1448 1449 sb = page_address(rdev->sb_page); 1450 1451 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1452 sb->major_version != cpu_to_le32(1) || 1453 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1454 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1455 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1456 return -EINVAL; 1457 1458 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1459 pr_warn("md: invalid superblock checksum on %s\n", 1460 bdevname(rdev->bdev,b)); 1461 return -EINVAL; 1462 } 1463 if (le64_to_cpu(sb->data_size) < 10) { 1464 pr_warn("md: data_size too small on %s\n", 1465 bdevname(rdev->bdev,b)); 1466 return -EINVAL; 1467 } 1468 if (sb->pad0 || 1469 sb->pad3[0] || 1470 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1471 /* Some padding is non-zero, might be a new feature */ 1472 return -EINVAL; 1473 1474 rdev->preferred_minor = 0xffff; 1475 rdev->data_offset = le64_to_cpu(sb->data_offset); 1476 rdev->new_data_offset = rdev->data_offset; 1477 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1478 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1479 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1480 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1481 1482 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1483 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1484 if (rdev->sb_size & bmask) 1485 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1486 1487 if (minor_version 1488 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1489 return -EINVAL; 1490 if (minor_version 1491 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1492 return -EINVAL; 1493 1494 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1495 rdev->desc_nr = -1; 1496 else 1497 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1498 1499 if (!rdev->bb_page) { 1500 rdev->bb_page = alloc_page(GFP_KERNEL); 1501 if (!rdev->bb_page) 1502 return -ENOMEM; 1503 } 1504 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1505 rdev->badblocks.count == 0) { 1506 /* need to load the bad block list. 1507 * Currently we limit it to one page. 1508 */ 1509 s32 offset; 1510 sector_t bb_sector; 1511 u64 *bbp; 1512 int i; 1513 int sectors = le16_to_cpu(sb->bblog_size); 1514 if (sectors > (PAGE_SIZE / 512)) 1515 return -EINVAL; 1516 offset = le32_to_cpu(sb->bblog_offset); 1517 if (offset == 0) 1518 return -EINVAL; 1519 bb_sector = (long long)offset; 1520 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1521 rdev->bb_page, REQ_OP_READ, 0, true)) 1522 return -EIO; 1523 bbp = (u64 *)page_address(rdev->bb_page); 1524 rdev->badblocks.shift = sb->bblog_shift; 1525 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1526 u64 bb = le64_to_cpu(*bbp); 1527 int count = bb & (0x3ff); 1528 u64 sector = bb >> 10; 1529 sector <<= sb->bblog_shift; 1530 count <<= sb->bblog_shift; 1531 if (bb + 1 == 0) 1532 break; 1533 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1534 return -EINVAL; 1535 } 1536 } else if (sb->bblog_offset != 0) 1537 rdev->badblocks.shift = 0; 1538 1539 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { 1540 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1541 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1542 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1543 } 1544 1545 if (!refdev) { 1546 ret = 1; 1547 } else { 1548 __u64 ev1, ev2; 1549 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1550 1551 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1552 sb->level != refsb->level || 1553 sb->layout != refsb->layout || 1554 sb->chunksize != refsb->chunksize) { 1555 pr_warn("md: %s has strangely different superblock to %s\n", 1556 bdevname(rdev->bdev,b), 1557 bdevname(refdev->bdev,b2)); 1558 return -EINVAL; 1559 } 1560 ev1 = le64_to_cpu(sb->events); 1561 ev2 = le64_to_cpu(refsb->events); 1562 1563 if (ev1 > ev2) 1564 ret = 1; 1565 else 1566 ret = 0; 1567 } 1568 if (minor_version) { 1569 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1570 sectors -= rdev->data_offset; 1571 } else 1572 sectors = rdev->sb_start; 1573 if (sectors < le64_to_cpu(sb->data_size)) 1574 return -EINVAL; 1575 rdev->sectors = le64_to_cpu(sb->data_size); 1576 return ret; 1577 } 1578 1579 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1580 { 1581 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1582 __u64 ev1 = le64_to_cpu(sb->events); 1583 1584 rdev->raid_disk = -1; 1585 clear_bit(Faulty, &rdev->flags); 1586 clear_bit(In_sync, &rdev->flags); 1587 clear_bit(Bitmap_sync, &rdev->flags); 1588 clear_bit(WriteMostly, &rdev->flags); 1589 1590 if (mddev->raid_disks == 0) { 1591 mddev->major_version = 1; 1592 mddev->patch_version = 0; 1593 mddev->external = 0; 1594 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1595 mddev->ctime = le64_to_cpu(sb->ctime); 1596 mddev->utime = le64_to_cpu(sb->utime); 1597 mddev->level = le32_to_cpu(sb->level); 1598 mddev->clevel[0] = 0; 1599 mddev->layout = le32_to_cpu(sb->layout); 1600 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1601 mddev->dev_sectors = le64_to_cpu(sb->size); 1602 mddev->events = ev1; 1603 mddev->bitmap_info.offset = 0; 1604 mddev->bitmap_info.space = 0; 1605 /* Default location for bitmap is 1K after superblock 1606 * using 3K - total of 4K 1607 */ 1608 mddev->bitmap_info.default_offset = 1024 >> 9; 1609 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1610 mddev->reshape_backwards = 0; 1611 1612 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1613 memcpy(mddev->uuid, sb->set_uuid, 16); 1614 1615 mddev->max_disks = (4096-256)/2; 1616 1617 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1618 mddev->bitmap_info.file == NULL) { 1619 mddev->bitmap_info.offset = 1620 (__s32)le32_to_cpu(sb->bitmap_offset); 1621 /* Metadata doesn't record how much space is available. 1622 * For 1.0, we assume we can use up to the superblock 1623 * if before, else to 4K beyond superblock. 1624 * For others, assume no change is possible. 1625 */ 1626 if (mddev->minor_version > 0) 1627 mddev->bitmap_info.space = 0; 1628 else if (mddev->bitmap_info.offset > 0) 1629 mddev->bitmap_info.space = 1630 8 - mddev->bitmap_info.offset; 1631 else 1632 mddev->bitmap_info.space = 1633 -mddev->bitmap_info.offset; 1634 } 1635 1636 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1637 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1638 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1639 mddev->new_level = le32_to_cpu(sb->new_level); 1640 mddev->new_layout = le32_to_cpu(sb->new_layout); 1641 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1642 if (mddev->delta_disks < 0 || 1643 (mddev->delta_disks == 0 && 1644 (le32_to_cpu(sb->feature_map) 1645 & MD_FEATURE_RESHAPE_BACKWARDS))) 1646 mddev->reshape_backwards = 1; 1647 } else { 1648 mddev->reshape_position = MaxSector; 1649 mddev->delta_disks = 0; 1650 mddev->new_level = mddev->level; 1651 mddev->new_layout = mddev->layout; 1652 mddev->new_chunk_sectors = mddev->chunk_sectors; 1653 } 1654 1655 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1656 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1657 1658 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { 1659 if (le32_to_cpu(sb->feature_map) & 1660 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1661 return -EINVAL; 1662 set_bit(MD_HAS_PPL, &mddev->flags); 1663 } 1664 } else if (mddev->pers == NULL) { 1665 /* Insist of good event counter while assembling, except for 1666 * spares (which don't need an event count) */ 1667 ++ev1; 1668 if (rdev->desc_nr >= 0 && 1669 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1670 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1671 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1672 if (ev1 < mddev->events) 1673 return -EINVAL; 1674 } else if (mddev->bitmap) { 1675 /* If adding to array with a bitmap, then we can accept an 1676 * older device, but not too old. 1677 */ 1678 if (ev1 < mddev->bitmap->events_cleared) 1679 return 0; 1680 if (ev1 < mddev->events) 1681 set_bit(Bitmap_sync, &rdev->flags); 1682 } else { 1683 if (ev1 < mddev->events) 1684 /* just a hot-add of a new device, leave raid_disk at -1 */ 1685 return 0; 1686 } 1687 if (mddev->level != LEVEL_MULTIPATH) { 1688 int role; 1689 if (rdev->desc_nr < 0 || 1690 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1691 role = MD_DISK_ROLE_SPARE; 1692 rdev->desc_nr = -1; 1693 } else 1694 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1695 switch(role) { 1696 case MD_DISK_ROLE_SPARE: /* spare */ 1697 break; 1698 case MD_DISK_ROLE_FAULTY: /* faulty */ 1699 set_bit(Faulty, &rdev->flags); 1700 break; 1701 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1702 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1703 /* journal device without journal feature */ 1704 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1705 return -EINVAL; 1706 } 1707 set_bit(Journal, &rdev->flags); 1708 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1709 rdev->raid_disk = 0; 1710 break; 1711 default: 1712 rdev->saved_raid_disk = role; 1713 if ((le32_to_cpu(sb->feature_map) & 1714 MD_FEATURE_RECOVERY_OFFSET)) { 1715 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1716 if (!(le32_to_cpu(sb->feature_map) & 1717 MD_FEATURE_RECOVERY_BITMAP)) 1718 rdev->saved_raid_disk = -1; 1719 } else 1720 set_bit(In_sync, &rdev->flags); 1721 rdev->raid_disk = role; 1722 break; 1723 } 1724 if (sb->devflags & WriteMostly1) 1725 set_bit(WriteMostly, &rdev->flags); 1726 if (sb->devflags & FailFast1) 1727 set_bit(FailFast, &rdev->flags); 1728 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1729 set_bit(Replacement, &rdev->flags); 1730 } else /* MULTIPATH are always insync */ 1731 set_bit(In_sync, &rdev->flags); 1732 1733 return 0; 1734 } 1735 1736 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1737 { 1738 struct mdp_superblock_1 *sb; 1739 struct md_rdev *rdev2; 1740 int max_dev, i; 1741 /* make rdev->sb match mddev and rdev data. */ 1742 1743 sb = page_address(rdev->sb_page); 1744 1745 sb->feature_map = 0; 1746 sb->pad0 = 0; 1747 sb->recovery_offset = cpu_to_le64(0); 1748 memset(sb->pad3, 0, sizeof(sb->pad3)); 1749 1750 sb->utime = cpu_to_le64((__u64)mddev->utime); 1751 sb->events = cpu_to_le64(mddev->events); 1752 if (mddev->in_sync) 1753 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1754 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1755 sb->resync_offset = cpu_to_le64(MaxSector); 1756 else 1757 sb->resync_offset = cpu_to_le64(0); 1758 1759 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1760 1761 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1762 sb->size = cpu_to_le64(mddev->dev_sectors); 1763 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1764 sb->level = cpu_to_le32(mddev->level); 1765 sb->layout = cpu_to_le32(mddev->layout); 1766 if (test_bit(FailFast, &rdev->flags)) 1767 sb->devflags |= FailFast1; 1768 else 1769 sb->devflags &= ~FailFast1; 1770 1771 if (test_bit(WriteMostly, &rdev->flags)) 1772 sb->devflags |= WriteMostly1; 1773 else 1774 sb->devflags &= ~WriteMostly1; 1775 sb->data_offset = cpu_to_le64(rdev->data_offset); 1776 sb->data_size = cpu_to_le64(rdev->sectors); 1777 1778 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1779 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1780 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1781 } 1782 1783 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 1784 !test_bit(In_sync, &rdev->flags)) { 1785 sb->feature_map |= 1786 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1787 sb->recovery_offset = 1788 cpu_to_le64(rdev->recovery_offset); 1789 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1790 sb->feature_map |= 1791 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1792 } 1793 /* Note: recovery_offset and journal_tail share space */ 1794 if (test_bit(Journal, &rdev->flags)) 1795 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 1796 if (test_bit(Replacement, &rdev->flags)) 1797 sb->feature_map |= 1798 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1799 1800 if (mddev->reshape_position != MaxSector) { 1801 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1802 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1803 sb->new_layout = cpu_to_le32(mddev->new_layout); 1804 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1805 sb->new_level = cpu_to_le32(mddev->new_level); 1806 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1807 if (mddev->delta_disks == 0 && 1808 mddev->reshape_backwards) 1809 sb->feature_map 1810 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1811 if (rdev->new_data_offset != rdev->data_offset) { 1812 sb->feature_map 1813 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1814 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1815 - rdev->data_offset)); 1816 } 1817 } 1818 1819 if (mddev_is_clustered(mddev)) 1820 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 1821 1822 if (rdev->badblocks.count == 0) 1823 /* Nothing to do for bad blocks*/ ; 1824 else if (sb->bblog_offset == 0) 1825 /* Cannot record bad blocks on this device */ 1826 md_error(mddev, rdev); 1827 else { 1828 struct badblocks *bb = &rdev->badblocks; 1829 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1830 u64 *p = bb->page; 1831 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1832 if (bb->changed) { 1833 unsigned seq; 1834 1835 retry: 1836 seq = read_seqbegin(&bb->lock); 1837 1838 memset(bbp, 0xff, PAGE_SIZE); 1839 1840 for (i = 0 ; i < bb->count ; i++) { 1841 u64 internal_bb = p[i]; 1842 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1843 | BB_LEN(internal_bb)); 1844 bbp[i] = cpu_to_le64(store_bb); 1845 } 1846 bb->changed = 0; 1847 if (read_seqretry(&bb->lock, seq)) 1848 goto retry; 1849 1850 bb->sector = (rdev->sb_start + 1851 (int)le32_to_cpu(sb->bblog_offset)); 1852 bb->size = le16_to_cpu(sb->bblog_size); 1853 } 1854 } 1855 1856 max_dev = 0; 1857 rdev_for_each(rdev2, mddev) 1858 if (rdev2->desc_nr+1 > max_dev) 1859 max_dev = rdev2->desc_nr+1; 1860 1861 if (max_dev > le32_to_cpu(sb->max_dev)) { 1862 int bmask; 1863 sb->max_dev = cpu_to_le32(max_dev); 1864 rdev->sb_size = max_dev * 2 + 256; 1865 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1866 if (rdev->sb_size & bmask) 1867 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1868 } else 1869 max_dev = le32_to_cpu(sb->max_dev); 1870 1871 for (i=0; i<max_dev;i++) 1872 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1873 1874 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 1875 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1876 1877 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 1878 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 1879 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 1880 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 1881 } 1882 1883 rdev_for_each(rdev2, mddev) { 1884 i = rdev2->desc_nr; 1885 if (test_bit(Faulty, &rdev2->flags)) 1886 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1887 else if (test_bit(In_sync, &rdev2->flags)) 1888 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1889 else if (test_bit(Journal, &rdev2->flags)) 1890 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 1891 else if (rdev2->raid_disk >= 0) 1892 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1893 else 1894 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1895 } 1896 1897 sb->sb_csum = calc_sb_1_csum(sb); 1898 } 1899 1900 static unsigned long long 1901 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1902 { 1903 struct mdp_superblock_1 *sb; 1904 sector_t max_sectors; 1905 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1906 return 0; /* component must fit device */ 1907 if (rdev->data_offset != rdev->new_data_offset) 1908 return 0; /* too confusing */ 1909 if (rdev->sb_start < rdev->data_offset) { 1910 /* minor versions 1 and 2; superblock before data */ 1911 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1912 max_sectors -= rdev->data_offset; 1913 if (!num_sectors || num_sectors > max_sectors) 1914 num_sectors = max_sectors; 1915 } else if (rdev->mddev->bitmap_info.offset) { 1916 /* minor version 0 with bitmap we can't move */ 1917 return 0; 1918 } else { 1919 /* minor version 0; superblock after data */ 1920 sector_t sb_start; 1921 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1922 sb_start &= ~(sector_t)(4*2 - 1); 1923 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1924 if (!num_sectors || num_sectors > max_sectors) 1925 num_sectors = max_sectors; 1926 rdev->sb_start = sb_start; 1927 } 1928 sb = page_address(rdev->sb_page); 1929 sb->data_size = cpu_to_le64(num_sectors); 1930 sb->super_offset = cpu_to_le64(rdev->sb_start); 1931 sb->sb_csum = calc_sb_1_csum(sb); 1932 do { 1933 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1934 rdev->sb_page); 1935 } while (md_super_wait(rdev->mddev) < 0); 1936 return num_sectors; 1937 1938 } 1939 1940 static int 1941 super_1_allow_new_offset(struct md_rdev *rdev, 1942 unsigned long long new_offset) 1943 { 1944 /* All necessary checks on new >= old have been done */ 1945 struct bitmap *bitmap; 1946 if (new_offset >= rdev->data_offset) 1947 return 1; 1948 1949 /* with 1.0 metadata, there is no metadata to tread on 1950 * so we can always move back */ 1951 if (rdev->mddev->minor_version == 0) 1952 return 1; 1953 1954 /* otherwise we must be sure not to step on 1955 * any metadata, so stay: 1956 * 36K beyond start of superblock 1957 * beyond end of badblocks 1958 * beyond write-intent bitmap 1959 */ 1960 if (rdev->sb_start + (32+4)*2 > new_offset) 1961 return 0; 1962 bitmap = rdev->mddev->bitmap; 1963 if (bitmap && !rdev->mddev->bitmap_info.file && 1964 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1965 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1966 return 0; 1967 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1968 return 0; 1969 1970 return 1; 1971 } 1972 1973 static struct super_type super_types[] = { 1974 [0] = { 1975 .name = "0.90.0", 1976 .owner = THIS_MODULE, 1977 .load_super = super_90_load, 1978 .validate_super = super_90_validate, 1979 .sync_super = super_90_sync, 1980 .rdev_size_change = super_90_rdev_size_change, 1981 .allow_new_offset = super_90_allow_new_offset, 1982 }, 1983 [1] = { 1984 .name = "md-1", 1985 .owner = THIS_MODULE, 1986 .load_super = super_1_load, 1987 .validate_super = super_1_validate, 1988 .sync_super = super_1_sync, 1989 .rdev_size_change = super_1_rdev_size_change, 1990 .allow_new_offset = super_1_allow_new_offset, 1991 }, 1992 }; 1993 1994 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1995 { 1996 if (mddev->sync_super) { 1997 mddev->sync_super(mddev, rdev); 1998 return; 1999 } 2000 2001 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2002 2003 super_types[mddev->major_version].sync_super(mddev, rdev); 2004 } 2005 2006 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2007 { 2008 struct md_rdev *rdev, *rdev2; 2009 2010 rcu_read_lock(); 2011 rdev_for_each_rcu(rdev, mddev1) { 2012 if (test_bit(Faulty, &rdev->flags) || 2013 test_bit(Journal, &rdev->flags) || 2014 rdev->raid_disk == -1) 2015 continue; 2016 rdev_for_each_rcu(rdev2, mddev2) { 2017 if (test_bit(Faulty, &rdev2->flags) || 2018 test_bit(Journal, &rdev2->flags) || 2019 rdev2->raid_disk == -1) 2020 continue; 2021 if (rdev->bdev->bd_contains == 2022 rdev2->bdev->bd_contains) { 2023 rcu_read_unlock(); 2024 return 1; 2025 } 2026 } 2027 } 2028 rcu_read_unlock(); 2029 return 0; 2030 } 2031 2032 static LIST_HEAD(pending_raid_disks); 2033 2034 /* 2035 * Try to register data integrity profile for an mddev 2036 * 2037 * This is called when an array is started and after a disk has been kicked 2038 * from the array. It only succeeds if all working and active component devices 2039 * are integrity capable with matching profiles. 2040 */ 2041 int md_integrity_register(struct mddev *mddev) 2042 { 2043 struct md_rdev *rdev, *reference = NULL; 2044 2045 if (list_empty(&mddev->disks)) 2046 return 0; /* nothing to do */ 2047 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2048 return 0; /* shouldn't register, or already is */ 2049 rdev_for_each(rdev, mddev) { 2050 /* skip spares and non-functional disks */ 2051 if (test_bit(Faulty, &rdev->flags)) 2052 continue; 2053 if (rdev->raid_disk < 0) 2054 continue; 2055 if (!reference) { 2056 /* Use the first rdev as the reference */ 2057 reference = rdev; 2058 continue; 2059 } 2060 /* does this rdev's profile match the reference profile? */ 2061 if (blk_integrity_compare(reference->bdev->bd_disk, 2062 rdev->bdev->bd_disk) < 0) 2063 return -EINVAL; 2064 } 2065 if (!reference || !bdev_get_integrity(reference->bdev)) 2066 return 0; 2067 /* 2068 * All component devices are integrity capable and have matching 2069 * profiles, register the common profile for the md device. 2070 */ 2071 blk_integrity_register(mddev->gendisk, 2072 bdev_get_integrity(reference->bdev)); 2073 2074 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2075 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2076 pr_err("md: failed to create integrity pool for %s\n", 2077 mdname(mddev)); 2078 return -EINVAL; 2079 } 2080 return 0; 2081 } 2082 EXPORT_SYMBOL(md_integrity_register); 2083 2084 /* 2085 * Attempt to add an rdev, but only if it is consistent with the current 2086 * integrity profile 2087 */ 2088 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2089 { 2090 struct blk_integrity *bi_rdev; 2091 struct blk_integrity *bi_mddev; 2092 char name[BDEVNAME_SIZE]; 2093 2094 if (!mddev->gendisk) 2095 return 0; 2096 2097 bi_rdev = bdev_get_integrity(rdev->bdev); 2098 bi_mddev = blk_get_integrity(mddev->gendisk); 2099 2100 if (!bi_mddev) /* nothing to do */ 2101 return 0; 2102 2103 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2104 pr_err("%s: incompatible integrity profile for %s\n", 2105 mdname(mddev), bdevname(rdev->bdev, name)); 2106 return -ENXIO; 2107 } 2108 2109 return 0; 2110 } 2111 EXPORT_SYMBOL(md_integrity_add_rdev); 2112 2113 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2114 { 2115 char b[BDEVNAME_SIZE]; 2116 struct kobject *ko; 2117 int err; 2118 2119 /* prevent duplicates */ 2120 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2121 return -EEXIST; 2122 2123 if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && 2124 mddev->pers) 2125 return -EROFS; 2126 2127 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2128 if (!test_bit(Journal, &rdev->flags) && 2129 rdev->sectors && 2130 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2131 if (mddev->pers) { 2132 /* Cannot change size, so fail 2133 * If mddev->level <= 0, then we don't care 2134 * about aligning sizes (e.g. linear) 2135 */ 2136 if (mddev->level > 0) 2137 return -ENOSPC; 2138 } else 2139 mddev->dev_sectors = rdev->sectors; 2140 } 2141 2142 /* Verify rdev->desc_nr is unique. 2143 * If it is -1, assign a free number, else 2144 * check number is not in use 2145 */ 2146 rcu_read_lock(); 2147 if (rdev->desc_nr < 0) { 2148 int choice = 0; 2149 if (mddev->pers) 2150 choice = mddev->raid_disks; 2151 while (md_find_rdev_nr_rcu(mddev, choice)) 2152 choice++; 2153 rdev->desc_nr = choice; 2154 } else { 2155 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2156 rcu_read_unlock(); 2157 return -EBUSY; 2158 } 2159 } 2160 rcu_read_unlock(); 2161 if (!test_bit(Journal, &rdev->flags) && 2162 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2163 pr_warn("md: %s: array is limited to %d devices\n", 2164 mdname(mddev), mddev->max_disks); 2165 return -EBUSY; 2166 } 2167 bdevname(rdev->bdev,b); 2168 strreplace(b, '/', '!'); 2169 2170 rdev->mddev = mddev; 2171 pr_debug("md: bind<%s>\n", b); 2172 2173 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2174 goto fail; 2175 2176 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2177 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2178 /* failure here is OK */; 2179 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2180 2181 list_add_rcu(&rdev->same_set, &mddev->disks); 2182 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2183 2184 /* May as well allow recovery to be retried once */ 2185 mddev->recovery_disabled++; 2186 2187 return 0; 2188 2189 fail: 2190 pr_warn("md: failed to register dev-%s for %s\n", 2191 b, mdname(mddev)); 2192 return err; 2193 } 2194 2195 static void md_delayed_delete(struct work_struct *ws) 2196 { 2197 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2198 kobject_del(&rdev->kobj); 2199 kobject_put(&rdev->kobj); 2200 } 2201 2202 static void unbind_rdev_from_array(struct md_rdev *rdev) 2203 { 2204 char b[BDEVNAME_SIZE]; 2205 2206 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2207 list_del_rcu(&rdev->same_set); 2208 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2209 rdev->mddev = NULL; 2210 sysfs_remove_link(&rdev->kobj, "block"); 2211 sysfs_put(rdev->sysfs_state); 2212 rdev->sysfs_state = NULL; 2213 rdev->badblocks.count = 0; 2214 /* We need to delay this, otherwise we can deadlock when 2215 * writing to 'remove' to "dev/state". We also need 2216 * to delay it due to rcu usage. 2217 */ 2218 synchronize_rcu(); 2219 INIT_WORK(&rdev->del_work, md_delayed_delete); 2220 kobject_get(&rdev->kobj); 2221 queue_work(md_misc_wq, &rdev->del_work); 2222 } 2223 2224 /* 2225 * prevent the device from being mounted, repartitioned or 2226 * otherwise reused by a RAID array (or any other kernel 2227 * subsystem), by bd_claiming the device. 2228 */ 2229 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2230 { 2231 int err = 0; 2232 struct block_device *bdev; 2233 char b[BDEVNAME_SIZE]; 2234 2235 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2236 shared ? (struct md_rdev *)lock_rdev : rdev); 2237 if (IS_ERR(bdev)) { 2238 pr_warn("md: could not open %s.\n", __bdevname(dev, b)); 2239 return PTR_ERR(bdev); 2240 } 2241 rdev->bdev = bdev; 2242 return err; 2243 } 2244 2245 static void unlock_rdev(struct md_rdev *rdev) 2246 { 2247 struct block_device *bdev = rdev->bdev; 2248 rdev->bdev = NULL; 2249 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2250 } 2251 2252 void md_autodetect_dev(dev_t dev); 2253 2254 static void export_rdev(struct md_rdev *rdev) 2255 { 2256 char b[BDEVNAME_SIZE]; 2257 2258 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); 2259 md_rdev_clear(rdev); 2260 #ifndef MODULE 2261 if (test_bit(AutoDetected, &rdev->flags)) 2262 md_autodetect_dev(rdev->bdev->bd_dev); 2263 #endif 2264 unlock_rdev(rdev); 2265 kobject_put(&rdev->kobj); 2266 } 2267 2268 void md_kick_rdev_from_array(struct md_rdev *rdev) 2269 { 2270 unbind_rdev_from_array(rdev); 2271 export_rdev(rdev); 2272 } 2273 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2274 2275 static void export_array(struct mddev *mddev) 2276 { 2277 struct md_rdev *rdev; 2278 2279 while (!list_empty(&mddev->disks)) { 2280 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2281 same_set); 2282 md_kick_rdev_from_array(rdev); 2283 } 2284 mddev->raid_disks = 0; 2285 mddev->major_version = 0; 2286 } 2287 2288 static bool set_in_sync(struct mddev *mddev) 2289 { 2290 WARN_ON_ONCE(!spin_is_locked(&mddev->lock)); 2291 if (!mddev->in_sync) { 2292 mddev->sync_checkers++; 2293 spin_unlock(&mddev->lock); 2294 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2295 spin_lock(&mddev->lock); 2296 if (!mddev->in_sync && 2297 percpu_ref_is_zero(&mddev->writes_pending)) { 2298 mddev->in_sync = 1; 2299 /* 2300 * Ensure ->in_sync is visible before we clear 2301 * ->sync_checkers. 2302 */ 2303 smp_mb(); 2304 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2305 sysfs_notify_dirent_safe(mddev->sysfs_state); 2306 } 2307 if (--mddev->sync_checkers == 0) 2308 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2309 } 2310 if (mddev->safemode == 1) 2311 mddev->safemode = 0; 2312 return mddev->in_sync; 2313 } 2314 2315 static void sync_sbs(struct mddev *mddev, int nospares) 2316 { 2317 /* Update each superblock (in-memory image), but 2318 * if we are allowed to, skip spares which already 2319 * have the right event counter, or have one earlier 2320 * (which would mean they aren't being marked as dirty 2321 * with the rest of the array) 2322 */ 2323 struct md_rdev *rdev; 2324 rdev_for_each(rdev, mddev) { 2325 if (rdev->sb_events == mddev->events || 2326 (nospares && 2327 rdev->raid_disk < 0 && 2328 rdev->sb_events+1 == mddev->events)) { 2329 /* Don't update this superblock */ 2330 rdev->sb_loaded = 2; 2331 } else { 2332 sync_super(mddev, rdev); 2333 rdev->sb_loaded = 1; 2334 } 2335 } 2336 } 2337 2338 static bool does_sb_need_changing(struct mddev *mddev) 2339 { 2340 struct md_rdev *rdev; 2341 struct mdp_superblock_1 *sb; 2342 int role; 2343 2344 /* Find a good rdev */ 2345 rdev_for_each(rdev, mddev) 2346 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2347 break; 2348 2349 /* No good device found. */ 2350 if (!rdev) 2351 return false; 2352 2353 sb = page_address(rdev->sb_page); 2354 /* Check if a device has become faulty or a spare become active */ 2355 rdev_for_each(rdev, mddev) { 2356 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2357 /* Device activated? */ 2358 if (role == 0xffff && rdev->raid_disk >=0 && 2359 !test_bit(Faulty, &rdev->flags)) 2360 return true; 2361 /* Device turned faulty? */ 2362 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2363 return true; 2364 } 2365 2366 /* Check if any mddev parameters have changed */ 2367 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2368 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2369 (mddev->layout != le32_to_cpu(sb->layout)) || 2370 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2371 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2372 return true; 2373 2374 return false; 2375 } 2376 2377 void md_update_sb(struct mddev *mddev, int force_change) 2378 { 2379 struct md_rdev *rdev; 2380 int sync_req; 2381 int nospares = 0; 2382 int any_badblocks_changed = 0; 2383 int ret = -1; 2384 2385 if (mddev->ro) { 2386 if (force_change) 2387 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2388 return; 2389 } 2390 2391 repeat: 2392 if (mddev_is_clustered(mddev)) { 2393 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2394 force_change = 1; 2395 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2396 nospares = 1; 2397 ret = md_cluster_ops->metadata_update_start(mddev); 2398 /* Has someone else has updated the sb */ 2399 if (!does_sb_need_changing(mddev)) { 2400 if (ret == 0) 2401 md_cluster_ops->metadata_update_cancel(mddev); 2402 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2403 BIT(MD_SB_CHANGE_DEVS) | 2404 BIT(MD_SB_CHANGE_CLEAN)); 2405 return; 2406 } 2407 } 2408 2409 /* First make sure individual recovery_offsets are correct */ 2410 rdev_for_each(rdev, mddev) { 2411 if (rdev->raid_disk >= 0 && 2412 mddev->delta_disks >= 0 && 2413 !test_bit(Journal, &rdev->flags) && 2414 !test_bit(In_sync, &rdev->flags) && 2415 mddev->curr_resync_completed > rdev->recovery_offset) 2416 rdev->recovery_offset = mddev->curr_resync_completed; 2417 2418 } 2419 if (!mddev->persistent) { 2420 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2421 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2422 if (!mddev->external) { 2423 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2424 rdev_for_each(rdev, mddev) { 2425 if (rdev->badblocks.changed) { 2426 rdev->badblocks.changed = 0; 2427 ack_all_badblocks(&rdev->badblocks); 2428 md_error(mddev, rdev); 2429 } 2430 clear_bit(Blocked, &rdev->flags); 2431 clear_bit(BlockedBadBlocks, &rdev->flags); 2432 wake_up(&rdev->blocked_wait); 2433 } 2434 } 2435 wake_up(&mddev->sb_wait); 2436 return; 2437 } 2438 2439 spin_lock(&mddev->lock); 2440 2441 mddev->utime = ktime_get_real_seconds(); 2442 2443 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2444 force_change = 1; 2445 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2446 /* just a clean<-> dirty transition, possibly leave spares alone, 2447 * though if events isn't the right even/odd, we will have to do 2448 * spares after all 2449 */ 2450 nospares = 1; 2451 if (force_change) 2452 nospares = 0; 2453 if (mddev->degraded) 2454 /* If the array is degraded, then skipping spares is both 2455 * dangerous and fairly pointless. 2456 * Dangerous because a device that was removed from the array 2457 * might have a event_count that still looks up-to-date, 2458 * so it can be re-added without a resync. 2459 * Pointless because if there are any spares to skip, 2460 * then a recovery will happen and soon that array won't 2461 * be degraded any more and the spare can go back to sleep then. 2462 */ 2463 nospares = 0; 2464 2465 sync_req = mddev->in_sync; 2466 2467 /* If this is just a dirty<->clean transition, and the array is clean 2468 * and 'events' is odd, we can roll back to the previous clean state */ 2469 if (nospares 2470 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2471 && mddev->can_decrease_events 2472 && mddev->events != 1) { 2473 mddev->events--; 2474 mddev->can_decrease_events = 0; 2475 } else { 2476 /* otherwise we have to go forward and ... */ 2477 mddev->events ++; 2478 mddev->can_decrease_events = nospares; 2479 } 2480 2481 /* 2482 * This 64-bit counter should never wrap. 2483 * Either we are in around ~1 trillion A.C., assuming 2484 * 1 reboot per second, or we have a bug... 2485 */ 2486 WARN_ON(mddev->events == 0); 2487 2488 rdev_for_each(rdev, mddev) { 2489 if (rdev->badblocks.changed) 2490 any_badblocks_changed++; 2491 if (test_bit(Faulty, &rdev->flags)) 2492 set_bit(FaultRecorded, &rdev->flags); 2493 } 2494 2495 sync_sbs(mddev, nospares); 2496 spin_unlock(&mddev->lock); 2497 2498 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2499 mdname(mddev), mddev->in_sync); 2500 2501 if (mddev->queue) 2502 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2503 rewrite: 2504 bitmap_update_sb(mddev->bitmap); 2505 rdev_for_each(rdev, mddev) { 2506 char b[BDEVNAME_SIZE]; 2507 2508 if (rdev->sb_loaded != 1) 2509 continue; /* no noise on spare devices */ 2510 2511 if (!test_bit(Faulty, &rdev->flags)) { 2512 md_super_write(mddev,rdev, 2513 rdev->sb_start, rdev->sb_size, 2514 rdev->sb_page); 2515 pr_debug("md: (write) %s's sb offset: %llu\n", 2516 bdevname(rdev->bdev, b), 2517 (unsigned long long)rdev->sb_start); 2518 rdev->sb_events = mddev->events; 2519 if (rdev->badblocks.size) { 2520 md_super_write(mddev, rdev, 2521 rdev->badblocks.sector, 2522 rdev->badblocks.size << 9, 2523 rdev->bb_page); 2524 rdev->badblocks.size = 0; 2525 } 2526 2527 } else 2528 pr_debug("md: %s (skipping faulty)\n", 2529 bdevname(rdev->bdev, b)); 2530 2531 if (mddev->level == LEVEL_MULTIPATH) 2532 /* only need to write one superblock... */ 2533 break; 2534 } 2535 if (md_super_wait(mddev) < 0) 2536 goto rewrite; 2537 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2538 2539 if (mddev_is_clustered(mddev) && ret == 0) 2540 md_cluster_ops->metadata_update_finish(mddev); 2541 2542 if (mddev->in_sync != sync_req || 2543 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2544 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2545 /* have to write it out again */ 2546 goto repeat; 2547 wake_up(&mddev->sb_wait); 2548 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2549 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2550 2551 rdev_for_each(rdev, mddev) { 2552 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2553 clear_bit(Blocked, &rdev->flags); 2554 2555 if (any_badblocks_changed) 2556 ack_all_badblocks(&rdev->badblocks); 2557 clear_bit(BlockedBadBlocks, &rdev->flags); 2558 wake_up(&rdev->blocked_wait); 2559 } 2560 } 2561 EXPORT_SYMBOL(md_update_sb); 2562 2563 static int add_bound_rdev(struct md_rdev *rdev) 2564 { 2565 struct mddev *mddev = rdev->mddev; 2566 int err = 0; 2567 bool add_journal = test_bit(Journal, &rdev->flags); 2568 2569 if (!mddev->pers->hot_remove_disk || add_journal) { 2570 /* If there is hot_add_disk but no hot_remove_disk 2571 * then added disks for geometry changes, 2572 * and should be added immediately. 2573 */ 2574 super_types[mddev->major_version]. 2575 validate_super(mddev, rdev); 2576 if (add_journal) 2577 mddev_suspend(mddev); 2578 err = mddev->pers->hot_add_disk(mddev, rdev); 2579 if (add_journal) 2580 mddev_resume(mddev); 2581 if (err) { 2582 md_kick_rdev_from_array(rdev); 2583 return err; 2584 } 2585 } 2586 sysfs_notify_dirent_safe(rdev->sysfs_state); 2587 2588 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2589 if (mddev->degraded) 2590 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2591 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2592 md_new_event(mddev); 2593 md_wakeup_thread(mddev->thread); 2594 return 0; 2595 } 2596 2597 /* words written to sysfs files may, or may not, be \n terminated. 2598 * We want to accept with case. For this we use cmd_match. 2599 */ 2600 static int cmd_match(const char *cmd, const char *str) 2601 { 2602 /* See if cmd, written into a sysfs file, matches 2603 * str. They must either be the same, or cmd can 2604 * have a trailing newline 2605 */ 2606 while (*cmd && *str && *cmd == *str) { 2607 cmd++; 2608 str++; 2609 } 2610 if (*cmd == '\n') 2611 cmd++; 2612 if (*str || *cmd) 2613 return 0; 2614 return 1; 2615 } 2616 2617 struct rdev_sysfs_entry { 2618 struct attribute attr; 2619 ssize_t (*show)(struct md_rdev *, char *); 2620 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2621 }; 2622 2623 static ssize_t 2624 state_show(struct md_rdev *rdev, char *page) 2625 { 2626 char *sep = ","; 2627 size_t len = 0; 2628 unsigned long flags = ACCESS_ONCE(rdev->flags); 2629 2630 if (test_bit(Faulty, &flags) || 2631 (!test_bit(ExternalBbl, &flags) && 2632 rdev->badblocks.unacked_exist)) 2633 len += sprintf(page+len, "faulty%s", sep); 2634 if (test_bit(In_sync, &flags)) 2635 len += sprintf(page+len, "in_sync%s", sep); 2636 if (test_bit(Journal, &flags)) 2637 len += sprintf(page+len, "journal%s", sep); 2638 if (test_bit(WriteMostly, &flags)) 2639 len += sprintf(page+len, "write_mostly%s", sep); 2640 if (test_bit(Blocked, &flags) || 2641 (rdev->badblocks.unacked_exist 2642 && !test_bit(Faulty, &flags))) 2643 len += sprintf(page+len, "blocked%s", sep); 2644 if (!test_bit(Faulty, &flags) && 2645 !test_bit(Journal, &flags) && 2646 !test_bit(In_sync, &flags)) 2647 len += sprintf(page+len, "spare%s", sep); 2648 if (test_bit(WriteErrorSeen, &flags)) 2649 len += sprintf(page+len, "write_error%s", sep); 2650 if (test_bit(WantReplacement, &flags)) 2651 len += sprintf(page+len, "want_replacement%s", sep); 2652 if (test_bit(Replacement, &flags)) 2653 len += sprintf(page+len, "replacement%s", sep); 2654 if (test_bit(ExternalBbl, &flags)) 2655 len += sprintf(page+len, "external_bbl%s", sep); 2656 if (test_bit(FailFast, &flags)) 2657 len += sprintf(page+len, "failfast%s", sep); 2658 2659 if (len) 2660 len -= strlen(sep); 2661 2662 return len+sprintf(page+len, "\n"); 2663 } 2664 2665 static ssize_t 2666 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2667 { 2668 /* can write 2669 * faulty - simulates an error 2670 * remove - disconnects the device 2671 * writemostly - sets write_mostly 2672 * -writemostly - clears write_mostly 2673 * blocked - sets the Blocked flags 2674 * -blocked - clears the Blocked and possibly simulates an error 2675 * insync - sets Insync providing device isn't active 2676 * -insync - clear Insync for a device with a slot assigned, 2677 * so that it gets rebuilt based on bitmap 2678 * write_error - sets WriteErrorSeen 2679 * -write_error - clears WriteErrorSeen 2680 * {,-}failfast - set/clear FailFast 2681 */ 2682 int err = -EINVAL; 2683 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2684 md_error(rdev->mddev, rdev); 2685 if (test_bit(Faulty, &rdev->flags)) 2686 err = 0; 2687 else 2688 err = -EBUSY; 2689 } else if (cmd_match(buf, "remove")) { 2690 if (rdev->mddev->pers) { 2691 clear_bit(Blocked, &rdev->flags); 2692 remove_and_add_spares(rdev->mddev, rdev); 2693 } 2694 if (rdev->raid_disk >= 0) 2695 err = -EBUSY; 2696 else { 2697 struct mddev *mddev = rdev->mddev; 2698 err = 0; 2699 if (mddev_is_clustered(mddev)) 2700 err = md_cluster_ops->remove_disk(mddev, rdev); 2701 2702 if (err == 0) { 2703 md_kick_rdev_from_array(rdev); 2704 if (mddev->pers) { 2705 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2706 md_wakeup_thread(mddev->thread); 2707 } 2708 md_new_event(mddev); 2709 } 2710 } 2711 } else if (cmd_match(buf, "writemostly")) { 2712 set_bit(WriteMostly, &rdev->flags); 2713 err = 0; 2714 } else if (cmd_match(buf, "-writemostly")) { 2715 clear_bit(WriteMostly, &rdev->flags); 2716 err = 0; 2717 } else if (cmd_match(buf, "blocked")) { 2718 set_bit(Blocked, &rdev->flags); 2719 err = 0; 2720 } else if (cmd_match(buf, "-blocked")) { 2721 if (!test_bit(Faulty, &rdev->flags) && 2722 !test_bit(ExternalBbl, &rdev->flags) && 2723 rdev->badblocks.unacked_exist) { 2724 /* metadata handler doesn't understand badblocks, 2725 * so we need to fail the device 2726 */ 2727 md_error(rdev->mddev, rdev); 2728 } 2729 clear_bit(Blocked, &rdev->flags); 2730 clear_bit(BlockedBadBlocks, &rdev->flags); 2731 wake_up(&rdev->blocked_wait); 2732 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2733 md_wakeup_thread(rdev->mddev->thread); 2734 2735 err = 0; 2736 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2737 set_bit(In_sync, &rdev->flags); 2738 err = 0; 2739 } else if (cmd_match(buf, "failfast")) { 2740 set_bit(FailFast, &rdev->flags); 2741 err = 0; 2742 } else if (cmd_match(buf, "-failfast")) { 2743 clear_bit(FailFast, &rdev->flags); 2744 err = 0; 2745 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2746 !test_bit(Journal, &rdev->flags)) { 2747 if (rdev->mddev->pers == NULL) { 2748 clear_bit(In_sync, &rdev->flags); 2749 rdev->saved_raid_disk = rdev->raid_disk; 2750 rdev->raid_disk = -1; 2751 err = 0; 2752 } 2753 } else if (cmd_match(buf, "write_error")) { 2754 set_bit(WriteErrorSeen, &rdev->flags); 2755 err = 0; 2756 } else if (cmd_match(buf, "-write_error")) { 2757 clear_bit(WriteErrorSeen, &rdev->flags); 2758 err = 0; 2759 } else if (cmd_match(buf, "want_replacement")) { 2760 /* Any non-spare device that is not a replacement can 2761 * become want_replacement at any time, but we then need to 2762 * check if recovery is needed. 2763 */ 2764 if (rdev->raid_disk >= 0 && 2765 !test_bit(Journal, &rdev->flags) && 2766 !test_bit(Replacement, &rdev->flags)) 2767 set_bit(WantReplacement, &rdev->flags); 2768 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2769 md_wakeup_thread(rdev->mddev->thread); 2770 err = 0; 2771 } else if (cmd_match(buf, "-want_replacement")) { 2772 /* Clearing 'want_replacement' is always allowed. 2773 * Once replacements starts it is too late though. 2774 */ 2775 err = 0; 2776 clear_bit(WantReplacement, &rdev->flags); 2777 } else if (cmd_match(buf, "replacement")) { 2778 /* Can only set a device as a replacement when array has not 2779 * yet been started. Once running, replacement is automatic 2780 * from spares, or by assigning 'slot'. 2781 */ 2782 if (rdev->mddev->pers) 2783 err = -EBUSY; 2784 else { 2785 set_bit(Replacement, &rdev->flags); 2786 err = 0; 2787 } 2788 } else if (cmd_match(buf, "-replacement")) { 2789 /* Similarly, can only clear Replacement before start */ 2790 if (rdev->mddev->pers) 2791 err = -EBUSY; 2792 else { 2793 clear_bit(Replacement, &rdev->flags); 2794 err = 0; 2795 } 2796 } else if (cmd_match(buf, "re-add")) { 2797 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2798 /* clear_bit is performed _after_ all the devices 2799 * have their local Faulty bit cleared. If any writes 2800 * happen in the meantime in the local node, they 2801 * will land in the local bitmap, which will be synced 2802 * by this node eventually 2803 */ 2804 if (!mddev_is_clustered(rdev->mddev) || 2805 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2806 clear_bit(Faulty, &rdev->flags); 2807 err = add_bound_rdev(rdev); 2808 } 2809 } else 2810 err = -EBUSY; 2811 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 2812 set_bit(ExternalBbl, &rdev->flags); 2813 rdev->badblocks.shift = 0; 2814 err = 0; 2815 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 2816 clear_bit(ExternalBbl, &rdev->flags); 2817 err = 0; 2818 } 2819 if (!err) 2820 sysfs_notify_dirent_safe(rdev->sysfs_state); 2821 return err ? err : len; 2822 } 2823 static struct rdev_sysfs_entry rdev_state = 2824 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2825 2826 static ssize_t 2827 errors_show(struct md_rdev *rdev, char *page) 2828 { 2829 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2830 } 2831 2832 static ssize_t 2833 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2834 { 2835 unsigned int n; 2836 int rv; 2837 2838 rv = kstrtouint(buf, 10, &n); 2839 if (rv < 0) 2840 return rv; 2841 atomic_set(&rdev->corrected_errors, n); 2842 return len; 2843 } 2844 static struct rdev_sysfs_entry rdev_errors = 2845 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2846 2847 static ssize_t 2848 slot_show(struct md_rdev *rdev, char *page) 2849 { 2850 if (test_bit(Journal, &rdev->flags)) 2851 return sprintf(page, "journal\n"); 2852 else if (rdev->raid_disk < 0) 2853 return sprintf(page, "none\n"); 2854 else 2855 return sprintf(page, "%d\n", rdev->raid_disk); 2856 } 2857 2858 static ssize_t 2859 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2860 { 2861 int slot; 2862 int err; 2863 2864 if (test_bit(Journal, &rdev->flags)) 2865 return -EBUSY; 2866 if (strncmp(buf, "none", 4)==0) 2867 slot = -1; 2868 else { 2869 err = kstrtouint(buf, 10, (unsigned int *)&slot); 2870 if (err < 0) 2871 return err; 2872 } 2873 if (rdev->mddev->pers && slot == -1) { 2874 /* Setting 'slot' on an active array requires also 2875 * updating the 'rd%d' link, and communicating 2876 * with the personality with ->hot_*_disk. 2877 * For now we only support removing 2878 * failed/spare devices. This normally happens automatically, 2879 * but not when the metadata is externally managed. 2880 */ 2881 if (rdev->raid_disk == -1) 2882 return -EEXIST; 2883 /* personality does all needed checks */ 2884 if (rdev->mddev->pers->hot_remove_disk == NULL) 2885 return -EINVAL; 2886 clear_bit(Blocked, &rdev->flags); 2887 remove_and_add_spares(rdev->mddev, rdev); 2888 if (rdev->raid_disk >= 0) 2889 return -EBUSY; 2890 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2891 md_wakeup_thread(rdev->mddev->thread); 2892 } else if (rdev->mddev->pers) { 2893 /* Activating a spare .. or possibly reactivating 2894 * if we ever get bitmaps working here. 2895 */ 2896 int err; 2897 2898 if (rdev->raid_disk != -1) 2899 return -EBUSY; 2900 2901 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2902 return -EBUSY; 2903 2904 if (rdev->mddev->pers->hot_add_disk == NULL) 2905 return -EINVAL; 2906 2907 if (slot >= rdev->mddev->raid_disks && 2908 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2909 return -ENOSPC; 2910 2911 rdev->raid_disk = slot; 2912 if (test_bit(In_sync, &rdev->flags)) 2913 rdev->saved_raid_disk = slot; 2914 else 2915 rdev->saved_raid_disk = -1; 2916 clear_bit(In_sync, &rdev->flags); 2917 clear_bit(Bitmap_sync, &rdev->flags); 2918 err = rdev->mddev->pers-> 2919 hot_add_disk(rdev->mddev, rdev); 2920 if (err) { 2921 rdev->raid_disk = -1; 2922 return err; 2923 } else 2924 sysfs_notify_dirent_safe(rdev->sysfs_state); 2925 if (sysfs_link_rdev(rdev->mddev, rdev)) 2926 /* failure here is OK */; 2927 /* don't wakeup anyone, leave that to userspace. */ 2928 } else { 2929 if (slot >= rdev->mddev->raid_disks && 2930 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2931 return -ENOSPC; 2932 rdev->raid_disk = slot; 2933 /* assume it is working */ 2934 clear_bit(Faulty, &rdev->flags); 2935 clear_bit(WriteMostly, &rdev->flags); 2936 set_bit(In_sync, &rdev->flags); 2937 sysfs_notify_dirent_safe(rdev->sysfs_state); 2938 } 2939 return len; 2940 } 2941 2942 static struct rdev_sysfs_entry rdev_slot = 2943 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2944 2945 static ssize_t 2946 offset_show(struct md_rdev *rdev, char *page) 2947 { 2948 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2949 } 2950 2951 static ssize_t 2952 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2953 { 2954 unsigned long long offset; 2955 if (kstrtoull(buf, 10, &offset) < 0) 2956 return -EINVAL; 2957 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2958 return -EBUSY; 2959 if (rdev->sectors && rdev->mddev->external) 2960 /* Must set offset before size, so overlap checks 2961 * can be sane */ 2962 return -EBUSY; 2963 rdev->data_offset = offset; 2964 rdev->new_data_offset = offset; 2965 return len; 2966 } 2967 2968 static struct rdev_sysfs_entry rdev_offset = 2969 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2970 2971 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2972 { 2973 return sprintf(page, "%llu\n", 2974 (unsigned long long)rdev->new_data_offset); 2975 } 2976 2977 static ssize_t new_offset_store(struct md_rdev *rdev, 2978 const char *buf, size_t len) 2979 { 2980 unsigned long long new_offset; 2981 struct mddev *mddev = rdev->mddev; 2982 2983 if (kstrtoull(buf, 10, &new_offset) < 0) 2984 return -EINVAL; 2985 2986 if (mddev->sync_thread || 2987 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 2988 return -EBUSY; 2989 if (new_offset == rdev->data_offset) 2990 /* reset is always permitted */ 2991 ; 2992 else if (new_offset > rdev->data_offset) { 2993 /* must not push array size beyond rdev_sectors */ 2994 if (new_offset - rdev->data_offset 2995 + mddev->dev_sectors > rdev->sectors) 2996 return -E2BIG; 2997 } 2998 /* Metadata worries about other space details. */ 2999 3000 /* decreasing the offset is inconsistent with a backwards 3001 * reshape. 3002 */ 3003 if (new_offset < rdev->data_offset && 3004 mddev->reshape_backwards) 3005 return -EINVAL; 3006 /* Increasing offset is inconsistent with forwards 3007 * reshape. reshape_direction should be set to 3008 * 'backwards' first. 3009 */ 3010 if (new_offset > rdev->data_offset && 3011 !mddev->reshape_backwards) 3012 return -EINVAL; 3013 3014 if (mddev->pers && mddev->persistent && 3015 !super_types[mddev->major_version] 3016 .allow_new_offset(rdev, new_offset)) 3017 return -E2BIG; 3018 rdev->new_data_offset = new_offset; 3019 if (new_offset > rdev->data_offset) 3020 mddev->reshape_backwards = 1; 3021 else if (new_offset < rdev->data_offset) 3022 mddev->reshape_backwards = 0; 3023 3024 return len; 3025 } 3026 static struct rdev_sysfs_entry rdev_new_offset = 3027 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3028 3029 static ssize_t 3030 rdev_size_show(struct md_rdev *rdev, char *page) 3031 { 3032 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3033 } 3034 3035 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 3036 { 3037 /* check if two start/length pairs overlap */ 3038 if (s1+l1 <= s2) 3039 return 0; 3040 if (s2+l2 <= s1) 3041 return 0; 3042 return 1; 3043 } 3044 3045 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3046 { 3047 unsigned long long blocks; 3048 sector_t new; 3049 3050 if (kstrtoull(buf, 10, &blocks) < 0) 3051 return -EINVAL; 3052 3053 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3054 return -EINVAL; /* sector conversion overflow */ 3055 3056 new = blocks * 2; 3057 if (new != blocks * 2) 3058 return -EINVAL; /* unsigned long long to sector_t overflow */ 3059 3060 *sectors = new; 3061 return 0; 3062 } 3063 3064 static ssize_t 3065 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3066 { 3067 struct mddev *my_mddev = rdev->mddev; 3068 sector_t oldsectors = rdev->sectors; 3069 sector_t sectors; 3070 3071 if (test_bit(Journal, &rdev->flags)) 3072 return -EBUSY; 3073 if (strict_blocks_to_sectors(buf, §ors) < 0) 3074 return -EINVAL; 3075 if (rdev->data_offset != rdev->new_data_offset) 3076 return -EINVAL; /* too confusing */ 3077 if (my_mddev->pers && rdev->raid_disk >= 0) { 3078 if (my_mddev->persistent) { 3079 sectors = super_types[my_mddev->major_version]. 3080 rdev_size_change(rdev, sectors); 3081 if (!sectors) 3082 return -EBUSY; 3083 } else if (!sectors) 3084 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 3085 rdev->data_offset; 3086 if (!my_mddev->pers->resize) 3087 /* Cannot change size for RAID0 or Linear etc */ 3088 return -EINVAL; 3089 } 3090 if (sectors < my_mddev->dev_sectors) 3091 return -EINVAL; /* component must fit device */ 3092 3093 rdev->sectors = sectors; 3094 if (sectors > oldsectors && my_mddev->external) { 3095 /* Need to check that all other rdevs with the same 3096 * ->bdev do not overlap. 'rcu' is sufficient to walk 3097 * the rdev lists safely. 3098 * This check does not provide a hard guarantee, it 3099 * just helps avoid dangerous mistakes. 3100 */ 3101 struct mddev *mddev; 3102 int overlap = 0; 3103 struct list_head *tmp; 3104 3105 rcu_read_lock(); 3106 for_each_mddev(mddev, tmp) { 3107 struct md_rdev *rdev2; 3108 3109 rdev_for_each(rdev2, mddev) 3110 if (rdev->bdev == rdev2->bdev && 3111 rdev != rdev2 && 3112 overlaps(rdev->data_offset, rdev->sectors, 3113 rdev2->data_offset, 3114 rdev2->sectors)) { 3115 overlap = 1; 3116 break; 3117 } 3118 if (overlap) { 3119 mddev_put(mddev); 3120 break; 3121 } 3122 } 3123 rcu_read_unlock(); 3124 if (overlap) { 3125 /* Someone else could have slipped in a size 3126 * change here, but doing so is just silly. 3127 * We put oldsectors back because we *know* it is 3128 * safe, and trust userspace not to race with 3129 * itself 3130 */ 3131 rdev->sectors = oldsectors; 3132 return -EBUSY; 3133 } 3134 } 3135 return len; 3136 } 3137 3138 static struct rdev_sysfs_entry rdev_size = 3139 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3140 3141 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3142 { 3143 unsigned long long recovery_start = rdev->recovery_offset; 3144 3145 if (test_bit(In_sync, &rdev->flags) || 3146 recovery_start == MaxSector) 3147 return sprintf(page, "none\n"); 3148 3149 return sprintf(page, "%llu\n", recovery_start); 3150 } 3151 3152 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3153 { 3154 unsigned long long recovery_start; 3155 3156 if (cmd_match(buf, "none")) 3157 recovery_start = MaxSector; 3158 else if (kstrtoull(buf, 10, &recovery_start)) 3159 return -EINVAL; 3160 3161 if (rdev->mddev->pers && 3162 rdev->raid_disk >= 0) 3163 return -EBUSY; 3164 3165 rdev->recovery_offset = recovery_start; 3166 if (recovery_start == MaxSector) 3167 set_bit(In_sync, &rdev->flags); 3168 else 3169 clear_bit(In_sync, &rdev->flags); 3170 return len; 3171 } 3172 3173 static struct rdev_sysfs_entry rdev_recovery_start = 3174 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3175 3176 /* sysfs access to bad-blocks list. 3177 * We present two files. 3178 * 'bad-blocks' lists sector numbers and lengths of ranges that 3179 * are recorded as bad. The list is truncated to fit within 3180 * the one-page limit of sysfs. 3181 * Writing "sector length" to this file adds an acknowledged 3182 * bad block list. 3183 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3184 * been acknowledged. Writing to this file adds bad blocks 3185 * without acknowledging them. This is largely for testing. 3186 */ 3187 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3188 { 3189 return badblocks_show(&rdev->badblocks, page, 0); 3190 } 3191 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3192 { 3193 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3194 /* Maybe that ack was all we needed */ 3195 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3196 wake_up(&rdev->blocked_wait); 3197 return rv; 3198 } 3199 static struct rdev_sysfs_entry rdev_bad_blocks = 3200 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3201 3202 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3203 { 3204 return badblocks_show(&rdev->badblocks, page, 1); 3205 } 3206 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3207 { 3208 return badblocks_store(&rdev->badblocks, page, len, 1); 3209 } 3210 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3211 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3212 3213 static ssize_t 3214 ppl_sector_show(struct md_rdev *rdev, char *page) 3215 { 3216 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3217 } 3218 3219 static ssize_t 3220 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3221 { 3222 unsigned long long sector; 3223 3224 if (kstrtoull(buf, 10, §or) < 0) 3225 return -EINVAL; 3226 if (sector != (sector_t)sector) 3227 return -EINVAL; 3228 3229 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3230 rdev->raid_disk >= 0) 3231 return -EBUSY; 3232 3233 if (rdev->mddev->persistent) { 3234 if (rdev->mddev->major_version == 0) 3235 return -EINVAL; 3236 if ((sector > rdev->sb_start && 3237 sector - rdev->sb_start > S16_MAX) || 3238 (sector < rdev->sb_start && 3239 rdev->sb_start - sector > -S16_MIN)) 3240 return -EINVAL; 3241 rdev->ppl.offset = sector - rdev->sb_start; 3242 } else if (!rdev->mddev->external) { 3243 return -EBUSY; 3244 } 3245 rdev->ppl.sector = sector; 3246 return len; 3247 } 3248 3249 static struct rdev_sysfs_entry rdev_ppl_sector = 3250 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3251 3252 static ssize_t 3253 ppl_size_show(struct md_rdev *rdev, char *page) 3254 { 3255 return sprintf(page, "%u\n", rdev->ppl.size); 3256 } 3257 3258 static ssize_t 3259 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3260 { 3261 unsigned int size; 3262 3263 if (kstrtouint(buf, 10, &size) < 0) 3264 return -EINVAL; 3265 3266 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3267 rdev->raid_disk >= 0) 3268 return -EBUSY; 3269 3270 if (rdev->mddev->persistent) { 3271 if (rdev->mddev->major_version == 0) 3272 return -EINVAL; 3273 if (size > U16_MAX) 3274 return -EINVAL; 3275 } else if (!rdev->mddev->external) { 3276 return -EBUSY; 3277 } 3278 rdev->ppl.size = size; 3279 return len; 3280 } 3281 3282 static struct rdev_sysfs_entry rdev_ppl_size = 3283 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3284 3285 static struct attribute *rdev_default_attrs[] = { 3286 &rdev_state.attr, 3287 &rdev_errors.attr, 3288 &rdev_slot.attr, 3289 &rdev_offset.attr, 3290 &rdev_new_offset.attr, 3291 &rdev_size.attr, 3292 &rdev_recovery_start.attr, 3293 &rdev_bad_blocks.attr, 3294 &rdev_unack_bad_blocks.attr, 3295 &rdev_ppl_sector.attr, 3296 &rdev_ppl_size.attr, 3297 NULL, 3298 }; 3299 static ssize_t 3300 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3301 { 3302 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3303 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3304 3305 if (!entry->show) 3306 return -EIO; 3307 if (!rdev->mddev) 3308 return -EBUSY; 3309 return entry->show(rdev, page); 3310 } 3311 3312 static ssize_t 3313 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3314 const char *page, size_t length) 3315 { 3316 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3317 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3318 ssize_t rv; 3319 struct mddev *mddev = rdev->mddev; 3320 3321 if (!entry->store) 3322 return -EIO; 3323 if (!capable(CAP_SYS_ADMIN)) 3324 return -EACCES; 3325 rv = mddev ? mddev_lock(mddev): -EBUSY; 3326 if (!rv) { 3327 if (rdev->mddev == NULL) 3328 rv = -EBUSY; 3329 else 3330 rv = entry->store(rdev, page, length); 3331 mddev_unlock(mddev); 3332 } 3333 return rv; 3334 } 3335 3336 static void rdev_free(struct kobject *ko) 3337 { 3338 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3339 kfree(rdev); 3340 } 3341 static const struct sysfs_ops rdev_sysfs_ops = { 3342 .show = rdev_attr_show, 3343 .store = rdev_attr_store, 3344 }; 3345 static struct kobj_type rdev_ktype = { 3346 .release = rdev_free, 3347 .sysfs_ops = &rdev_sysfs_ops, 3348 .default_attrs = rdev_default_attrs, 3349 }; 3350 3351 int md_rdev_init(struct md_rdev *rdev) 3352 { 3353 rdev->desc_nr = -1; 3354 rdev->saved_raid_disk = -1; 3355 rdev->raid_disk = -1; 3356 rdev->flags = 0; 3357 rdev->data_offset = 0; 3358 rdev->new_data_offset = 0; 3359 rdev->sb_events = 0; 3360 rdev->last_read_error = 0; 3361 rdev->sb_loaded = 0; 3362 rdev->bb_page = NULL; 3363 atomic_set(&rdev->nr_pending, 0); 3364 atomic_set(&rdev->read_errors, 0); 3365 atomic_set(&rdev->corrected_errors, 0); 3366 3367 INIT_LIST_HEAD(&rdev->same_set); 3368 init_waitqueue_head(&rdev->blocked_wait); 3369 3370 /* Add space to store bad block list. 3371 * This reserves the space even on arrays where it cannot 3372 * be used - I wonder if that matters 3373 */ 3374 return badblocks_init(&rdev->badblocks, 0); 3375 } 3376 EXPORT_SYMBOL_GPL(md_rdev_init); 3377 /* 3378 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3379 * 3380 * mark the device faulty if: 3381 * 3382 * - the device is nonexistent (zero size) 3383 * - the device has no valid superblock 3384 * 3385 * a faulty rdev _never_ has rdev->sb set. 3386 */ 3387 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3388 { 3389 char b[BDEVNAME_SIZE]; 3390 int err; 3391 struct md_rdev *rdev; 3392 sector_t size; 3393 3394 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3395 if (!rdev) 3396 return ERR_PTR(-ENOMEM); 3397 3398 err = md_rdev_init(rdev); 3399 if (err) 3400 goto abort_free; 3401 err = alloc_disk_sb(rdev); 3402 if (err) 3403 goto abort_free; 3404 3405 err = lock_rdev(rdev, newdev, super_format == -2); 3406 if (err) 3407 goto abort_free; 3408 3409 kobject_init(&rdev->kobj, &rdev_ktype); 3410 3411 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3412 if (!size) { 3413 pr_warn("md: %s has zero or unknown size, marking faulty!\n", 3414 bdevname(rdev->bdev,b)); 3415 err = -EINVAL; 3416 goto abort_free; 3417 } 3418 3419 if (super_format >= 0) { 3420 err = super_types[super_format]. 3421 load_super(rdev, NULL, super_minor); 3422 if (err == -EINVAL) { 3423 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", 3424 bdevname(rdev->bdev,b), 3425 super_format, super_minor); 3426 goto abort_free; 3427 } 3428 if (err < 0) { 3429 pr_warn("md: could not read %s's sb, not importing!\n", 3430 bdevname(rdev->bdev,b)); 3431 goto abort_free; 3432 } 3433 } 3434 3435 return rdev; 3436 3437 abort_free: 3438 if (rdev->bdev) 3439 unlock_rdev(rdev); 3440 md_rdev_clear(rdev); 3441 kfree(rdev); 3442 return ERR_PTR(err); 3443 } 3444 3445 /* 3446 * Check a full RAID array for plausibility 3447 */ 3448 3449 static void analyze_sbs(struct mddev *mddev) 3450 { 3451 int i; 3452 struct md_rdev *rdev, *freshest, *tmp; 3453 char b[BDEVNAME_SIZE]; 3454 3455 freshest = NULL; 3456 rdev_for_each_safe(rdev, tmp, mddev) 3457 switch (super_types[mddev->major_version]. 3458 load_super(rdev, freshest, mddev->minor_version)) { 3459 case 1: 3460 freshest = rdev; 3461 break; 3462 case 0: 3463 break; 3464 default: 3465 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", 3466 bdevname(rdev->bdev,b)); 3467 md_kick_rdev_from_array(rdev); 3468 } 3469 3470 super_types[mddev->major_version]. 3471 validate_super(mddev, freshest); 3472 3473 i = 0; 3474 rdev_for_each_safe(rdev, tmp, mddev) { 3475 if (mddev->max_disks && 3476 (rdev->desc_nr >= mddev->max_disks || 3477 i > mddev->max_disks)) { 3478 pr_warn("md: %s: %s: only %d devices permitted\n", 3479 mdname(mddev), bdevname(rdev->bdev, b), 3480 mddev->max_disks); 3481 md_kick_rdev_from_array(rdev); 3482 continue; 3483 } 3484 if (rdev != freshest) { 3485 if (super_types[mddev->major_version]. 3486 validate_super(mddev, rdev)) { 3487 pr_warn("md: kicking non-fresh %s from array!\n", 3488 bdevname(rdev->bdev,b)); 3489 md_kick_rdev_from_array(rdev); 3490 continue; 3491 } 3492 } 3493 if (mddev->level == LEVEL_MULTIPATH) { 3494 rdev->desc_nr = i++; 3495 rdev->raid_disk = rdev->desc_nr; 3496 set_bit(In_sync, &rdev->flags); 3497 } else if (rdev->raid_disk >= 3498 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3499 !test_bit(Journal, &rdev->flags)) { 3500 rdev->raid_disk = -1; 3501 clear_bit(In_sync, &rdev->flags); 3502 } 3503 } 3504 } 3505 3506 /* Read a fixed-point number. 3507 * Numbers in sysfs attributes should be in "standard" units where 3508 * possible, so time should be in seconds. 3509 * However we internally use a a much smaller unit such as 3510 * milliseconds or jiffies. 3511 * This function takes a decimal number with a possible fractional 3512 * component, and produces an integer which is the result of 3513 * multiplying that number by 10^'scale'. 3514 * all without any floating-point arithmetic. 3515 */ 3516 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3517 { 3518 unsigned long result = 0; 3519 long decimals = -1; 3520 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3521 if (*cp == '.') 3522 decimals = 0; 3523 else if (decimals < scale) { 3524 unsigned int value; 3525 value = *cp - '0'; 3526 result = result * 10 + value; 3527 if (decimals >= 0) 3528 decimals++; 3529 } 3530 cp++; 3531 } 3532 if (*cp == '\n') 3533 cp++; 3534 if (*cp) 3535 return -EINVAL; 3536 if (decimals < 0) 3537 decimals = 0; 3538 while (decimals < scale) { 3539 result *= 10; 3540 decimals ++; 3541 } 3542 *res = result; 3543 return 0; 3544 } 3545 3546 static ssize_t 3547 safe_delay_show(struct mddev *mddev, char *page) 3548 { 3549 int msec = (mddev->safemode_delay*1000)/HZ; 3550 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3551 } 3552 static ssize_t 3553 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3554 { 3555 unsigned long msec; 3556 3557 if (mddev_is_clustered(mddev)) { 3558 pr_warn("md: Safemode is disabled for clustered mode\n"); 3559 return -EINVAL; 3560 } 3561 3562 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3563 return -EINVAL; 3564 if (msec == 0) 3565 mddev->safemode_delay = 0; 3566 else { 3567 unsigned long old_delay = mddev->safemode_delay; 3568 unsigned long new_delay = (msec*HZ)/1000; 3569 3570 if (new_delay == 0) 3571 new_delay = 1; 3572 mddev->safemode_delay = new_delay; 3573 if (new_delay < old_delay || old_delay == 0) 3574 mod_timer(&mddev->safemode_timer, jiffies+1); 3575 } 3576 return len; 3577 } 3578 static struct md_sysfs_entry md_safe_delay = 3579 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3580 3581 static ssize_t 3582 level_show(struct mddev *mddev, char *page) 3583 { 3584 struct md_personality *p; 3585 int ret; 3586 spin_lock(&mddev->lock); 3587 p = mddev->pers; 3588 if (p) 3589 ret = sprintf(page, "%s\n", p->name); 3590 else if (mddev->clevel[0]) 3591 ret = sprintf(page, "%s\n", mddev->clevel); 3592 else if (mddev->level != LEVEL_NONE) 3593 ret = sprintf(page, "%d\n", mddev->level); 3594 else 3595 ret = 0; 3596 spin_unlock(&mddev->lock); 3597 return ret; 3598 } 3599 3600 static ssize_t 3601 level_store(struct mddev *mddev, const char *buf, size_t len) 3602 { 3603 char clevel[16]; 3604 ssize_t rv; 3605 size_t slen = len; 3606 struct md_personality *pers, *oldpers; 3607 long level; 3608 void *priv, *oldpriv; 3609 struct md_rdev *rdev; 3610 3611 if (slen == 0 || slen >= sizeof(clevel)) 3612 return -EINVAL; 3613 3614 rv = mddev_lock(mddev); 3615 if (rv) 3616 return rv; 3617 3618 if (mddev->pers == NULL) { 3619 strncpy(mddev->clevel, buf, slen); 3620 if (mddev->clevel[slen-1] == '\n') 3621 slen--; 3622 mddev->clevel[slen] = 0; 3623 mddev->level = LEVEL_NONE; 3624 rv = len; 3625 goto out_unlock; 3626 } 3627 rv = -EROFS; 3628 if (mddev->ro) 3629 goto out_unlock; 3630 3631 /* request to change the personality. Need to ensure: 3632 * - array is not engaged in resync/recovery/reshape 3633 * - old personality can be suspended 3634 * - new personality will access other array. 3635 */ 3636 3637 rv = -EBUSY; 3638 if (mddev->sync_thread || 3639 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3640 mddev->reshape_position != MaxSector || 3641 mddev->sysfs_active) 3642 goto out_unlock; 3643 3644 rv = -EINVAL; 3645 if (!mddev->pers->quiesce) { 3646 pr_warn("md: %s: %s does not support online personality change\n", 3647 mdname(mddev), mddev->pers->name); 3648 goto out_unlock; 3649 } 3650 3651 /* Now find the new personality */ 3652 strncpy(clevel, buf, slen); 3653 if (clevel[slen-1] == '\n') 3654 slen--; 3655 clevel[slen] = 0; 3656 if (kstrtol(clevel, 10, &level)) 3657 level = LEVEL_NONE; 3658 3659 if (request_module("md-%s", clevel) != 0) 3660 request_module("md-level-%s", clevel); 3661 spin_lock(&pers_lock); 3662 pers = find_pers(level, clevel); 3663 if (!pers || !try_module_get(pers->owner)) { 3664 spin_unlock(&pers_lock); 3665 pr_warn("md: personality %s not loaded\n", clevel); 3666 rv = -EINVAL; 3667 goto out_unlock; 3668 } 3669 spin_unlock(&pers_lock); 3670 3671 if (pers == mddev->pers) { 3672 /* Nothing to do! */ 3673 module_put(pers->owner); 3674 rv = len; 3675 goto out_unlock; 3676 } 3677 if (!pers->takeover) { 3678 module_put(pers->owner); 3679 pr_warn("md: %s: %s does not support personality takeover\n", 3680 mdname(mddev), clevel); 3681 rv = -EINVAL; 3682 goto out_unlock; 3683 } 3684 3685 rdev_for_each(rdev, mddev) 3686 rdev->new_raid_disk = rdev->raid_disk; 3687 3688 /* ->takeover must set new_* and/or delta_disks 3689 * if it succeeds, and may set them when it fails. 3690 */ 3691 priv = pers->takeover(mddev); 3692 if (IS_ERR(priv)) { 3693 mddev->new_level = mddev->level; 3694 mddev->new_layout = mddev->layout; 3695 mddev->new_chunk_sectors = mddev->chunk_sectors; 3696 mddev->raid_disks -= mddev->delta_disks; 3697 mddev->delta_disks = 0; 3698 mddev->reshape_backwards = 0; 3699 module_put(pers->owner); 3700 pr_warn("md: %s: %s would not accept array\n", 3701 mdname(mddev), clevel); 3702 rv = PTR_ERR(priv); 3703 goto out_unlock; 3704 } 3705 3706 /* Looks like we have a winner */ 3707 mddev_suspend(mddev); 3708 mddev_detach(mddev); 3709 3710 spin_lock(&mddev->lock); 3711 oldpers = mddev->pers; 3712 oldpriv = mddev->private; 3713 mddev->pers = pers; 3714 mddev->private = priv; 3715 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3716 mddev->level = mddev->new_level; 3717 mddev->layout = mddev->new_layout; 3718 mddev->chunk_sectors = mddev->new_chunk_sectors; 3719 mddev->delta_disks = 0; 3720 mddev->reshape_backwards = 0; 3721 mddev->degraded = 0; 3722 spin_unlock(&mddev->lock); 3723 3724 if (oldpers->sync_request == NULL && 3725 mddev->external) { 3726 /* We are converting from a no-redundancy array 3727 * to a redundancy array and metadata is managed 3728 * externally so we need to be sure that writes 3729 * won't block due to a need to transition 3730 * clean->dirty 3731 * until external management is started. 3732 */ 3733 mddev->in_sync = 0; 3734 mddev->safemode_delay = 0; 3735 mddev->safemode = 0; 3736 } 3737 3738 oldpers->free(mddev, oldpriv); 3739 3740 if (oldpers->sync_request == NULL && 3741 pers->sync_request != NULL) { 3742 /* need to add the md_redundancy_group */ 3743 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3744 pr_warn("md: cannot register extra attributes for %s\n", 3745 mdname(mddev)); 3746 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3747 } 3748 if (oldpers->sync_request != NULL && 3749 pers->sync_request == NULL) { 3750 /* need to remove the md_redundancy_group */ 3751 if (mddev->to_remove == NULL) 3752 mddev->to_remove = &md_redundancy_group; 3753 } 3754 3755 module_put(oldpers->owner); 3756 3757 rdev_for_each(rdev, mddev) { 3758 if (rdev->raid_disk < 0) 3759 continue; 3760 if (rdev->new_raid_disk >= mddev->raid_disks) 3761 rdev->new_raid_disk = -1; 3762 if (rdev->new_raid_disk == rdev->raid_disk) 3763 continue; 3764 sysfs_unlink_rdev(mddev, rdev); 3765 } 3766 rdev_for_each(rdev, mddev) { 3767 if (rdev->raid_disk < 0) 3768 continue; 3769 if (rdev->new_raid_disk == rdev->raid_disk) 3770 continue; 3771 rdev->raid_disk = rdev->new_raid_disk; 3772 if (rdev->raid_disk < 0) 3773 clear_bit(In_sync, &rdev->flags); 3774 else { 3775 if (sysfs_link_rdev(mddev, rdev)) 3776 pr_warn("md: cannot register rd%d for %s after level change\n", 3777 rdev->raid_disk, mdname(mddev)); 3778 } 3779 } 3780 3781 if (pers->sync_request == NULL) { 3782 /* this is now an array without redundancy, so 3783 * it must always be in_sync 3784 */ 3785 mddev->in_sync = 1; 3786 del_timer_sync(&mddev->safemode_timer); 3787 } 3788 blk_set_stacking_limits(&mddev->queue->limits); 3789 pers->run(mddev); 3790 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3791 mddev_resume(mddev); 3792 if (!mddev->thread) 3793 md_update_sb(mddev, 1); 3794 sysfs_notify(&mddev->kobj, NULL, "level"); 3795 md_new_event(mddev); 3796 rv = len; 3797 out_unlock: 3798 mddev_unlock(mddev); 3799 return rv; 3800 } 3801 3802 static struct md_sysfs_entry md_level = 3803 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3804 3805 static ssize_t 3806 layout_show(struct mddev *mddev, char *page) 3807 { 3808 /* just a number, not meaningful for all levels */ 3809 if (mddev->reshape_position != MaxSector && 3810 mddev->layout != mddev->new_layout) 3811 return sprintf(page, "%d (%d)\n", 3812 mddev->new_layout, mddev->layout); 3813 return sprintf(page, "%d\n", mddev->layout); 3814 } 3815 3816 static ssize_t 3817 layout_store(struct mddev *mddev, const char *buf, size_t len) 3818 { 3819 unsigned int n; 3820 int err; 3821 3822 err = kstrtouint(buf, 10, &n); 3823 if (err < 0) 3824 return err; 3825 err = mddev_lock(mddev); 3826 if (err) 3827 return err; 3828 3829 if (mddev->pers) { 3830 if (mddev->pers->check_reshape == NULL) 3831 err = -EBUSY; 3832 else if (mddev->ro) 3833 err = -EROFS; 3834 else { 3835 mddev->new_layout = n; 3836 err = mddev->pers->check_reshape(mddev); 3837 if (err) 3838 mddev->new_layout = mddev->layout; 3839 } 3840 } else { 3841 mddev->new_layout = n; 3842 if (mddev->reshape_position == MaxSector) 3843 mddev->layout = n; 3844 } 3845 mddev_unlock(mddev); 3846 return err ?: len; 3847 } 3848 static struct md_sysfs_entry md_layout = 3849 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3850 3851 static ssize_t 3852 raid_disks_show(struct mddev *mddev, char *page) 3853 { 3854 if (mddev->raid_disks == 0) 3855 return 0; 3856 if (mddev->reshape_position != MaxSector && 3857 mddev->delta_disks != 0) 3858 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3859 mddev->raid_disks - mddev->delta_disks); 3860 return sprintf(page, "%d\n", mddev->raid_disks); 3861 } 3862 3863 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3864 3865 static ssize_t 3866 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3867 { 3868 unsigned int n; 3869 int err; 3870 3871 err = kstrtouint(buf, 10, &n); 3872 if (err < 0) 3873 return err; 3874 3875 err = mddev_lock(mddev); 3876 if (err) 3877 return err; 3878 if (mddev->pers) 3879 err = update_raid_disks(mddev, n); 3880 else if (mddev->reshape_position != MaxSector) { 3881 struct md_rdev *rdev; 3882 int olddisks = mddev->raid_disks - mddev->delta_disks; 3883 3884 err = -EINVAL; 3885 rdev_for_each(rdev, mddev) { 3886 if (olddisks < n && 3887 rdev->data_offset < rdev->new_data_offset) 3888 goto out_unlock; 3889 if (olddisks > n && 3890 rdev->data_offset > rdev->new_data_offset) 3891 goto out_unlock; 3892 } 3893 err = 0; 3894 mddev->delta_disks = n - olddisks; 3895 mddev->raid_disks = n; 3896 mddev->reshape_backwards = (mddev->delta_disks < 0); 3897 } else 3898 mddev->raid_disks = n; 3899 out_unlock: 3900 mddev_unlock(mddev); 3901 return err ? err : len; 3902 } 3903 static struct md_sysfs_entry md_raid_disks = 3904 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3905 3906 static ssize_t 3907 chunk_size_show(struct mddev *mddev, char *page) 3908 { 3909 if (mddev->reshape_position != MaxSector && 3910 mddev->chunk_sectors != mddev->new_chunk_sectors) 3911 return sprintf(page, "%d (%d)\n", 3912 mddev->new_chunk_sectors << 9, 3913 mddev->chunk_sectors << 9); 3914 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3915 } 3916 3917 static ssize_t 3918 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3919 { 3920 unsigned long n; 3921 int err; 3922 3923 err = kstrtoul(buf, 10, &n); 3924 if (err < 0) 3925 return err; 3926 3927 err = mddev_lock(mddev); 3928 if (err) 3929 return err; 3930 if (mddev->pers) { 3931 if (mddev->pers->check_reshape == NULL) 3932 err = -EBUSY; 3933 else if (mddev->ro) 3934 err = -EROFS; 3935 else { 3936 mddev->new_chunk_sectors = n >> 9; 3937 err = mddev->pers->check_reshape(mddev); 3938 if (err) 3939 mddev->new_chunk_sectors = mddev->chunk_sectors; 3940 } 3941 } else { 3942 mddev->new_chunk_sectors = n >> 9; 3943 if (mddev->reshape_position == MaxSector) 3944 mddev->chunk_sectors = n >> 9; 3945 } 3946 mddev_unlock(mddev); 3947 return err ?: len; 3948 } 3949 static struct md_sysfs_entry md_chunk_size = 3950 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3951 3952 static ssize_t 3953 resync_start_show(struct mddev *mddev, char *page) 3954 { 3955 if (mddev->recovery_cp == MaxSector) 3956 return sprintf(page, "none\n"); 3957 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3958 } 3959 3960 static ssize_t 3961 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3962 { 3963 unsigned long long n; 3964 int err; 3965 3966 if (cmd_match(buf, "none")) 3967 n = MaxSector; 3968 else { 3969 err = kstrtoull(buf, 10, &n); 3970 if (err < 0) 3971 return err; 3972 if (n != (sector_t)n) 3973 return -EINVAL; 3974 } 3975 3976 err = mddev_lock(mddev); 3977 if (err) 3978 return err; 3979 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3980 err = -EBUSY; 3981 3982 if (!err) { 3983 mddev->recovery_cp = n; 3984 if (mddev->pers) 3985 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 3986 } 3987 mddev_unlock(mddev); 3988 return err ?: len; 3989 } 3990 static struct md_sysfs_entry md_resync_start = 3991 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 3992 resync_start_show, resync_start_store); 3993 3994 /* 3995 * The array state can be: 3996 * 3997 * clear 3998 * No devices, no size, no level 3999 * Equivalent to STOP_ARRAY ioctl 4000 * inactive 4001 * May have some settings, but array is not active 4002 * all IO results in error 4003 * When written, doesn't tear down array, but just stops it 4004 * suspended (not supported yet) 4005 * All IO requests will block. The array can be reconfigured. 4006 * Writing this, if accepted, will block until array is quiescent 4007 * readonly 4008 * no resync can happen. no superblocks get written. 4009 * write requests fail 4010 * read-auto 4011 * like readonly, but behaves like 'clean' on a write request. 4012 * 4013 * clean - no pending writes, but otherwise active. 4014 * When written to inactive array, starts without resync 4015 * If a write request arrives then 4016 * if metadata is known, mark 'dirty' and switch to 'active'. 4017 * if not known, block and switch to write-pending 4018 * If written to an active array that has pending writes, then fails. 4019 * active 4020 * fully active: IO and resync can be happening. 4021 * When written to inactive array, starts with resync 4022 * 4023 * write-pending 4024 * clean, but writes are blocked waiting for 'active' to be written. 4025 * 4026 * active-idle 4027 * like active, but no writes have been seen for a while (100msec). 4028 * 4029 */ 4030 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4031 write_pending, active_idle, bad_word}; 4032 static char *array_states[] = { 4033 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4034 "write-pending", "active-idle", NULL }; 4035 4036 static int match_word(const char *word, char **list) 4037 { 4038 int n; 4039 for (n=0; list[n]; n++) 4040 if (cmd_match(word, list[n])) 4041 break; 4042 return n; 4043 } 4044 4045 static ssize_t 4046 array_state_show(struct mddev *mddev, char *page) 4047 { 4048 enum array_state st = inactive; 4049 4050 if (mddev->pers) 4051 switch(mddev->ro) { 4052 case 1: 4053 st = readonly; 4054 break; 4055 case 2: 4056 st = read_auto; 4057 break; 4058 case 0: 4059 spin_lock(&mddev->lock); 4060 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4061 st = write_pending; 4062 else if (mddev->in_sync) 4063 st = clean; 4064 else if (mddev->safemode) 4065 st = active_idle; 4066 else 4067 st = active; 4068 spin_unlock(&mddev->lock); 4069 } 4070 else { 4071 if (list_empty(&mddev->disks) && 4072 mddev->raid_disks == 0 && 4073 mddev->dev_sectors == 0) 4074 st = clear; 4075 else 4076 st = inactive; 4077 } 4078 return sprintf(page, "%s\n", array_states[st]); 4079 } 4080 4081 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4082 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4083 static int do_md_run(struct mddev *mddev); 4084 static int restart_array(struct mddev *mddev); 4085 4086 static ssize_t 4087 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4088 { 4089 int err = 0; 4090 enum array_state st = match_word(buf, array_states); 4091 4092 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 4093 /* don't take reconfig_mutex when toggling between 4094 * clean and active 4095 */ 4096 spin_lock(&mddev->lock); 4097 if (st == active) { 4098 restart_array(mddev); 4099 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4100 md_wakeup_thread(mddev->thread); 4101 wake_up(&mddev->sb_wait); 4102 } else /* st == clean */ { 4103 restart_array(mddev); 4104 if (!set_in_sync(mddev)) 4105 err = -EBUSY; 4106 } 4107 if (!err) 4108 sysfs_notify_dirent_safe(mddev->sysfs_state); 4109 spin_unlock(&mddev->lock); 4110 return err ?: len; 4111 } 4112 err = mddev_lock(mddev); 4113 if (err) 4114 return err; 4115 err = -EINVAL; 4116 switch(st) { 4117 case bad_word: 4118 break; 4119 case clear: 4120 /* stopping an active array */ 4121 err = do_md_stop(mddev, 0, NULL); 4122 break; 4123 case inactive: 4124 /* stopping an active array */ 4125 if (mddev->pers) 4126 err = do_md_stop(mddev, 2, NULL); 4127 else 4128 err = 0; /* already inactive */ 4129 break; 4130 case suspended: 4131 break; /* not supported yet */ 4132 case readonly: 4133 if (mddev->pers) 4134 err = md_set_readonly(mddev, NULL); 4135 else { 4136 mddev->ro = 1; 4137 set_disk_ro(mddev->gendisk, 1); 4138 err = do_md_run(mddev); 4139 } 4140 break; 4141 case read_auto: 4142 if (mddev->pers) { 4143 if (mddev->ro == 0) 4144 err = md_set_readonly(mddev, NULL); 4145 else if (mddev->ro == 1) 4146 err = restart_array(mddev); 4147 if (err == 0) { 4148 mddev->ro = 2; 4149 set_disk_ro(mddev->gendisk, 0); 4150 } 4151 } else { 4152 mddev->ro = 2; 4153 err = do_md_run(mddev); 4154 } 4155 break; 4156 case clean: 4157 if (mddev->pers) { 4158 err = restart_array(mddev); 4159 if (err) 4160 break; 4161 spin_lock(&mddev->lock); 4162 if (!set_in_sync(mddev)) 4163 err = -EBUSY; 4164 spin_unlock(&mddev->lock); 4165 } else 4166 err = -EINVAL; 4167 break; 4168 case active: 4169 if (mddev->pers) { 4170 err = restart_array(mddev); 4171 if (err) 4172 break; 4173 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4174 wake_up(&mddev->sb_wait); 4175 err = 0; 4176 } else { 4177 mddev->ro = 0; 4178 set_disk_ro(mddev->gendisk, 0); 4179 err = do_md_run(mddev); 4180 } 4181 break; 4182 case write_pending: 4183 case active_idle: 4184 /* these cannot be set */ 4185 break; 4186 } 4187 4188 if (!err) { 4189 if (mddev->hold_active == UNTIL_IOCTL) 4190 mddev->hold_active = 0; 4191 sysfs_notify_dirent_safe(mddev->sysfs_state); 4192 } 4193 mddev_unlock(mddev); 4194 return err ?: len; 4195 } 4196 static struct md_sysfs_entry md_array_state = 4197 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4198 4199 static ssize_t 4200 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4201 return sprintf(page, "%d\n", 4202 atomic_read(&mddev->max_corr_read_errors)); 4203 } 4204 4205 static ssize_t 4206 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4207 { 4208 unsigned int n; 4209 int rv; 4210 4211 rv = kstrtouint(buf, 10, &n); 4212 if (rv < 0) 4213 return rv; 4214 atomic_set(&mddev->max_corr_read_errors, n); 4215 return len; 4216 } 4217 4218 static struct md_sysfs_entry max_corr_read_errors = 4219 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4220 max_corrected_read_errors_store); 4221 4222 static ssize_t 4223 null_show(struct mddev *mddev, char *page) 4224 { 4225 return -EINVAL; 4226 } 4227 4228 static ssize_t 4229 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4230 { 4231 /* buf must be %d:%d\n? giving major and minor numbers */ 4232 /* The new device is added to the array. 4233 * If the array has a persistent superblock, we read the 4234 * superblock to initialise info and check validity. 4235 * Otherwise, only checking done is that in bind_rdev_to_array, 4236 * which mainly checks size. 4237 */ 4238 char *e; 4239 int major = simple_strtoul(buf, &e, 10); 4240 int minor; 4241 dev_t dev; 4242 struct md_rdev *rdev; 4243 int err; 4244 4245 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4246 return -EINVAL; 4247 minor = simple_strtoul(e+1, &e, 10); 4248 if (*e && *e != '\n') 4249 return -EINVAL; 4250 dev = MKDEV(major, minor); 4251 if (major != MAJOR(dev) || 4252 minor != MINOR(dev)) 4253 return -EOVERFLOW; 4254 4255 flush_workqueue(md_misc_wq); 4256 4257 err = mddev_lock(mddev); 4258 if (err) 4259 return err; 4260 if (mddev->persistent) { 4261 rdev = md_import_device(dev, mddev->major_version, 4262 mddev->minor_version); 4263 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4264 struct md_rdev *rdev0 4265 = list_entry(mddev->disks.next, 4266 struct md_rdev, same_set); 4267 err = super_types[mddev->major_version] 4268 .load_super(rdev, rdev0, mddev->minor_version); 4269 if (err < 0) 4270 goto out; 4271 } 4272 } else if (mddev->external) 4273 rdev = md_import_device(dev, -2, -1); 4274 else 4275 rdev = md_import_device(dev, -1, -1); 4276 4277 if (IS_ERR(rdev)) { 4278 mddev_unlock(mddev); 4279 return PTR_ERR(rdev); 4280 } 4281 err = bind_rdev_to_array(rdev, mddev); 4282 out: 4283 if (err) 4284 export_rdev(rdev); 4285 mddev_unlock(mddev); 4286 return err ? err : len; 4287 } 4288 4289 static struct md_sysfs_entry md_new_device = 4290 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4291 4292 static ssize_t 4293 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4294 { 4295 char *end; 4296 unsigned long chunk, end_chunk; 4297 int err; 4298 4299 err = mddev_lock(mddev); 4300 if (err) 4301 return err; 4302 if (!mddev->bitmap) 4303 goto out; 4304 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4305 while (*buf) { 4306 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4307 if (buf == end) break; 4308 if (*end == '-') { /* range */ 4309 buf = end + 1; 4310 end_chunk = simple_strtoul(buf, &end, 0); 4311 if (buf == end) break; 4312 } 4313 if (*end && !isspace(*end)) break; 4314 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4315 buf = skip_spaces(end); 4316 } 4317 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4318 out: 4319 mddev_unlock(mddev); 4320 return len; 4321 } 4322 4323 static struct md_sysfs_entry md_bitmap = 4324 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4325 4326 static ssize_t 4327 size_show(struct mddev *mddev, char *page) 4328 { 4329 return sprintf(page, "%llu\n", 4330 (unsigned long long)mddev->dev_sectors / 2); 4331 } 4332 4333 static int update_size(struct mddev *mddev, sector_t num_sectors); 4334 4335 static ssize_t 4336 size_store(struct mddev *mddev, const char *buf, size_t len) 4337 { 4338 /* If array is inactive, we can reduce the component size, but 4339 * not increase it (except from 0). 4340 * If array is active, we can try an on-line resize 4341 */ 4342 sector_t sectors; 4343 int err = strict_blocks_to_sectors(buf, §ors); 4344 4345 if (err < 0) 4346 return err; 4347 err = mddev_lock(mddev); 4348 if (err) 4349 return err; 4350 if (mddev->pers) { 4351 err = update_size(mddev, sectors); 4352 if (err == 0) 4353 md_update_sb(mddev, 1); 4354 } else { 4355 if (mddev->dev_sectors == 0 || 4356 mddev->dev_sectors > sectors) 4357 mddev->dev_sectors = sectors; 4358 else 4359 err = -ENOSPC; 4360 } 4361 mddev_unlock(mddev); 4362 return err ? err : len; 4363 } 4364 4365 static struct md_sysfs_entry md_size = 4366 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4367 4368 /* Metadata version. 4369 * This is one of 4370 * 'none' for arrays with no metadata (good luck...) 4371 * 'external' for arrays with externally managed metadata, 4372 * or N.M for internally known formats 4373 */ 4374 static ssize_t 4375 metadata_show(struct mddev *mddev, char *page) 4376 { 4377 if (mddev->persistent) 4378 return sprintf(page, "%d.%d\n", 4379 mddev->major_version, mddev->minor_version); 4380 else if (mddev->external) 4381 return sprintf(page, "external:%s\n", mddev->metadata_type); 4382 else 4383 return sprintf(page, "none\n"); 4384 } 4385 4386 static ssize_t 4387 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4388 { 4389 int major, minor; 4390 char *e; 4391 int err; 4392 /* Changing the details of 'external' metadata is 4393 * always permitted. Otherwise there must be 4394 * no devices attached to the array. 4395 */ 4396 4397 err = mddev_lock(mddev); 4398 if (err) 4399 return err; 4400 err = -EBUSY; 4401 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4402 ; 4403 else if (!list_empty(&mddev->disks)) 4404 goto out_unlock; 4405 4406 err = 0; 4407 if (cmd_match(buf, "none")) { 4408 mddev->persistent = 0; 4409 mddev->external = 0; 4410 mddev->major_version = 0; 4411 mddev->minor_version = 90; 4412 goto out_unlock; 4413 } 4414 if (strncmp(buf, "external:", 9) == 0) { 4415 size_t namelen = len-9; 4416 if (namelen >= sizeof(mddev->metadata_type)) 4417 namelen = sizeof(mddev->metadata_type)-1; 4418 strncpy(mddev->metadata_type, buf+9, namelen); 4419 mddev->metadata_type[namelen] = 0; 4420 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4421 mddev->metadata_type[--namelen] = 0; 4422 mddev->persistent = 0; 4423 mddev->external = 1; 4424 mddev->major_version = 0; 4425 mddev->minor_version = 90; 4426 goto out_unlock; 4427 } 4428 major = simple_strtoul(buf, &e, 10); 4429 err = -EINVAL; 4430 if (e==buf || *e != '.') 4431 goto out_unlock; 4432 buf = e+1; 4433 minor = simple_strtoul(buf, &e, 10); 4434 if (e==buf || (*e && *e != '\n') ) 4435 goto out_unlock; 4436 err = -ENOENT; 4437 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4438 goto out_unlock; 4439 mddev->major_version = major; 4440 mddev->minor_version = minor; 4441 mddev->persistent = 1; 4442 mddev->external = 0; 4443 err = 0; 4444 out_unlock: 4445 mddev_unlock(mddev); 4446 return err ?: len; 4447 } 4448 4449 static struct md_sysfs_entry md_metadata = 4450 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4451 4452 static ssize_t 4453 action_show(struct mddev *mddev, char *page) 4454 { 4455 char *type = "idle"; 4456 unsigned long recovery = mddev->recovery; 4457 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4458 type = "frozen"; 4459 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4460 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4461 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4462 type = "reshape"; 4463 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4464 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4465 type = "resync"; 4466 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4467 type = "check"; 4468 else 4469 type = "repair"; 4470 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4471 type = "recover"; 4472 else if (mddev->reshape_position != MaxSector) 4473 type = "reshape"; 4474 } 4475 return sprintf(page, "%s\n", type); 4476 } 4477 4478 static ssize_t 4479 action_store(struct mddev *mddev, const char *page, size_t len) 4480 { 4481 if (!mddev->pers || !mddev->pers->sync_request) 4482 return -EINVAL; 4483 4484 4485 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4486 if (cmd_match(page, "frozen")) 4487 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4488 else 4489 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4490 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4491 mddev_lock(mddev) == 0) { 4492 flush_workqueue(md_misc_wq); 4493 if (mddev->sync_thread) { 4494 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4495 md_reap_sync_thread(mddev); 4496 } 4497 mddev_unlock(mddev); 4498 } 4499 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4500 return -EBUSY; 4501 else if (cmd_match(page, "resync")) 4502 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4503 else if (cmd_match(page, "recover")) { 4504 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4505 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4506 } else if (cmd_match(page, "reshape")) { 4507 int err; 4508 if (mddev->pers->start_reshape == NULL) 4509 return -EINVAL; 4510 err = mddev_lock(mddev); 4511 if (!err) { 4512 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4513 err = -EBUSY; 4514 else { 4515 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4516 err = mddev->pers->start_reshape(mddev); 4517 } 4518 mddev_unlock(mddev); 4519 } 4520 if (err) 4521 return err; 4522 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4523 } else { 4524 if (cmd_match(page, "check")) 4525 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4526 else if (!cmd_match(page, "repair")) 4527 return -EINVAL; 4528 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4529 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4530 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4531 } 4532 if (mddev->ro == 2) { 4533 /* A write to sync_action is enough to justify 4534 * canceling read-auto mode 4535 */ 4536 mddev->ro = 0; 4537 md_wakeup_thread(mddev->sync_thread); 4538 } 4539 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4540 md_wakeup_thread(mddev->thread); 4541 sysfs_notify_dirent_safe(mddev->sysfs_action); 4542 return len; 4543 } 4544 4545 static struct md_sysfs_entry md_scan_mode = 4546 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4547 4548 static ssize_t 4549 last_sync_action_show(struct mddev *mddev, char *page) 4550 { 4551 return sprintf(page, "%s\n", mddev->last_sync_action); 4552 } 4553 4554 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4555 4556 static ssize_t 4557 mismatch_cnt_show(struct mddev *mddev, char *page) 4558 { 4559 return sprintf(page, "%llu\n", 4560 (unsigned long long) 4561 atomic64_read(&mddev->resync_mismatches)); 4562 } 4563 4564 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4565 4566 static ssize_t 4567 sync_min_show(struct mddev *mddev, char *page) 4568 { 4569 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4570 mddev->sync_speed_min ? "local": "system"); 4571 } 4572 4573 static ssize_t 4574 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4575 { 4576 unsigned int min; 4577 int rv; 4578 4579 if (strncmp(buf, "system", 6)==0) { 4580 min = 0; 4581 } else { 4582 rv = kstrtouint(buf, 10, &min); 4583 if (rv < 0) 4584 return rv; 4585 if (min == 0) 4586 return -EINVAL; 4587 } 4588 mddev->sync_speed_min = min; 4589 return len; 4590 } 4591 4592 static struct md_sysfs_entry md_sync_min = 4593 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4594 4595 static ssize_t 4596 sync_max_show(struct mddev *mddev, char *page) 4597 { 4598 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4599 mddev->sync_speed_max ? "local": "system"); 4600 } 4601 4602 static ssize_t 4603 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4604 { 4605 unsigned int max; 4606 int rv; 4607 4608 if (strncmp(buf, "system", 6)==0) { 4609 max = 0; 4610 } else { 4611 rv = kstrtouint(buf, 10, &max); 4612 if (rv < 0) 4613 return rv; 4614 if (max == 0) 4615 return -EINVAL; 4616 } 4617 mddev->sync_speed_max = max; 4618 return len; 4619 } 4620 4621 static struct md_sysfs_entry md_sync_max = 4622 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4623 4624 static ssize_t 4625 degraded_show(struct mddev *mddev, char *page) 4626 { 4627 return sprintf(page, "%d\n", mddev->degraded); 4628 } 4629 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4630 4631 static ssize_t 4632 sync_force_parallel_show(struct mddev *mddev, char *page) 4633 { 4634 return sprintf(page, "%d\n", mddev->parallel_resync); 4635 } 4636 4637 static ssize_t 4638 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4639 { 4640 long n; 4641 4642 if (kstrtol(buf, 10, &n)) 4643 return -EINVAL; 4644 4645 if (n != 0 && n != 1) 4646 return -EINVAL; 4647 4648 mddev->parallel_resync = n; 4649 4650 if (mddev->sync_thread) 4651 wake_up(&resync_wait); 4652 4653 return len; 4654 } 4655 4656 /* force parallel resync, even with shared block devices */ 4657 static struct md_sysfs_entry md_sync_force_parallel = 4658 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4659 sync_force_parallel_show, sync_force_parallel_store); 4660 4661 static ssize_t 4662 sync_speed_show(struct mddev *mddev, char *page) 4663 { 4664 unsigned long resync, dt, db; 4665 if (mddev->curr_resync == 0) 4666 return sprintf(page, "none\n"); 4667 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4668 dt = (jiffies - mddev->resync_mark) / HZ; 4669 if (!dt) dt++; 4670 db = resync - mddev->resync_mark_cnt; 4671 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4672 } 4673 4674 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4675 4676 static ssize_t 4677 sync_completed_show(struct mddev *mddev, char *page) 4678 { 4679 unsigned long long max_sectors, resync; 4680 4681 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4682 return sprintf(page, "none\n"); 4683 4684 if (mddev->curr_resync == 1 || 4685 mddev->curr_resync == 2) 4686 return sprintf(page, "delayed\n"); 4687 4688 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4689 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4690 max_sectors = mddev->resync_max_sectors; 4691 else 4692 max_sectors = mddev->dev_sectors; 4693 4694 resync = mddev->curr_resync_completed; 4695 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4696 } 4697 4698 static struct md_sysfs_entry md_sync_completed = 4699 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4700 4701 static ssize_t 4702 min_sync_show(struct mddev *mddev, char *page) 4703 { 4704 return sprintf(page, "%llu\n", 4705 (unsigned long long)mddev->resync_min); 4706 } 4707 static ssize_t 4708 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4709 { 4710 unsigned long long min; 4711 int err; 4712 4713 if (kstrtoull(buf, 10, &min)) 4714 return -EINVAL; 4715 4716 spin_lock(&mddev->lock); 4717 err = -EINVAL; 4718 if (min > mddev->resync_max) 4719 goto out_unlock; 4720 4721 err = -EBUSY; 4722 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4723 goto out_unlock; 4724 4725 /* Round down to multiple of 4K for safety */ 4726 mddev->resync_min = round_down(min, 8); 4727 err = 0; 4728 4729 out_unlock: 4730 spin_unlock(&mddev->lock); 4731 return err ?: len; 4732 } 4733 4734 static struct md_sysfs_entry md_min_sync = 4735 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4736 4737 static ssize_t 4738 max_sync_show(struct mddev *mddev, char *page) 4739 { 4740 if (mddev->resync_max == MaxSector) 4741 return sprintf(page, "max\n"); 4742 else 4743 return sprintf(page, "%llu\n", 4744 (unsigned long long)mddev->resync_max); 4745 } 4746 static ssize_t 4747 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4748 { 4749 int err; 4750 spin_lock(&mddev->lock); 4751 if (strncmp(buf, "max", 3) == 0) 4752 mddev->resync_max = MaxSector; 4753 else { 4754 unsigned long long max; 4755 int chunk; 4756 4757 err = -EINVAL; 4758 if (kstrtoull(buf, 10, &max)) 4759 goto out_unlock; 4760 if (max < mddev->resync_min) 4761 goto out_unlock; 4762 4763 err = -EBUSY; 4764 if (max < mddev->resync_max && 4765 mddev->ro == 0 && 4766 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4767 goto out_unlock; 4768 4769 /* Must be a multiple of chunk_size */ 4770 chunk = mddev->chunk_sectors; 4771 if (chunk) { 4772 sector_t temp = max; 4773 4774 err = -EINVAL; 4775 if (sector_div(temp, chunk)) 4776 goto out_unlock; 4777 } 4778 mddev->resync_max = max; 4779 } 4780 wake_up(&mddev->recovery_wait); 4781 err = 0; 4782 out_unlock: 4783 spin_unlock(&mddev->lock); 4784 return err ?: len; 4785 } 4786 4787 static struct md_sysfs_entry md_max_sync = 4788 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4789 4790 static ssize_t 4791 suspend_lo_show(struct mddev *mddev, char *page) 4792 { 4793 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4794 } 4795 4796 static ssize_t 4797 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4798 { 4799 unsigned long long old, new; 4800 int err; 4801 4802 err = kstrtoull(buf, 10, &new); 4803 if (err < 0) 4804 return err; 4805 if (new != (sector_t)new) 4806 return -EINVAL; 4807 4808 err = mddev_lock(mddev); 4809 if (err) 4810 return err; 4811 err = -EINVAL; 4812 if (mddev->pers == NULL || 4813 mddev->pers->quiesce == NULL) 4814 goto unlock; 4815 old = mddev->suspend_lo; 4816 mddev->suspend_lo = new; 4817 if (new >= old) 4818 /* Shrinking suspended region */ 4819 mddev->pers->quiesce(mddev, 2); 4820 else { 4821 /* Expanding suspended region - need to wait */ 4822 mddev->pers->quiesce(mddev, 1); 4823 mddev->pers->quiesce(mddev, 0); 4824 } 4825 err = 0; 4826 unlock: 4827 mddev_unlock(mddev); 4828 return err ?: len; 4829 } 4830 static struct md_sysfs_entry md_suspend_lo = 4831 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4832 4833 static ssize_t 4834 suspend_hi_show(struct mddev *mddev, char *page) 4835 { 4836 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4837 } 4838 4839 static ssize_t 4840 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4841 { 4842 unsigned long long old, new; 4843 int err; 4844 4845 err = kstrtoull(buf, 10, &new); 4846 if (err < 0) 4847 return err; 4848 if (new != (sector_t)new) 4849 return -EINVAL; 4850 4851 err = mddev_lock(mddev); 4852 if (err) 4853 return err; 4854 err = -EINVAL; 4855 if (mddev->pers == NULL || 4856 mddev->pers->quiesce == NULL) 4857 goto unlock; 4858 old = mddev->suspend_hi; 4859 mddev->suspend_hi = new; 4860 if (new <= old) 4861 /* Shrinking suspended region */ 4862 mddev->pers->quiesce(mddev, 2); 4863 else { 4864 /* Expanding suspended region - need to wait */ 4865 mddev->pers->quiesce(mddev, 1); 4866 mddev->pers->quiesce(mddev, 0); 4867 } 4868 err = 0; 4869 unlock: 4870 mddev_unlock(mddev); 4871 return err ?: len; 4872 } 4873 static struct md_sysfs_entry md_suspend_hi = 4874 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4875 4876 static ssize_t 4877 reshape_position_show(struct mddev *mddev, char *page) 4878 { 4879 if (mddev->reshape_position != MaxSector) 4880 return sprintf(page, "%llu\n", 4881 (unsigned long long)mddev->reshape_position); 4882 strcpy(page, "none\n"); 4883 return 5; 4884 } 4885 4886 static ssize_t 4887 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4888 { 4889 struct md_rdev *rdev; 4890 unsigned long long new; 4891 int err; 4892 4893 err = kstrtoull(buf, 10, &new); 4894 if (err < 0) 4895 return err; 4896 if (new != (sector_t)new) 4897 return -EINVAL; 4898 err = mddev_lock(mddev); 4899 if (err) 4900 return err; 4901 err = -EBUSY; 4902 if (mddev->pers) 4903 goto unlock; 4904 mddev->reshape_position = new; 4905 mddev->delta_disks = 0; 4906 mddev->reshape_backwards = 0; 4907 mddev->new_level = mddev->level; 4908 mddev->new_layout = mddev->layout; 4909 mddev->new_chunk_sectors = mddev->chunk_sectors; 4910 rdev_for_each(rdev, mddev) 4911 rdev->new_data_offset = rdev->data_offset; 4912 err = 0; 4913 unlock: 4914 mddev_unlock(mddev); 4915 return err ?: len; 4916 } 4917 4918 static struct md_sysfs_entry md_reshape_position = 4919 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4920 reshape_position_store); 4921 4922 static ssize_t 4923 reshape_direction_show(struct mddev *mddev, char *page) 4924 { 4925 return sprintf(page, "%s\n", 4926 mddev->reshape_backwards ? "backwards" : "forwards"); 4927 } 4928 4929 static ssize_t 4930 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4931 { 4932 int backwards = 0; 4933 int err; 4934 4935 if (cmd_match(buf, "forwards")) 4936 backwards = 0; 4937 else if (cmd_match(buf, "backwards")) 4938 backwards = 1; 4939 else 4940 return -EINVAL; 4941 if (mddev->reshape_backwards == backwards) 4942 return len; 4943 4944 err = mddev_lock(mddev); 4945 if (err) 4946 return err; 4947 /* check if we are allowed to change */ 4948 if (mddev->delta_disks) 4949 err = -EBUSY; 4950 else if (mddev->persistent && 4951 mddev->major_version == 0) 4952 err = -EINVAL; 4953 else 4954 mddev->reshape_backwards = backwards; 4955 mddev_unlock(mddev); 4956 return err ?: len; 4957 } 4958 4959 static struct md_sysfs_entry md_reshape_direction = 4960 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4961 reshape_direction_store); 4962 4963 static ssize_t 4964 array_size_show(struct mddev *mddev, char *page) 4965 { 4966 if (mddev->external_size) 4967 return sprintf(page, "%llu\n", 4968 (unsigned long long)mddev->array_sectors/2); 4969 else 4970 return sprintf(page, "default\n"); 4971 } 4972 4973 static ssize_t 4974 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4975 { 4976 sector_t sectors; 4977 int err; 4978 4979 err = mddev_lock(mddev); 4980 if (err) 4981 return err; 4982 4983 /* cluster raid doesn't support change array_sectors */ 4984 if (mddev_is_clustered(mddev)) { 4985 mddev_unlock(mddev); 4986 return -EINVAL; 4987 } 4988 4989 if (strncmp(buf, "default", 7) == 0) { 4990 if (mddev->pers) 4991 sectors = mddev->pers->size(mddev, 0, 0); 4992 else 4993 sectors = mddev->array_sectors; 4994 4995 mddev->external_size = 0; 4996 } else { 4997 if (strict_blocks_to_sectors(buf, §ors) < 0) 4998 err = -EINVAL; 4999 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5000 err = -E2BIG; 5001 else 5002 mddev->external_size = 1; 5003 } 5004 5005 if (!err) { 5006 mddev->array_sectors = sectors; 5007 if (mddev->pers) { 5008 set_capacity(mddev->gendisk, mddev->array_sectors); 5009 revalidate_disk(mddev->gendisk); 5010 } 5011 } 5012 mddev_unlock(mddev); 5013 return err ?: len; 5014 } 5015 5016 static struct md_sysfs_entry md_array_size = 5017 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5018 array_size_store); 5019 5020 static ssize_t 5021 consistency_policy_show(struct mddev *mddev, char *page) 5022 { 5023 int ret; 5024 5025 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5026 ret = sprintf(page, "journal\n"); 5027 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5028 ret = sprintf(page, "ppl\n"); 5029 } else if (mddev->bitmap) { 5030 ret = sprintf(page, "bitmap\n"); 5031 } else if (mddev->pers) { 5032 if (mddev->pers->sync_request) 5033 ret = sprintf(page, "resync\n"); 5034 else 5035 ret = sprintf(page, "none\n"); 5036 } else { 5037 ret = sprintf(page, "unknown\n"); 5038 } 5039 5040 return ret; 5041 } 5042 5043 static ssize_t 5044 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5045 { 5046 int err = 0; 5047 5048 if (mddev->pers) { 5049 if (mddev->pers->change_consistency_policy) 5050 err = mddev->pers->change_consistency_policy(mddev, buf); 5051 else 5052 err = -EBUSY; 5053 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5054 set_bit(MD_HAS_PPL, &mddev->flags); 5055 } else { 5056 err = -EINVAL; 5057 } 5058 5059 return err ? err : len; 5060 } 5061 5062 static struct md_sysfs_entry md_consistency_policy = 5063 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5064 consistency_policy_store); 5065 5066 static struct attribute *md_default_attrs[] = { 5067 &md_level.attr, 5068 &md_layout.attr, 5069 &md_raid_disks.attr, 5070 &md_chunk_size.attr, 5071 &md_size.attr, 5072 &md_resync_start.attr, 5073 &md_metadata.attr, 5074 &md_new_device.attr, 5075 &md_safe_delay.attr, 5076 &md_array_state.attr, 5077 &md_reshape_position.attr, 5078 &md_reshape_direction.attr, 5079 &md_array_size.attr, 5080 &max_corr_read_errors.attr, 5081 &md_consistency_policy.attr, 5082 NULL, 5083 }; 5084 5085 static struct attribute *md_redundancy_attrs[] = { 5086 &md_scan_mode.attr, 5087 &md_last_scan_mode.attr, 5088 &md_mismatches.attr, 5089 &md_sync_min.attr, 5090 &md_sync_max.attr, 5091 &md_sync_speed.attr, 5092 &md_sync_force_parallel.attr, 5093 &md_sync_completed.attr, 5094 &md_min_sync.attr, 5095 &md_max_sync.attr, 5096 &md_suspend_lo.attr, 5097 &md_suspend_hi.attr, 5098 &md_bitmap.attr, 5099 &md_degraded.attr, 5100 NULL, 5101 }; 5102 static struct attribute_group md_redundancy_group = { 5103 .name = NULL, 5104 .attrs = md_redundancy_attrs, 5105 }; 5106 5107 static ssize_t 5108 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5109 { 5110 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5111 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5112 ssize_t rv; 5113 5114 if (!entry->show) 5115 return -EIO; 5116 spin_lock(&all_mddevs_lock); 5117 if (list_empty(&mddev->all_mddevs)) { 5118 spin_unlock(&all_mddevs_lock); 5119 return -EBUSY; 5120 } 5121 mddev_get(mddev); 5122 spin_unlock(&all_mddevs_lock); 5123 5124 rv = entry->show(mddev, page); 5125 mddev_put(mddev); 5126 return rv; 5127 } 5128 5129 static ssize_t 5130 md_attr_store(struct kobject *kobj, struct attribute *attr, 5131 const char *page, size_t length) 5132 { 5133 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5134 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5135 ssize_t rv; 5136 5137 if (!entry->store) 5138 return -EIO; 5139 if (!capable(CAP_SYS_ADMIN)) 5140 return -EACCES; 5141 spin_lock(&all_mddevs_lock); 5142 if (list_empty(&mddev->all_mddevs)) { 5143 spin_unlock(&all_mddevs_lock); 5144 return -EBUSY; 5145 } 5146 mddev_get(mddev); 5147 spin_unlock(&all_mddevs_lock); 5148 rv = entry->store(mddev, page, length); 5149 mddev_put(mddev); 5150 return rv; 5151 } 5152 5153 static void md_free(struct kobject *ko) 5154 { 5155 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5156 5157 if (mddev->sysfs_state) 5158 sysfs_put(mddev->sysfs_state); 5159 5160 if (mddev->queue) 5161 blk_cleanup_queue(mddev->queue); 5162 if (mddev->gendisk) { 5163 del_gendisk(mddev->gendisk); 5164 put_disk(mddev->gendisk); 5165 } 5166 percpu_ref_exit(&mddev->writes_pending); 5167 5168 kfree(mddev); 5169 } 5170 5171 static const struct sysfs_ops md_sysfs_ops = { 5172 .show = md_attr_show, 5173 .store = md_attr_store, 5174 }; 5175 static struct kobj_type md_ktype = { 5176 .release = md_free, 5177 .sysfs_ops = &md_sysfs_ops, 5178 .default_attrs = md_default_attrs, 5179 }; 5180 5181 int mdp_major = 0; 5182 5183 static void mddev_delayed_delete(struct work_struct *ws) 5184 { 5185 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5186 5187 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 5188 kobject_del(&mddev->kobj); 5189 kobject_put(&mddev->kobj); 5190 } 5191 5192 static void no_op(struct percpu_ref *r) {} 5193 5194 int mddev_init_writes_pending(struct mddev *mddev) 5195 { 5196 if (mddev->writes_pending.percpu_count_ptr) 5197 return 0; 5198 if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0) 5199 return -ENOMEM; 5200 /* We want to start with the refcount at zero */ 5201 percpu_ref_put(&mddev->writes_pending); 5202 return 0; 5203 } 5204 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5205 5206 static int md_alloc(dev_t dev, char *name) 5207 { 5208 /* 5209 * If dev is zero, name is the name of a device to allocate with 5210 * an arbitrary minor number. It will be "md_???" 5211 * If dev is non-zero it must be a device number with a MAJOR of 5212 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5213 * the device is being created by opening a node in /dev. 5214 * If "name" is not NULL, the device is being created by 5215 * writing to /sys/module/md_mod/parameters/new_array. 5216 */ 5217 static DEFINE_MUTEX(disks_mutex); 5218 struct mddev *mddev = mddev_find(dev); 5219 struct gendisk *disk; 5220 int partitioned; 5221 int shift; 5222 int unit; 5223 int error; 5224 5225 if (!mddev) 5226 return -ENODEV; 5227 5228 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5229 shift = partitioned ? MdpMinorShift : 0; 5230 unit = MINOR(mddev->unit) >> shift; 5231 5232 /* wait for any previous instance of this device to be 5233 * completely removed (mddev_delayed_delete). 5234 */ 5235 flush_workqueue(md_misc_wq); 5236 5237 mutex_lock(&disks_mutex); 5238 error = -EEXIST; 5239 if (mddev->gendisk) 5240 goto abort; 5241 5242 if (name && !dev) { 5243 /* Need to ensure that 'name' is not a duplicate. 5244 */ 5245 struct mddev *mddev2; 5246 spin_lock(&all_mddevs_lock); 5247 5248 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5249 if (mddev2->gendisk && 5250 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5251 spin_unlock(&all_mddevs_lock); 5252 goto abort; 5253 } 5254 spin_unlock(&all_mddevs_lock); 5255 } 5256 if (name && dev) 5257 /* 5258 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5259 */ 5260 mddev->hold_active = UNTIL_STOP; 5261 5262 error = -ENOMEM; 5263 mddev->queue = blk_alloc_queue(GFP_KERNEL); 5264 if (!mddev->queue) 5265 goto abort; 5266 mddev->queue->queuedata = mddev; 5267 5268 blk_queue_make_request(mddev->queue, md_make_request); 5269 blk_set_stacking_limits(&mddev->queue->limits); 5270 5271 disk = alloc_disk(1 << shift); 5272 if (!disk) { 5273 blk_cleanup_queue(mddev->queue); 5274 mddev->queue = NULL; 5275 goto abort; 5276 } 5277 disk->major = MAJOR(mddev->unit); 5278 disk->first_minor = unit << shift; 5279 if (name) 5280 strcpy(disk->disk_name, name); 5281 else if (partitioned) 5282 sprintf(disk->disk_name, "md_d%d", unit); 5283 else 5284 sprintf(disk->disk_name, "md%d", unit); 5285 disk->fops = &md_fops; 5286 disk->private_data = mddev; 5287 disk->queue = mddev->queue; 5288 blk_queue_write_cache(mddev->queue, true, true); 5289 /* Allow extended partitions. This makes the 5290 * 'mdp' device redundant, but we can't really 5291 * remove it now. 5292 */ 5293 disk->flags |= GENHD_FL_EXT_DEVT; 5294 mddev->gendisk = disk; 5295 /* As soon as we call add_disk(), another thread could get 5296 * through to md_open, so make sure it doesn't get too far 5297 */ 5298 mutex_lock(&mddev->open_mutex); 5299 add_disk(disk); 5300 5301 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 5302 &disk_to_dev(disk)->kobj, "%s", "md"); 5303 if (error) { 5304 /* This isn't possible, but as kobject_init_and_add is marked 5305 * __must_check, we must do something with the result 5306 */ 5307 pr_debug("md: cannot register %s/md - name in use\n", 5308 disk->disk_name); 5309 error = 0; 5310 } 5311 if (mddev->kobj.sd && 5312 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5313 pr_debug("pointless warning\n"); 5314 mutex_unlock(&mddev->open_mutex); 5315 abort: 5316 mutex_unlock(&disks_mutex); 5317 if (!error && mddev->kobj.sd) { 5318 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5319 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5320 } 5321 mddev_put(mddev); 5322 return error; 5323 } 5324 5325 static struct kobject *md_probe(dev_t dev, int *part, void *data) 5326 { 5327 if (create_on_open) 5328 md_alloc(dev, NULL); 5329 return NULL; 5330 } 5331 5332 static int add_named_array(const char *val, struct kernel_param *kp) 5333 { 5334 /* 5335 * val must be "md_*" or "mdNNN". 5336 * For "md_*" we allocate an array with a large free minor number, and 5337 * set the name to val. val must not already be an active name. 5338 * For "mdNNN" we allocate an array with the minor number NNN 5339 * which must not already be in use. 5340 */ 5341 int len = strlen(val); 5342 char buf[DISK_NAME_LEN]; 5343 unsigned long devnum; 5344 5345 while (len && val[len-1] == '\n') 5346 len--; 5347 if (len >= DISK_NAME_LEN) 5348 return -E2BIG; 5349 strlcpy(buf, val, len+1); 5350 if (strncmp(buf, "md_", 3) == 0) 5351 return md_alloc(0, buf); 5352 if (strncmp(buf, "md", 2) == 0 && 5353 isdigit(buf[2]) && 5354 kstrtoul(buf+2, 10, &devnum) == 0 && 5355 devnum <= MINORMASK) 5356 return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); 5357 5358 return -EINVAL; 5359 } 5360 5361 static void md_safemode_timeout(unsigned long data) 5362 { 5363 struct mddev *mddev = (struct mddev *) data; 5364 5365 mddev->safemode = 1; 5366 if (mddev->external) 5367 sysfs_notify_dirent_safe(mddev->sysfs_state); 5368 5369 md_wakeup_thread(mddev->thread); 5370 } 5371 5372 static int start_dirty_degraded; 5373 5374 int md_run(struct mddev *mddev) 5375 { 5376 int err; 5377 struct md_rdev *rdev; 5378 struct md_personality *pers; 5379 5380 if (list_empty(&mddev->disks)) 5381 /* cannot run an array with no devices.. */ 5382 return -EINVAL; 5383 5384 if (mddev->pers) 5385 return -EBUSY; 5386 /* Cannot run until previous stop completes properly */ 5387 if (mddev->sysfs_active) 5388 return -EBUSY; 5389 5390 /* 5391 * Analyze all RAID superblock(s) 5392 */ 5393 if (!mddev->raid_disks) { 5394 if (!mddev->persistent) 5395 return -EINVAL; 5396 analyze_sbs(mddev); 5397 } 5398 5399 if (mddev->level != LEVEL_NONE) 5400 request_module("md-level-%d", mddev->level); 5401 else if (mddev->clevel[0]) 5402 request_module("md-%s", mddev->clevel); 5403 5404 /* 5405 * Drop all container device buffers, from now on 5406 * the only valid external interface is through the md 5407 * device. 5408 */ 5409 rdev_for_each(rdev, mddev) { 5410 if (test_bit(Faulty, &rdev->flags)) 5411 continue; 5412 sync_blockdev(rdev->bdev); 5413 invalidate_bdev(rdev->bdev); 5414 if (mddev->ro != 1 && 5415 (bdev_read_only(rdev->bdev) || 5416 bdev_read_only(rdev->meta_bdev))) { 5417 mddev->ro = 1; 5418 if (mddev->gendisk) 5419 set_disk_ro(mddev->gendisk, 1); 5420 } 5421 5422 /* perform some consistency tests on the device. 5423 * We don't want the data to overlap the metadata, 5424 * Internal Bitmap issues have been handled elsewhere. 5425 */ 5426 if (rdev->meta_bdev) { 5427 /* Nothing to check */; 5428 } else if (rdev->data_offset < rdev->sb_start) { 5429 if (mddev->dev_sectors && 5430 rdev->data_offset + mddev->dev_sectors 5431 > rdev->sb_start) { 5432 pr_warn("md: %s: data overlaps metadata\n", 5433 mdname(mddev)); 5434 return -EINVAL; 5435 } 5436 } else { 5437 if (rdev->sb_start + rdev->sb_size/512 5438 > rdev->data_offset) { 5439 pr_warn("md: %s: metadata overlaps data\n", 5440 mdname(mddev)); 5441 return -EINVAL; 5442 } 5443 } 5444 sysfs_notify_dirent_safe(rdev->sysfs_state); 5445 } 5446 5447 if (mddev->bio_set == NULL) { 5448 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5449 if (!mddev->bio_set) 5450 return -ENOMEM; 5451 } 5452 if (mddev->sync_set == NULL) { 5453 mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5454 if (!mddev->sync_set) 5455 return -ENOMEM; 5456 } 5457 5458 spin_lock(&pers_lock); 5459 pers = find_pers(mddev->level, mddev->clevel); 5460 if (!pers || !try_module_get(pers->owner)) { 5461 spin_unlock(&pers_lock); 5462 if (mddev->level != LEVEL_NONE) 5463 pr_warn("md: personality for level %d is not loaded!\n", 5464 mddev->level); 5465 else 5466 pr_warn("md: personality for level %s is not loaded!\n", 5467 mddev->clevel); 5468 return -EINVAL; 5469 } 5470 spin_unlock(&pers_lock); 5471 if (mddev->level != pers->level) { 5472 mddev->level = pers->level; 5473 mddev->new_level = pers->level; 5474 } 5475 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5476 5477 if (mddev->reshape_position != MaxSector && 5478 pers->start_reshape == NULL) { 5479 /* This personality cannot handle reshaping... */ 5480 module_put(pers->owner); 5481 return -EINVAL; 5482 } 5483 5484 if (pers->sync_request) { 5485 /* Warn if this is a potentially silly 5486 * configuration. 5487 */ 5488 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5489 struct md_rdev *rdev2; 5490 int warned = 0; 5491 5492 rdev_for_each(rdev, mddev) 5493 rdev_for_each(rdev2, mddev) { 5494 if (rdev < rdev2 && 5495 rdev->bdev->bd_contains == 5496 rdev2->bdev->bd_contains) { 5497 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", 5498 mdname(mddev), 5499 bdevname(rdev->bdev,b), 5500 bdevname(rdev2->bdev,b2)); 5501 warned = 1; 5502 } 5503 } 5504 5505 if (warned) 5506 pr_warn("True protection against single-disk failure might be compromised.\n"); 5507 } 5508 5509 mddev->recovery = 0; 5510 /* may be over-ridden by personality */ 5511 mddev->resync_max_sectors = mddev->dev_sectors; 5512 5513 mddev->ok_start_degraded = start_dirty_degraded; 5514 5515 if (start_readonly && mddev->ro == 0) 5516 mddev->ro = 2; /* read-only, but switch on first write */ 5517 5518 /* 5519 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes 5520 * up mddev->thread. It is important to initialize critical 5521 * resources for mddev->thread BEFORE calling pers->run(). 5522 */ 5523 err = pers->run(mddev); 5524 if (err) 5525 pr_warn("md: pers->run() failed ...\n"); 5526 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5527 WARN_ONCE(!mddev->external_size, 5528 "%s: default size too small, but 'external_size' not in effect?\n", 5529 __func__); 5530 pr_warn("md: invalid array_size %llu > default size %llu\n", 5531 (unsigned long long)mddev->array_sectors / 2, 5532 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5533 err = -EINVAL; 5534 } 5535 if (err == 0 && pers->sync_request && 5536 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5537 struct bitmap *bitmap; 5538 5539 bitmap = bitmap_create(mddev, -1); 5540 if (IS_ERR(bitmap)) { 5541 err = PTR_ERR(bitmap); 5542 pr_warn("%s: failed to create bitmap (%d)\n", 5543 mdname(mddev), err); 5544 } else 5545 mddev->bitmap = bitmap; 5546 5547 } 5548 if (err) { 5549 mddev_detach(mddev); 5550 if (mddev->private) 5551 pers->free(mddev, mddev->private); 5552 mddev->private = NULL; 5553 module_put(pers->owner); 5554 bitmap_destroy(mddev); 5555 return err; 5556 } 5557 if (mddev->queue) { 5558 bool nonrot = true; 5559 5560 rdev_for_each(rdev, mddev) { 5561 if (rdev->raid_disk >= 0 && 5562 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 5563 nonrot = false; 5564 break; 5565 } 5566 } 5567 if (mddev->degraded) 5568 nonrot = false; 5569 if (nonrot) 5570 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5571 else 5572 queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5573 mddev->queue->backing_dev_info->congested_data = mddev; 5574 mddev->queue->backing_dev_info->congested_fn = md_congested; 5575 } 5576 if (pers->sync_request) { 5577 if (mddev->kobj.sd && 5578 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5579 pr_warn("md: cannot register extra attributes for %s\n", 5580 mdname(mddev)); 5581 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5582 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5583 mddev->ro = 0; 5584 5585 atomic_set(&mddev->max_corr_read_errors, 5586 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5587 mddev->safemode = 0; 5588 if (mddev_is_clustered(mddev)) 5589 mddev->safemode_delay = 0; 5590 else 5591 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5592 mddev->in_sync = 1; 5593 smp_wmb(); 5594 spin_lock(&mddev->lock); 5595 mddev->pers = pers; 5596 spin_unlock(&mddev->lock); 5597 rdev_for_each(rdev, mddev) 5598 if (rdev->raid_disk >= 0) 5599 if (sysfs_link_rdev(mddev, rdev)) 5600 /* failure here is OK */; 5601 5602 if (mddev->degraded && !mddev->ro) 5603 /* This ensures that recovering status is reported immediately 5604 * via sysfs - until a lack of spares is confirmed. 5605 */ 5606 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5607 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5608 5609 if (mddev->sb_flags) 5610 md_update_sb(mddev, 0); 5611 5612 md_new_event(mddev); 5613 sysfs_notify_dirent_safe(mddev->sysfs_state); 5614 sysfs_notify_dirent_safe(mddev->sysfs_action); 5615 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5616 return 0; 5617 } 5618 EXPORT_SYMBOL_GPL(md_run); 5619 5620 static int do_md_run(struct mddev *mddev) 5621 { 5622 int err; 5623 5624 err = md_run(mddev); 5625 if (err) 5626 goto out; 5627 err = bitmap_load(mddev); 5628 if (err) { 5629 bitmap_destroy(mddev); 5630 goto out; 5631 } 5632 5633 if (mddev_is_clustered(mddev)) 5634 md_allow_write(mddev); 5635 5636 md_wakeup_thread(mddev->thread); 5637 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5638 5639 set_capacity(mddev->gendisk, mddev->array_sectors); 5640 revalidate_disk(mddev->gendisk); 5641 mddev->changed = 1; 5642 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5643 out: 5644 return err; 5645 } 5646 5647 static int restart_array(struct mddev *mddev) 5648 { 5649 struct gendisk *disk = mddev->gendisk; 5650 struct md_rdev *rdev; 5651 bool has_journal = false; 5652 bool has_readonly = false; 5653 5654 /* Complain if it has no devices */ 5655 if (list_empty(&mddev->disks)) 5656 return -ENXIO; 5657 if (!mddev->pers) 5658 return -EINVAL; 5659 if (!mddev->ro) 5660 return -EBUSY; 5661 5662 rcu_read_lock(); 5663 rdev_for_each_rcu(rdev, mddev) { 5664 if (test_bit(Journal, &rdev->flags) && 5665 !test_bit(Faulty, &rdev->flags)) 5666 has_journal = true; 5667 if (bdev_read_only(rdev->bdev)) 5668 has_readonly = true; 5669 } 5670 rcu_read_unlock(); 5671 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 5672 /* Don't restart rw with journal missing/faulty */ 5673 return -EINVAL; 5674 if (has_readonly) 5675 return -EROFS; 5676 5677 mddev->safemode = 0; 5678 mddev->ro = 0; 5679 set_disk_ro(disk, 0); 5680 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 5681 /* Kick recovery or resync if necessary */ 5682 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5683 md_wakeup_thread(mddev->thread); 5684 md_wakeup_thread(mddev->sync_thread); 5685 sysfs_notify_dirent_safe(mddev->sysfs_state); 5686 return 0; 5687 } 5688 5689 static void md_clean(struct mddev *mddev) 5690 { 5691 mddev->array_sectors = 0; 5692 mddev->external_size = 0; 5693 mddev->dev_sectors = 0; 5694 mddev->raid_disks = 0; 5695 mddev->recovery_cp = 0; 5696 mddev->resync_min = 0; 5697 mddev->resync_max = MaxSector; 5698 mddev->reshape_position = MaxSector; 5699 mddev->external = 0; 5700 mddev->persistent = 0; 5701 mddev->level = LEVEL_NONE; 5702 mddev->clevel[0] = 0; 5703 mddev->flags = 0; 5704 mddev->sb_flags = 0; 5705 mddev->ro = 0; 5706 mddev->metadata_type[0] = 0; 5707 mddev->chunk_sectors = 0; 5708 mddev->ctime = mddev->utime = 0; 5709 mddev->layout = 0; 5710 mddev->max_disks = 0; 5711 mddev->events = 0; 5712 mddev->can_decrease_events = 0; 5713 mddev->delta_disks = 0; 5714 mddev->reshape_backwards = 0; 5715 mddev->new_level = LEVEL_NONE; 5716 mddev->new_layout = 0; 5717 mddev->new_chunk_sectors = 0; 5718 mddev->curr_resync = 0; 5719 atomic64_set(&mddev->resync_mismatches, 0); 5720 mddev->suspend_lo = mddev->suspend_hi = 0; 5721 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5722 mddev->recovery = 0; 5723 mddev->in_sync = 0; 5724 mddev->changed = 0; 5725 mddev->degraded = 0; 5726 mddev->safemode = 0; 5727 mddev->private = NULL; 5728 mddev->cluster_info = NULL; 5729 mddev->bitmap_info.offset = 0; 5730 mddev->bitmap_info.default_offset = 0; 5731 mddev->bitmap_info.default_space = 0; 5732 mddev->bitmap_info.chunksize = 0; 5733 mddev->bitmap_info.daemon_sleep = 0; 5734 mddev->bitmap_info.max_write_behind = 0; 5735 mddev->bitmap_info.nodes = 0; 5736 } 5737 5738 static void __md_stop_writes(struct mddev *mddev) 5739 { 5740 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5741 flush_workqueue(md_misc_wq); 5742 if (mddev->sync_thread) { 5743 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5744 md_reap_sync_thread(mddev); 5745 } 5746 5747 del_timer_sync(&mddev->safemode_timer); 5748 5749 if (mddev->pers && mddev->pers->quiesce) { 5750 mddev->pers->quiesce(mddev, 1); 5751 mddev->pers->quiesce(mddev, 0); 5752 } 5753 bitmap_flush(mddev); 5754 5755 if (mddev->ro == 0 && 5756 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5757 mddev->sb_flags)) { 5758 /* mark array as shutdown cleanly */ 5759 if (!mddev_is_clustered(mddev)) 5760 mddev->in_sync = 1; 5761 md_update_sb(mddev, 1); 5762 } 5763 } 5764 5765 void md_stop_writes(struct mddev *mddev) 5766 { 5767 mddev_lock_nointr(mddev); 5768 __md_stop_writes(mddev); 5769 mddev_unlock(mddev); 5770 } 5771 EXPORT_SYMBOL_GPL(md_stop_writes); 5772 5773 static void mddev_detach(struct mddev *mddev) 5774 { 5775 bitmap_wait_behind_writes(mddev); 5776 if (mddev->pers && mddev->pers->quiesce) { 5777 mddev->pers->quiesce(mddev, 1); 5778 mddev->pers->quiesce(mddev, 0); 5779 } 5780 md_unregister_thread(&mddev->thread); 5781 if (mddev->queue) 5782 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5783 } 5784 5785 static void __md_stop(struct mddev *mddev) 5786 { 5787 struct md_personality *pers = mddev->pers; 5788 bitmap_destroy(mddev); 5789 mddev_detach(mddev); 5790 /* Ensure ->event_work is done */ 5791 flush_workqueue(md_misc_wq); 5792 spin_lock(&mddev->lock); 5793 mddev->pers = NULL; 5794 spin_unlock(&mddev->lock); 5795 pers->free(mddev, mddev->private); 5796 mddev->private = NULL; 5797 if (pers->sync_request && mddev->to_remove == NULL) 5798 mddev->to_remove = &md_redundancy_group; 5799 module_put(pers->owner); 5800 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5801 } 5802 5803 void md_stop(struct mddev *mddev) 5804 { 5805 /* stop the array and free an attached data structures. 5806 * This is called from dm-raid 5807 */ 5808 __md_stop(mddev); 5809 if (mddev->bio_set) 5810 bioset_free(mddev->bio_set); 5811 } 5812 5813 EXPORT_SYMBOL_GPL(md_stop); 5814 5815 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5816 { 5817 int err = 0; 5818 int did_freeze = 0; 5819 5820 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5821 did_freeze = 1; 5822 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5823 md_wakeup_thread(mddev->thread); 5824 } 5825 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5826 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5827 if (mddev->sync_thread) 5828 /* Thread might be blocked waiting for metadata update 5829 * which will now never happen */ 5830 wake_up_process(mddev->sync_thread->tsk); 5831 5832 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 5833 return -EBUSY; 5834 mddev_unlock(mddev); 5835 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5836 &mddev->recovery)); 5837 wait_event(mddev->sb_wait, 5838 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 5839 mddev_lock_nointr(mddev); 5840 5841 mutex_lock(&mddev->open_mutex); 5842 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5843 mddev->sync_thread || 5844 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5845 pr_warn("md: %s still in use.\n",mdname(mddev)); 5846 if (did_freeze) { 5847 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5848 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5849 md_wakeup_thread(mddev->thread); 5850 } 5851 err = -EBUSY; 5852 goto out; 5853 } 5854 if (mddev->pers) { 5855 __md_stop_writes(mddev); 5856 5857 err = -ENXIO; 5858 if (mddev->ro==1) 5859 goto out; 5860 mddev->ro = 1; 5861 set_disk_ro(mddev->gendisk, 1); 5862 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5863 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5864 md_wakeup_thread(mddev->thread); 5865 sysfs_notify_dirent_safe(mddev->sysfs_state); 5866 err = 0; 5867 } 5868 out: 5869 mutex_unlock(&mddev->open_mutex); 5870 return err; 5871 } 5872 5873 /* mode: 5874 * 0 - completely stop and dis-assemble array 5875 * 2 - stop but do not disassemble array 5876 */ 5877 static int do_md_stop(struct mddev *mddev, int mode, 5878 struct block_device *bdev) 5879 { 5880 struct gendisk *disk = mddev->gendisk; 5881 struct md_rdev *rdev; 5882 int did_freeze = 0; 5883 5884 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5885 did_freeze = 1; 5886 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5887 md_wakeup_thread(mddev->thread); 5888 } 5889 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5890 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5891 if (mddev->sync_thread) 5892 /* Thread might be blocked waiting for metadata update 5893 * which will now never happen */ 5894 wake_up_process(mddev->sync_thread->tsk); 5895 5896 mddev_unlock(mddev); 5897 wait_event(resync_wait, (mddev->sync_thread == NULL && 5898 !test_bit(MD_RECOVERY_RUNNING, 5899 &mddev->recovery))); 5900 mddev_lock_nointr(mddev); 5901 5902 mutex_lock(&mddev->open_mutex); 5903 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5904 mddev->sysfs_active || 5905 mddev->sync_thread || 5906 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5907 pr_warn("md: %s still in use.\n",mdname(mddev)); 5908 mutex_unlock(&mddev->open_mutex); 5909 if (did_freeze) { 5910 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5911 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5912 md_wakeup_thread(mddev->thread); 5913 } 5914 return -EBUSY; 5915 } 5916 if (mddev->pers) { 5917 if (mddev->ro) 5918 set_disk_ro(disk, 0); 5919 5920 __md_stop_writes(mddev); 5921 __md_stop(mddev); 5922 mddev->queue->backing_dev_info->congested_fn = NULL; 5923 5924 /* tell userspace to handle 'inactive' */ 5925 sysfs_notify_dirent_safe(mddev->sysfs_state); 5926 5927 rdev_for_each(rdev, mddev) 5928 if (rdev->raid_disk >= 0) 5929 sysfs_unlink_rdev(mddev, rdev); 5930 5931 set_capacity(disk, 0); 5932 mutex_unlock(&mddev->open_mutex); 5933 mddev->changed = 1; 5934 revalidate_disk(disk); 5935 5936 if (mddev->ro) 5937 mddev->ro = 0; 5938 } else 5939 mutex_unlock(&mddev->open_mutex); 5940 /* 5941 * Free resources if final stop 5942 */ 5943 if (mode == 0) { 5944 pr_info("md: %s stopped.\n", mdname(mddev)); 5945 5946 if (mddev->bitmap_info.file) { 5947 struct file *f = mddev->bitmap_info.file; 5948 spin_lock(&mddev->lock); 5949 mddev->bitmap_info.file = NULL; 5950 spin_unlock(&mddev->lock); 5951 fput(f); 5952 } 5953 mddev->bitmap_info.offset = 0; 5954 5955 export_array(mddev); 5956 5957 md_clean(mddev); 5958 if (mddev->hold_active == UNTIL_STOP) 5959 mddev->hold_active = 0; 5960 } 5961 md_new_event(mddev); 5962 sysfs_notify_dirent_safe(mddev->sysfs_state); 5963 return 0; 5964 } 5965 5966 #ifndef MODULE 5967 static void autorun_array(struct mddev *mddev) 5968 { 5969 struct md_rdev *rdev; 5970 int err; 5971 5972 if (list_empty(&mddev->disks)) 5973 return; 5974 5975 pr_info("md: running: "); 5976 5977 rdev_for_each(rdev, mddev) { 5978 char b[BDEVNAME_SIZE]; 5979 pr_cont("<%s>", bdevname(rdev->bdev,b)); 5980 } 5981 pr_cont("\n"); 5982 5983 err = do_md_run(mddev); 5984 if (err) { 5985 pr_warn("md: do_md_run() returned %d\n", err); 5986 do_md_stop(mddev, 0, NULL); 5987 } 5988 } 5989 5990 /* 5991 * lets try to run arrays based on all disks that have arrived 5992 * until now. (those are in pending_raid_disks) 5993 * 5994 * the method: pick the first pending disk, collect all disks with 5995 * the same UUID, remove all from the pending list and put them into 5996 * the 'same_array' list. Then order this list based on superblock 5997 * update time (freshest comes first), kick out 'old' disks and 5998 * compare superblocks. If everything's fine then run it. 5999 * 6000 * If "unit" is allocated, then bump its reference count 6001 */ 6002 static void autorun_devices(int part) 6003 { 6004 struct md_rdev *rdev0, *rdev, *tmp; 6005 struct mddev *mddev; 6006 char b[BDEVNAME_SIZE]; 6007 6008 pr_info("md: autorun ...\n"); 6009 while (!list_empty(&pending_raid_disks)) { 6010 int unit; 6011 dev_t dev; 6012 LIST_HEAD(candidates); 6013 rdev0 = list_entry(pending_raid_disks.next, 6014 struct md_rdev, same_set); 6015 6016 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); 6017 INIT_LIST_HEAD(&candidates); 6018 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6019 if (super_90_load(rdev, rdev0, 0) >= 0) { 6020 pr_debug("md: adding %s ...\n", 6021 bdevname(rdev->bdev,b)); 6022 list_move(&rdev->same_set, &candidates); 6023 } 6024 /* 6025 * now we have a set of devices, with all of them having 6026 * mostly sane superblocks. It's time to allocate the 6027 * mddev. 6028 */ 6029 if (part) { 6030 dev = MKDEV(mdp_major, 6031 rdev0->preferred_minor << MdpMinorShift); 6032 unit = MINOR(dev) >> MdpMinorShift; 6033 } else { 6034 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6035 unit = MINOR(dev); 6036 } 6037 if (rdev0->preferred_minor != unit) { 6038 pr_warn("md: unit number in %s is bad: %d\n", 6039 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 6040 break; 6041 } 6042 6043 md_probe(dev, NULL, NULL); 6044 mddev = mddev_find(dev); 6045 if (!mddev || !mddev->gendisk) { 6046 if (mddev) 6047 mddev_put(mddev); 6048 break; 6049 } 6050 if (mddev_lock(mddev)) 6051 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6052 else if (mddev->raid_disks || mddev->major_version 6053 || !list_empty(&mddev->disks)) { 6054 pr_warn("md: %s already running, cannot run %s\n", 6055 mdname(mddev), bdevname(rdev0->bdev,b)); 6056 mddev_unlock(mddev); 6057 } else { 6058 pr_debug("md: created %s\n", mdname(mddev)); 6059 mddev->persistent = 1; 6060 rdev_for_each_list(rdev, tmp, &candidates) { 6061 list_del_init(&rdev->same_set); 6062 if (bind_rdev_to_array(rdev, mddev)) 6063 export_rdev(rdev); 6064 } 6065 autorun_array(mddev); 6066 mddev_unlock(mddev); 6067 } 6068 /* on success, candidates will be empty, on error 6069 * it won't... 6070 */ 6071 rdev_for_each_list(rdev, tmp, &candidates) { 6072 list_del_init(&rdev->same_set); 6073 export_rdev(rdev); 6074 } 6075 mddev_put(mddev); 6076 } 6077 pr_info("md: ... autorun DONE.\n"); 6078 } 6079 #endif /* !MODULE */ 6080 6081 static int get_version(void __user *arg) 6082 { 6083 mdu_version_t ver; 6084 6085 ver.major = MD_MAJOR_VERSION; 6086 ver.minor = MD_MINOR_VERSION; 6087 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6088 6089 if (copy_to_user(arg, &ver, sizeof(ver))) 6090 return -EFAULT; 6091 6092 return 0; 6093 } 6094 6095 static int get_array_info(struct mddev *mddev, void __user *arg) 6096 { 6097 mdu_array_info_t info; 6098 int nr,working,insync,failed,spare; 6099 struct md_rdev *rdev; 6100 6101 nr = working = insync = failed = spare = 0; 6102 rcu_read_lock(); 6103 rdev_for_each_rcu(rdev, mddev) { 6104 nr++; 6105 if (test_bit(Faulty, &rdev->flags)) 6106 failed++; 6107 else { 6108 working++; 6109 if (test_bit(In_sync, &rdev->flags)) 6110 insync++; 6111 else if (test_bit(Journal, &rdev->flags)) 6112 /* TODO: add journal count to md_u.h */ 6113 ; 6114 else 6115 spare++; 6116 } 6117 } 6118 rcu_read_unlock(); 6119 6120 info.major_version = mddev->major_version; 6121 info.minor_version = mddev->minor_version; 6122 info.patch_version = MD_PATCHLEVEL_VERSION; 6123 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6124 info.level = mddev->level; 6125 info.size = mddev->dev_sectors / 2; 6126 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6127 info.size = -1; 6128 info.nr_disks = nr; 6129 info.raid_disks = mddev->raid_disks; 6130 info.md_minor = mddev->md_minor; 6131 info.not_persistent= !mddev->persistent; 6132 6133 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6134 info.state = 0; 6135 if (mddev->in_sync) 6136 info.state = (1<<MD_SB_CLEAN); 6137 if (mddev->bitmap && mddev->bitmap_info.offset) 6138 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6139 if (mddev_is_clustered(mddev)) 6140 info.state |= (1<<MD_SB_CLUSTERED); 6141 info.active_disks = insync; 6142 info.working_disks = working; 6143 info.failed_disks = failed; 6144 info.spare_disks = spare; 6145 6146 info.layout = mddev->layout; 6147 info.chunk_size = mddev->chunk_sectors << 9; 6148 6149 if (copy_to_user(arg, &info, sizeof(info))) 6150 return -EFAULT; 6151 6152 return 0; 6153 } 6154 6155 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6156 { 6157 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6158 char *ptr; 6159 int err; 6160 6161 file = kzalloc(sizeof(*file), GFP_NOIO); 6162 if (!file) 6163 return -ENOMEM; 6164 6165 err = 0; 6166 spin_lock(&mddev->lock); 6167 /* bitmap enabled */ 6168 if (mddev->bitmap_info.file) { 6169 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6170 sizeof(file->pathname)); 6171 if (IS_ERR(ptr)) 6172 err = PTR_ERR(ptr); 6173 else 6174 memmove(file->pathname, ptr, 6175 sizeof(file->pathname)-(ptr-file->pathname)); 6176 } 6177 spin_unlock(&mddev->lock); 6178 6179 if (err == 0 && 6180 copy_to_user(arg, file, sizeof(*file))) 6181 err = -EFAULT; 6182 6183 kfree(file); 6184 return err; 6185 } 6186 6187 static int get_disk_info(struct mddev *mddev, void __user * arg) 6188 { 6189 mdu_disk_info_t info; 6190 struct md_rdev *rdev; 6191 6192 if (copy_from_user(&info, arg, sizeof(info))) 6193 return -EFAULT; 6194 6195 rcu_read_lock(); 6196 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6197 if (rdev) { 6198 info.major = MAJOR(rdev->bdev->bd_dev); 6199 info.minor = MINOR(rdev->bdev->bd_dev); 6200 info.raid_disk = rdev->raid_disk; 6201 info.state = 0; 6202 if (test_bit(Faulty, &rdev->flags)) 6203 info.state |= (1<<MD_DISK_FAULTY); 6204 else if (test_bit(In_sync, &rdev->flags)) { 6205 info.state |= (1<<MD_DISK_ACTIVE); 6206 info.state |= (1<<MD_DISK_SYNC); 6207 } 6208 if (test_bit(Journal, &rdev->flags)) 6209 info.state |= (1<<MD_DISK_JOURNAL); 6210 if (test_bit(WriteMostly, &rdev->flags)) 6211 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6212 if (test_bit(FailFast, &rdev->flags)) 6213 info.state |= (1<<MD_DISK_FAILFAST); 6214 } else { 6215 info.major = info.minor = 0; 6216 info.raid_disk = -1; 6217 info.state = (1<<MD_DISK_REMOVED); 6218 } 6219 rcu_read_unlock(); 6220 6221 if (copy_to_user(arg, &info, sizeof(info))) 6222 return -EFAULT; 6223 6224 return 0; 6225 } 6226 6227 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 6228 { 6229 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 6230 struct md_rdev *rdev; 6231 dev_t dev = MKDEV(info->major,info->minor); 6232 6233 if (mddev_is_clustered(mddev) && 6234 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6235 pr_warn("%s: Cannot add to clustered mddev.\n", 6236 mdname(mddev)); 6237 return -EINVAL; 6238 } 6239 6240 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6241 return -EOVERFLOW; 6242 6243 if (!mddev->raid_disks) { 6244 int err; 6245 /* expecting a device which has a superblock */ 6246 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6247 if (IS_ERR(rdev)) { 6248 pr_warn("md: md_import_device returned %ld\n", 6249 PTR_ERR(rdev)); 6250 return PTR_ERR(rdev); 6251 } 6252 if (!list_empty(&mddev->disks)) { 6253 struct md_rdev *rdev0 6254 = list_entry(mddev->disks.next, 6255 struct md_rdev, same_set); 6256 err = super_types[mddev->major_version] 6257 .load_super(rdev, rdev0, mddev->minor_version); 6258 if (err < 0) { 6259 pr_warn("md: %s has different UUID to %s\n", 6260 bdevname(rdev->bdev,b), 6261 bdevname(rdev0->bdev,b2)); 6262 export_rdev(rdev); 6263 return -EINVAL; 6264 } 6265 } 6266 err = bind_rdev_to_array(rdev, mddev); 6267 if (err) 6268 export_rdev(rdev); 6269 return err; 6270 } 6271 6272 /* 6273 * add_new_disk can be used once the array is assembled 6274 * to add "hot spares". They must already have a superblock 6275 * written 6276 */ 6277 if (mddev->pers) { 6278 int err; 6279 if (!mddev->pers->hot_add_disk) { 6280 pr_warn("%s: personality does not support diskops!\n", 6281 mdname(mddev)); 6282 return -EINVAL; 6283 } 6284 if (mddev->persistent) 6285 rdev = md_import_device(dev, mddev->major_version, 6286 mddev->minor_version); 6287 else 6288 rdev = md_import_device(dev, -1, -1); 6289 if (IS_ERR(rdev)) { 6290 pr_warn("md: md_import_device returned %ld\n", 6291 PTR_ERR(rdev)); 6292 return PTR_ERR(rdev); 6293 } 6294 /* set saved_raid_disk if appropriate */ 6295 if (!mddev->persistent) { 6296 if (info->state & (1<<MD_DISK_SYNC) && 6297 info->raid_disk < mddev->raid_disks) { 6298 rdev->raid_disk = info->raid_disk; 6299 set_bit(In_sync, &rdev->flags); 6300 clear_bit(Bitmap_sync, &rdev->flags); 6301 } else 6302 rdev->raid_disk = -1; 6303 rdev->saved_raid_disk = rdev->raid_disk; 6304 } else 6305 super_types[mddev->major_version]. 6306 validate_super(mddev, rdev); 6307 if ((info->state & (1<<MD_DISK_SYNC)) && 6308 rdev->raid_disk != info->raid_disk) { 6309 /* This was a hot-add request, but events doesn't 6310 * match, so reject it. 6311 */ 6312 export_rdev(rdev); 6313 return -EINVAL; 6314 } 6315 6316 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6317 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6318 set_bit(WriteMostly, &rdev->flags); 6319 else 6320 clear_bit(WriteMostly, &rdev->flags); 6321 if (info->state & (1<<MD_DISK_FAILFAST)) 6322 set_bit(FailFast, &rdev->flags); 6323 else 6324 clear_bit(FailFast, &rdev->flags); 6325 6326 if (info->state & (1<<MD_DISK_JOURNAL)) { 6327 struct md_rdev *rdev2; 6328 bool has_journal = false; 6329 6330 /* make sure no existing journal disk */ 6331 rdev_for_each(rdev2, mddev) { 6332 if (test_bit(Journal, &rdev2->flags)) { 6333 has_journal = true; 6334 break; 6335 } 6336 } 6337 if (has_journal) { 6338 export_rdev(rdev); 6339 return -EBUSY; 6340 } 6341 set_bit(Journal, &rdev->flags); 6342 } 6343 /* 6344 * check whether the device shows up in other nodes 6345 */ 6346 if (mddev_is_clustered(mddev)) { 6347 if (info->state & (1 << MD_DISK_CANDIDATE)) 6348 set_bit(Candidate, &rdev->flags); 6349 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6350 /* --add initiated by this node */ 6351 err = md_cluster_ops->add_new_disk(mddev, rdev); 6352 if (err) { 6353 export_rdev(rdev); 6354 return err; 6355 } 6356 } 6357 } 6358 6359 rdev->raid_disk = -1; 6360 err = bind_rdev_to_array(rdev, mddev); 6361 6362 if (err) 6363 export_rdev(rdev); 6364 6365 if (mddev_is_clustered(mddev)) { 6366 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6367 if (!err) { 6368 err = md_cluster_ops->new_disk_ack(mddev, 6369 err == 0); 6370 if (err) 6371 md_kick_rdev_from_array(rdev); 6372 } 6373 } else { 6374 if (err) 6375 md_cluster_ops->add_new_disk_cancel(mddev); 6376 else 6377 err = add_bound_rdev(rdev); 6378 } 6379 6380 } else if (!err) 6381 err = add_bound_rdev(rdev); 6382 6383 return err; 6384 } 6385 6386 /* otherwise, add_new_disk is only allowed 6387 * for major_version==0 superblocks 6388 */ 6389 if (mddev->major_version != 0) { 6390 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6391 return -EINVAL; 6392 } 6393 6394 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6395 int err; 6396 rdev = md_import_device(dev, -1, 0); 6397 if (IS_ERR(rdev)) { 6398 pr_warn("md: error, md_import_device() returned %ld\n", 6399 PTR_ERR(rdev)); 6400 return PTR_ERR(rdev); 6401 } 6402 rdev->desc_nr = info->number; 6403 if (info->raid_disk < mddev->raid_disks) 6404 rdev->raid_disk = info->raid_disk; 6405 else 6406 rdev->raid_disk = -1; 6407 6408 if (rdev->raid_disk < mddev->raid_disks) 6409 if (info->state & (1<<MD_DISK_SYNC)) 6410 set_bit(In_sync, &rdev->flags); 6411 6412 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6413 set_bit(WriteMostly, &rdev->flags); 6414 if (info->state & (1<<MD_DISK_FAILFAST)) 6415 set_bit(FailFast, &rdev->flags); 6416 6417 if (!mddev->persistent) { 6418 pr_debug("md: nonpersistent superblock ...\n"); 6419 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6420 } else 6421 rdev->sb_start = calc_dev_sboffset(rdev); 6422 rdev->sectors = rdev->sb_start; 6423 6424 err = bind_rdev_to_array(rdev, mddev); 6425 if (err) { 6426 export_rdev(rdev); 6427 return err; 6428 } 6429 } 6430 6431 return 0; 6432 } 6433 6434 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6435 { 6436 char b[BDEVNAME_SIZE]; 6437 struct md_rdev *rdev; 6438 6439 rdev = find_rdev(mddev, dev); 6440 if (!rdev) 6441 return -ENXIO; 6442 6443 if (rdev->raid_disk < 0) 6444 goto kick_rdev; 6445 6446 clear_bit(Blocked, &rdev->flags); 6447 remove_and_add_spares(mddev, rdev); 6448 6449 if (rdev->raid_disk >= 0) 6450 goto busy; 6451 6452 kick_rdev: 6453 if (mddev_is_clustered(mddev)) 6454 md_cluster_ops->remove_disk(mddev, rdev); 6455 6456 md_kick_rdev_from_array(rdev); 6457 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6458 if (mddev->thread) 6459 md_wakeup_thread(mddev->thread); 6460 else 6461 md_update_sb(mddev, 1); 6462 md_new_event(mddev); 6463 6464 return 0; 6465 busy: 6466 pr_debug("md: cannot remove active disk %s from %s ...\n", 6467 bdevname(rdev->bdev,b), mdname(mddev)); 6468 return -EBUSY; 6469 } 6470 6471 static int hot_add_disk(struct mddev *mddev, dev_t dev) 6472 { 6473 char b[BDEVNAME_SIZE]; 6474 int err; 6475 struct md_rdev *rdev; 6476 6477 if (!mddev->pers) 6478 return -ENODEV; 6479 6480 if (mddev->major_version != 0) { 6481 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 6482 mdname(mddev)); 6483 return -EINVAL; 6484 } 6485 if (!mddev->pers->hot_add_disk) { 6486 pr_warn("%s: personality does not support diskops!\n", 6487 mdname(mddev)); 6488 return -EINVAL; 6489 } 6490 6491 rdev = md_import_device(dev, -1, 0); 6492 if (IS_ERR(rdev)) { 6493 pr_warn("md: error, md_import_device() returned %ld\n", 6494 PTR_ERR(rdev)); 6495 return -EINVAL; 6496 } 6497 6498 if (mddev->persistent) 6499 rdev->sb_start = calc_dev_sboffset(rdev); 6500 else 6501 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6502 6503 rdev->sectors = rdev->sb_start; 6504 6505 if (test_bit(Faulty, &rdev->flags)) { 6506 pr_warn("md: can not hot-add faulty %s disk to %s!\n", 6507 bdevname(rdev->bdev,b), mdname(mddev)); 6508 err = -EINVAL; 6509 goto abort_export; 6510 } 6511 6512 clear_bit(In_sync, &rdev->flags); 6513 rdev->desc_nr = -1; 6514 rdev->saved_raid_disk = -1; 6515 err = bind_rdev_to_array(rdev, mddev); 6516 if (err) 6517 goto abort_export; 6518 6519 /* 6520 * The rest should better be atomic, we can have disk failures 6521 * noticed in interrupt contexts ... 6522 */ 6523 6524 rdev->raid_disk = -1; 6525 6526 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6527 if (!mddev->thread) 6528 md_update_sb(mddev, 1); 6529 /* 6530 * Kick recovery, maybe this spare has to be added to the 6531 * array immediately. 6532 */ 6533 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6534 md_wakeup_thread(mddev->thread); 6535 md_new_event(mddev); 6536 return 0; 6537 6538 abort_export: 6539 export_rdev(rdev); 6540 return err; 6541 } 6542 6543 static int set_bitmap_file(struct mddev *mddev, int fd) 6544 { 6545 int err = 0; 6546 6547 if (mddev->pers) { 6548 if (!mddev->pers->quiesce || !mddev->thread) 6549 return -EBUSY; 6550 if (mddev->recovery || mddev->sync_thread) 6551 return -EBUSY; 6552 /* we should be able to change the bitmap.. */ 6553 } 6554 6555 if (fd >= 0) { 6556 struct inode *inode; 6557 struct file *f; 6558 6559 if (mddev->bitmap || mddev->bitmap_info.file) 6560 return -EEXIST; /* cannot add when bitmap is present */ 6561 f = fget(fd); 6562 6563 if (f == NULL) { 6564 pr_warn("%s: error: failed to get bitmap file\n", 6565 mdname(mddev)); 6566 return -EBADF; 6567 } 6568 6569 inode = f->f_mapping->host; 6570 if (!S_ISREG(inode->i_mode)) { 6571 pr_warn("%s: error: bitmap file must be a regular file\n", 6572 mdname(mddev)); 6573 err = -EBADF; 6574 } else if (!(f->f_mode & FMODE_WRITE)) { 6575 pr_warn("%s: error: bitmap file must open for write\n", 6576 mdname(mddev)); 6577 err = -EBADF; 6578 } else if (atomic_read(&inode->i_writecount) != 1) { 6579 pr_warn("%s: error: bitmap file is already in use\n", 6580 mdname(mddev)); 6581 err = -EBUSY; 6582 } 6583 if (err) { 6584 fput(f); 6585 return err; 6586 } 6587 mddev->bitmap_info.file = f; 6588 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6589 } else if (mddev->bitmap == NULL) 6590 return -ENOENT; /* cannot remove what isn't there */ 6591 err = 0; 6592 if (mddev->pers) { 6593 mddev->pers->quiesce(mddev, 1); 6594 if (fd >= 0) { 6595 struct bitmap *bitmap; 6596 6597 bitmap = bitmap_create(mddev, -1); 6598 if (!IS_ERR(bitmap)) { 6599 mddev->bitmap = bitmap; 6600 err = bitmap_load(mddev); 6601 } else 6602 err = PTR_ERR(bitmap); 6603 } 6604 if (fd < 0 || err) { 6605 bitmap_destroy(mddev); 6606 fd = -1; /* make sure to put the file */ 6607 } 6608 mddev->pers->quiesce(mddev, 0); 6609 } 6610 if (fd < 0) { 6611 struct file *f = mddev->bitmap_info.file; 6612 if (f) { 6613 spin_lock(&mddev->lock); 6614 mddev->bitmap_info.file = NULL; 6615 spin_unlock(&mddev->lock); 6616 fput(f); 6617 } 6618 } 6619 6620 return err; 6621 } 6622 6623 /* 6624 * set_array_info is used two different ways 6625 * The original usage is when creating a new array. 6626 * In this usage, raid_disks is > 0 and it together with 6627 * level, size, not_persistent,layout,chunksize determine the 6628 * shape of the array. 6629 * This will always create an array with a type-0.90.0 superblock. 6630 * The newer usage is when assembling an array. 6631 * In this case raid_disks will be 0, and the major_version field is 6632 * use to determine which style super-blocks are to be found on the devices. 6633 * The minor and patch _version numbers are also kept incase the 6634 * super_block handler wishes to interpret them. 6635 */ 6636 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6637 { 6638 6639 if (info->raid_disks == 0) { 6640 /* just setting version number for superblock loading */ 6641 if (info->major_version < 0 || 6642 info->major_version >= ARRAY_SIZE(super_types) || 6643 super_types[info->major_version].name == NULL) { 6644 /* maybe try to auto-load a module? */ 6645 pr_warn("md: superblock version %d not known\n", 6646 info->major_version); 6647 return -EINVAL; 6648 } 6649 mddev->major_version = info->major_version; 6650 mddev->minor_version = info->minor_version; 6651 mddev->patch_version = info->patch_version; 6652 mddev->persistent = !info->not_persistent; 6653 /* ensure mddev_put doesn't delete this now that there 6654 * is some minimal configuration. 6655 */ 6656 mddev->ctime = ktime_get_real_seconds(); 6657 return 0; 6658 } 6659 mddev->major_version = MD_MAJOR_VERSION; 6660 mddev->minor_version = MD_MINOR_VERSION; 6661 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6662 mddev->ctime = ktime_get_real_seconds(); 6663 6664 mddev->level = info->level; 6665 mddev->clevel[0] = 0; 6666 mddev->dev_sectors = 2 * (sector_t)info->size; 6667 mddev->raid_disks = info->raid_disks; 6668 /* don't set md_minor, it is determined by which /dev/md* was 6669 * openned 6670 */ 6671 if (info->state & (1<<MD_SB_CLEAN)) 6672 mddev->recovery_cp = MaxSector; 6673 else 6674 mddev->recovery_cp = 0; 6675 mddev->persistent = ! info->not_persistent; 6676 mddev->external = 0; 6677 6678 mddev->layout = info->layout; 6679 mddev->chunk_sectors = info->chunk_size >> 9; 6680 6681 if (mddev->persistent) { 6682 mddev->max_disks = MD_SB_DISKS; 6683 mddev->flags = 0; 6684 mddev->sb_flags = 0; 6685 } 6686 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6687 6688 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6689 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6690 mddev->bitmap_info.offset = 0; 6691 6692 mddev->reshape_position = MaxSector; 6693 6694 /* 6695 * Generate a 128 bit UUID 6696 */ 6697 get_random_bytes(mddev->uuid, 16); 6698 6699 mddev->new_level = mddev->level; 6700 mddev->new_chunk_sectors = mddev->chunk_sectors; 6701 mddev->new_layout = mddev->layout; 6702 mddev->delta_disks = 0; 6703 mddev->reshape_backwards = 0; 6704 6705 return 0; 6706 } 6707 6708 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6709 { 6710 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6711 6712 if (mddev->external_size) 6713 return; 6714 6715 mddev->array_sectors = array_sectors; 6716 } 6717 EXPORT_SYMBOL(md_set_array_sectors); 6718 6719 static int update_size(struct mddev *mddev, sector_t num_sectors) 6720 { 6721 struct md_rdev *rdev; 6722 int rv; 6723 int fit = (num_sectors == 0); 6724 sector_t old_dev_sectors = mddev->dev_sectors; 6725 6726 if (mddev->pers->resize == NULL) 6727 return -EINVAL; 6728 /* The "num_sectors" is the number of sectors of each device that 6729 * is used. This can only make sense for arrays with redundancy. 6730 * linear and raid0 always use whatever space is available. We can only 6731 * consider changing this number if no resync or reconstruction is 6732 * happening, and if the new size is acceptable. It must fit before the 6733 * sb_start or, if that is <data_offset, it must fit before the size 6734 * of each device. If num_sectors is zero, we find the largest size 6735 * that fits. 6736 */ 6737 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6738 mddev->sync_thread) 6739 return -EBUSY; 6740 if (mddev->ro) 6741 return -EROFS; 6742 6743 rdev_for_each(rdev, mddev) { 6744 sector_t avail = rdev->sectors; 6745 6746 if (fit && (num_sectors == 0 || num_sectors > avail)) 6747 num_sectors = avail; 6748 if (avail < num_sectors) 6749 return -ENOSPC; 6750 } 6751 rv = mddev->pers->resize(mddev, num_sectors); 6752 if (!rv) { 6753 if (mddev_is_clustered(mddev)) 6754 md_cluster_ops->update_size(mddev, old_dev_sectors); 6755 else if (mddev->queue) { 6756 set_capacity(mddev->gendisk, mddev->array_sectors); 6757 revalidate_disk(mddev->gendisk); 6758 } 6759 } 6760 return rv; 6761 } 6762 6763 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6764 { 6765 int rv; 6766 struct md_rdev *rdev; 6767 /* change the number of raid disks */ 6768 if (mddev->pers->check_reshape == NULL) 6769 return -EINVAL; 6770 if (mddev->ro) 6771 return -EROFS; 6772 if (raid_disks <= 0 || 6773 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6774 return -EINVAL; 6775 if (mddev->sync_thread || 6776 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6777 mddev->reshape_position != MaxSector) 6778 return -EBUSY; 6779 6780 rdev_for_each(rdev, mddev) { 6781 if (mddev->raid_disks < raid_disks && 6782 rdev->data_offset < rdev->new_data_offset) 6783 return -EINVAL; 6784 if (mddev->raid_disks > raid_disks && 6785 rdev->data_offset > rdev->new_data_offset) 6786 return -EINVAL; 6787 } 6788 6789 mddev->delta_disks = raid_disks - mddev->raid_disks; 6790 if (mddev->delta_disks < 0) 6791 mddev->reshape_backwards = 1; 6792 else if (mddev->delta_disks > 0) 6793 mddev->reshape_backwards = 0; 6794 6795 rv = mddev->pers->check_reshape(mddev); 6796 if (rv < 0) { 6797 mddev->delta_disks = 0; 6798 mddev->reshape_backwards = 0; 6799 } 6800 return rv; 6801 } 6802 6803 /* 6804 * update_array_info is used to change the configuration of an 6805 * on-line array. 6806 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6807 * fields in the info are checked against the array. 6808 * Any differences that cannot be handled will cause an error. 6809 * Normally, only one change can be managed at a time. 6810 */ 6811 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6812 { 6813 int rv = 0; 6814 int cnt = 0; 6815 int state = 0; 6816 6817 /* calculate expected state,ignoring low bits */ 6818 if (mddev->bitmap && mddev->bitmap_info.offset) 6819 state |= (1 << MD_SB_BITMAP_PRESENT); 6820 6821 if (mddev->major_version != info->major_version || 6822 mddev->minor_version != info->minor_version || 6823 /* mddev->patch_version != info->patch_version || */ 6824 mddev->ctime != info->ctime || 6825 mddev->level != info->level || 6826 /* mddev->layout != info->layout || */ 6827 mddev->persistent != !info->not_persistent || 6828 mddev->chunk_sectors != info->chunk_size >> 9 || 6829 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6830 ((state^info->state) & 0xfffffe00) 6831 ) 6832 return -EINVAL; 6833 /* Check there is only one change */ 6834 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6835 cnt++; 6836 if (mddev->raid_disks != info->raid_disks) 6837 cnt++; 6838 if (mddev->layout != info->layout) 6839 cnt++; 6840 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6841 cnt++; 6842 if (cnt == 0) 6843 return 0; 6844 if (cnt > 1) 6845 return -EINVAL; 6846 6847 if (mddev->layout != info->layout) { 6848 /* Change layout 6849 * we don't need to do anything at the md level, the 6850 * personality will take care of it all. 6851 */ 6852 if (mddev->pers->check_reshape == NULL) 6853 return -EINVAL; 6854 else { 6855 mddev->new_layout = info->layout; 6856 rv = mddev->pers->check_reshape(mddev); 6857 if (rv) 6858 mddev->new_layout = mddev->layout; 6859 return rv; 6860 } 6861 } 6862 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6863 rv = update_size(mddev, (sector_t)info->size * 2); 6864 6865 if (mddev->raid_disks != info->raid_disks) 6866 rv = update_raid_disks(mddev, info->raid_disks); 6867 6868 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6869 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6870 rv = -EINVAL; 6871 goto err; 6872 } 6873 if (mddev->recovery || mddev->sync_thread) { 6874 rv = -EBUSY; 6875 goto err; 6876 } 6877 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6878 struct bitmap *bitmap; 6879 /* add the bitmap */ 6880 if (mddev->bitmap) { 6881 rv = -EEXIST; 6882 goto err; 6883 } 6884 if (mddev->bitmap_info.default_offset == 0) { 6885 rv = -EINVAL; 6886 goto err; 6887 } 6888 mddev->bitmap_info.offset = 6889 mddev->bitmap_info.default_offset; 6890 mddev->bitmap_info.space = 6891 mddev->bitmap_info.default_space; 6892 mddev->pers->quiesce(mddev, 1); 6893 bitmap = bitmap_create(mddev, -1); 6894 if (!IS_ERR(bitmap)) { 6895 mddev->bitmap = bitmap; 6896 rv = bitmap_load(mddev); 6897 } else 6898 rv = PTR_ERR(bitmap); 6899 if (rv) 6900 bitmap_destroy(mddev); 6901 mddev->pers->quiesce(mddev, 0); 6902 } else { 6903 /* remove the bitmap */ 6904 if (!mddev->bitmap) { 6905 rv = -ENOENT; 6906 goto err; 6907 } 6908 if (mddev->bitmap->storage.file) { 6909 rv = -EINVAL; 6910 goto err; 6911 } 6912 if (mddev->bitmap_info.nodes) { 6913 /* hold PW on all the bitmap lock */ 6914 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 6915 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 6916 rv = -EPERM; 6917 md_cluster_ops->unlock_all_bitmaps(mddev); 6918 goto err; 6919 } 6920 6921 mddev->bitmap_info.nodes = 0; 6922 md_cluster_ops->leave(mddev); 6923 } 6924 mddev->pers->quiesce(mddev, 1); 6925 bitmap_destroy(mddev); 6926 mddev->pers->quiesce(mddev, 0); 6927 mddev->bitmap_info.offset = 0; 6928 } 6929 } 6930 md_update_sb(mddev, 1); 6931 return rv; 6932 err: 6933 return rv; 6934 } 6935 6936 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6937 { 6938 struct md_rdev *rdev; 6939 int err = 0; 6940 6941 if (mddev->pers == NULL) 6942 return -ENODEV; 6943 6944 rcu_read_lock(); 6945 rdev = find_rdev_rcu(mddev, dev); 6946 if (!rdev) 6947 err = -ENODEV; 6948 else { 6949 md_error(mddev, rdev); 6950 if (!test_bit(Faulty, &rdev->flags)) 6951 err = -EBUSY; 6952 } 6953 rcu_read_unlock(); 6954 return err; 6955 } 6956 6957 /* 6958 * We have a problem here : there is no easy way to give a CHS 6959 * virtual geometry. We currently pretend that we have a 2 heads 6960 * 4 sectors (with a BIG number of cylinders...). This drives 6961 * dosfs just mad... ;-) 6962 */ 6963 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6964 { 6965 struct mddev *mddev = bdev->bd_disk->private_data; 6966 6967 geo->heads = 2; 6968 geo->sectors = 4; 6969 geo->cylinders = mddev->array_sectors / 8; 6970 return 0; 6971 } 6972 6973 static inline bool md_ioctl_valid(unsigned int cmd) 6974 { 6975 switch (cmd) { 6976 case ADD_NEW_DISK: 6977 case BLKROSET: 6978 case GET_ARRAY_INFO: 6979 case GET_BITMAP_FILE: 6980 case GET_DISK_INFO: 6981 case HOT_ADD_DISK: 6982 case HOT_REMOVE_DISK: 6983 case RAID_AUTORUN: 6984 case RAID_VERSION: 6985 case RESTART_ARRAY_RW: 6986 case RUN_ARRAY: 6987 case SET_ARRAY_INFO: 6988 case SET_BITMAP_FILE: 6989 case SET_DISK_FAULTY: 6990 case STOP_ARRAY: 6991 case STOP_ARRAY_RO: 6992 case CLUSTERED_DISK_NACK: 6993 return true; 6994 default: 6995 return false; 6996 } 6997 } 6998 6999 static int md_ioctl(struct block_device *bdev, fmode_t mode, 7000 unsigned int cmd, unsigned long arg) 7001 { 7002 int err = 0; 7003 void __user *argp = (void __user *)arg; 7004 struct mddev *mddev = NULL; 7005 int ro; 7006 bool did_set_md_closing = false; 7007 7008 if (!md_ioctl_valid(cmd)) 7009 return -ENOTTY; 7010 7011 switch (cmd) { 7012 case RAID_VERSION: 7013 case GET_ARRAY_INFO: 7014 case GET_DISK_INFO: 7015 break; 7016 default: 7017 if (!capable(CAP_SYS_ADMIN)) 7018 return -EACCES; 7019 } 7020 7021 /* 7022 * Commands dealing with the RAID driver but not any 7023 * particular array: 7024 */ 7025 switch (cmd) { 7026 case RAID_VERSION: 7027 err = get_version(argp); 7028 goto out; 7029 7030 #ifndef MODULE 7031 case RAID_AUTORUN: 7032 err = 0; 7033 autostart_arrays(arg); 7034 goto out; 7035 #endif 7036 default:; 7037 } 7038 7039 /* 7040 * Commands creating/starting a new array: 7041 */ 7042 7043 mddev = bdev->bd_disk->private_data; 7044 7045 if (!mddev) { 7046 BUG(); 7047 goto out; 7048 } 7049 7050 /* Some actions do not requires the mutex */ 7051 switch (cmd) { 7052 case GET_ARRAY_INFO: 7053 if (!mddev->raid_disks && !mddev->external) 7054 err = -ENODEV; 7055 else 7056 err = get_array_info(mddev, argp); 7057 goto out; 7058 7059 case GET_DISK_INFO: 7060 if (!mddev->raid_disks && !mddev->external) 7061 err = -ENODEV; 7062 else 7063 err = get_disk_info(mddev, argp); 7064 goto out; 7065 7066 case SET_DISK_FAULTY: 7067 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7068 goto out; 7069 7070 case GET_BITMAP_FILE: 7071 err = get_bitmap_file(mddev, argp); 7072 goto out; 7073 7074 } 7075 7076 if (cmd == ADD_NEW_DISK) 7077 /* need to ensure md_delayed_delete() has completed */ 7078 flush_workqueue(md_misc_wq); 7079 7080 if (cmd == HOT_REMOVE_DISK) 7081 /* need to ensure recovery thread has run */ 7082 wait_event_interruptible_timeout(mddev->sb_wait, 7083 !test_bit(MD_RECOVERY_NEEDED, 7084 &mddev->recovery), 7085 msecs_to_jiffies(5000)); 7086 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7087 /* Need to flush page cache, and ensure no-one else opens 7088 * and writes 7089 */ 7090 mutex_lock(&mddev->open_mutex); 7091 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7092 mutex_unlock(&mddev->open_mutex); 7093 err = -EBUSY; 7094 goto out; 7095 } 7096 WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); 7097 set_bit(MD_CLOSING, &mddev->flags); 7098 did_set_md_closing = true; 7099 mutex_unlock(&mddev->open_mutex); 7100 sync_blockdev(bdev); 7101 } 7102 err = mddev_lock(mddev); 7103 if (err) { 7104 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7105 err, cmd); 7106 goto out; 7107 } 7108 7109 if (cmd == SET_ARRAY_INFO) { 7110 mdu_array_info_t info; 7111 if (!arg) 7112 memset(&info, 0, sizeof(info)); 7113 else if (copy_from_user(&info, argp, sizeof(info))) { 7114 err = -EFAULT; 7115 goto unlock; 7116 } 7117 if (mddev->pers) { 7118 err = update_array_info(mddev, &info); 7119 if (err) { 7120 pr_warn("md: couldn't update array info. %d\n", err); 7121 goto unlock; 7122 } 7123 goto unlock; 7124 } 7125 if (!list_empty(&mddev->disks)) { 7126 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7127 err = -EBUSY; 7128 goto unlock; 7129 } 7130 if (mddev->raid_disks) { 7131 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7132 err = -EBUSY; 7133 goto unlock; 7134 } 7135 err = set_array_info(mddev, &info); 7136 if (err) { 7137 pr_warn("md: couldn't set array info. %d\n", err); 7138 goto unlock; 7139 } 7140 goto unlock; 7141 } 7142 7143 /* 7144 * Commands querying/configuring an existing array: 7145 */ 7146 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7147 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7148 if ((!mddev->raid_disks && !mddev->external) 7149 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7150 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7151 && cmd != GET_BITMAP_FILE) { 7152 err = -ENODEV; 7153 goto unlock; 7154 } 7155 7156 /* 7157 * Commands even a read-only array can execute: 7158 */ 7159 switch (cmd) { 7160 case RESTART_ARRAY_RW: 7161 err = restart_array(mddev); 7162 goto unlock; 7163 7164 case STOP_ARRAY: 7165 err = do_md_stop(mddev, 0, bdev); 7166 goto unlock; 7167 7168 case STOP_ARRAY_RO: 7169 err = md_set_readonly(mddev, bdev); 7170 goto unlock; 7171 7172 case HOT_REMOVE_DISK: 7173 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7174 goto unlock; 7175 7176 case ADD_NEW_DISK: 7177 /* We can support ADD_NEW_DISK on read-only arrays 7178 * only if we are re-adding a preexisting device. 7179 * So require mddev->pers and MD_DISK_SYNC. 7180 */ 7181 if (mddev->pers) { 7182 mdu_disk_info_t info; 7183 if (copy_from_user(&info, argp, sizeof(info))) 7184 err = -EFAULT; 7185 else if (!(info.state & (1<<MD_DISK_SYNC))) 7186 /* Need to clear read-only for this */ 7187 break; 7188 else 7189 err = add_new_disk(mddev, &info); 7190 goto unlock; 7191 } 7192 break; 7193 7194 case BLKROSET: 7195 if (get_user(ro, (int __user *)(arg))) { 7196 err = -EFAULT; 7197 goto unlock; 7198 } 7199 err = -EINVAL; 7200 7201 /* if the bdev is going readonly the value of mddev->ro 7202 * does not matter, no writes are coming 7203 */ 7204 if (ro) 7205 goto unlock; 7206 7207 /* are we are already prepared for writes? */ 7208 if (mddev->ro != 1) 7209 goto unlock; 7210 7211 /* transitioning to readauto need only happen for 7212 * arrays that call md_write_start 7213 */ 7214 if (mddev->pers) { 7215 err = restart_array(mddev); 7216 if (err == 0) { 7217 mddev->ro = 2; 7218 set_disk_ro(mddev->gendisk, 0); 7219 } 7220 } 7221 goto unlock; 7222 } 7223 7224 /* 7225 * The remaining ioctls are changing the state of the 7226 * superblock, so we do not allow them on read-only arrays. 7227 */ 7228 if (mddev->ro && mddev->pers) { 7229 if (mddev->ro == 2) { 7230 mddev->ro = 0; 7231 sysfs_notify_dirent_safe(mddev->sysfs_state); 7232 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7233 /* mddev_unlock will wake thread */ 7234 /* If a device failed while we were read-only, we 7235 * need to make sure the metadata is updated now. 7236 */ 7237 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7238 mddev_unlock(mddev); 7239 wait_event(mddev->sb_wait, 7240 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7241 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7242 mddev_lock_nointr(mddev); 7243 } 7244 } else { 7245 err = -EROFS; 7246 goto unlock; 7247 } 7248 } 7249 7250 switch (cmd) { 7251 case ADD_NEW_DISK: 7252 { 7253 mdu_disk_info_t info; 7254 if (copy_from_user(&info, argp, sizeof(info))) 7255 err = -EFAULT; 7256 else 7257 err = add_new_disk(mddev, &info); 7258 goto unlock; 7259 } 7260 7261 case CLUSTERED_DISK_NACK: 7262 if (mddev_is_clustered(mddev)) 7263 md_cluster_ops->new_disk_ack(mddev, false); 7264 else 7265 err = -EINVAL; 7266 goto unlock; 7267 7268 case HOT_ADD_DISK: 7269 err = hot_add_disk(mddev, new_decode_dev(arg)); 7270 goto unlock; 7271 7272 case RUN_ARRAY: 7273 err = do_md_run(mddev); 7274 goto unlock; 7275 7276 case SET_BITMAP_FILE: 7277 err = set_bitmap_file(mddev, (int)arg); 7278 goto unlock; 7279 7280 default: 7281 err = -EINVAL; 7282 goto unlock; 7283 } 7284 7285 unlock: 7286 if (mddev->hold_active == UNTIL_IOCTL && 7287 err != -EINVAL) 7288 mddev->hold_active = 0; 7289 mddev_unlock(mddev); 7290 out: 7291 if(did_set_md_closing) 7292 clear_bit(MD_CLOSING, &mddev->flags); 7293 return err; 7294 } 7295 #ifdef CONFIG_COMPAT 7296 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 7297 unsigned int cmd, unsigned long arg) 7298 { 7299 switch (cmd) { 7300 case HOT_REMOVE_DISK: 7301 case HOT_ADD_DISK: 7302 case SET_DISK_FAULTY: 7303 case SET_BITMAP_FILE: 7304 /* These take in integer arg, do not convert */ 7305 break; 7306 default: 7307 arg = (unsigned long)compat_ptr(arg); 7308 break; 7309 } 7310 7311 return md_ioctl(bdev, mode, cmd, arg); 7312 } 7313 #endif /* CONFIG_COMPAT */ 7314 7315 static int md_open(struct block_device *bdev, fmode_t mode) 7316 { 7317 /* 7318 * Succeed if we can lock the mddev, which confirms that 7319 * it isn't being stopped right now. 7320 */ 7321 struct mddev *mddev = mddev_find(bdev->bd_dev); 7322 int err; 7323 7324 if (!mddev) 7325 return -ENODEV; 7326 7327 if (mddev->gendisk != bdev->bd_disk) { 7328 /* we are racing with mddev_put which is discarding this 7329 * bd_disk. 7330 */ 7331 mddev_put(mddev); 7332 /* Wait until bdev->bd_disk is definitely gone */ 7333 flush_workqueue(md_misc_wq); 7334 /* Then retry the open from the top */ 7335 return -ERESTARTSYS; 7336 } 7337 BUG_ON(mddev != bdev->bd_disk->private_data); 7338 7339 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7340 goto out; 7341 7342 if (test_bit(MD_CLOSING, &mddev->flags)) { 7343 mutex_unlock(&mddev->open_mutex); 7344 err = -ENODEV; 7345 goto out; 7346 } 7347 7348 err = 0; 7349 atomic_inc(&mddev->openers); 7350 mutex_unlock(&mddev->open_mutex); 7351 7352 check_disk_change(bdev); 7353 out: 7354 if (err) 7355 mddev_put(mddev); 7356 return err; 7357 } 7358 7359 static void md_release(struct gendisk *disk, fmode_t mode) 7360 { 7361 struct mddev *mddev = disk->private_data; 7362 7363 BUG_ON(!mddev); 7364 atomic_dec(&mddev->openers); 7365 mddev_put(mddev); 7366 } 7367 7368 static int md_media_changed(struct gendisk *disk) 7369 { 7370 struct mddev *mddev = disk->private_data; 7371 7372 return mddev->changed; 7373 } 7374 7375 static int md_revalidate(struct gendisk *disk) 7376 { 7377 struct mddev *mddev = disk->private_data; 7378 7379 mddev->changed = 0; 7380 return 0; 7381 } 7382 static const struct block_device_operations md_fops = 7383 { 7384 .owner = THIS_MODULE, 7385 .open = md_open, 7386 .release = md_release, 7387 .ioctl = md_ioctl, 7388 #ifdef CONFIG_COMPAT 7389 .compat_ioctl = md_compat_ioctl, 7390 #endif 7391 .getgeo = md_getgeo, 7392 .media_changed = md_media_changed, 7393 .revalidate_disk= md_revalidate, 7394 }; 7395 7396 static int md_thread(void *arg) 7397 { 7398 struct md_thread *thread = arg; 7399 7400 /* 7401 * md_thread is a 'system-thread', it's priority should be very 7402 * high. We avoid resource deadlocks individually in each 7403 * raid personality. (RAID5 does preallocation) We also use RR and 7404 * the very same RT priority as kswapd, thus we will never get 7405 * into a priority inversion deadlock. 7406 * 7407 * we definitely have to have equal or higher priority than 7408 * bdflush, otherwise bdflush will deadlock if there are too 7409 * many dirty RAID5 blocks. 7410 */ 7411 7412 allow_signal(SIGKILL); 7413 while (!kthread_should_stop()) { 7414 7415 /* We need to wait INTERRUPTIBLE so that 7416 * we don't add to the load-average. 7417 * That means we need to be sure no signals are 7418 * pending 7419 */ 7420 if (signal_pending(current)) 7421 flush_signals(current); 7422 7423 wait_event_interruptible_timeout 7424 (thread->wqueue, 7425 test_bit(THREAD_WAKEUP, &thread->flags) 7426 || kthread_should_stop() || kthread_should_park(), 7427 thread->timeout); 7428 7429 clear_bit(THREAD_WAKEUP, &thread->flags); 7430 if (kthread_should_park()) 7431 kthread_parkme(); 7432 if (!kthread_should_stop()) 7433 thread->run(thread); 7434 } 7435 7436 return 0; 7437 } 7438 7439 void md_wakeup_thread(struct md_thread *thread) 7440 { 7441 if (thread) { 7442 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7443 if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) 7444 wake_up(&thread->wqueue); 7445 } 7446 } 7447 EXPORT_SYMBOL(md_wakeup_thread); 7448 7449 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7450 struct mddev *mddev, const char *name) 7451 { 7452 struct md_thread *thread; 7453 7454 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7455 if (!thread) 7456 return NULL; 7457 7458 init_waitqueue_head(&thread->wqueue); 7459 7460 thread->run = run; 7461 thread->mddev = mddev; 7462 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7463 thread->tsk = kthread_run(md_thread, thread, 7464 "%s_%s", 7465 mdname(thread->mddev), 7466 name); 7467 if (IS_ERR(thread->tsk)) { 7468 kfree(thread); 7469 return NULL; 7470 } 7471 return thread; 7472 } 7473 EXPORT_SYMBOL(md_register_thread); 7474 7475 void md_unregister_thread(struct md_thread **threadp) 7476 { 7477 struct md_thread *thread = *threadp; 7478 if (!thread) 7479 return; 7480 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7481 /* Locking ensures that mddev_unlock does not wake_up a 7482 * non-existent thread 7483 */ 7484 spin_lock(&pers_lock); 7485 *threadp = NULL; 7486 spin_unlock(&pers_lock); 7487 7488 kthread_stop(thread->tsk); 7489 kfree(thread); 7490 } 7491 EXPORT_SYMBOL(md_unregister_thread); 7492 7493 void md_error(struct mddev *mddev, struct md_rdev *rdev) 7494 { 7495 if (!rdev || test_bit(Faulty, &rdev->flags)) 7496 return; 7497 7498 if (!mddev->pers || !mddev->pers->error_handler) 7499 return; 7500 mddev->pers->error_handler(mddev,rdev); 7501 if (mddev->degraded) 7502 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7503 sysfs_notify_dirent_safe(rdev->sysfs_state); 7504 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7505 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7506 md_wakeup_thread(mddev->thread); 7507 if (mddev->event_work.func) 7508 queue_work(md_misc_wq, &mddev->event_work); 7509 md_new_event(mddev); 7510 } 7511 EXPORT_SYMBOL(md_error); 7512 7513 /* seq_file implementation /proc/mdstat */ 7514 7515 static void status_unused(struct seq_file *seq) 7516 { 7517 int i = 0; 7518 struct md_rdev *rdev; 7519 7520 seq_printf(seq, "unused devices: "); 7521 7522 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7523 char b[BDEVNAME_SIZE]; 7524 i++; 7525 seq_printf(seq, "%s ", 7526 bdevname(rdev->bdev,b)); 7527 } 7528 if (!i) 7529 seq_printf(seq, "<none>"); 7530 7531 seq_printf(seq, "\n"); 7532 } 7533 7534 static int status_resync(struct seq_file *seq, struct mddev *mddev) 7535 { 7536 sector_t max_sectors, resync, res; 7537 unsigned long dt, db; 7538 sector_t rt; 7539 int scale; 7540 unsigned int per_milli; 7541 7542 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7543 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7544 max_sectors = mddev->resync_max_sectors; 7545 else 7546 max_sectors = mddev->dev_sectors; 7547 7548 resync = mddev->curr_resync; 7549 if (resync <= 3) { 7550 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7551 /* Still cleaning up */ 7552 resync = max_sectors; 7553 } else 7554 resync -= atomic_read(&mddev->recovery_active); 7555 7556 if (resync == 0) { 7557 if (mddev->recovery_cp < MaxSector) { 7558 seq_printf(seq, "\tresync=PENDING"); 7559 return 1; 7560 } 7561 return 0; 7562 } 7563 if (resync < 3) { 7564 seq_printf(seq, "\tresync=DELAYED"); 7565 return 1; 7566 } 7567 7568 WARN_ON(max_sectors == 0); 7569 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7570 * in a sector_t, and (max_sectors>>scale) will fit in a 7571 * u32, as those are the requirements for sector_div. 7572 * Thus 'scale' must be at least 10 7573 */ 7574 scale = 10; 7575 if (sizeof(sector_t) > sizeof(unsigned long)) { 7576 while ( max_sectors/2 > (1ULL<<(scale+32))) 7577 scale++; 7578 } 7579 res = (resync>>scale)*1000; 7580 sector_div(res, (u32)((max_sectors>>scale)+1)); 7581 7582 per_milli = res; 7583 { 7584 int i, x = per_milli/50, y = 20-x; 7585 seq_printf(seq, "["); 7586 for (i = 0; i < x; i++) 7587 seq_printf(seq, "="); 7588 seq_printf(seq, ">"); 7589 for (i = 0; i < y; i++) 7590 seq_printf(seq, "."); 7591 seq_printf(seq, "] "); 7592 } 7593 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7594 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7595 "reshape" : 7596 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7597 "check" : 7598 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7599 "resync" : "recovery"))), 7600 per_milli/10, per_milli % 10, 7601 (unsigned long long) resync/2, 7602 (unsigned long long) max_sectors/2); 7603 7604 /* 7605 * dt: time from mark until now 7606 * db: blocks written from mark until now 7607 * rt: remaining time 7608 * 7609 * rt is a sector_t, so could be 32bit or 64bit. 7610 * So we divide before multiply in case it is 32bit and close 7611 * to the limit. 7612 * We scale the divisor (db) by 32 to avoid losing precision 7613 * near the end of resync when the number of remaining sectors 7614 * is close to 'db'. 7615 * We then divide rt by 32 after multiplying by db to compensate. 7616 * The '+1' avoids division by zero if db is very small. 7617 */ 7618 dt = ((jiffies - mddev->resync_mark) / HZ); 7619 if (!dt) dt++; 7620 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7621 - mddev->resync_mark_cnt; 7622 7623 rt = max_sectors - resync; /* number of remaining sectors */ 7624 sector_div(rt, db/32+1); 7625 rt *= dt; 7626 rt >>= 5; 7627 7628 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7629 ((unsigned long)rt % 60)/6); 7630 7631 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7632 return 1; 7633 } 7634 7635 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7636 { 7637 struct list_head *tmp; 7638 loff_t l = *pos; 7639 struct mddev *mddev; 7640 7641 if (l >= 0x10000) 7642 return NULL; 7643 if (!l--) 7644 /* header */ 7645 return (void*)1; 7646 7647 spin_lock(&all_mddevs_lock); 7648 list_for_each(tmp,&all_mddevs) 7649 if (!l--) { 7650 mddev = list_entry(tmp, struct mddev, all_mddevs); 7651 mddev_get(mddev); 7652 spin_unlock(&all_mddevs_lock); 7653 return mddev; 7654 } 7655 spin_unlock(&all_mddevs_lock); 7656 if (!l--) 7657 return (void*)2;/* tail */ 7658 return NULL; 7659 } 7660 7661 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7662 { 7663 struct list_head *tmp; 7664 struct mddev *next_mddev, *mddev = v; 7665 7666 ++*pos; 7667 if (v == (void*)2) 7668 return NULL; 7669 7670 spin_lock(&all_mddevs_lock); 7671 if (v == (void*)1) 7672 tmp = all_mddevs.next; 7673 else 7674 tmp = mddev->all_mddevs.next; 7675 if (tmp != &all_mddevs) 7676 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7677 else { 7678 next_mddev = (void*)2; 7679 *pos = 0x10000; 7680 } 7681 spin_unlock(&all_mddevs_lock); 7682 7683 if (v != (void*)1) 7684 mddev_put(mddev); 7685 return next_mddev; 7686 7687 } 7688 7689 static void md_seq_stop(struct seq_file *seq, void *v) 7690 { 7691 struct mddev *mddev = v; 7692 7693 if (mddev && v != (void*)1 && v != (void*)2) 7694 mddev_put(mddev); 7695 } 7696 7697 static int md_seq_show(struct seq_file *seq, void *v) 7698 { 7699 struct mddev *mddev = v; 7700 sector_t sectors; 7701 struct md_rdev *rdev; 7702 7703 if (v == (void*)1) { 7704 struct md_personality *pers; 7705 seq_printf(seq, "Personalities : "); 7706 spin_lock(&pers_lock); 7707 list_for_each_entry(pers, &pers_list, list) 7708 seq_printf(seq, "[%s] ", pers->name); 7709 7710 spin_unlock(&pers_lock); 7711 seq_printf(seq, "\n"); 7712 seq->poll_event = atomic_read(&md_event_count); 7713 return 0; 7714 } 7715 if (v == (void*)2) { 7716 status_unused(seq); 7717 return 0; 7718 } 7719 7720 spin_lock(&mddev->lock); 7721 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7722 seq_printf(seq, "%s : %sactive", mdname(mddev), 7723 mddev->pers ? "" : "in"); 7724 if (mddev->pers) { 7725 if (mddev->ro==1) 7726 seq_printf(seq, " (read-only)"); 7727 if (mddev->ro==2) 7728 seq_printf(seq, " (auto-read-only)"); 7729 seq_printf(seq, " %s", mddev->pers->name); 7730 } 7731 7732 sectors = 0; 7733 rcu_read_lock(); 7734 rdev_for_each_rcu(rdev, mddev) { 7735 char b[BDEVNAME_SIZE]; 7736 seq_printf(seq, " %s[%d]", 7737 bdevname(rdev->bdev,b), rdev->desc_nr); 7738 if (test_bit(WriteMostly, &rdev->flags)) 7739 seq_printf(seq, "(W)"); 7740 if (test_bit(Journal, &rdev->flags)) 7741 seq_printf(seq, "(J)"); 7742 if (test_bit(Faulty, &rdev->flags)) { 7743 seq_printf(seq, "(F)"); 7744 continue; 7745 } 7746 if (rdev->raid_disk < 0) 7747 seq_printf(seq, "(S)"); /* spare */ 7748 if (test_bit(Replacement, &rdev->flags)) 7749 seq_printf(seq, "(R)"); 7750 sectors += rdev->sectors; 7751 } 7752 rcu_read_unlock(); 7753 7754 if (!list_empty(&mddev->disks)) { 7755 if (mddev->pers) 7756 seq_printf(seq, "\n %llu blocks", 7757 (unsigned long long) 7758 mddev->array_sectors / 2); 7759 else 7760 seq_printf(seq, "\n %llu blocks", 7761 (unsigned long long)sectors / 2); 7762 } 7763 if (mddev->persistent) { 7764 if (mddev->major_version != 0 || 7765 mddev->minor_version != 90) { 7766 seq_printf(seq," super %d.%d", 7767 mddev->major_version, 7768 mddev->minor_version); 7769 } 7770 } else if (mddev->external) 7771 seq_printf(seq, " super external:%s", 7772 mddev->metadata_type); 7773 else 7774 seq_printf(seq, " super non-persistent"); 7775 7776 if (mddev->pers) { 7777 mddev->pers->status(seq, mddev); 7778 seq_printf(seq, "\n "); 7779 if (mddev->pers->sync_request) { 7780 if (status_resync(seq, mddev)) 7781 seq_printf(seq, "\n "); 7782 } 7783 } else 7784 seq_printf(seq, "\n "); 7785 7786 bitmap_status(seq, mddev->bitmap); 7787 7788 seq_printf(seq, "\n"); 7789 } 7790 spin_unlock(&mddev->lock); 7791 7792 return 0; 7793 } 7794 7795 static const struct seq_operations md_seq_ops = { 7796 .start = md_seq_start, 7797 .next = md_seq_next, 7798 .stop = md_seq_stop, 7799 .show = md_seq_show, 7800 }; 7801 7802 static int md_seq_open(struct inode *inode, struct file *file) 7803 { 7804 struct seq_file *seq; 7805 int error; 7806 7807 error = seq_open(file, &md_seq_ops); 7808 if (error) 7809 return error; 7810 7811 seq = file->private_data; 7812 seq->poll_event = atomic_read(&md_event_count); 7813 return error; 7814 } 7815 7816 static int md_unloading; 7817 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7818 { 7819 struct seq_file *seq = filp->private_data; 7820 int mask; 7821 7822 if (md_unloading) 7823 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7824 poll_wait(filp, &md_event_waiters, wait); 7825 7826 /* always allow read */ 7827 mask = POLLIN | POLLRDNORM; 7828 7829 if (seq->poll_event != atomic_read(&md_event_count)) 7830 mask |= POLLERR | POLLPRI; 7831 return mask; 7832 } 7833 7834 static const struct file_operations md_seq_fops = { 7835 .owner = THIS_MODULE, 7836 .open = md_seq_open, 7837 .read = seq_read, 7838 .llseek = seq_lseek, 7839 .release = seq_release_private, 7840 .poll = mdstat_poll, 7841 }; 7842 7843 int register_md_personality(struct md_personality *p) 7844 { 7845 pr_debug("md: %s personality registered for level %d\n", 7846 p->name, p->level); 7847 spin_lock(&pers_lock); 7848 list_add_tail(&p->list, &pers_list); 7849 spin_unlock(&pers_lock); 7850 return 0; 7851 } 7852 EXPORT_SYMBOL(register_md_personality); 7853 7854 int unregister_md_personality(struct md_personality *p) 7855 { 7856 pr_debug("md: %s personality unregistered\n", p->name); 7857 spin_lock(&pers_lock); 7858 list_del_init(&p->list); 7859 spin_unlock(&pers_lock); 7860 return 0; 7861 } 7862 EXPORT_SYMBOL(unregister_md_personality); 7863 7864 int register_md_cluster_operations(struct md_cluster_operations *ops, 7865 struct module *module) 7866 { 7867 int ret = 0; 7868 spin_lock(&pers_lock); 7869 if (md_cluster_ops != NULL) 7870 ret = -EALREADY; 7871 else { 7872 md_cluster_ops = ops; 7873 md_cluster_mod = module; 7874 } 7875 spin_unlock(&pers_lock); 7876 return ret; 7877 } 7878 EXPORT_SYMBOL(register_md_cluster_operations); 7879 7880 int unregister_md_cluster_operations(void) 7881 { 7882 spin_lock(&pers_lock); 7883 md_cluster_ops = NULL; 7884 spin_unlock(&pers_lock); 7885 return 0; 7886 } 7887 EXPORT_SYMBOL(unregister_md_cluster_operations); 7888 7889 int md_setup_cluster(struct mddev *mddev, int nodes) 7890 { 7891 if (!md_cluster_ops) 7892 request_module("md-cluster"); 7893 spin_lock(&pers_lock); 7894 /* ensure module won't be unloaded */ 7895 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7896 pr_warn("can't find md-cluster module or get it's reference.\n"); 7897 spin_unlock(&pers_lock); 7898 return -ENOENT; 7899 } 7900 spin_unlock(&pers_lock); 7901 7902 return md_cluster_ops->join(mddev, nodes); 7903 } 7904 7905 void md_cluster_stop(struct mddev *mddev) 7906 { 7907 if (!md_cluster_ops) 7908 return; 7909 md_cluster_ops->leave(mddev); 7910 module_put(md_cluster_mod); 7911 } 7912 7913 static int is_mddev_idle(struct mddev *mddev, int init) 7914 { 7915 struct md_rdev *rdev; 7916 int idle; 7917 int curr_events; 7918 7919 idle = 1; 7920 rcu_read_lock(); 7921 rdev_for_each_rcu(rdev, mddev) { 7922 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7923 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7924 (int)part_stat_read(&disk->part0, sectors[1]) - 7925 atomic_read(&disk->sync_io); 7926 /* sync IO will cause sync_io to increase before the disk_stats 7927 * as sync_io is counted when a request starts, and 7928 * disk_stats is counted when it completes. 7929 * So resync activity will cause curr_events to be smaller than 7930 * when there was no such activity. 7931 * non-sync IO will cause disk_stat to increase without 7932 * increasing sync_io so curr_events will (eventually) 7933 * be larger than it was before. Once it becomes 7934 * substantially larger, the test below will cause 7935 * the array to appear non-idle, and resync will slow 7936 * down. 7937 * If there is a lot of outstanding resync activity when 7938 * we set last_event to curr_events, then all that activity 7939 * completing might cause the array to appear non-idle 7940 * and resync will be slowed down even though there might 7941 * not have been non-resync activity. This will only 7942 * happen once though. 'last_events' will soon reflect 7943 * the state where there is little or no outstanding 7944 * resync requests, and further resync activity will 7945 * always make curr_events less than last_events. 7946 * 7947 */ 7948 if (init || curr_events - rdev->last_events > 64) { 7949 rdev->last_events = curr_events; 7950 idle = 0; 7951 } 7952 } 7953 rcu_read_unlock(); 7954 return idle; 7955 } 7956 7957 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7958 { 7959 /* another "blocks" (512byte) blocks have been synced */ 7960 atomic_sub(blocks, &mddev->recovery_active); 7961 wake_up(&mddev->recovery_wait); 7962 if (!ok) { 7963 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7964 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7965 md_wakeup_thread(mddev->thread); 7966 // stop recovery, signal do_sync .... 7967 } 7968 } 7969 EXPORT_SYMBOL(md_done_sync); 7970 7971 /* md_write_start(mddev, bi) 7972 * If we need to update some array metadata (e.g. 'active' flag 7973 * in superblock) before writing, schedule a superblock update 7974 * and wait for it to complete. 7975 * A return value of 'false' means that the write wasn't recorded 7976 * and cannot proceed as the array is being suspend. 7977 */ 7978 bool md_write_start(struct mddev *mddev, struct bio *bi) 7979 { 7980 int did_change = 0; 7981 if (bio_data_dir(bi) != WRITE) 7982 return true; 7983 7984 BUG_ON(mddev->ro == 1); 7985 if (mddev->ro == 2) { 7986 /* need to switch to read/write */ 7987 mddev->ro = 0; 7988 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7989 md_wakeup_thread(mddev->thread); 7990 md_wakeup_thread(mddev->sync_thread); 7991 did_change = 1; 7992 } 7993 rcu_read_lock(); 7994 percpu_ref_get(&mddev->writes_pending); 7995 smp_mb(); /* Match smp_mb in set_in_sync() */ 7996 if (mddev->safemode == 1) 7997 mddev->safemode = 0; 7998 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 7999 if (mddev->in_sync || !mddev->sync_checkers) { 8000 spin_lock(&mddev->lock); 8001 if (mddev->in_sync) { 8002 mddev->in_sync = 0; 8003 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8004 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8005 md_wakeup_thread(mddev->thread); 8006 did_change = 1; 8007 } 8008 spin_unlock(&mddev->lock); 8009 } 8010 rcu_read_unlock(); 8011 if (did_change) 8012 sysfs_notify_dirent_safe(mddev->sysfs_state); 8013 wait_event(mddev->sb_wait, 8014 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); 8015 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8016 percpu_ref_put(&mddev->writes_pending); 8017 return false; 8018 } 8019 return true; 8020 } 8021 EXPORT_SYMBOL(md_write_start); 8022 8023 /* md_write_inc can only be called when md_write_start() has 8024 * already been called at least once of the current request. 8025 * It increments the counter and is useful when a single request 8026 * is split into several parts. Each part causes an increment and 8027 * so needs a matching md_write_end(). 8028 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8029 * a spinlocked region. 8030 */ 8031 void md_write_inc(struct mddev *mddev, struct bio *bi) 8032 { 8033 if (bio_data_dir(bi) != WRITE) 8034 return; 8035 WARN_ON_ONCE(mddev->in_sync || mddev->ro); 8036 percpu_ref_get(&mddev->writes_pending); 8037 } 8038 EXPORT_SYMBOL(md_write_inc); 8039 8040 void md_write_end(struct mddev *mddev) 8041 { 8042 percpu_ref_put(&mddev->writes_pending); 8043 8044 if (mddev->safemode == 2) 8045 md_wakeup_thread(mddev->thread); 8046 else if (mddev->safemode_delay) 8047 /* The roundup() ensures this only performs locking once 8048 * every ->safemode_delay jiffies 8049 */ 8050 mod_timer(&mddev->safemode_timer, 8051 roundup(jiffies, mddev->safemode_delay) + 8052 mddev->safemode_delay); 8053 } 8054 8055 EXPORT_SYMBOL(md_write_end); 8056 8057 /* md_allow_write(mddev) 8058 * Calling this ensures that the array is marked 'active' so that writes 8059 * may proceed without blocking. It is important to call this before 8060 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8061 * Must be called with mddev_lock held. 8062 */ 8063 void md_allow_write(struct mddev *mddev) 8064 { 8065 if (!mddev->pers) 8066 return; 8067 if (mddev->ro) 8068 return; 8069 if (!mddev->pers->sync_request) 8070 return; 8071 8072 spin_lock(&mddev->lock); 8073 if (mddev->in_sync) { 8074 mddev->in_sync = 0; 8075 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8076 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8077 if (mddev->safemode_delay && 8078 mddev->safemode == 0) 8079 mddev->safemode = 1; 8080 spin_unlock(&mddev->lock); 8081 md_update_sb(mddev, 0); 8082 sysfs_notify_dirent_safe(mddev->sysfs_state); 8083 /* wait for the dirty state to be recorded in the metadata */ 8084 wait_event(mddev->sb_wait, 8085 !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && 8086 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8087 } else 8088 spin_unlock(&mddev->lock); 8089 } 8090 EXPORT_SYMBOL_GPL(md_allow_write); 8091 8092 #define SYNC_MARKS 10 8093 #define SYNC_MARK_STEP (3*HZ) 8094 #define UPDATE_FREQUENCY (5*60*HZ) 8095 void md_do_sync(struct md_thread *thread) 8096 { 8097 struct mddev *mddev = thread->mddev; 8098 struct mddev *mddev2; 8099 unsigned int currspeed = 0, 8100 window; 8101 sector_t max_sectors,j, io_sectors, recovery_done; 8102 unsigned long mark[SYNC_MARKS]; 8103 unsigned long update_time; 8104 sector_t mark_cnt[SYNC_MARKS]; 8105 int last_mark,m; 8106 struct list_head *tmp; 8107 sector_t last_check; 8108 int skipped = 0; 8109 struct md_rdev *rdev; 8110 char *desc, *action = NULL; 8111 struct blk_plug plug; 8112 int ret; 8113 8114 /* just incase thread restarts... */ 8115 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8116 return; 8117 if (mddev->ro) {/* never try to sync a read-only array */ 8118 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8119 return; 8120 } 8121 8122 if (mddev_is_clustered(mddev)) { 8123 ret = md_cluster_ops->resync_start(mddev); 8124 if (ret) 8125 goto skip; 8126 8127 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8128 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8129 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8130 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8131 && ((unsigned long long)mddev->curr_resync_completed 8132 < (unsigned long long)mddev->resync_max_sectors)) 8133 goto skip; 8134 } 8135 8136 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8137 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8138 desc = "data-check"; 8139 action = "check"; 8140 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8141 desc = "requested-resync"; 8142 action = "repair"; 8143 } else 8144 desc = "resync"; 8145 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8146 desc = "reshape"; 8147 else 8148 desc = "recovery"; 8149 8150 mddev->last_sync_action = action ?: desc; 8151 8152 /* we overload curr_resync somewhat here. 8153 * 0 == not engaged in resync at all 8154 * 2 == checking that there is no conflict with another sync 8155 * 1 == like 2, but have yielded to allow conflicting resync to 8156 * commense 8157 * other == active in resync - this many blocks 8158 * 8159 * Before starting a resync we must have set curr_resync to 8160 * 2, and then checked that every "conflicting" array has curr_resync 8161 * less than ours. When we find one that is the same or higher 8162 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8163 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8164 * This will mean we have to start checking from the beginning again. 8165 * 8166 */ 8167 8168 do { 8169 int mddev2_minor = -1; 8170 mddev->curr_resync = 2; 8171 8172 try_again: 8173 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8174 goto skip; 8175 for_each_mddev(mddev2, tmp) { 8176 if (mddev2 == mddev) 8177 continue; 8178 if (!mddev->parallel_resync 8179 && mddev2->curr_resync 8180 && match_mddev_units(mddev, mddev2)) { 8181 DEFINE_WAIT(wq); 8182 if (mddev < mddev2 && mddev->curr_resync == 2) { 8183 /* arbitrarily yield */ 8184 mddev->curr_resync = 1; 8185 wake_up(&resync_wait); 8186 } 8187 if (mddev > mddev2 && mddev->curr_resync == 1) 8188 /* no need to wait here, we can wait the next 8189 * time 'round when curr_resync == 2 8190 */ 8191 continue; 8192 /* We need to wait 'interruptible' so as not to 8193 * contribute to the load average, and not to 8194 * be caught by 'softlockup' 8195 */ 8196 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8197 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8198 mddev2->curr_resync >= mddev->curr_resync) { 8199 if (mddev2_minor != mddev2->md_minor) { 8200 mddev2_minor = mddev2->md_minor; 8201 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8202 desc, mdname(mddev), 8203 mdname(mddev2)); 8204 } 8205 mddev_put(mddev2); 8206 if (signal_pending(current)) 8207 flush_signals(current); 8208 schedule(); 8209 finish_wait(&resync_wait, &wq); 8210 goto try_again; 8211 } 8212 finish_wait(&resync_wait, &wq); 8213 } 8214 } 8215 } while (mddev->curr_resync < 2); 8216 8217 j = 0; 8218 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8219 /* resync follows the size requested by the personality, 8220 * which defaults to physical size, but can be virtual size 8221 */ 8222 max_sectors = mddev->resync_max_sectors; 8223 atomic64_set(&mddev->resync_mismatches, 0); 8224 /* we don't use the checkpoint if there's a bitmap */ 8225 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8226 j = mddev->resync_min; 8227 else if (!mddev->bitmap) 8228 j = mddev->recovery_cp; 8229 8230 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8231 max_sectors = mddev->resync_max_sectors; 8232 else { 8233 /* recovery follows the physical size of devices */ 8234 max_sectors = mddev->dev_sectors; 8235 j = MaxSector; 8236 rcu_read_lock(); 8237 rdev_for_each_rcu(rdev, mddev) 8238 if (rdev->raid_disk >= 0 && 8239 !test_bit(Journal, &rdev->flags) && 8240 !test_bit(Faulty, &rdev->flags) && 8241 !test_bit(In_sync, &rdev->flags) && 8242 rdev->recovery_offset < j) 8243 j = rdev->recovery_offset; 8244 rcu_read_unlock(); 8245 8246 /* If there is a bitmap, we need to make sure all 8247 * writes that started before we added a spare 8248 * complete before we start doing a recovery. 8249 * Otherwise the write might complete and (via 8250 * bitmap_endwrite) set a bit in the bitmap after the 8251 * recovery has checked that bit and skipped that 8252 * region. 8253 */ 8254 if (mddev->bitmap) { 8255 mddev->pers->quiesce(mddev, 1); 8256 mddev->pers->quiesce(mddev, 0); 8257 } 8258 } 8259 8260 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8261 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8262 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8263 speed_max(mddev), desc); 8264 8265 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8266 8267 io_sectors = 0; 8268 for (m = 0; m < SYNC_MARKS; m++) { 8269 mark[m] = jiffies; 8270 mark_cnt[m] = io_sectors; 8271 } 8272 last_mark = 0; 8273 mddev->resync_mark = mark[last_mark]; 8274 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8275 8276 /* 8277 * Tune reconstruction: 8278 */ 8279 window = 32*(PAGE_SIZE/512); 8280 pr_debug("md: using %dk window, over a total of %lluk.\n", 8281 window/2, (unsigned long long)max_sectors/2); 8282 8283 atomic_set(&mddev->recovery_active, 0); 8284 last_check = 0; 8285 8286 if (j>2) { 8287 pr_debug("md: resuming %s of %s from checkpoint.\n", 8288 desc, mdname(mddev)); 8289 mddev->curr_resync = j; 8290 } else 8291 mddev->curr_resync = 3; /* no longer delayed */ 8292 mddev->curr_resync_completed = j; 8293 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8294 md_new_event(mddev); 8295 update_time = jiffies; 8296 8297 blk_start_plug(&plug); 8298 while (j < max_sectors) { 8299 sector_t sectors; 8300 8301 skipped = 0; 8302 8303 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8304 ((mddev->curr_resync > mddev->curr_resync_completed && 8305 (mddev->curr_resync - mddev->curr_resync_completed) 8306 > (max_sectors >> 4)) || 8307 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8308 (j - mddev->curr_resync_completed)*2 8309 >= mddev->resync_max - mddev->curr_resync_completed || 8310 mddev->curr_resync_completed > mddev->resync_max 8311 )) { 8312 /* time to update curr_resync_completed */ 8313 wait_event(mddev->recovery_wait, 8314 atomic_read(&mddev->recovery_active) == 0); 8315 mddev->curr_resync_completed = j; 8316 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8317 j > mddev->recovery_cp) 8318 mddev->recovery_cp = j; 8319 update_time = jiffies; 8320 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8321 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8322 } 8323 8324 while (j >= mddev->resync_max && 8325 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8326 /* As this condition is controlled by user-space, 8327 * we can block indefinitely, so use '_interruptible' 8328 * to avoid triggering warnings. 8329 */ 8330 flush_signals(current); /* just in case */ 8331 wait_event_interruptible(mddev->recovery_wait, 8332 mddev->resync_max > j 8333 || test_bit(MD_RECOVERY_INTR, 8334 &mddev->recovery)); 8335 } 8336 8337 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8338 break; 8339 8340 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8341 if (sectors == 0) { 8342 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8343 break; 8344 } 8345 8346 if (!skipped) { /* actual IO requested */ 8347 io_sectors += sectors; 8348 atomic_add(sectors, &mddev->recovery_active); 8349 } 8350 8351 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8352 break; 8353 8354 j += sectors; 8355 if (j > max_sectors) 8356 /* when skipping, extra large numbers can be returned. */ 8357 j = max_sectors; 8358 if (j > 2) 8359 mddev->curr_resync = j; 8360 mddev->curr_mark_cnt = io_sectors; 8361 if (last_check == 0) 8362 /* this is the earliest that rebuild will be 8363 * visible in /proc/mdstat 8364 */ 8365 md_new_event(mddev); 8366 8367 if (last_check + window > io_sectors || j == max_sectors) 8368 continue; 8369 8370 last_check = io_sectors; 8371 repeat: 8372 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8373 /* step marks */ 8374 int next = (last_mark+1) % SYNC_MARKS; 8375 8376 mddev->resync_mark = mark[next]; 8377 mddev->resync_mark_cnt = mark_cnt[next]; 8378 mark[next] = jiffies; 8379 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8380 last_mark = next; 8381 } 8382 8383 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8384 break; 8385 8386 /* 8387 * this loop exits only if either when we are slower than 8388 * the 'hard' speed limit, or the system was IO-idle for 8389 * a jiffy. 8390 * the system might be non-idle CPU-wise, but we only care 8391 * about not overloading the IO subsystem. (things like an 8392 * e2fsck being done on the RAID array should execute fast) 8393 */ 8394 cond_resched(); 8395 8396 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8397 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8398 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8399 8400 if (currspeed > speed_min(mddev)) { 8401 if (currspeed > speed_max(mddev)) { 8402 msleep(500); 8403 goto repeat; 8404 } 8405 if (!is_mddev_idle(mddev, 0)) { 8406 /* 8407 * Give other IO more of a chance. 8408 * The faster the devices, the less we wait. 8409 */ 8410 wait_event(mddev->recovery_wait, 8411 !atomic_read(&mddev->recovery_active)); 8412 } 8413 } 8414 } 8415 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 8416 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8417 ? "interrupted" : "done"); 8418 /* 8419 * this also signals 'finished resyncing' to md_stop 8420 */ 8421 blk_finish_plug(&plug); 8422 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8423 8424 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8425 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8426 mddev->curr_resync > 3) { 8427 mddev->curr_resync_completed = mddev->curr_resync; 8428 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8429 } 8430 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8431 8432 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 8433 mddev->curr_resync > 3) { 8434 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8435 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8436 if (mddev->curr_resync >= mddev->recovery_cp) { 8437 pr_debug("md: checkpointing %s of %s.\n", 8438 desc, mdname(mddev)); 8439 if (test_bit(MD_RECOVERY_ERROR, 8440 &mddev->recovery)) 8441 mddev->recovery_cp = 8442 mddev->curr_resync_completed; 8443 else 8444 mddev->recovery_cp = 8445 mddev->curr_resync; 8446 } 8447 } else 8448 mddev->recovery_cp = MaxSector; 8449 } else { 8450 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8451 mddev->curr_resync = MaxSector; 8452 rcu_read_lock(); 8453 rdev_for_each_rcu(rdev, mddev) 8454 if (rdev->raid_disk >= 0 && 8455 mddev->delta_disks >= 0 && 8456 !test_bit(Journal, &rdev->flags) && 8457 !test_bit(Faulty, &rdev->flags) && 8458 !test_bit(In_sync, &rdev->flags) && 8459 rdev->recovery_offset < mddev->curr_resync) 8460 rdev->recovery_offset = mddev->curr_resync; 8461 rcu_read_unlock(); 8462 } 8463 } 8464 skip: 8465 /* set CHANGE_PENDING here since maybe another update is needed, 8466 * so other nodes are informed. It should be harmless for normal 8467 * raid */ 8468 set_mask_bits(&mddev->sb_flags, 0, 8469 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 8470 8471 spin_lock(&mddev->lock); 8472 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8473 /* We completed so min/max setting can be forgotten if used. */ 8474 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8475 mddev->resync_min = 0; 8476 mddev->resync_max = MaxSector; 8477 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8478 mddev->resync_min = mddev->curr_resync_completed; 8479 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 8480 mddev->curr_resync = 0; 8481 spin_unlock(&mddev->lock); 8482 8483 wake_up(&resync_wait); 8484 md_wakeup_thread(mddev->thread); 8485 return; 8486 } 8487 EXPORT_SYMBOL_GPL(md_do_sync); 8488 8489 static int remove_and_add_spares(struct mddev *mddev, 8490 struct md_rdev *this) 8491 { 8492 struct md_rdev *rdev; 8493 int spares = 0; 8494 int removed = 0; 8495 bool remove_some = false; 8496 8497 rdev_for_each(rdev, mddev) { 8498 if ((this == NULL || rdev == this) && 8499 rdev->raid_disk >= 0 && 8500 !test_bit(Blocked, &rdev->flags) && 8501 test_bit(Faulty, &rdev->flags) && 8502 atomic_read(&rdev->nr_pending)==0) { 8503 /* Faulty non-Blocked devices with nr_pending == 0 8504 * never get nr_pending incremented, 8505 * never get Faulty cleared, and never get Blocked set. 8506 * So we can synchronize_rcu now rather than once per device 8507 */ 8508 remove_some = true; 8509 set_bit(RemoveSynchronized, &rdev->flags); 8510 } 8511 } 8512 8513 if (remove_some) 8514 synchronize_rcu(); 8515 rdev_for_each(rdev, mddev) { 8516 if ((this == NULL || rdev == this) && 8517 rdev->raid_disk >= 0 && 8518 !test_bit(Blocked, &rdev->flags) && 8519 ((test_bit(RemoveSynchronized, &rdev->flags) || 8520 (!test_bit(In_sync, &rdev->flags) && 8521 !test_bit(Journal, &rdev->flags))) && 8522 atomic_read(&rdev->nr_pending)==0)) { 8523 if (mddev->pers->hot_remove_disk( 8524 mddev, rdev) == 0) { 8525 sysfs_unlink_rdev(mddev, rdev); 8526 rdev->raid_disk = -1; 8527 removed++; 8528 } 8529 } 8530 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 8531 clear_bit(RemoveSynchronized, &rdev->flags); 8532 } 8533 8534 if (removed && mddev->kobj.sd) 8535 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8536 8537 if (this && removed) 8538 goto no_add; 8539 8540 rdev_for_each(rdev, mddev) { 8541 if (this && this != rdev) 8542 continue; 8543 if (test_bit(Candidate, &rdev->flags)) 8544 continue; 8545 if (rdev->raid_disk >= 0 && 8546 !test_bit(In_sync, &rdev->flags) && 8547 !test_bit(Journal, &rdev->flags) && 8548 !test_bit(Faulty, &rdev->flags)) 8549 spares++; 8550 if (rdev->raid_disk >= 0) 8551 continue; 8552 if (test_bit(Faulty, &rdev->flags)) 8553 continue; 8554 if (!test_bit(Journal, &rdev->flags)) { 8555 if (mddev->ro && 8556 ! (rdev->saved_raid_disk >= 0 && 8557 !test_bit(Bitmap_sync, &rdev->flags))) 8558 continue; 8559 8560 rdev->recovery_offset = 0; 8561 } 8562 if (mddev->pers-> 8563 hot_add_disk(mddev, rdev) == 0) { 8564 if (sysfs_link_rdev(mddev, rdev)) 8565 /* failure here is OK */; 8566 if (!test_bit(Journal, &rdev->flags)) 8567 spares++; 8568 md_new_event(mddev); 8569 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8570 } 8571 } 8572 no_add: 8573 if (removed) 8574 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8575 return spares; 8576 } 8577 8578 static void md_start_sync(struct work_struct *ws) 8579 { 8580 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8581 8582 mddev->sync_thread = md_register_thread(md_do_sync, 8583 mddev, 8584 "resync"); 8585 if (!mddev->sync_thread) { 8586 pr_warn("%s: could not start resync thread...\n", 8587 mdname(mddev)); 8588 /* leave the spares where they are, it shouldn't hurt */ 8589 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8590 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8591 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8592 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8593 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8594 wake_up(&resync_wait); 8595 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8596 &mddev->recovery)) 8597 if (mddev->sysfs_action) 8598 sysfs_notify_dirent_safe(mddev->sysfs_action); 8599 } else 8600 md_wakeup_thread(mddev->sync_thread); 8601 sysfs_notify_dirent_safe(mddev->sysfs_action); 8602 md_new_event(mddev); 8603 } 8604 8605 /* 8606 * This routine is regularly called by all per-raid-array threads to 8607 * deal with generic issues like resync and super-block update. 8608 * Raid personalities that don't have a thread (linear/raid0) do not 8609 * need this as they never do any recovery or update the superblock. 8610 * 8611 * It does not do any resync itself, but rather "forks" off other threads 8612 * to do that as needed. 8613 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8614 * "->recovery" and create a thread at ->sync_thread. 8615 * When the thread finishes it sets MD_RECOVERY_DONE 8616 * and wakeups up this thread which will reap the thread and finish up. 8617 * This thread also removes any faulty devices (with nr_pending == 0). 8618 * 8619 * The overall approach is: 8620 * 1/ if the superblock needs updating, update it. 8621 * 2/ If a recovery thread is running, don't do anything else. 8622 * 3/ If recovery has finished, clean up, possibly marking spares active. 8623 * 4/ If there are any faulty devices, remove them. 8624 * 5/ If array is degraded, try to add spares devices 8625 * 6/ If array has spares or is not in-sync, start a resync thread. 8626 */ 8627 void md_check_recovery(struct mddev *mddev) 8628 { 8629 if (mddev->suspended) 8630 return; 8631 8632 if (mddev->bitmap) 8633 bitmap_daemon_work(mddev); 8634 8635 if (signal_pending(current)) { 8636 if (mddev->pers->sync_request && !mddev->external) { 8637 pr_debug("md: %s in immediate safe mode\n", 8638 mdname(mddev)); 8639 mddev->safemode = 2; 8640 } 8641 flush_signals(current); 8642 } 8643 8644 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8645 return; 8646 if ( ! ( 8647 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 8648 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8649 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8650 (mddev->external == 0 && mddev->safemode == 1) || 8651 (mddev->safemode == 2 8652 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8653 )) 8654 return; 8655 8656 if (mddev_trylock(mddev)) { 8657 int spares = 0; 8658 8659 if (mddev->ro) { 8660 struct md_rdev *rdev; 8661 if (!mddev->external && mddev->in_sync) 8662 /* 'Blocked' flag not needed as failed devices 8663 * will be recorded if array switched to read/write. 8664 * Leaving it set will prevent the device 8665 * from being removed. 8666 */ 8667 rdev_for_each(rdev, mddev) 8668 clear_bit(Blocked, &rdev->flags); 8669 /* On a read-only array we can: 8670 * - remove failed devices 8671 * - add already-in_sync devices if the array itself 8672 * is in-sync. 8673 * As we only add devices that are already in-sync, 8674 * we can activate the spares immediately. 8675 */ 8676 remove_and_add_spares(mddev, NULL); 8677 /* There is no thread, but we need to call 8678 * ->spare_active and clear saved_raid_disk 8679 */ 8680 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8681 md_reap_sync_thread(mddev); 8682 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8683 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8684 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8685 goto unlock; 8686 } 8687 8688 if (mddev_is_clustered(mddev)) { 8689 struct md_rdev *rdev; 8690 /* kick the device if another node issued a 8691 * remove disk. 8692 */ 8693 rdev_for_each(rdev, mddev) { 8694 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 8695 rdev->raid_disk < 0) 8696 md_kick_rdev_from_array(rdev); 8697 } 8698 } 8699 8700 if (!mddev->external && !mddev->in_sync) { 8701 spin_lock(&mddev->lock); 8702 set_in_sync(mddev); 8703 spin_unlock(&mddev->lock); 8704 } 8705 8706 if (mddev->sb_flags) 8707 md_update_sb(mddev, 0); 8708 8709 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8710 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8711 /* resync/recovery still happening */ 8712 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8713 goto unlock; 8714 } 8715 if (mddev->sync_thread) { 8716 md_reap_sync_thread(mddev); 8717 goto unlock; 8718 } 8719 /* Set RUNNING before clearing NEEDED to avoid 8720 * any transients in the value of "sync_action". 8721 */ 8722 mddev->curr_resync_completed = 0; 8723 spin_lock(&mddev->lock); 8724 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8725 spin_unlock(&mddev->lock); 8726 /* Clear some bits that don't mean anything, but 8727 * might be left set 8728 */ 8729 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8730 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8731 8732 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8733 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8734 goto not_running; 8735 /* no recovery is running. 8736 * remove any failed drives, then 8737 * add spares if possible. 8738 * Spares are also removed and re-added, to allow 8739 * the personality to fail the re-add. 8740 */ 8741 8742 if (mddev->reshape_position != MaxSector) { 8743 if (mddev->pers->check_reshape == NULL || 8744 mddev->pers->check_reshape(mddev) != 0) 8745 /* Cannot proceed */ 8746 goto not_running; 8747 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8748 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8749 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8750 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8751 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8752 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8753 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8754 } else if (mddev->recovery_cp < MaxSector) { 8755 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8756 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8757 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8758 /* nothing to be done ... */ 8759 goto not_running; 8760 8761 if (mddev->pers->sync_request) { 8762 if (spares) { 8763 /* We are adding a device or devices to an array 8764 * which has the bitmap stored on all devices. 8765 * So make sure all bitmap pages get written 8766 */ 8767 bitmap_write_all(mddev->bitmap); 8768 } 8769 INIT_WORK(&mddev->del_work, md_start_sync); 8770 queue_work(md_misc_wq, &mddev->del_work); 8771 goto unlock; 8772 } 8773 not_running: 8774 if (!mddev->sync_thread) { 8775 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8776 wake_up(&resync_wait); 8777 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8778 &mddev->recovery)) 8779 if (mddev->sysfs_action) 8780 sysfs_notify_dirent_safe(mddev->sysfs_action); 8781 } 8782 unlock: 8783 wake_up(&mddev->sb_wait); 8784 mddev_unlock(mddev); 8785 } 8786 } 8787 EXPORT_SYMBOL(md_check_recovery); 8788 8789 void md_reap_sync_thread(struct mddev *mddev) 8790 { 8791 struct md_rdev *rdev; 8792 8793 /* resync has finished, collect result */ 8794 md_unregister_thread(&mddev->sync_thread); 8795 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8796 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8797 /* success...*/ 8798 /* activate any spares */ 8799 if (mddev->pers->spare_active(mddev)) { 8800 sysfs_notify(&mddev->kobj, NULL, 8801 "degraded"); 8802 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8803 } 8804 } 8805 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8806 mddev->pers->finish_reshape) 8807 mddev->pers->finish_reshape(mddev); 8808 8809 /* If array is no-longer degraded, then any saved_raid_disk 8810 * information must be scrapped. 8811 */ 8812 if (!mddev->degraded) 8813 rdev_for_each(rdev, mddev) 8814 rdev->saved_raid_disk = -1; 8815 8816 md_update_sb(mddev, 1); 8817 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 8818 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 8819 * clustered raid */ 8820 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 8821 md_cluster_ops->resync_finish(mddev); 8822 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8823 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8824 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8825 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8826 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8827 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8828 wake_up(&resync_wait); 8829 /* flag recovery needed just to double check */ 8830 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8831 sysfs_notify_dirent_safe(mddev->sysfs_action); 8832 md_new_event(mddev); 8833 if (mddev->event_work.func) 8834 queue_work(md_misc_wq, &mddev->event_work); 8835 } 8836 EXPORT_SYMBOL(md_reap_sync_thread); 8837 8838 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8839 { 8840 sysfs_notify_dirent_safe(rdev->sysfs_state); 8841 wait_event_timeout(rdev->blocked_wait, 8842 !test_bit(Blocked, &rdev->flags) && 8843 !test_bit(BlockedBadBlocks, &rdev->flags), 8844 msecs_to_jiffies(5000)); 8845 rdev_dec_pending(rdev, mddev); 8846 } 8847 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8848 8849 void md_finish_reshape(struct mddev *mddev) 8850 { 8851 /* called be personality module when reshape completes. */ 8852 struct md_rdev *rdev; 8853 8854 rdev_for_each(rdev, mddev) { 8855 if (rdev->data_offset > rdev->new_data_offset) 8856 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8857 else 8858 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8859 rdev->data_offset = rdev->new_data_offset; 8860 } 8861 } 8862 EXPORT_SYMBOL(md_finish_reshape); 8863 8864 /* Bad block management */ 8865 8866 /* Returns 1 on success, 0 on failure */ 8867 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8868 int is_new) 8869 { 8870 struct mddev *mddev = rdev->mddev; 8871 int rv; 8872 if (is_new) 8873 s += rdev->new_data_offset; 8874 else 8875 s += rdev->data_offset; 8876 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 8877 if (rv == 0) { 8878 /* Make sure they get written out promptly */ 8879 if (test_bit(ExternalBbl, &rdev->flags)) 8880 sysfs_notify(&rdev->kobj, NULL, 8881 "unacknowledged_bad_blocks"); 8882 sysfs_notify_dirent_safe(rdev->sysfs_state); 8883 set_mask_bits(&mddev->sb_flags, 0, 8884 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 8885 md_wakeup_thread(rdev->mddev->thread); 8886 return 1; 8887 } else 8888 return 0; 8889 } 8890 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8891 8892 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8893 int is_new) 8894 { 8895 int rv; 8896 if (is_new) 8897 s += rdev->new_data_offset; 8898 else 8899 s += rdev->data_offset; 8900 rv = badblocks_clear(&rdev->badblocks, s, sectors); 8901 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 8902 sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); 8903 return rv; 8904 } 8905 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8906 8907 static int md_notify_reboot(struct notifier_block *this, 8908 unsigned long code, void *x) 8909 { 8910 struct list_head *tmp; 8911 struct mddev *mddev; 8912 int need_delay = 0; 8913 8914 for_each_mddev(mddev, tmp) { 8915 if (mddev_trylock(mddev)) { 8916 if (mddev->pers) 8917 __md_stop_writes(mddev); 8918 if (mddev->persistent) 8919 mddev->safemode = 2; 8920 mddev_unlock(mddev); 8921 } 8922 need_delay = 1; 8923 } 8924 /* 8925 * certain more exotic SCSI devices are known to be 8926 * volatile wrt too early system reboots. While the 8927 * right place to handle this issue is the given 8928 * driver, we do want to have a safe RAID driver ... 8929 */ 8930 if (need_delay) 8931 mdelay(1000*1); 8932 8933 return NOTIFY_DONE; 8934 } 8935 8936 static struct notifier_block md_notifier = { 8937 .notifier_call = md_notify_reboot, 8938 .next = NULL, 8939 .priority = INT_MAX, /* before any real devices */ 8940 }; 8941 8942 static void md_geninit(void) 8943 { 8944 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8945 8946 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8947 } 8948 8949 static int __init md_init(void) 8950 { 8951 int ret = -ENOMEM; 8952 8953 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8954 if (!md_wq) 8955 goto err_wq; 8956 8957 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8958 if (!md_misc_wq) 8959 goto err_misc_wq; 8960 8961 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8962 goto err_md; 8963 8964 if ((ret = register_blkdev(0, "mdp")) < 0) 8965 goto err_mdp; 8966 mdp_major = ret; 8967 8968 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 8969 md_probe, NULL, NULL); 8970 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8971 md_probe, NULL, NULL); 8972 8973 register_reboot_notifier(&md_notifier); 8974 raid_table_header = register_sysctl_table(raid_root_table); 8975 8976 md_geninit(); 8977 return 0; 8978 8979 err_mdp: 8980 unregister_blkdev(MD_MAJOR, "md"); 8981 err_md: 8982 destroy_workqueue(md_misc_wq); 8983 err_misc_wq: 8984 destroy_workqueue(md_wq); 8985 err_wq: 8986 return ret; 8987 } 8988 8989 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 8990 { 8991 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 8992 struct md_rdev *rdev2; 8993 int role, ret; 8994 char b[BDEVNAME_SIZE]; 8995 8996 /* 8997 * If size is changed in another node then we need to 8998 * do resize as well. 8999 */ 9000 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9001 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9002 if (ret) 9003 pr_info("md-cluster: resize failed\n"); 9004 else 9005 bitmap_update_sb(mddev->bitmap); 9006 } 9007 9008 /* Check for change of roles in the active devices */ 9009 rdev_for_each(rdev2, mddev) { 9010 if (test_bit(Faulty, &rdev2->flags)) 9011 continue; 9012 9013 /* Check if the roles changed */ 9014 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9015 9016 if (test_bit(Candidate, &rdev2->flags)) { 9017 if (role == 0xfffe) { 9018 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 9019 md_kick_rdev_from_array(rdev2); 9020 continue; 9021 } 9022 else 9023 clear_bit(Candidate, &rdev2->flags); 9024 } 9025 9026 if (role != rdev2->raid_disk) { 9027 /* got activated */ 9028 if (rdev2->raid_disk == -1 && role != 0xffff) { 9029 rdev2->saved_raid_disk = role; 9030 ret = remove_and_add_spares(mddev, rdev2); 9031 pr_info("Activated spare: %s\n", 9032 bdevname(rdev2->bdev,b)); 9033 /* wakeup mddev->thread here, so array could 9034 * perform resync with the new activated disk */ 9035 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9036 md_wakeup_thread(mddev->thread); 9037 9038 } 9039 /* device faulty 9040 * We just want to do the minimum to mark the disk 9041 * as faulty. The recovery is performed by the 9042 * one who initiated the error. 9043 */ 9044 if ((role == 0xfffe) || (role == 0xfffd)) { 9045 md_error(mddev, rdev2); 9046 clear_bit(Blocked, &rdev2->flags); 9047 } 9048 } 9049 } 9050 9051 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) 9052 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9053 9054 /* Finally set the event to be up to date */ 9055 mddev->events = le64_to_cpu(sb->events); 9056 } 9057 9058 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9059 { 9060 int err; 9061 struct page *swapout = rdev->sb_page; 9062 struct mdp_superblock_1 *sb; 9063 9064 /* Store the sb page of the rdev in the swapout temporary 9065 * variable in case we err in the future 9066 */ 9067 rdev->sb_page = NULL; 9068 err = alloc_disk_sb(rdev); 9069 if (err == 0) { 9070 ClearPageUptodate(rdev->sb_page); 9071 rdev->sb_loaded = 0; 9072 err = super_types[mddev->major_version]. 9073 load_super(rdev, NULL, mddev->minor_version); 9074 } 9075 if (err < 0) { 9076 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9077 __func__, __LINE__, rdev->desc_nr, err); 9078 if (rdev->sb_page) 9079 put_page(rdev->sb_page); 9080 rdev->sb_page = swapout; 9081 rdev->sb_loaded = 1; 9082 return err; 9083 } 9084 9085 sb = page_address(rdev->sb_page); 9086 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9087 * is not set 9088 */ 9089 9090 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9091 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9092 9093 /* The other node finished recovery, call spare_active to set 9094 * device In_sync and mddev->degraded 9095 */ 9096 if (rdev->recovery_offset == MaxSector && 9097 !test_bit(In_sync, &rdev->flags) && 9098 mddev->pers->spare_active(mddev)) 9099 sysfs_notify(&mddev->kobj, NULL, "degraded"); 9100 9101 put_page(swapout); 9102 return 0; 9103 } 9104 9105 void md_reload_sb(struct mddev *mddev, int nr) 9106 { 9107 struct md_rdev *rdev; 9108 int err; 9109 9110 /* Find the rdev */ 9111 rdev_for_each_rcu(rdev, mddev) { 9112 if (rdev->desc_nr == nr) 9113 break; 9114 } 9115 9116 if (!rdev || rdev->desc_nr != nr) { 9117 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9118 return; 9119 } 9120 9121 err = read_rdev(mddev, rdev); 9122 if (err < 0) 9123 return; 9124 9125 check_sb_changes(mddev, rdev); 9126 9127 /* Read all rdev's to update recovery_offset */ 9128 rdev_for_each_rcu(rdev, mddev) 9129 read_rdev(mddev, rdev); 9130 } 9131 EXPORT_SYMBOL(md_reload_sb); 9132 9133 #ifndef MODULE 9134 9135 /* 9136 * Searches all registered partitions for autorun RAID arrays 9137 * at boot time. 9138 */ 9139 9140 static DEFINE_MUTEX(detected_devices_mutex); 9141 static LIST_HEAD(all_detected_devices); 9142 struct detected_devices_node { 9143 struct list_head list; 9144 dev_t dev; 9145 }; 9146 9147 void md_autodetect_dev(dev_t dev) 9148 { 9149 struct detected_devices_node *node_detected_dev; 9150 9151 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9152 if (node_detected_dev) { 9153 node_detected_dev->dev = dev; 9154 mutex_lock(&detected_devices_mutex); 9155 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9156 mutex_unlock(&detected_devices_mutex); 9157 } 9158 } 9159 9160 static void autostart_arrays(int part) 9161 { 9162 struct md_rdev *rdev; 9163 struct detected_devices_node *node_detected_dev; 9164 dev_t dev; 9165 int i_scanned, i_passed; 9166 9167 i_scanned = 0; 9168 i_passed = 0; 9169 9170 pr_info("md: Autodetecting RAID arrays.\n"); 9171 9172 mutex_lock(&detected_devices_mutex); 9173 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9174 i_scanned++; 9175 node_detected_dev = list_entry(all_detected_devices.next, 9176 struct detected_devices_node, list); 9177 list_del(&node_detected_dev->list); 9178 dev = node_detected_dev->dev; 9179 kfree(node_detected_dev); 9180 mutex_unlock(&detected_devices_mutex); 9181 rdev = md_import_device(dev,0, 90); 9182 mutex_lock(&detected_devices_mutex); 9183 if (IS_ERR(rdev)) 9184 continue; 9185 9186 if (test_bit(Faulty, &rdev->flags)) 9187 continue; 9188 9189 set_bit(AutoDetected, &rdev->flags); 9190 list_add(&rdev->same_set, &pending_raid_disks); 9191 i_passed++; 9192 } 9193 mutex_unlock(&detected_devices_mutex); 9194 9195 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 9196 9197 autorun_devices(part); 9198 } 9199 9200 #endif /* !MODULE */ 9201 9202 static __exit void md_exit(void) 9203 { 9204 struct mddev *mddev; 9205 struct list_head *tmp; 9206 int delay = 1; 9207 9208 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 9209 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 9210 9211 unregister_blkdev(MD_MAJOR,"md"); 9212 unregister_blkdev(mdp_major, "mdp"); 9213 unregister_reboot_notifier(&md_notifier); 9214 unregister_sysctl_table(raid_table_header); 9215 9216 /* We cannot unload the modules while some process is 9217 * waiting for us in select() or poll() - wake them up 9218 */ 9219 md_unloading = 1; 9220 while (waitqueue_active(&md_event_waiters)) { 9221 /* not safe to leave yet */ 9222 wake_up(&md_event_waiters); 9223 msleep(delay); 9224 delay += delay; 9225 } 9226 remove_proc_entry("mdstat", NULL); 9227 9228 for_each_mddev(mddev, tmp) { 9229 export_array(mddev); 9230 mddev->ctime = 0; 9231 mddev->hold_active = 0; 9232 /* 9233 * for_each_mddev() will call mddev_put() at the end of each 9234 * iteration. As the mddev is now fully clear, this will 9235 * schedule the mddev for destruction by a workqueue, and the 9236 * destroy_workqueue() below will wait for that to complete. 9237 */ 9238 } 9239 destroy_workqueue(md_misc_wq); 9240 destroy_workqueue(md_wq); 9241 } 9242 9243 subsys_initcall(md_init); 9244 module_exit(md_exit) 9245 9246 static int get_ro(char *buffer, struct kernel_param *kp) 9247 { 9248 return sprintf(buffer, "%d", start_readonly); 9249 } 9250 static int set_ro(const char *val, struct kernel_param *kp) 9251 { 9252 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 9253 } 9254 9255 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 9256 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 9257 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9258 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 9259 9260 MODULE_LICENSE("GPL"); 9261 MODULE_DESCRIPTION("MD RAID framework"); 9262 MODULE_ALIAS("md"); 9263 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9264