1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected by pers_lock. */ 74 static LIST_HEAD(pers_list); 75 static DEFINE_SPINLOCK(pers_lock); 76 77 static const struct kobj_type md_ktype; 78 79 struct md_cluster_operations *md_cluster_ops; 80 EXPORT_SYMBOL(md_cluster_ops); 81 static struct module *md_cluster_mod; 82 83 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 84 static struct workqueue_struct *md_wq; 85 static struct workqueue_struct *md_misc_wq; 86 struct workqueue_struct *md_bitmap_wq; 87 88 static int remove_and_add_spares(struct mddev *mddev, 89 struct md_rdev *this); 90 static void mddev_detach(struct mddev *mddev); 91 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 92 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 93 94 /* 95 * Default number of read corrections we'll attempt on an rdev 96 * before ejecting it from the array. We divide the read error 97 * count by 2 for every hour elapsed between read errors. 98 */ 99 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 100 /* Default safemode delay: 200 msec */ 101 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 102 /* 103 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 104 * is 1000 KB/sec, so the extra system load does not show up that much. 105 * Increase it if you want to have more _guaranteed_ speed. Note that 106 * the RAID driver will use the maximum available bandwidth if the IO 107 * subsystem is idle. There is also an 'absolute maximum' reconstruction 108 * speed limit - in case reconstruction slows down your system despite 109 * idle IO detection. 110 * 111 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 112 * or /sys/block/mdX/md/sync_speed_{min,max} 113 */ 114 115 static int sysctl_speed_limit_min = 1000; 116 static int sysctl_speed_limit_max = 200000; 117 static inline int speed_min(struct mddev *mddev) 118 { 119 return mddev->sync_speed_min ? 120 mddev->sync_speed_min : sysctl_speed_limit_min; 121 } 122 123 static inline int speed_max(struct mddev *mddev) 124 { 125 return mddev->sync_speed_max ? 126 mddev->sync_speed_max : sysctl_speed_limit_max; 127 } 128 129 static void rdev_uninit_serial(struct md_rdev *rdev) 130 { 131 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 132 return; 133 134 kvfree(rdev->serial); 135 rdev->serial = NULL; 136 } 137 138 static void rdevs_uninit_serial(struct mddev *mddev) 139 { 140 struct md_rdev *rdev; 141 142 rdev_for_each(rdev, mddev) 143 rdev_uninit_serial(rdev); 144 } 145 146 static int rdev_init_serial(struct md_rdev *rdev) 147 { 148 /* serial_nums equals with BARRIER_BUCKETS_NR */ 149 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 150 struct serial_in_rdev *serial = NULL; 151 152 if (test_bit(CollisionCheck, &rdev->flags)) 153 return 0; 154 155 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 156 GFP_KERNEL); 157 if (!serial) 158 return -ENOMEM; 159 160 for (i = 0; i < serial_nums; i++) { 161 struct serial_in_rdev *serial_tmp = &serial[i]; 162 163 spin_lock_init(&serial_tmp->serial_lock); 164 serial_tmp->serial_rb = RB_ROOT_CACHED; 165 init_waitqueue_head(&serial_tmp->serial_io_wait); 166 } 167 168 rdev->serial = serial; 169 set_bit(CollisionCheck, &rdev->flags); 170 171 return 0; 172 } 173 174 static int rdevs_init_serial(struct mddev *mddev) 175 { 176 struct md_rdev *rdev; 177 int ret = 0; 178 179 rdev_for_each(rdev, mddev) { 180 ret = rdev_init_serial(rdev); 181 if (ret) 182 break; 183 } 184 185 /* Free all resources if pool is not existed */ 186 if (ret && !mddev->serial_info_pool) 187 rdevs_uninit_serial(mddev); 188 189 return ret; 190 } 191 192 /* 193 * rdev needs to enable serial stuffs if it meets the conditions: 194 * 1. it is multi-queue device flaged with writemostly. 195 * 2. the write-behind mode is enabled. 196 */ 197 static int rdev_need_serial(struct md_rdev *rdev) 198 { 199 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 200 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 201 test_bit(WriteMostly, &rdev->flags)); 202 } 203 204 /* 205 * Init resource for rdev(s), then create serial_info_pool if: 206 * 1. rdev is the first device which return true from rdev_enable_serial. 207 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 208 */ 209 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 210 bool is_suspend) 211 { 212 int ret = 0; 213 214 if (rdev && !rdev_need_serial(rdev) && 215 !test_bit(CollisionCheck, &rdev->flags)) 216 return; 217 218 if (!is_suspend) 219 mddev_suspend(mddev); 220 221 if (!rdev) 222 ret = rdevs_init_serial(mddev); 223 else 224 ret = rdev_init_serial(rdev); 225 if (ret) 226 goto abort; 227 228 if (mddev->serial_info_pool == NULL) { 229 /* 230 * already in memalloc noio context by 231 * mddev_suspend() 232 */ 233 mddev->serial_info_pool = 234 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 235 sizeof(struct serial_info)); 236 if (!mddev->serial_info_pool) { 237 rdevs_uninit_serial(mddev); 238 pr_err("can't alloc memory pool for serialization\n"); 239 } 240 } 241 242 abort: 243 if (!is_suspend) 244 mddev_resume(mddev); 245 } 246 247 /* 248 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 249 * 1. rdev is the last device flaged with CollisionCheck. 250 * 2. when bitmap is destroyed while policy is not enabled. 251 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 252 */ 253 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 254 bool is_suspend) 255 { 256 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 257 return; 258 259 if (mddev->serial_info_pool) { 260 struct md_rdev *temp; 261 int num = 0; /* used to track if other rdevs need the pool */ 262 263 if (!is_suspend) 264 mddev_suspend(mddev); 265 rdev_for_each(temp, mddev) { 266 if (!rdev) { 267 if (!mddev->serialize_policy || 268 !rdev_need_serial(temp)) 269 rdev_uninit_serial(temp); 270 else 271 num++; 272 } else if (temp != rdev && 273 test_bit(CollisionCheck, &temp->flags)) 274 num++; 275 } 276 277 if (rdev) 278 rdev_uninit_serial(rdev); 279 280 if (num) 281 pr_info("The mempool could be used by other devices\n"); 282 else { 283 mempool_destroy(mddev->serial_info_pool); 284 mddev->serial_info_pool = NULL; 285 } 286 if (!is_suspend) 287 mddev_resume(mddev); 288 } 289 } 290 291 static struct ctl_table_header *raid_table_header; 292 293 static struct ctl_table raid_table[] = { 294 { 295 .procname = "speed_limit_min", 296 .data = &sysctl_speed_limit_min, 297 .maxlen = sizeof(int), 298 .mode = S_IRUGO|S_IWUSR, 299 .proc_handler = proc_dointvec, 300 }, 301 { 302 .procname = "speed_limit_max", 303 .data = &sysctl_speed_limit_max, 304 .maxlen = sizeof(int), 305 .mode = S_IRUGO|S_IWUSR, 306 .proc_handler = proc_dointvec, 307 }, 308 { } 309 }; 310 311 static int start_readonly; 312 313 /* 314 * The original mechanism for creating an md device is to create 315 * a device node in /dev and to open it. This causes races with device-close. 316 * The preferred method is to write to the "new_array" module parameter. 317 * This can avoid races. 318 * Setting create_on_open to false disables the original mechanism 319 * so all the races disappear. 320 */ 321 static bool create_on_open = true; 322 323 /* 324 * We have a system wide 'event count' that is incremented 325 * on any 'interesting' event, and readers of /proc/mdstat 326 * can use 'poll' or 'select' to find out when the event 327 * count increases. 328 * 329 * Events are: 330 * start array, stop array, error, add device, remove device, 331 * start build, activate spare 332 */ 333 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 334 static atomic_t md_event_count; 335 void md_new_event(void) 336 { 337 atomic_inc(&md_event_count); 338 wake_up(&md_event_waiters); 339 } 340 EXPORT_SYMBOL_GPL(md_new_event); 341 342 /* 343 * Enables to iterate over all existing md arrays 344 * all_mddevs_lock protects this list. 345 */ 346 static LIST_HEAD(all_mddevs); 347 static DEFINE_SPINLOCK(all_mddevs_lock); 348 349 /* Rather than calling directly into the personality make_request function, 350 * IO requests come here first so that we can check if the device is 351 * being suspended pending a reconfiguration. 352 * We hold a refcount over the call to ->make_request. By the time that 353 * call has finished, the bio has been linked into some internal structure 354 * and so is visible to ->quiesce(), so we don't need the refcount any more. 355 */ 356 static bool is_suspended(struct mddev *mddev, struct bio *bio) 357 { 358 if (is_md_suspended(mddev)) 359 return true; 360 if (bio_data_dir(bio) != WRITE) 361 return false; 362 if (mddev->suspend_lo >= mddev->suspend_hi) 363 return false; 364 if (bio->bi_iter.bi_sector >= mddev->suspend_hi) 365 return false; 366 if (bio_end_sector(bio) < mddev->suspend_lo) 367 return false; 368 return true; 369 } 370 371 void md_handle_request(struct mddev *mddev, struct bio *bio) 372 { 373 check_suspended: 374 if (is_suspended(mddev, bio)) { 375 DEFINE_WAIT(__wait); 376 /* Bail out if REQ_NOWAIT is set for the bio */ 377 if (bio->bi_opf & REQ_NOWAIT) { 378 bio_wouldblock_error(bio); 379 return; 380 } 381 for (;;) { 382 prepare_to_wait(&mddev->sb_wait, &__wait, 383 TASK_UNINTERRUPTIBLE); 384 if (!is_suspended(mddev, bio)) 385 break; 386 schedule(); 387 } 388 finish_wait(&mddev->sb_wait, &__wait); 389 } 390 if (!percpu_ref_tryget_live(&mddev->active_io)) 391 goto check_suspended; 392 393 if (!mddev->pers->make_request(mddev, bio)) { 394 percpu_ref_put(&mddev->active_io); 395 goto check_suspended; 396 } 397 398 percpu_ref_put(&mddev->active_io); 399 } 400 EXPORT_SYMBOL(md_handle_request); 401 402 static void md_submit_bio(struct bio *bio) 403 { 404 const int rw = bio_data_dir(bio); 405 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 406 407 if (mddev == NULL || mddev->pers == NULL) { 408 bio_io_error(bio); 409 return; 410 } 411 412 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 413 bio_io_error(bio); 414 return; 415 } 416 417 bio = bio_split_to_limits(bio); 418 if (!bio) 419 return; 420 421 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 422 if (bio_sectors(bio) != 0) 423 bio->bi_status = BLK_STS_IOERR; 424 bio_endio(bio); 425 return; 426 } 427 428 /* bio could be mergeable after passing to underlayer */ 429 bio->bi_opf &= ~REQ_NOMERGE; 430 431 md_handle_request(mddev, bio); 432 } 433 434 /* mddev_suspend makes sure no new requests are submitted 435 * to the device, and that any requests that have been submitted 436 * are completely handled. 437 * Once mddev_detach() is called and completes, the module will be 438 * completely unused. 439 */ 440 void mddev_suspend(struct mddev *mddev) 441 { 442 struct md_thread *thread = rcu_dereference_protected(mddev->thread, 443 lockdep_is_held(&mddev->reconfig_mutex)); 444 445 WARN_ON_ONCE(thread && current == thread->tsk); 446 if (mddev->suspended++) 447 return; 448 wake_up(&mddev->sb_wait); 449 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); 450 percpu_ref_kill(&mddev->active_io); 451 452 if (mddev->pers && mddev->pers->prepare_suspend) 453 mddev->pers->prepare_suspend(mddev); 454 455 wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); 456 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); 457 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); 458 459 del_timer_sync(&mddev->safemode_timer); 460 /* restrict memory reclaim I/O during raid array is suspend */ 461 mddev->noio_flag = memalloc_noio_save(); 462 } 463 EXPORT_SYMBOL_GPL(mddev_suspend); 464 465 void mddev_resume(struct mddev *mddev) 466 { 467 lockdep_assert_held(&mddev->reconfig_mutex); 468 if (--mddev->suspended) 469 return; 470 471 /* entred the memalloc scope from mddev_suspend() */ 472 memalloc_noio_restore(mddev->noio_flag); 473 474 percpu_ref_resurrect(&mddev->active_io); 475 wake_up(&mddev->sb_wait); 476 477 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 478 md_wakeup_thread(mddev->thread); 479 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 480 } 481 EXPORT_SYMBOL_GPL(mddev_resume); 482 483 /* 484 * Generic flush handling for md 485 */ 486 487 static void md_end_flush(struct bio *bio) 488 { 489 struct md_rdev *rdev = bio->bi_private; 490 struct mddev *mddev = rdev->mddev; 491 492 bio_put(bio); 493 494 rdev_dec_pending(rdev, mddev); 495 496 if (atomic_dec_and_test(&mddev->flush_pending)) { 497 /* The pair is percpu_ref_get() from md_flush_request() */ 498 percpu_ref_put(&mddev->active_io); 499 500 /* The pre-request flush has finished */ 501 queue_work(md_wq, &mddev->flush_work); 502 } 503 } 504 505 static void md_submit_flush_data(struct work_struct *ws); 506 507 static void submit_flushes(struct work_struct *ws) 508 { 509 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 510 struct md_rdev *rdev; 511 512 mddev->start_flush = ktime_get_boottime(); 513 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 514 atomic_set(&mddev->flush_pending, 1); 515 rcu_read_lock(); 516 rdev_for_each_rcu(rdev, mddev) 517 if (rdev->raid_disk >= 0 && 518 !test_bit(Faulty, &rdev->flags)) { 519 struct bio *bi; 520 521 atomic_inc(&rdev->nr_pending); 522 rcu_read_unlock(); 523 bi = bio_alloc_bioset(rdev->bdev, 0, 524 REQ_OP_WRITE | REQ_PREFLUSH, 525 GFP_NOIO, &mddev->bio_set); 526 bi->bi_end_io = md_end_flush; 527 bi->bi_private = rdev; 528 atomic_inc(&mddev->flush_pending); 529 submit_bio(bi); 530 rcu_read_lock(); 531 } 532 rcu_read_unlock(); 533 if (atomic_dec_and_test(&mddev->flush_pending)) 534 queue_work(md_wq, &mddev->flush_work); 535 } 536 537 static void md_submit_flush_data(struct work_struct *ws) 538 { 539 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 540 struct bio *bio = mddev->flush_bio; 541 542 /* 543 * must reset flush_bio before calling into md_handle_request to avoid a 544 * deadlock, because other bios passed md_handle_request suspend check 545 * could wait for this and below md_handle_request could wait for those 546 * bios because of suspend check 547 */ 548 spin_lock_irq(&mddev->lock); 549 mddev->prev_flush_start = mddev->start_flush; 550 mddev->flush_bio = NULL; 551 spin_unlock_irq(&mddev->lock); 552 wake_up(&mddev->sb_wait); 553 554 if (bio->bi_iter.bi_size == 0) { 555 /* an empty barrier - all done */ 556 bio_endio(bio); 557 } else { 558 bio->bi_opf &= ~REQ_PREFLUSH; 559 md_handle_request(mddev, bio); 560 } 561 } 562 563 /* 564 * Manages consolidation of flushes and submitting any flushes needed for 565 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 566 * being finished in another context. Returns false if the flushing is 567 * complete but still needs the I/O portion of the bio to be processed. 568 */ 569 bool md_flush_request(struct mddev *mddev, struct bio *bio) 570 { 571 ktime_t req_start = ktime_get_boottime(); 572 spin_lock_irq(&mddev->lock); 573 /* flush requests wait until ongoing flush completes, 574 * hence coalescing all the pending requests. 575 */ 576 wait_event_lock_irq(mddev->sb_wait, 577 !mddev->flush_bio || 578 ktime_before(req_start, mddev->prev_flush_start), 579 mddev->lock); 580 /* new request after previous flush is completed */ 581 if (ktime_after(req_start, mddev->prev_flush_start)) { 582 WARN_ON(mddev->flush_bio); 583 /* 584 * Grab a reference to make sure mddev_suspend() will wait for 585 * this flush to be done. 586 * 587 * md_flush_reqeust() is called under md_handle_request() and 588 * 'active_io' is already grabbed, hence percpu_ref_is_zero() 589 * won't pass, percpu_ref_tryget_live() can't be used because 590 * percpu_ref_kill() can be called by mddev_suspend() 591 * concurrently. 592 */ 593 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 594 percpu_ref_get(&mddev->active_io); 595 mddev->flush_bio = bio; 596 bio = NULL; 597 } 598 spin_unlock_irq(&mddev->lock); 599 600 if (!bio) { 601 INIT_WORK(&mddev->flush_work, submit_flushes); 602 queue_work(md_wq, &mddev->flush_work); 603 } else { 604 /* flush was performed for some other bio while we waited. */ 605 if (bio->bi_iter.bi_size == 0) 606 /* an empty barrier - all done */ 607 bio_endio(bio); 608 else { 609 bio->bi_opf &= ~REQ_PREFLUSH; 610 return false; 611 } 612 } 613 return true; 614 } 615 EXPORT_SYMBOL(md_flush_request); 616 617 static inline struct mddev *mddev_get(struct mddev *mddev) 618 { 619 lockdep_assert_held(&all_mddevs_lock); 620 621 if (test_bit(MD_DELETED, &mddev->flags)) 622 return NULL; 623 atomic_inc(&mddev->active); 624 return mddev; 625 } 626 627 static void mddev_delayed_delete(struct work_struct *ws); 628 629 void mddev_put(struct mddev *mddev) 630 { 631 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 632 return; 633 if (!mddev->raid_disks && list_empty(&mddev->disks) && 634 mddev->ctime == 0 && !mddev->hold_active) { 635 /* Array is not configured at all, and not held active, 636 * so destroy it */ 637 set_bit(MD_DELETED, &mddev->flags); 638 639 /* 640 * Call queue_work inside the spinlock so that 641 * flush_workqueue() after mddev_find will succeed in waiting 642 * for the work to be done. 643 */ 644 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 645 queue_work(md_misc_wq, &mddev->del_work); 646 } 647 spin_unlock(&all_mddevs_lock); 648 } 649 650 static void md_safemode_timeout(struct timer_list *t); 651 652 void mddev_init(struct mddev *mddev) 653 { 654 mutex_init(&mddev->open_mutex); 655 mutex_init(&mddev->reconfig_mutex); 656 mutex_init(&mddev->sync_mutex); 657 mutex_init(&mddev->bitmap_info.mutex); 658 INIT_LIST_HEAD(&mddev->disks); 659 INIT_LIST_HEAD(&mddev->all_mddevs); 660 INIT_LIST_HEAD(&mddev->deleting); 661 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 662 atomic_set(&mddev->active, 1); 663 atomic_set(&mddev->openers, 0); 664 atomic_set(&mddev->sync_seq, 0); 665 spin_lock_init(&mddev->lock); 666 atomic_set(&mddev->flush_pending, 0); 667 init_waitqueue_head(&mddev->sb_wait); 668 init_waitqueue_head(&mddev->recovery_wait); 669 mddev->reshape_position = MaxSector; 670 mddev->reshape_backwards = 0; 671 mddev->last_sync_action = "none"; 672 mddev->resync_min = 0; 673 mddev->resync_max = MaxSector; 674 mddev->level = LEVEL_NONE; 675 } 676 EXPORT_SYMBOL_GPL(mddev_init); 677 678 static struct mddev *mddev_find_locked(dev_t unit) 679 { 680 struct mddev *mddev; 681 682 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 683 if (mddev->unit == unit) 684 return mddev; 685 686 return NULL; 687 } 688 689 /* find an unused unit number */ 690 static dev_t mddev_alloc_unit(void) 691 { 692 static int next_minor = 512; 693 int start = next_minor; 694 bool is_free = 0; 695 dev_t dev = 0; 696 697 while (!is_free) { 698 dev = MKDEV(MD_MAJOR, next_minor); 699 next_minor++; 700 if (next_minor > MINORMASK) 701 next_minor = 0; 702 if (next_minor == start) 703 return 0; /* Oh dear, all in use. */ 704 is_free = !mddev_find_locked(dev); 705 } 706 707 return dev; 708 } 709 710 static struct mddev *mddev_alloc(dev_t unit) 711 { 712 struct mddev *new; 713 int error; 714 715 if (unit && MAJOR(unit) != MD_MAJOR) 716 unit &= ~((1 << MdpMinorShift) - 1); 717 718 new = kzalloc(sizeof(*new), GFP_KERNEL); 719 if (!new) 720 return ERR_PTR(-ENOMEM); 721 mddev_init(new); 722 723 spin_lock(&all_mddevs_lock); 724 if (unit) { 725 error = -EEXIST; 726 if (mddev_find_locked(unit)) 727 goto out_free_new; 728 new->unit = unit; 729 if (MAJOR(unit) == MD_MAJOR) 730 new->md_minor = MINOR(unit); 731 else 732 new->md_minor = MINOR(unit) >> MdpMinorShift; 733 new->hold_active = UNTIL_IOCTL; 734 } else { 735 error = -ENODEV; 736 new->unit = mddev_alloc_unit(); 737 if (!new->unit) 738 goto out_free_new; 739 new->md_minor = MINOR(new->unit); 740 new->hold_active = UNTIL_STOP; 741 } 742 743 list_add(&new->all_mddevs, &all_mddevs); 744 spin_unlock(&all_mddevs_lock); 745 return new; 746 out_free_new: 747 spin_unlock(&all_mddevs_lock); 748 kfree(new); 749 return ERR_PTR(error); 750 } 751 752 static void mddev_free(struct mddev *mddev) 753 { 754 spin_lock(&all_mddevs_lock); 755 list_del(&mddev->all_mddevs); 756 spin_unlock(&all_mddevs_lock); 757 758 kfree(mddev); 759 } 760 761 static const struct attribute_group md_redundancy_group; 762 763 void mddev_unlock(struct mddev *mddev) 764 { 765 struct md_rdev *rdev; 766 struct md_rdev *tmp; 767 LIST_HEAD(delete); 768 769 if (!list_empty(&mddev->deleting)) 770 list_splice_init(&mddev->deleting, &delete); 771 772 if (mddev->to_remove) { 773 /* These cannot be removed under reconfig_mutex as 774 * an access to the files will try to take reconfig_mutex 775 * while holding the file unremovable, which leads to 776 * a deadlock. 777 * So hold set sysfs_active while the remove in happeing, 778 * and anything else which might set ->to_remove or my 779 * otherwise change the sysfs namespace will fail with 780 * -EBUSY if sysfs_active is still set. 781 * We set sysfs_active under reconfig_mutex and elsewhere 782 * test it under the same mutex to ensure its correct value 783 * is seen. 784 */ 785 const struct attribute_group *to_remove = mddev->to_remove; 786 mddev->to_remove = NULL; 787 mddev->sysfs_active = 1; 788 mutex_unlock(&mddev->reconfig_mutex); 789 790 if (mddev->kobj.sd) { 791 if (to_remove != &md_redundancy_group) 792 sysfs_remove_group(&mddev->kobj, to_remove); 793 if (mddev->pers == NULL || 794 mddev->pers->sync_request == NULL) { 795 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 796 if (mddev->sysfs_action) 797 sysfs_put(mddev->sysfs_action); 798 if (mddev->sysfs_completed) 799 sysfs_put(mddev->sysfs_completed); 800 if (mddev->sysfs_degraded) 801 sysfs_put(mddev->sysfs_degraded); 802 mddev->sysfs_action = NULL; 803 mddev->sysfs_completed = NULL; 804 mddev->sysfs_degraded = NULL; 805 } 806 } 807 mddev->sysfs_active = 0; 808 } else 809 mutex_unlock(&mddev->reconfig_mutex); 810 811 md_wakeup_thread(mddev->thread); 812 wake_up(&mddev->sb_wait); 813 814 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 815 list_del_init(&rdev->same_set); 816 kobject_del(&rdev->kobj); 817 export_rdev(rdev, mddev); 818 } 819 } 820 EXPORT_SYMBOL_GPL(mddev_unlock); 821 822 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 823 { 824 struct md_rdev *rdev; 825 826 rdev_for_each_rcu(rdev, mddev) 827 if (rdev->desc_nr == nr) 828 return rdev; 829 830 return NULL; 831 } 832 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 833 834 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 835 { 836 struct md_rdev *rdev; 837 838 rdev_for_each(rdev, mddev) 839 if (rdev->bdev->bd_dev == dev) 840 return rdev; 841 842 return NULL; 843 } 844 845 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 846 { 847 struct md_rdev *rdev; 848 849 rdev_for_each_rcu(rdev, mddev) 850 if (rdev->bdev->bd_dev == dev) 851 return rdev; 852 853 return NULL; 854 } 855 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 856 857 static struct md_personality *find_pers(int level, char *clevel) 858 { 859 struct md_personality *pers; 860 list_for_each_entry(pers, &pers_list, list) { 861 if (level != LEVEL_NONE && pers->level == level) 862 return pers; 863 if (strcmp(pers->name, clevel)==0) 864 return pers; 865 } 866 return NULL; 867 } 868 869 /* return the offset of the super block in 512byte sectors */ 870 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 871 { 872 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 873 } 874 875 static int alloc_disk_sb(struct md_rdev *rdev) 876 { 877 rdev->sb_page = alloc_page(GFP_KERNEL); 878 if (!rdev->sb_page) 879 return -ENOMEM; 880 return 0; 881 } 882 883 void md_rdev_clear(struct md_rdev *rdev) 884 { 885 if (rdev->sb_page) { 886 put_page(rdev->sb_page); 887 rdev->sb_loaded = 0; 888 rdev->sb_page = NULL; 889 rdev->sb_start = 0; 890 rdev->sectors = 0; 891 } 892 if (rdev->bb_page) { 893 put_page(rdev->bb_page); 894 rdev->bb_page = NULL; 895 } 896 badblocks_exit(&rdev->badblocks); 897 } 898 EXPORT_SYMBOL_GPL(md_rdev_clear); 899 900 static void super_written(struct bio *bio) 901 { 902 struct md_rdev *rdev = bio->bi_private; 903 struct mddev *mddev = rdev->mddev; 904 905 if (bio->bi_status) { 906 pr_err("md: %s gets error=%d\n", __func__, 907 blk_status_to_errno(bio->bi_status)); 908 md_error(mddev, rdev); 909 if (!test_bit(Faulty, &rdev->flags) 910 && (bio->bi_opf & MD_FAILFAST)) { 911 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 912 set_bit(LastDev, &rdev->flags); 913 } 914 } else 915 clear_bit(LastDev, &rdev->flags); 916 917 bio_put(bio); 918 919 rdev_dec_pending(rdev, mddev); 920 921 if (atomic_dec_and_test(&mddev->pending_writes)) 922 wake_up(&mddev->sb_wait); 923 } 924 925 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 926 sector_t sector, int size, struct page *page) 927 { 928 /* write first size bytes of page to sector of rdev 929 * Increment mddev->pending_writes before returning 930 * and decrement it on completion, waking up sb_wait 931 * if zero is reached. 932 * If an error occurred, call md_error 933 */ 934 struct bio *bio; 935 936 if (!page) 937 return; 938 939 if (test_bit(Faulty, &rdev->flags)) 940 return; 941 942 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 943 1, 944 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, 945 GFP_NOIO, &mddev->sync_set); 946 947 atomic_inc(&rdev->nr_pending); 948 949 bio->bi_iter.bi_sector = sector; 950 __bio_add_page(bio, page, size, 0); 951 bio->bi_private = rdev; 952 bio->bi_end_io = super_written; 953 954 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 955 test_bit(FailFast, &rdev->flags) && 956 !test_bit(LastDev, &rdev->flags)) 957 bio->bi_opf |= MD_FAILFAST; 958 959 atomic_inc(&mddev->pending_writes); 960 submit_bio(bio); 961 } 962 963 int md_super_wait(struct mddev *mddev) 964 { 965 /* wait for all superblock writes that were scheduled to complete */ 966 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 967 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 968 return -EAGAIN; 969 return 0; 970 } 971 972 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 973 struct page *page, blk_opf_t opf, bool metadata_op) 974 { 975 struct bio bio; 976 struct bio_vec bvec; 977 978 if (metadata_op && rdev->meta_bdev) 979 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 980 else 981 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 982 983 if (metadata_op) 984 bio.bi_iter.bi_sector = sector + rdev->sb_start; 985 else if (rdev->mddev->reshape_position != MaxSector && 986 (rdev->mddev->reshape_backwards == 987 (sector >= rdev->mddev->reshape_position))) 988 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 989 else 990 bio.bi_iter.bi_sector = sector + rdev->data_offset; 991 __bio_add_page(&bio, page, size, 0); 992 993 submit_bio_wait(&bio); 994 995 return !bio.bi_status; 996 } 997 EXPORT_SYMBOL_GPL(sync_page_io); 998 999 static int read_disk_sb(struct md_rdev *rdev, int size) 1000 { 1001 if (rdev->sb_loaded) 1002 return 0; 1003 1004 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1005 goto fail; 1006 rdev->sb_loaded = 1; 1007 return 0; 1008 1009 fail: 1010 pr_err("md: disabled device %pg, could not read superblock.\n", 1011 rdev->bdev); 1012 return -EINVAL; 1013 } 1014 1015 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1016 { 1017 return sb1->set_uuid0 == sb2->set_uuid0 && 1018 sb1->set_uuid1 == sb2->set_uuid1 && 1019 sb1->set_uuid2 == sb2->set_uuid2 && 1020 sb1->set_uuid3 == sb2->set_uuid3; 1021 } 1022 1023 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1024 { 1025 int ret; 1026 mdp_super_t *tmp1, *tmp2; 1027 1028 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1029 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1030 1031 if (!tmp1 || !tmp2) { 1032 ret = 0; 1033 goto abort; 1034 } 1035 1036 *tmp1 = *sb1; 1037 *tmp2 = *sb2; 1038 1039 /* 1040 * nr_disks is not constant 1041 */ 1042 tmp1->nr_disks = 0; 1043 tmp2->nr_disks = 0; 1044 1045 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1046 abort: 1047 kfree(tmp1); 1048 kfree(tmp2); 1049 return ret; 1050 } 1051 1052 static u32 md_csum_fold(u32 csum) 1053 { 1054 csum = (csum & 0xffff) + (csum >> 16); 1055 return (csum & 0xffff) + (csum >> 16); 1056 } 1057 1058 static unsigned int calc_sb_csum(mdp_super_t *sb) 1059 { 1060 u64 newcsum = 0; 1061 u32 *sb32 = (u32*)sb; 1062 int i; 1063 unsigned int disk_csum, csum; 1064 1065 disk_csum = sb->sb_csum; 1066 sb->sb_csum = 0; 1067 1068 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1069 newcsum += sb32[i]; 1070 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1071 1072 #ifdef CONFIG_ALPHA 1073 /* This used to use csum_partial, which was wrong for several 1074 * reasons including that different results are returned on 1075 * different architectures. It isn't critical that we get exactly 1076 * the same return value as before (we always csum_fold before 1077 * testing, and that removes any differences). However as we 1078 * know that csum_partial always returned a 16bit value on 1079 * alphas, do a fold to maximise conformity to previous behaviour. 1080 */ 1081 sb->sb_csum = md_csum_fold(disk_csum); 1082 #else 1083 sb->sb_csum = disk_csum; 1084 #endif 1085 return csum; 1086 } 1087 1088 /* 1089 * Handle superblock details. 1090 * We want to be able to handle multiple superblock formats 1091 * so we have a common interface to them all, and an array of 1092 * different handlers. 1093 * We rely on user-space to write the initial superblock, and support 1094 * reading and updating of superblocks. 1095 * Interface methods are: 1096 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1097 * loads and validates a superblock on dev. 1098 * if refdev != NULL, compare superblocks on both devices 1099 * Return: 1100 * 0 - dev has a superblock that is compatible with refdev 1101 * 1 - dev has a superblock that is compatible and newer than refdev 1102 * so dev should be used as the refdev in future 1103 * -EINVAL superblock incompatible or invalid 1104 * -othererror e.g. -EIO 1105 * 1106 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1107 * Verify that dev is acceptable into mddev. 1108 * The first time, mddev->raid_disks will be 0, and data from 1109 * dev should be merged in. Subsequent calls check that dev 1110 * is new enough. Return 0 or -EINVAL 1111 * 1112 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1113 * Update the superblock for rdev with data in mddev 1114 * This does not write to disc. 1115 * 1116 */ 1117 1118 struct super_type { 1119 char *name; 1120 struct module *owner; 1121 int (*load_super)(struct md_rdev *rdev, 1122 struct md_rdev *refdev, 1123 int minor_version); 1124 int (*validate_super)(struct mddev *mddev, 1125 struct md_rdev *freshest, 1126 struct md_rdev *rdev); 1127 void (*sync_super)(struct mddev *mddev, 1128 struct md_rdev *rdev); 1129 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1130 sector_t num_sectors); 1131 int (*allow_new_offset)(struct md_rdev *rdev, 1132 unsigned long long new_offset); 1133 }; 1134 1135 /* 1136 * Check that the given mddev has no bitmap. 1137 * 1138 * This function is called from the run method of all personalities that do not 1139 * support bitmaps. It prints an error message and returns non-zero if mddev 1140 * has a bitmap. Otherwise, it returns 0. 1141 * 1142 */ 1143 int md_check_no_bitmap(struct mddev *mddev) 1144 { 1145 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1146 return 0; 1147 pr_warn("%s: bitmaps are not supported for %s\n", 1148 mdname(mddev), mddev->pers->name); 1149 return 1; 1150 } 1151 EXPORT_SYMBOL(md_check_no_bitmap); 1152 1153 /* 1154 * load_super for 0.90.0 1155 */ 1156 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1157 { 1158 mdp_super_t *sb; 1159 int ret; 1160 bool spare_disk = true; 1161 1162 /* 1163 * Calculate the position of the superblock (512byte sectors), 1164 * it's at the end of the disk. 1165 * 1166 * It also happens to be a multiple of 4Kb. 1167 */ 1168 rdev->sb_start = calc_dev_sboffset(rdev); 1169 1170 ret = read_disk_sb(rdev, MD_SB_BYTES); 1171 if (ret) 1172 return ret; 1173 1174 ret = -EINVAL; 1175 1176 sb = page_address(rdev->sb_page); 1177 1178 if (sb->md_magic != MD_SB_MAGIC) { 1179 pr_warn("md: invalid raid superblock magic on %pg\n", 1180 rdev->bdev); 1181 goto abort; 1182 } 1183 1184 if (sb->major_version != 0 || 1185 sb->minor_version < 90 || 1186 sb->minor_version > 91) { 1187 pr_warn("Bad version number %d.%d on %pg\n", 1188 sb->major_version, sb->minor_version, rdev->bdev); 1189 goto abort; 1190 } 1191 1192 if (sb->raid_disks <= 0) 1193 goto abort; 1194 1195 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1196 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1197 goto abort; 1198 } 1199 1200 rdev->preferred_minor = sb->md_minor; 1201 rdev->data_offset = 0; 1202 rdev->new_data_offset = 0; 1203 rdev->sb_size = MD_SB_BYTES; 1204 rdev->badblocks.shift = -1; 1205 1206 if (sb->level == LEVEL_MULTIPATH) 1207 rdev->desc_nr = -1; 1208 else 1209 rdev->desc_nr = sb->this_disk.number; 1210 1211 /* not spare disk, or LEVEL_MULTIPATH */ 1212 if (sb->level == LEVEL_MULTIPATH || 1213 (rdev->desc_nr >= 0 && 1214 rdev->desc_nr < MD_SB_DISKS && 1215 sb->disks[rdev->desc_nr].state & 1216 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) 1217 spare_disk = false; 1218 1219 if (!refdev) { 1220 if (!spare_disk) 1221 ret = 1; 1222 else 1223 ret = 0; 1224 } else { 1225 __u64 ev1, ev2; 1226 mdp_super_t *refsb = page_address(refdev->sb_page); 1227 if (!md_uuid_equal(refsb, sb)) { 1228 pr_warn("md: %pg has different UUID to %pg\n", 1229 rdev->bdev, refdev->bdev); 1230 goto abort; 1231 } 1232 if (!md_sb_equal(refsb, sb)) { 1233 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1234 rdev->bdev, refdev->bdev); 1235 goto abort; 1236 } 1237 ev1 = md_event(sb); 1238 ev2 = md_event(refsb); 1239 1240 if (!spare_disk && ev1 > ev2) 1241 ret = 1; 1242 else 1243 ret = 0; 1244 } 1245 rdev->sectors = rdev->sb_start; 1246 /* Limit to 4TB as metadata cannot record more than that. 1247 * (not needed for Linear and RAID0 as metadata doesn't 1248 * record this size) 1249 */ 1250 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1251 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1252 1253 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1254 /* "this cannot possibly happen" ... */ 1255 ret = -EINVAL; 1256 1257 abort: 1258 return ret; 1259 } 1260 1261 /* 1262 * validate_super for 0.90.0 1263 * note: we are not using "freshest" for 0.9 superblock 1264 */ 1265 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1266 { 1267 mdp_disk_t *desc; 1268 mdp_super_t *sb = page_address(rdev->sb_page); 1269 __u64 ev1 = md_event(sb); 1270 1271 rdev->raid_disk = -1; 1272 clear_bit(Faulty, &rdev->flags); 1273 clear_bit(In_sync, &rdev->flags); 1274 clear_bit(Bitmap_sync, &rdev->flags); 1275 clear_bit(WriteMostly, &rdev->flags); 1276 1277 if (mddev->raid_disks == 0) { 1278 mddev->major_version = 0; 1279 mddev->minor_version = sb->minor_version; 1280 mddev->patch_version = sb->patch_version; 1281 mddev->external = 0; 1282 mddev->chunk_sectors = sb->chunk_size >> 9; 1283 mddev->ctime = sb->ctime; 1284 mddev->utime = sb->utime; 1285 mddev->level = sb->level; 1286 mddev->clevel[0] = 0; 1287 mddev->layout = sb->layout; 1288 mddev->raid_disks = sb->raid_disks; 1289 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1290 mddev->events = ev1; 1291 mddev->bitmap_info.offset = 0; 1292 mddev->bitmap_info.space = 0; 1293 /* bitmap can use 60 K after the 4K superblocks */ 1294 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1295 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1296 mddev->reshape_backwards = 0; 1297 1298 if (mddev->minor_version >= 91) { 1299 mddev->reshape_position = sb->reshape_position; 1300 mddev->delta_disks = sb->delta_disks; 1301 mddev->new_level = sb->new_level; 1302 mddev->new_layout = sb->new_layout; 1303 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1304 if (mddev->delta_disks < 0) 1305 mddev->reshape_backwards = 1; 1306 } else { 1307 mddev->reshape_position = MaxSector; 1308 mddev->delta_disks = 0; 1309 mddev->new_level = mddev->level; 1310 mddev->new_layout = mddev->layout; 1311 mddev->new_chunk_sectors = mddev->chunk_sectors; 1312 } 1313 if (mddev->level == 0) 1314 mddev->layout = -1; 1315 1316 if (sb->state & (1<<MD_SB_CLEAN)) 1317 mddev->recovery_cp = MaxSector; 1318 else { 1319 if (sb->events_hi == sb->cp_events_hi && 1320 sb->events_lo == sb->cp_events_lo) { 1321 mddev->recovery_cp = sb->recovery_cp; 1322 } else 1323 mddev->recovery_cp = 0; 1324 } 1325 1326 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1327 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1328 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1329 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1330 1331 mddev->max_disks = MD_SB_DISKS; 1332 1333 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1334 mddev->bitmap_info.file == NULL) { 1335 mddev->bitmap_info.offset = 1336 mddev->bitmap_info.default_offset; 1337 mddev->bitmap_info.space = 1338 mddev->bitmap_info.default_space; 1339 } 1340 1341 } else if (mddev->pers == NULL) { 1342 /* Insist on good event counter while assembling, except 1343 * for spares (which don't need an event count) */ 1344 ++ev1; 1345 if (sb->disks[rdev->desc_nr].state & ( 1346 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1347 if (ev1 < mddev->events) 1348 return -EINVAL; 1349 } else if (mddev->bitmap) { 1350 /* if adding to array with a bitmap, then we can accept an 1351 * older device ... but not too old. 1352 */ 1353 if (ev1 < mddev->bitmap->events_cleared) 1354 return 0; 1355 if (ev1 < mddev->events) 1356 set_bit(Bitmap_sync, &rdev->flags); 1357 } else { 1358 if (ev1 < mddev->events) 1359 /* just a hot-add of a new device, leave raid_disk at -1 */ 1360 return 0; 1361 } 1362 1363 if (mddev->level != LEVEL_MULTIPATH) { 1364 desc = sb->disks + rdev->desc_nr; 1365 1366 if (desc->state & (1<<MD_DISK_FAULTY)) 1367 set_bit(Faulty, &rdev->flags); 1368 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1369 desc->raid_disk < mddev->raid_disks */) { 1370 set_bit(In_sync, &rdev->flags); 1371 rdev->raid_disk = desc->raid_disk; 1372 rdev->saved_raid_disk = desc->raid_disk; 1373 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1374 /* active but not in sync implies recovery up to 1375 * reshape position. We don't know exactly where 1376 * that is, so set to zero for now */ 1377 if (mddev->minor_version >= 91) { 1378 rdev->recovery_offset = 0; 1379 rdev->raid_disk = desc->raid_disk; 1380 } 1381 } 1382 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1383 set_bit(WriteMostly, &rdev->flags); 1384 if (desc->state & (1<<MD_DISK_FAILFAST)) 1385 set_bit(FailFast, &rdev->flags); 1386 } else /* MULTIPATH are always insync */ 1387 set_bit(In_sync, &rdev->flags); 1388 return 0; 1389 } 1390 1391 /* 1392 * sync_super for 0.90.0 1393 */ 1394 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1395 { 1396 mdp_super_t *sb; 1397 struct md_rdev *rdev2; 1398 int next_spare = mddev->raid_disks; 1399 1400 /* make rdev->sb match mddev data.. 1401 * 1402 * 1/ zero out disks 1403 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1404 * 3/ any empty disks < next_spare become removed 1405 * 1406 * disks[0] gets initialised to REMOVED because 1407 * we cannot be sure from other fields if it has 1408 * been initialised or not. 1409 */ 1410 int i; 1411 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1412 1413 rdev->sb_size = MD_SB_BYTES; 1414 1415 sb = page_address(rdev->sb_page); 1416 1417 memset(sb, 0, sizeof(*sb)); 1418 1419 sb->md_magic = MD_SB_MAGIC; 1420 sb->major_version = mddev->major_version; 1421 sb->patch_version = mddev->patch_version; 1422 sb->gvalid_words = 0; /* ignored */ 1423 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1424 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1425 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1426 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1427 1428 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1429 sb->level = mddev->level; 1430 sb->size = mddev->dev_sectors / 2; 1431 sb->raid_disks = mddev->raid_disks; 1432 sb->md_minor = mddev->md_minor; 1433 sb->not_persistent = 0; 1434 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1435 sb->state = 0; 1436 sb->events_hi = (mddev->events>>32); 1437 sb->events_lo = (u32)mddev->events; 1438 1439 if (mddev->reshape_position == MaxSector) 1440 sb->minor_version = 90; 1441 else { 1442 sb->minor_version = 91; 1443 sb->reshape_position = mddev->reshape_position; 1444 sb->new_level = mddev->new_level; 1445 sb->delta_disks = mddev->delta_disks; 1446 sb->new_layout = mddev->new_layout; 1447 sb->new_chunk = mddev->new_chunk_sectors << 9; 1448 } 1449 mddev->minor_version = sb->minor_version; 1450 if (mddev->in_sync) 1451 { 1452 sb->recovery_cp = mddev->recovery_cp; 1453 sb->cp_events_hi = (mddev->events>>32); 1454 sb->cp_events_lo = (u32)mddev->events; 1455 if (mddev->recovery_cp == MaxSector) 1456 sb->state = (1<< MD_SB_CLEAN); 1457 } else 1458 sb->recovery_cp = 0; 1459 1460 sb->layout = mddev->layout; 1461 sb->chunk_size = mddev->chunk_sectors << 9; 1462 1463 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1464 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1465 1466 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1467 rdev_for_each(rdev2, mddev) { 1468 mdp_disk_t *d; 1469 int desc_nr; 1470 int is_active = test_bit(In_sync, &rdev2->flags); 1471 1472 if (rdev2->raid_disk >= 0 && 1473 sb->minor_version >= 91) 1474 /* we have nowhere to store the recovery_offset, 1475 * but if it is not below the reshape_position, 1476 * we can piggy-back on that. 1477 */ 1478 is_active = 1; 1479 if (rdev2->raid_disk < 0 || 1480 test_bit(Faulty, &rdev2->flags)) 1481 is_active = 0; 1482 if (is_active) 1483 desc_nr = rdev2->raid_disk; 1484 else 1485 desc_nr = next_spare++; 1486 rdev2->desc_nr = desc_nr; 1487 d = &sb->disks[rdev2->desc_nr]; 1488 nr_disks++; 1489 d->number = rdev2->desc_nr; 1490 d->major = MAJOR(rdev2->bdev->bd_dev); 1491 d->minor = MINOR(rdev2->bdev->bd_dev); 1492 if (is_active) 1493 d->raid_disk = rdev2->raid_disk; 1494 else 1495 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1496 if (test_bit(Faulty, &rdev2->flags)) 1497 d->state = (1<<MD_DISK_FAULTY); 1498 else if (is_active) { 1499 d->state = (1<<MD_DISK_ACTIVE); 1500 if (test_bit(In_sync, &rdev2->flags)) 1501 d->state |= (1<<MD_DISK_SYNC); 1502 active++; 1503 working++; 1504 } else { 1505 d->state = 0; 1506 spare++; 1507 working++; 1508 } 1509 if (test_bit(WriteMostly, &rdev2->flags)) 1510 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1511 if (test_bit(FailFast, &rdev2->flags)) 1512 d->state |= (1<<MD_DISK_FAILFAST); 1513 } 1514 /* now set the "removed" and "faulty" bits on any missing devices */ 1515 for (i=0 ; i < mddev->raid_disks ; i++) { 1516 mdp_disk_t *d = &sb->disks[i]; 1517 if (d->state == 0 && d->number == 0) { 1518 d->number = i; 1519 d->raid_disk = i; 1520 d->state = (1<<MD_DISK_REMOVED); 1521 d->state |= (1<<MD_DISK_FAULTY); 1522 failed++; 1523 } 1524 } 1525 sb->nr_disks = nr_disks; 1526 sb->active_disks = active; 1527 sb->working_disks = working; 1528 sb->failed_disks = failed; 1529 sb->spare_disks = spare; 1530 1531 sb->this_disk = sb->disks[rdev->desc_nr]; 1532 sb->sb_csum = calc_sb_csum(sb); 1533 } 1534 1535 /* 1536 * rdev_size_change for 0.90.0 1537 */ 1538 static unsigned long long 1539 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1540 { 1541 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1542 return 0; /* component must fit device */ 1543 if (rdev->mddev->bitmap_info.offset) 1544 return 0; /* can't move bitmap */ 1545 rdev->sb_start = calc_dev_sboffset(rdev); 1546 if (!num_sectors || num_sectors > rdev->sb_start) 1547 num_sectors = rdev->sb_start; 1548 /* Limit to 4TB as metadata cannot record more than that. 1549 * 4TB == 2^32 KB, or 2*2^32 sectors. 1550 */ 1551 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1552 num_sectors = (sector_t)(2ULL << 32) - 2; 1553 do { 1554 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1555 rdev->sb_page); 1556 } while (md_super_wait(rdev->mddev) < 0); 1557 return num_sectors; 1558 } 1559 1560 static int 1561 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1562 { 1563 /* non-zero offset changes not possible with v0.90 */ 1564 return new_offset == 0; 1565 } 1566 1567 /* 1568 * version 1 superblock 1569 */ 1570 1571 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1572 { 1573 __le32 disk_csum; 1574 u32 csum; 1575 unsigned long long newcsum; 1576 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1577 __le32 *isuper = (__le32*)sb; 1578 1579 disk_csum = sb->sb_csum; 1580 sb->sb_csum = 0; 1581 newcsum = 0; 1582 for (; size >= 4; size -= 4) 1583 newcsum += le32_to_cpu(*isuper++); 1584 1585 if (size == 2) 1586 newcsum += le16_to_cpu(*(__le16*) isuper); 1587 1588 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1589 sb->sb_csum = disk_csum; 1590 return cpu_to_le32(csum); 1591 } 1592 1593 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1594 { 1595 struct mdp_superblock_1 *sb; 1596 int ret; 1597 sector_t sb_start; 1598 sector_t sectors; 1599 int bmask; 1600 bool spare_disk = true; 1601 1602 /* 1603 * Calculate the position of the superblock in 512byte sectors. 1604 * It is always aligned to a 4K boundary and 1605 * depeding on minor_version, it can be: 1606 * 0: At least 8K, but less than 12K, from end of device 1607 * 1: At start of device 1608 * 2: 4K from start of device. 1609 */ 1610 switch(minor_version) { 1611 case 0: 1612 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1613 sb_start &= ~(sector_t)(4*2-1); 1614 break; 1615 case 1: 1616 sb_start = 0; 1617 break; 1618 case 2: 1619 sb_start = 8; 1620 break; 1621 default: 1622 return -EINVAL; 1623 } 1624 rdev->sb_start = sb_start; 1625 1626 /* superblock is rarely larger than 1K, but it can be larger, 1627 * and it is safe to read 4k, so we do that 1628 */ 1629 ret = read_disk_sb(rdev, 4096); 1630 if (ret) return ret; 1631 1632 sb = page_address(rdev->sb_page); 1633 1634 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1635 sb->major_version != cpu_to_le32(1) || 1636 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1637 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1638 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1639 return -EINVAL; 1640 1641 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1642 pr_warn("md: invalid superblock checksum on %pg\n", 1643 rdev->bdev); 1644 return -EINVAL; 1645 } 1646 if (le64_to_cpu(sb->data_size) < 10) { 1647 pr_warn("md: data_size too small on %pg\n", 1648 rdev->bdev); 1649 return -EINVAL; 1650 } 1651 if (sb->pad0 || 1652 sb->pad3[0] || 1653 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1654 /* Some padding is non-zero, might be a new feature */ 1655 return -EINVAL; 1656 1657 rdev->preferred_minor = 0xffff; 1658 rdev->data_offset = le64_to_cpu(sb->data_offset); 1659 rdev->new_data_offset = rdev->data_offset; 1660 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1661 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1662 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1663 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1664 1665 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1666 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1667 if (rdev->sb_size & bmask) 1668 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1669 1670 if (minor_version 1671 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1672 return -EINVAL; 1673 if (minor_version 1674 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1675 return -EINVAL; 1676 1677 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1678 rdev->desc_nr = -1; 1679 else 1680 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1681 1682 if (!rdev->bb_page) { 1683 rdev->bb_page = alloc_page(GFP_KERNEL); 1684 if (!rdev->bb_page) 1685 return -ENOMEM; 1686 } 1687 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1688 rdev->badblocks.count == 0) { 1689 /* need to load the bad block list. 1690 * Currently we limit it to one page. 1691 */ 1692 s32 offset; 1693 sector_t bb_sector; 1694 __le64 *bbp; 1695 int i; 1696 int sectors = le16_to_cpu(sb->bblog_size); 1697 if (sectors > (PAGE_SIZE / 512)) 1698 return -EINVAL; 1699 offset = le32_to_cpu(sb->bblog_offset); 1700 if (offset == 0) 1701 return -EINVAL; 1702 bb_sector = (long long)offset; 1703 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1704 rdev->bb_page, REQ_OP_READ, true)) 1705 return -EIO; 1706 bbp = (__le64 *)page_address(rdev->bb_page); 1707 rdev->badblocks.shift = sb->bblog_shift; 1708 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1709 u64 bb = le64_to_cpu(*bbp); 1710 int count = bb & (0x3ff); 1711 u64 sector = bb >> 10; 1712 sector <<= sb->bblog_shift; 1713 count <<= sb->bblog_shift; 1714 if (bb + 1 == 0) 1715 break; 1716 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1717 return -EINVAL; 1718 } 1719 } else if (sb->bblog_offset != 0) 1720 rdev->badblocks.shift = 0; 1721 1722 if ((le32_to_cpu(sb->feature_map) & 1723 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1724 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1725 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1726 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1727 } 1728 1729 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1730 sb->level != 0) 1731 return -EINVAL; 1732 1733 /* not spare disk, or LEVEL_MULTIPATH */ 1734 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || 1735 (rdev->desc_nr >= 0 && 1736 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1737 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1738 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) 1739 spare_disk = false; 1740 1741 if (!refdev) { 1742 if (!spare_disk) 1743 ret = 1; 1744 else 1745 ret = 0; 1746 } else { 1747 __u64 ev1, ev2; 1748 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1749 1750 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1751 sb->level != refsb->level || 1752 sb->layout != refsb->layout || 1753 sb->chunksize != refsb->chunksize) { 1754 pr_warn("md: %pg has strangely different superblock to %pg\n", 1755 rdev->bdev, 1756 refdev->bdev); 1757 return -EINVAL; 1758 } 1759 ev1 = le64_to_cpu(sb->events); 1760 ev2 = le64_to_cpu(refsb->events); 1761 1762 if (!spare_disk && ev1 > ev2) 1763 ret = 1; 1764 else 1765 ret = 0; 1766 } 1767 if (minor_version) 1768 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1769 else 1770 sectors = rdev->sb_start; 1771 if (sectors < le64_to_cpu(sb->data_size)) 1772 return -EINVAL; 1773 rdev->sectors = le64_to_cpu(sb->data_size); 1774 return ret; 1775 } 1776 1777 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1778 { 1779 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1780 __u64 ev1 = le64_to_cpu(sb->events); 1781 1782 rdev->raid_disk = -1; 1783 clear_bit(Faulty, &rdev->flags); 1784 clear_bit(In_sync, &rdev->flags); 1785 clear_bit(Bitmap_sync, &rdev->flags); 1786 clear_bit(WriteMostly, &rdev->flags); 1787 1788 if (mddev->raid_disks == 0) { 1789 mddev->major_version = 1; 1790 mddev->patch_version = 0; 1791 mddev->external = 0; 1792 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1793 mddev->ctime = le64_to_cpu(sb->ctime); 1794 mddev->utime = le64_to_cpu(sb->utime); 1795 mddev->level = le32_to_cpu(sb->level); 1796 mddev->clevel[0] = 0; 1797 mddev->layout = le32_to_cpu(sb->layout); 1798 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1799 mddev->dev_sectors = le64_to_cpu(sb->size); 1800 mddev->events = ev1; 1801 mddev->bitmap_info.offset = 0; 1802 mddev->bitmap_info.space = 0; 1803 /* Default location for bitmap is 1K after superblock 1804 * using 3K - total of 4K 1805 */ 1806 mddev->bitmap_info.default_offset = 1024 >> 9; 1807 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1808 mddev->reshape_backwards = 0; 1809 1810 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1811 memcpy(mddev->uuid, sb->set_uuid, 16); 1812 1813 mddev->max_disks = (4096-256)/2; 1814 1815 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1816 mddev->bitmap_info.file == NULL) { 1817 mddev->bitmap_info.offset = 1818 (__s32)le32_to_cpu(sb->bitmap_offset); 1819 /* Metadata doesn't record how much space is available. 1820 * For 1.0, we assume we can use up to the superblock 1821 * if before, else to 4K beyond superblock. 1822 * For others, assume no change is possible. 1823 */ 1824 if (mddev->minor_version > 0) 1825 mddev->bitmap_info.space = 0; 1826 else if (mddev->bitmap_info.offset > 0) 1827 mddev->bitmap_info.space = 1828 8 - mddev->bitmap_info.offset; 1829 else 1830 mddev->bitmap_info.space = 1831 -mddev->bitmap_info.offset; 1832 } 1833 1834 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1835 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1836 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1837 mddev->new_level = le32_to_cpu(sb->new_level); 1838 mddev->new_layout = le32_to_cpu(sb->new_layout); 1839 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1840 if (mddev->delta_disks < 0 || 1841 (mddev->delta_disks == 0 && 1842 (le32_to_cpu(sb->feature_map) 1843 & MD_FEATURE_RESHAPE_BACKWARDS))) 1844 mddev->reshape_backwards = 1; 1845 } else { 1846 mddev->reshape_position = MaxSector; 1847 mddev->delta_disks = 0; 1848 mddev->new_level = mddev->level; 1849 mddev->new_layout = mddev->layout; 1850 mddev->new_chunk_sectors = mddev->chunk_sectors; 1851 } 1852 1853 if (mddev->level == 0 && 1854 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1855 mddev->layout = -1; 1856 1857 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1858 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1859 1860 if (le32_to_cpu(sb->feature_map) & 1861 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1862 if (le32_to_cpu(sb->feature_map) & 1863 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1864 return -EINVAL; 1865 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1866 (le32_to_cpu(sb->feature_map) & 1867 MD_FEATURE_MULTIPLE_PPLS)) 1868 return -EINVAL; 1869 set_bit(MD_HAS_PPL, &mddev->flags); 1870 } 1871 } else if (mddev->pers == NULL) { 1872 /* Insist of good event counter while assembling, except for 1873 * spares (which don't need an event count). 1874 * Similar to mdadm, we allow event counter difference of 1 1875 * from the freshest device. 1876 */ 1877 if (rdev->desc_nr >= 0 && 1878 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1879 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1880 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1881 if (ev1 + 1 < mddev->events) 1882 return -EINVAL; 1883 } else if (mddev->bitmap) { 1884 /* If adding to array with a bitmap, then we can accept an 1885 * older device, but not too old. 1886 */ 1887 if (ev1 < mddev->bitmap->events_cleared) 1888 return 0; 1889 if (ev1 < mddev->events) 1890 set_bit(Bitmap_sync, &rdev->flags); 1891 } else { 1892 if (ev1 < mddev->events) 1893 /* just a hot-add of a new device, leave raid_disk at -1 */ 1894 return 0; 1895 } 1896 if (mddev->level != LEVEL_MULTIPATH) { 1897 int role; 1898 if (rdev->desc_nr < 0 || 1899 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1900 role = MD_DISK_ROLE_SPARE; 1901 rdev->desc_nr = -1; 1902 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1903 /* 1904 * If we are assembling, and our event counter is smaller than the 1905 * highest event counter, we cannot trust our superblock about the role. 1906 * It could happen that our rdev was marked as Faulty, and all other 1907 * superblocks were updated with +1 event counter. 1908 * Then, before the next superblock update, which typically happens when 1909 * remove_and_add_spares() removes the device from the array, there was 1910 * a crash or reboot. 1911 * If we allow current rdev without consulting the freshest superblock, 1912 * we could cause data corruption. 1913 * Note that in this case our event counter is smaller by 1 than the 1914 * highest, otherwise, this rdev would not be allowed into array; 1915 * both kernel and mdadm allow event counter difference of 1. 1916 */ 1917 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 1918 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 1919 1920 if (rdev->desc_nr >= freshest_max_dev) { 1921 /* this is unexpected, better not proceed */ 1922 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 1923 mdname(mddev), rdev->bdev, rdev->desc_nr, 1924 freshest->bdev, freshest_max_dev); 1925 return -EUCLEAN; 1926 } 1927 1928 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 1929 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 1930 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 1931 } else { 1932 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1933 } 1934 switch(role) { 1935 case MD_DISK_ROLE_SPARE: /* spare */ 1936 break; 1937 case MD_DISK_ROLE_FAULTY: /* faulty */ 1938 set_bit(Faulty, &rdev->flags); 1939 break; 1940 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1941 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1942 /* journal device without journal feature */ 1943 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1944 return -EINVAL; 1945 } 1946 set_bit(Journal, &rdev->flags); 1947 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1948 rdev->raid_disk = 0; 1949 break; 1950 default: 1951 rdev->saved_raid_disk = role; 1952 if ((le32_to_cpu(sb->feature_map) & 1953 MD_FEATURE_RECOVERY_OFFSET)) { 1954 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1955 if (!(le32_to_cpu(sb->feature_map) & 1956 MD_FEATURE_RECOVERY_BITMAP)) 1957 rdev->saved_raid_disk = -1; 1958 } else { 1959 /* 1960 * If the array is FROZEN, then the device can't 1961 * be in_sync with rest of array. 1962 */ 1963 if (!test_bit(MD_RECOVERY_FROZEN, 1964 &mddev->recovery)) 1965 set_bit(In_sync, &rdev->flags); 1966 } 1967 rdev->raid_disk = role; 1968 break; 1969 } 1970 if (sb->devflags & WriteMostly1) 1971 set_bit(WriteMostly, &rdev->flags); 1972 if (sb->devflags & FailFast1) 1973 set_bit(FailFast, &rdev->flags); 1974 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1975 set_bit(Replacement, &rdev->flags); 1976 } else /* MULTIPATH are always insync */ 1977 set_bit(In_sync, &rdev->flags); 1978 1979 return 0; 1980 } 1981 1982 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1983 { 1984 struct mdp_superblock_1 *sb; 1985 struct md_rdev *rdev2; 1986 int max_dev, i; 1987 /* make rdev->sb match mddev and rdev data. */ 1988 1989 sb = page_address(rdev->sb_page); 1990 1991 sb->feature_map = 0; 1992 sb->pad0 = 0; 1993 sb->recovery_offset = cpu_to_le64(0); 1994 memset(sb->pad3, 0, sizeof(sb->pad3)); 1995 1996 sb->utime = cpu_to_le64((__u64)mddev->utime); 1997 sb->events = cpu_to_le64(mddev->events); 1998 if (mddev->in_sync) 1999 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2000 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2001 sb->resync_offset = cpu_to_le64(MaxSector); 2002 else 2003 sb->resync_offset = cpu_to_le64(0); 2004 2005 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2006 2007 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2008 sb->size = cpu_to_le64(mddev->dev_sectors); 2009 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2010 sb->level = cpu_to_le32(mddev->level); 2011 sb->layout = cpu_to_le32(mddev->layout); 2012 if (test_bit(FailFast, &rdev->flags)) 2013 sb->devflags |= FailFast1; 2014 else 2015 sb->devflags &= ~FailFast1; 2016 2017 if (test_bit(WriteMostly, &rdev->flags)) 2018 sb->devflags |= WriteMostly1; 2019 else 2020 sb->devflags &= ~WriteMostly1; 2021 sb->data_offset = cpu_to_le64(rdev->data_offset); 2022 sb->data_size = cpu_to_le64(rdev->sectors); 2023 2024 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2025 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2026 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2027 } 2028 2029 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2030 !test_bit(In_sync, &rdev->flags)) { 2031 sb->feature_map |= 2032 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2033 sb->recovery_offset = 2034 cpu_to_le64(rdev->recovery_offset); 2035 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2036 sb->feature_map |= 2037 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2038 } 2039 /* Note: recovery_offset and journal_tail share space */ 2040 if (test_bit(Journal, &rdev->flags)) 2041 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2042 if (test_bit(Replacement, &rdev->flags)) 2043 sb->feature_map |= 2044 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2045 2046 if (mddev->reshape_position != MaxSector) { 2047 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2048 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2049 sb->new_layout = cpu_to_le32(mddev->new_layout); 2050 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2051 sb->new_level = cpu_to_le32(mddev->new_level); 2052 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2053 if (mddev->delta_disks == 0 && 2054 mddev->reshape_backwards) 2055 sb->feature_map 2056 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2057 if (rdev->new_data_offset != rdev->data_offset) { 2058 sb->feature_map 2059 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2060 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2061 - rdev->data_offset)); 2062 } 2063 } 2064 2065 if (mddev_is_clustered(mddev)) 2066 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2067 2068 if (rdev->badblocks.count == 0) 2069 /* Nothing to do for bad blocks*/ ; 2070 else if (sb->bblog_offset == 0) 2071 /* Cannot record bad blocks on this device */ 2072 md_error(mddev, rdev); 2073 else { 2074 struct badblocks *bb = &rdev->badblocks; 2075 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2076 u64 *p = bb->page; 2077 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2078 if (bb->changed) { 2079 unsigned seq; 2080 2081 retry: 2082 seq = read_seqbegin(&bb->lock); 2083 2084 memset(bbp, 0xff, PAGE_SIZE); 2085 2086 for (i = 0 ; i < bb->count ; i++) { 2087 u64 internal_bb = p[i]; 2088 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2089 | BB_LEN(internal_bb)); 2090 bbp[i] = cpu_to_le64(store_bb); 2091 } 2092 bb->changed = 0; 2093 if (read_seqretry(&bb->lock, seq)) 2094 goto retry; 2095 2096 bb->sector = (rdev->sb_start + 2097 (int)le32_to_cpu(sb->bblog_offset)); 2098 bb->size = le16_to_cpu(sb->bblog_size); 2099 } 2100 } 2101 2102 max_dev = 0; 2103 rdev_for_each(rdev2, mddev) 2104 if (rdev2->desc_nr+1 > max_dev) 2105 max_dev = rdev2->desc_nr+1; 2106 2107 if (max_dev > le32_to_cpu(sb->max_dev)) { 2108 int bmask; 2109 sb->max_dev = cpu_to_le32(max_dev); 2110 rdev->sb_size = max_dev * 2 + 256; 2111 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2112 if (rdev->sb_size & bmask) 2113 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2114 } else 2115 max_dev = le32_to_cpu(sb->max_dev); 2116 2117 for (i=0; i<max_dev;i++) 2118 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2119 2120 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2121 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2122 2123 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2124 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2125 sb->feature_map |= 2126 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2127 else 2128 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2129 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2130 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2131 } 2132 2133 rdev_for_each(rdev2, mddev) { 2134 i = rdev2->desc_nr; 2135 if (test_bit(Faulty, &rdev2->flags)) 2136 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2137 else if (test_bit(In_sync, &rdev2->flags)) 2138 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2139 else if (test_bit(Journal, &rdev2->flags)) 2140 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2141 else if (rdev2->raid_disk >= 0) 2142 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2143 else 2144 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2145 } 2146 2147 sb->sb_csum = calc_sb_1_csum(sb); 2148 } 2149 2150 static sector_t super_1_choose_bm_space(sector_t dev_size) 2151 { 2152 sector_t bm_space; 2153 2154 /* if the device is bigger than 8Gig, save 64k for bitmap 2155 * usage, if bigger than 200Gig, save 128k 2156 */ 2157 if (dev_size < 64*2) 2158 bm_space = 0; 2159 else if (dev_size - 64*2 >= 200*1024*1024*2) 2160 bm_space = 128*2; 2161 else if (dev_size - 4*2 > 8*1024*1024*2) 2162 bm_space = 64*2; 2163 else 2164 bm_space = 4*2; 2165 return bm_space; 2166 } 2167 2168 static unsigned long long 2169 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2170 { 2171 struct mdp_superblock_1 *sb; 2172 sector_t max_sectors; 2173 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2174 return 0; /* component must fit device */ 2175 if (rdev->data_offset != rdev->new_data_offset) 2176 return 0; /* too confusing */ 2177 if (rdev->sb_start < rdev->data_offset) { 2178 /* minor versions 1 and 2; superblock before data */ 2179 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2180 if (!num_sectors || num_sectors > max_sectors) 2181 num_sectors = max_sectors; 2182 } else if (rdev->mddev->bitmap_info.offset) { 2183 /* minor version 0 with bitmap we can't move */ 2184 return 0; 2185 } else { 2186 /* minor version 0; superblock after data */ 2187 sector_t sb_start, bm_space; 2188 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2189 2190 /* 8K is for superblock */ 2191 sb_start = dev_size - 8*2; 2192 sb_start &= ~(sector_t)(4*2 - 1); 2193 2194 bm_space = super_1_choose_bm_space(dev_size); 2195 2196 /* Space that can be used to store date needs to decrease 2197 * superblock bitmap space and bad block space(4K) 2198 */ 2199 max_sectors = sb_start - bm_space - 4*2; 2200 2201 if (!num_sectors || num_sectors > max_sectors) 2202 num_sectors = max_sectors; 2203 rdev->sb_start = sb_start; 2204 } 2205 sb = page_address(rdev->sb_page); 2206 sb->data_size = cpu_to_le64(num_sectors); 2207 sb->super_offset = cpu_to_le64(rdev->sb_start); 2208 sb->sb_csum = calc_sb_1_csum(sb); 2209 do { 2210 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2211 rdev->sb_page); 2212 } while (md_super_wait(rdev->mddev) < 0); 2213 return num_sectors; 2214 2215 } 2216 2217 static int 2218 super_1_allow_new_offset(struct md_rdev *rdev, 2219 unsigned long long new_offset) 2220 { 2221 /* All necessary checks on new >= old have been done */ 2222 struct bitmap *bitmap; 2223 if (new_offset >= rdev->data_offset) 2224 return 1; 2225 2226 /* with 1.0 metadata, there is no metadata to tread on 2227 * so we can always move back */ 2228 if (rdev->mddev->minor_version == 0) 2229 return 1; 2230 2231 /* otherwise we must be sure not to step on 2232 * any metadata, so stay: 2233 * 36K beyond start of superblock 2234 * beyond end of badblocks 2235 * beyond write-intent bitmap 2236 */ 2237 if (rdev->sb_start + (32+4)*2 > new_offset) 2238 return 0; 2239 bitmap = rdev->mddev->bitmap; 2240 if (bitmap && !rdev->mddev->bitmap_info.file && 2241 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2242 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2243 return 0; 2244 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2245 return 0; 2246 2247 return 1; 2248 } 2249 2250 static struct super_type super_types[] = { 2251 [0] = { 2252 .name = "0.90.0", 2253 .owner = THIS_MODULE, 2254 .load_super = super_90_load, 2255 .validate_super = super_90_validate, 2256 .sync_super = super_90_sync, 2257 .rdev_size_change = super_90_rdev_size_change, 2258 .allow_new_offset = super_90_allow_new_offset, 2259 }, 2260 [1] = { 2261 .name = "md-1", 2262 .owner = THIS_MODULE, 2263 .load_super = super_1_load, 2264 .validate_super = super_1_validate, 2265 .sync_super = super_1_sync, 2266 .rdev_size_change = super_1_rdev_size_change, 2267 .allow_new_offset = super_1_allow_new_offset, 2268 }, 2269 }; 2270 2271 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2272 { 2273 if (mddev->sync_super) { 2274 mddev->sync_super(mddev, rdev); 2275 return; 2276 } 2277 2278 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2279 2280 super_types[mddev->major_version].sync_super(mddev, rdev); 2281 } 2282 2283 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2284 { 2285 struct md_rdev *rdev, *rdev2; 2286 2287 rcu_read_lock(); 2288 rdev_for_each_rcu(rdev, mddev1) { 2289 if (test_bit(Faulty, &rdev->flags) || 2290 test_bit(Journal, &rdev->flags) || 2291 rdev->raid_disk == -1) 2292 continue; 2293 rdev_for_each_rcu(rdev2, mddev2) { 2294 if (test_bit(Faulty, &rdev2->flags) || 2295 test_bit(Journal, &rdev2->flags) || 2296 rdev2->raid_disk == -1) 2297 continue; 2298 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2299 rcu_read_unlock(); 2300 return 1; 2301 } 2302 } 2303 } 2304 rcu_read_unlock(); 2305 return 0; 2306 } 2307 2308 static LIST_HEAD(pending_raid_disks); 2309 2310 /* 2311 * Try to register data integrity profile for an mddev 2312 * 2313 * This is called when an array is started and after a disk has been kicked 2314 * from the array. It only succeeds if all working and active component devices 2315 * are integrity capable with matching profiles. 2316 */ 2317 int md_integrity_register(struct mddev *mddev) 2318 { 2319 struct md_rdev *rdev, *reference = NULL; 2320 2321 if (list_empty(&mddev->disks)) 2322 return 0; /* nothing to do */ 2323 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2324 return 0; /* shouldn't register, or already is */ 2325 rdev_for_each(rdev, mddev) { 2326 /* skip spares and non-functional disks */ 2327 if (test_bit(Faulty, &rdev->flags)) 2328 continue; 2329 if (rdev->raid_disk < 0) 2330 continue; 2331 if (!reference) { 2332 /* Use the first rdev as the reference */ 2333 reference = rdev; 2334 continue; 2335 } 2336 /* does this rdev's profile match the reference profile? */ 2337 if (blk_integrity_compare(reference->bdev->bd_disk, 2338 rdev->bdev->bd_disk) < 0) 2339 return -EINVAL; 2340 } 2341 if (!reference || !bdev_get_integrity(reference->bdev)) 2342 return 0; 2343 /* 2344 * All component devices are integrity capable and have matching 2345 * profiles, register the common profile for the md device. 2346 */ 2347 blk_integrity_register(mddev->gendisk, 2348 bdev_get_integrity(reference->bdev)); 2349 2350 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2351 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2352 (mddev->level != 1 && mddev->level != 10 && 2353 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2354 /* 2355 * No need to handle the failure of bioset_integrity_create, 2356 * because the function is called by md_run() -> pers->run(), 2357 * md_run calls bioset_exit -> bioset_integrity_free in case 2358 * of failure case. 2359 */ 2360 pr_err("md: failed to create integrity pool for %s\n", 2361 mdname(mddev)); 2362 return -EINVAL; 2363 } 2364 return 0; 2365 } 2366 EXPORT_SYMBOL(md_integrity_register); 2367 2368 /* 2369 * Attempt to add an rdev, but only if it is consistent with the current 2370 * integrity profile 2371 */ 2372 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2373 { 2374 struct blk_integrity *bi_mddev; 2375 2376 if (!mddev->gendisk) 2377 return 0; 2378 2379 bi_mddev = blk_get_integrity(mddev->gendisk); 2380 2381 if (!bi_mddev) /* nothing to do */ 2382 return 0; 2383 2384 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2385 pr_err("%s: incompatible integrity profile for %pg\n", 2386 mdname(mddev), rdev->bdev); 2387 return -ENXIO; 2388 } 2389 2390 return 0; 2391 } 2392 EXPORT_SYMBOL(md_integrity_add_rdev); 2393 2394 static bool rdev_read_only(struct md_rdev *rdev) 2395 { 2396 return bdev_read_only(rdev->bdev) || 2397 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2398 } 2399 2400 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2401 { 2402 char b[BDEVNAME_SIZE]; 2403 int err; 2404 2405 /* prevent duplicates */ 2406 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2407 return -EEXIST; 2408 2409 if (rdev_read_only(rdev) && mddev->pers) 2410 return -EROFS; 2411 2412 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2413 if (!test_bit(Journal, &rdev->flags) && 2414 rdev->sectors && 2415 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2416 if (mddev->pers) { 2417 /* Cannot change size, so fail 2418 * If mddev->level <= 0, then we don't care 2419 * about aligning sizes (e.g. linear) 2420 */ 2421 if (mddev->level > 0) 2422 return -ENOSPC; 2423 } else 2424 mddev->dev_sectors = rdev->sectors; 2425 } 2426 2427 /* Verify rdev->desc_nr is unique. 2428 * If it is -1, assign a free number, else 2429 * check number is not in use 2430 */ 2431 rcu_read_lock(); 2432 if (rdev->desc_nr < 0) { 2433 int choice = 0; 2434 if (mddev->pers) 2435 choice = mddev->raid_disks; 2436 while (md_find_rdev_nr_rcu(mddev, choice)) 2437 choice++; 2438 rdev->desc_nr = choice; 2439 } else { 2440 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2441 rcu_read_unlock(); 2442 return -EBUSY; 2443 } 2444 } 2445 rcu_read_unlock(); 2446 if (!test_bit(Journal, &rdev->flags) && 2447 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2448 pr_warn("md: %s: array is limited to %d devices\n", 2449 mdname(mddev), mddev->max_disks); 2450 return -EBUSY; 2451 } 2452 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2453 strreplace(b, '/', '!'); 2454 2455 rdev->mddev = mddev; 2456 pr_debug("md: bind<%s>\n", b); 2457 2458 if (mddev->raid_disks) 2459 mddev_create_serial_pool(mddev, rdev, false); 2460 2461 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2462 goto fail; 2463 2464 /* failure here is OK */ 2465 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2466 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2467 rdev->sysfs_unack_badblocks = 2468 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2469 rdev->sysfs_badblocks = 2470 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2471 2472 list_add_rcu(&rdev->same_set, &mddev->disks); 2473 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2474 2475 /* May as well allow recovery to be retried once */ 2476 mddev->recovery_disabled++; 2477 2478 return 0; 2479 2480 fail: 2481 pr_warn("md: failed to register dev-%s for %s\n", 2482 b, mdname(mddev)); 2483 return err; 2484 } 2485 2486 void md_autodetect_dev(dev_t dev); 2487 2488 /* just for claiming the bdev */ 2489 static struct md_rdev claim_rdev; 2490 2491 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2492 { 2493 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2494 md_rdev_clear(rdev); 2495 #ifndef MODULE 2496 if (test_bit(AutoDetected, &rdev->flags)) 2497 md_autodetect_dev(rdev->bdev->bd_dev); 2498 #endif 2499 blkdev_put(rdev->bdev, 2500 test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev); 2501 rdev->bdev = NULL; 2502 kobject_put(&rdev->kobj); 2503 } 2504 2505 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2506 { 2507 struct mddev *mddev = rdev->mddev; 2508 2509 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2510 list_del_rcu(&rdev->same_set); 2511 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2512 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 2513 rdev->mddev = NULL; 2514 sysfs_remove_link(&rdev->kobj, "block"); 2515 sysfs_put(rdev->sysfs_state); 2516 sysfs_put(rdev->sysfs_unack_badblocks); 2517 sysfs_put(rdev->sysfs_badblocks); 2518 rdev->sysfs_state = NULL; 2519 rdev->sysfs_unack_badblocks = NULL; 2520 rdev->sysfs_badblocks = NULL; 2521 rdev->badblocks.count = 0; 2522 2523 synchronize_rcu(); 2524 2525 /* 2526 * kobject_del() will wait for all in progress writers to be done, where 2527 * reconfig_mutex is held, hence it can't be called under 2528 * reconfig_mutex and it's delayed to mddev_unlock(). 2529 */ 2530 list_add(&rdev->same_set, &mddev->deleting); 2531 } 2532 2533 static void export_array(struct mddev *mddev) 2534 { 2535 struct md_rdev *rdev; 2536 2537 while (!list_empty(&mddev->disks)) { 2538 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2539 same_set); 2540 md_kick_rdev_from_array(rdev); 2541 } 2542 mddev->raid_disks = 0; 2543 mddev->major_version = 0; 2544 } 2545 2546 static bool set_in_sync(struct mddev *mddev) 2547 { 2548 lockdep_assert_held(&mddev->lock); 2549 if (!mddev->in_sync) { 2550 mddev->sync_checkers++; 2551 spin_unlock(&mddev->lock); 2552 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2553 spin_lock(&mddev->lock); 2554 if (!mddev->in_sync && 2555 percpu_ref_is_zero(&mddev->writes_pending)) { 2556 mddev->in_sync = 1; 2557 /* 2558 * Ensure ->in_sync is visible before we clear 2559 * ->sync_checkers. 2560 */ 2561 smp_mb(); 2562 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2563 sysfs_notify_dirent_safe(mddev->sysfs_state); 2564 } 2565 if (--mddev->sync_checkers == 0) 2566 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2567 } 2568 if (mddev->safemode == 1) 2569 mddev->safemode = 0; 2570 return mddev->in_sync; 2571 } 2572 2573 static void sync_sbs(struct mddev *mddev, int nospares) 2574 { 2575 /* Update each superblock (in-memory image), but 2576 * if we are allowed to, skip spares which already 2577 * have the right event counter, or have one earlier 2578 * (which would mean they aren't being marked as dirty 2579 * with the rest of the array) 2580 */ 2581 struct md_rdev *rdev; 2582 rdev_for_each(rdev, mddev) { 2583 if (rdev->sb_events == mddev->events || 2584 (nospares && 2585 rdev->raid_disk < 0 && 2586 rdev->sb_events+1 == mddev->events)) { 2587 /* Don't update this superblock */ 2588 rdev->sb_loaded = 2; 2589 } else { 2590 sync_super(mddev, rdev); 2591 rdev->sb_loaded = 1; 2592 } 2593 } 2594 } 2595 2596 static bool does_sb_need_changing(struct mddev *mddev) 2597 { 2598 struct md_rdev *rdev = NULL, *iter; 2599 struct mdp_superblock_1 *sb; 2600 int role; 2601 2602 /* Find a good rdev */ 2603 rdev_for_each(iter, mddev) 2604 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2605 rdev = iter; 2606 break; 2607 } 2608 2609 /* No good device found. */ 2610 if (!rdev) 2611 return false; 2612 2613 sb = page_address(rdev->sb_page); 2614 /* Check if a device has become faulty or a spare become active */ 2615 rdev_for_each(rdev, mddev) { 2616 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2617 /* Device activated? */ 2618 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2619 !test_bit(Faulty, &rdev->flags)) 2620 return true; 2621 /* Device turned faulty? */ 2622 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2623 return true; 2624 } 2625 2626 /* Check if any mddev parameters have changed */ 2627 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2628 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2629 (mddev->layout != le32_to_cpu(sb->layout)) || 2630 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2631 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2632 return true; 2633 2634 return false; 2635 } 2636 2637 void md_update_sb(struct mddev *mddev, int force_change) 2638 { 2639 struct md_rdev *rdev; 2640 int sync_req; 2641 int nospares = 0; 2642 int any_badblocks_changed = 0; 2643 int ret = -1; 2644 2645 if (!md_is_rdwr(mddev)) { 2646 if (force_change) 2647 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2648 return; 2649 } 2650 2651 repeat: 2652 if (mddev_is_clustered(mddev)) { 2653 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2654 force_change = 1; 2655 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2656 nospares = 1; 2657 ret = md_cluster_ops->metadata_update_start(mddev); 2658 /* Has someone else has updated the sb */ 2659 if (!does_sb_need_changing(mddev)) { 2660 if (ret == 0) 2661 md_cluster_ops->metadata_update_cancel(mddev); 2662 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2663 BIT(MD_SB_CHANGE_DEVS) | 2664 BIT(MD_SB_CHANGE_CLEAN)); 2665 return; 2666 } 2667 } 2668 2669 /* 2670 * First make sure individual recovery_offsets are correct 2671 * curr_resync_completed can only be used during recovery. 2672 * During reshape/resync it might use array-addresses rather 2673 * that device addresses. 2674 */ 2675 rdev_for_each(rdev, mddev) { 2676 if (rdev->raid_disk >= 0 && 2677 mddev->delta_disks >= 0 && 2678 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2679 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2680 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2681 !test_bit(Journal, &rdev->flags) && 2682 !test_bit(In_sync, &rdev->flags) && 2683 mddev->curr_resync_completed > rdev->recovery_offset) 2684 rdev->recovery_offset = mddev->curr_resync_completed; 2685 2686 } 2687 if (!mddev->persistent) { 2688 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2689 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2690 if (!mddev->external) { 2691 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2692 rdev_for_each(rdev, mddev) { 2693 if (rdev->badblocks.changed) { 2694 rdev->badblocks.changed = 0; 2695 ack_all_badblocks(&rdev->badblocks); 2696 md_error(mddev, rdev); 2697 } 2698 clear_bit(Blocked, &rdev->flags); 2699 clear_bit(BlockedBadBlocks, &rdev->flags); 2700 wake_up(&rdev->blocked_wait); 2701 } 2702 } 2703 wake_up(&mddev->sb_wait); 2704 return; 2705 } 2706 2707 spin_lock(&mddev->lock); 2708 2709 mddev->utime = ktime_get_real_seconds(); 2710 2711 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2712 force_change = 1; 2713 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2714 /* just a clean<-> dirty transition, possibly leave spares alone, 2715 * though if events isn't the right even/odd, we will have to do 2716 * spares after all 2717 */ 2718 nospares = 1; 2719 if (force_change) 2720 nospares = 0; 2721 if (mddev->degraded) 2722 /* If the array is degraded, then skipping spares is both 2723 * dangerous and fairly pointless. 2724 * Dangerous because a device that was removed from the array 2725 * might have a event_count that still looks up-to-date, 2726 * so it can be re-added without a resync. 2727 * Pointless because if there are any spares to skip, 2728 * then a recovery will happen and soon that array won't 2729 * be degraded any more and the spare can go back to sleep then. 2730 */ 2731 nospares = 0; 2732 2733 sync_req = mddev->in_sync; 2734 2735 /* If this is just a dirty<->clean transition, and the array is clean 2736 * and 'events' is odd, we can roll back to the previous clean state */ 2737 if (nospares 2738 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2739 && mddev->can_decrease_events 2740 && mddev->events != 1) { 2741 mddev->events--; 2742 mddev->can_decrease_events = 0; 2743 } else { 2744 /* otherwise we have to go forward and ... */ 2745 mddev->events ++; 2746 mddev->can_decrease_events = nospares; 2747 } 2748 2749 /* 2750 * This 64-bit counter should never wrap. 2751 * Either we are in around ~1 trillion A.C., assuming 2752 * 1 reboot per second, or we have a bug... 2753 */ 2754 WARN_ON(mddev->events == 0); 2755 2756 rdev_for_each(rdev, mddev) { 2757 if (rdev->badblocks.changed) 2758 any_badblocks_changed++; 2759 if (test_bit(Faulty, &rdev->flags)) 2760 set_bit(FaultRecorded, &rdev->flags); 2761 } 2762 2763 sync_sbs(mddev, nospares); 2764 spin_unlock(&mddev->lock); 2765 2766 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2767 mdname(mddev), mddev->in_sync); 2768 2769 if (mddev->queue) 2770 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2771 rewrite: 2772 md_bitmap_update_sb(mddev->bitmap); 2773 rdev_for_each(rdev, mddev) { 2774 if (rdev->sb_loaded != 1) 2775 continue; /* no noise on spare devices */ 2776 2777 if (!test_bit(Faulty, &rdev->flags)) { 2778 md_super_write(mddev,rdev, 2779 rdev->sb_start, rdev->sb_size, 2780 rdev->sb_page); 2781 pr_debug("md: (write) %pg's sb offset: %llu\n", 2782 rdev->bdev, 2783 (unsigned long long)rdev->sb_start); 2784 rdev->sb_events = mddev->events; 2785 if (rdev->badblocks.size) { 2786 md_super_write(mddev, rdev, 2787 rdev->badblocks.sector, 2788 rdev->badblocks.size << 9, 2789 rdev->bb_page); 2790 rdev->badblocks.size = 0; 2791 } 2792 2793 } else 2794 pr_debug("md: %pg (skipping faulty)\n", 2795 rdev->bdev); 2796 2797 if (mddev->level == LEVEL_MULTIPATH) 2798 /* only need to write one superblock... */ 2799 break; 2800 } 2801 if (md_super_wait(mddev) < 0) 2802 goto rewrite; 2803 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2804 2805 if (mddev_is_clustered(mddev) && ret == 0) 2806 md_cluster_ops->metadata_update_finish(mddev); 2807 2808 if (mddev->in_sync != sync_req || 2809 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2810 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2811 /* have to write it out again */ 2812 goto repeat; 2813 wake_up(&mddev->sb_wait); 2814 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2815 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2816 2817 rdev_for_each(rdev, mddev) { 2818 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2819 clear_bit(Blocked, &rdev->flags); 2820 2821 if (any_badblocks_changed) 2822 ack_all_badblocks(&rdev->badblocks); 2823 clear_bit(BlockedBadBlocks, &rdev->flags); 2824 wake_up(&rdev->blocked_wait); 2825 } 2826 } 2827 EXPORT_SYMBOL(md_update_sb); 2828 2829 static int add_bound_rdev(struct md_rdev *rdev) 2830 { 2831 struct mddev *mddev = rdev->mddev; 2832 int err = 0; 2833 bool add_journal = test_bit(Journal, &rdev->flags); 2834 2835 if (!mddev->pers->hot_remove_disk || add_journal) { 2836 /* If there is hot_add_disk but no hot_remove_disk 2837 * then added disks for geometry changes, 2838 * and should be added immediately. 2839 */ 2840 super_types[mddev->major_version]. 2841 validate_super(mddev, NULL/*freshest*/, rdev); 2842 if (add_journal) 2843 mddev_suspend(mddev); 2844 err = mddev->pers->hot_add_disk(mddev, rdev); 2845 if (add_journal) 2846 mddev_resume(mddev); 2847 if (err) { 2848 md_kick_rdev_from_array(rdev); 2849 return err; 2850 } 2851 } 2852 sysfs_notify_dirent_safe(rdev->sysfs_state); 2853 2854 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2855 if (mddev->degraded) 2856 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2857 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2858 md_new_event(); 2859 md_wakeup_thread(mddev->thread); 2860 return 0; 2861 } 2862 2863 /* words written to sysfs files may, or may not, be \n terminated. 2864 * We want to accept with case. For this we use cmd_match. 2865 */ 2866 static int cmd_match(const char *cmd, const char *str) 2867 { 2868 /* See if cmd, written into a sysfs file, matches 2869 * str. They must either be the same, or cmd can 2870 * have a trailing newline 2871 */ 2872 while (*cmd && *str && *cmd == *str) { 2873 cmd++; 2874 str++; 2875 } 2876 if (*cmd == '\n') 2877 cmd++; 2878 if (*str || *cmd) 2879 return 0; 2880 return 1; 2881 } 2882 2883 struct rdev_sysfs_entry { 2884 struct attribute attr; 2885 ssize_t (*show)(struct md_rdev *, char *); 2886 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2887 }; 2888 2889 static ssize_t 2890 state_show(struct md_rdev *rdev, char *page) 2891 { 2892 char *sep = ","; 2893 size_t len = 0; 2894 unsigned long flags = READ_ONCE(rdev->flags); 2895 2896 if (test_bit(Faulty, &flags) || 2897 (!test_bit(ExternalBbl, &flags) && 2898 rdev->badblocks.unacked_exist)) 2899 len += sprintf(page+len, "faulty%s", sep); 2900 if (test_bit(In_sync, &flags)) 2901 len += sprintf(page+len, "in_sync%s", sep); 2902 if (test_bit(Journal, &flags)) 2903 len += sprintf(page+len, "journal%s", sep); 2904 if (test_bit(WriteMostly, &flags)) 2905 len += sprintf(page+len, "write_mostly%s", sep); 2906 if (test_bit(Blocked, &flags) || 2907 (rdev->badblocks.unacked_exist 2908 && !test_bit(Faulty, &flags))) 2909 len += sprintf(page+len, "blocked%s", sep); 2910 if (!test_bit(Faulty, &flags) && 2911 !test_bit(Journal, &flags) && 2912 !test_bit(In_sync, &flags)) 2913 len += sprintf(page+len, "spare%s", sep); 2914 if (test_bit(WriteErrorSeen, &flags)) 2915 len += sprintf(page+len, "write_error%s", sep); 2916 if (test_bit(WantReplacement, &flags)) 2917 len += sprintf(page+len, "want_replacement%s", sep); 2918 if (test_bit(Replacement, &flags)) 2919 len += sprintf(page+len, "replacement%s", sep); 2920 if (test_bit(ExternalBbl, &flags)) 2921 len += sprintf(page+len, "external_bbl%s", sep); 2922 if (test_bit(FailFast, &flags)) 2923 len += sprintf(page+len, "failfast%s", sep); 2924 2925 if (len) 2926 len -= strlen(sep); 2927 2928 return len+sprintf(page+len, "\n"); 2929 } 2930 2931 static ssize_t 2932 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2933 { 2934 /* can write 2935 * faulty - simulates an error 2936 * remove - disconnects the device 2937 * writemostly - sets write_mostly 2938 * -writemostly - clears write_mostly 2939 * blocked - sets the Blocked flags 2940 * -blocked - clears the Blocked and possibly simulates an error 2941 * insync - sets Insync providing device isn't active 2942 * -insync - clear Insync for a device with a slot assigned, 2943 * so that it gets rebuilt based on bitmap 2944 * write_error - sets WriteErrorSeen 2945 * -write_error - clears WriteErrorSeen 2946 * {,-}failfast - set/clear FailFast 2947 */ 2948 2949 struct mddev *mddev = rdev->mddev; 2950 int err = -EINVAL; 2951 bool need_update_sb = false; 2952 2953 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2954 md_error(rdev->mddev, rdev); 2955 2956 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2957 err = -EBUSY; 2958 else 2959 err = 0; 2960 } else if (cmd_match(buf, "remove")) { 2961 if (rdev->mddev->pers) { 2962 clear_bit(Blocked, &rdev->flags); 2963 remove_and_add_spares(rdev->mddev, rdev); 2964 } 2965 if (rdev->raid_disk >= 0) 2966 err = -EBUSY; 2967 else { 2968 err = 0; 2969 if (mddev_is_clustered(mddev)) 2970 err = md_cluster_ops->remove_disk(mddev, rdev); 2971 2972 if (err == 0) { 2973 md_kick_rdev_from_array(rdev); 2974 if (mddev->pers) { 2975 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2976 md_wakeup_thread(mddev->thread); 2977 } 2978 md_new_event(); 2979 } 2980 } 2981 } else if (cmd_match(buf, "writemostly")) { 2982 set_bit(WriteMostly, &rdev->flags); 2983 mddev_create_serial_pool(rdev->mddev, rdev, false); 2984 need_update_sb = true; 2985 err = 0; 2986 } else if (cmd_match(buf, "-writemostly")) { 2987 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 2988 clear_bit(WriteMostly, &rdev->flags); 2989 need_update_sb = true; 2990 err = 0; 2991 } else if (cmd_match(buf, "blocked")) { 2992 set_bit(Blocked, &rdev->flags); 2993 err = 0; 2994 } else if (cmd_match(buf, "-blocked")) { 2995 if (!test_bit(Faulty, &rdev->flags) && 2996 !test_bit(ExternalBbl, &rdev->flags) && 2997 rdev->badblocks.unacked_exist) { 2998 /* metadata handler doesn't understand badblocks, 2999 * so we need to fail the device 3000 */ 3001 md_error(rdev->mddev, rdev); 3002 } 3003 clear_bit(Blocked, &rdev->flags); 3004 clear_bit(BlockedBadBlocks, &rdev->flags); 3005 wake_up(&rdev->blocked_wait); 3006 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3007 md_wakeup_thread(rdev->mddev->thread); 3008 3009 err = 0; 3010 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3011 set_bit(In_sync, &rdev->flags); 3012 err = 0; 3013 } else if (cmd_match(buf, "failfast")) { 3014 set_bit(FailFast, &rdev->flags); 3015 need_update_sb = true; 3016 err = 0; 3017 } else if (cmd_match(buf, "-failfast")) { 3018 clear_bit(FailFast, &rdev->flags); 3019 need_update_sb = true; 3020 err = 0; 3021 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3022 !test_bit(Journal, &rdev->flags)) { 3023 if (rdev->mddev->pers == NULL) { 3024 clear_bit(In_sync, &rdev->flags); 3025 rdev->saved_raid_disk = rdev->raid_disk; 3026 rdev->raid_disk = -1; 3027 err = 0; 3028 } 3029 } else if (cmd_match(buf, "write_error")) { 3030 set_bit(WriteErrorSeen, &rdev->flags); 3031 err = 0; 3032 } else if (cmd_match(buf, "-write_error")) { 3033 clear_bit(WriteErrorSeen, &rdev->flags); 3034 err = 0; 3035 } else if (cmd_match(buf, "want_replacement")) { 3036 /* Any non-spare device that is not a replacement can 3037 * become want_replacement at any time, but we then need to 3038 * check if recovery is needed. 3039 */ 3040 if (rdev->raid_disk >= 0 && 3041 !test_bit(Journal, &rdev->flags) && 3042 !test_bit(Replacement, &rdev->flags)) 3043 set_bit(WantReplacement, &rdev->flags); 3044 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3045 md_wakeup_thread(rdev->mddev->thread); 3046 err = 0; 3047 } else if (cmd_match(buf, "-want_replacement")) { 3048 /* Clearing 'want_replacement' is always allowed. 3049 * Once replacements starts it is too late though. 3050 */ 3051 err = 0; 3052 clear_bit(WantReplacement, &rdev->flags); 3053 } else if (cmd_match(buf, "replacement")) { 3054 /* Can only set a device as a replacement when array has not 3055 * yet been started. Once running, replacement is automatic 3056 * from spares, or by assigning 'slot'. 3057 */ 3058 if (rdev->mddev->pers) 3059 err = -EBUSY; 3060 else { 3061 set_bit(Replacement, &rdev->flags); 3062 err = 0; 3063 } 3064 } else if (cmd_match(buf, "-replacement")) { 3065 /* Similarly, can only clear Replacement before start */ 3066 if (rdev->mddev->pers) 3067 err = -EBUSY; 3068 else { 3069 clear_bit(Replacement, &rdev->flags); 3070 err = 0; 3071 } 3072 } else if (cmd_match(buf, "re-add")) { 3073 if (!rdev->mddev->pers) 3074 err = -EINVAL; 3075 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3076 rdev->saved_raid_disk >= 0) { 3077 /* clear_bit is performed _after_ all the devices 3078 * have their local Faulty bit cleared. If any writes 3079 * happen in the meantime in the local node, they 3080 * will land in the local bitmap, which will be synced 3081 * by this node eventually 3082 */ 3083 if (!mddev_is_clustered(rdev->mddev) || 3084 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3085 clear_bit(Faulty, &rdev->flags); 3086 err = add_bound_rdev(rdev); 3087 } 3088 } else 3089 err = -EBUSY; 3090 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3091 set_bit(ExternalBbl, &rdev->flags); 3092 rdev->badblocks.shift = 0; 3093 err = 0; 3094 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3095 clear_bit(ExternalBbl, &rdev->flags); 3096 err = 0; 3097 } 3098 if (need_update_sb) 3099 md_update_sb(mddev, 1); 3100 if (!err) 3101 sysfs_notify_dirent_safe(rdev->sysfs_state); 3102 return err ? err : len; 3103 } 3104 static struct rdev_sysfs_entry rdev_state = 3105 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3106 3107 static ssize_t 3108 errors_show(struct md_rdev *rdev, char *page) 3109 { 3110 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3111 } 3112 3113 static ssize_t 3114 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3115 { 3116 unsigned int n; 3117 int rv; 3118 3119 rv = kstrtouint(buf, 10, &n); 3120 if (rv < 0) 3121 return rv; 3122 atomic_set(&rdev->corrected_errors, n); 3123 return len; 3124 } 3125 static struct rdev_sysfs_entry rdev_errors = 3126 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3127 3128 static ssize_t 3129 slot_show(struct md_rdev *rdev, char *page) 3130 { 3131 if (test_bit(Journal, &rdev->flags)) 3132 return sprintf(page, "journal\n"); 3133 else if (rdev->raid_disk < 0) 3134 return sprintf(page, "none\n"); 3135 else 3136 return sprintf(page, "%d\n", rdev->raid_disk); 3137 } 3138 3139 static ssize_t 3140 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3141 { 3142 int slot; 3143 int err; 3144 3145 if (test_bit(Journal, &rdev->flags)) 3146 return -EBUSY; 3147 if (strncmp(buf, "none", 4)==0) 3148 slot = -1; 3149 else { 3150 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3151 if (err < 0) 3152 return err; 3153 if (slot < 0) 3154 /* overflow */ 3155 return -ENOSPC; 3156 } 3157 if (rdev->mddev->pers && slot == -1) { 3158 /* Setting 'slot' on an active array requires also 3159 * updating the 'rd%d' link, and communicating 3160 * with the personality with ->hot_*_disk. 3161 * For now we only support removing 3162 * failed/spare devices. This normally happens automatically, 3163 * but not when the metadata is externally managed. 3164 */ 3165 if (rdev->raid_disk == -1) 3166 return -EEXIST; 3167 /* personality does all needed checks */ 3168 if (rdev->mddev->pers->hot_remove_disk == NULL) 3169 return -EINVAL; 3170 clear_bit(Blocked, &rdev->flags); 3171 remove_and_add_spares(rdev->mddev, rdev); 3172 if (rdev->raid_disk >= 0) 3173 return -EBUSY; 3174 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3175 md_wakeup_thread(rdev->mddev->thread); 3176 } else if (rdev->mddev->pers) { 3177 /* Activating a spare .. or possibly reactivating 3178 * if we ever get bitmaps working here. 3179 */ 3180 int err; 3181 3182 if (rdev->raid_disk != -1) 3183 return -EBUSY; 3184 3185 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3186 return -EBUSY; 3187 3188 if (rdev->mddev->pers->hot_add_disk == NULL) 3189 return -EINVAL; 3190 3191 if (slot >= rdev->mddev->raid_disks && 3192 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3193 return -ENOSPC; 3194 3195 rdev->raid_disk = slot; 3196 if (test_bit(In_sync, &rdev->flags)) 3197 rdev->saved_raid_disk = slot; 3198 else 3199 rdev->saved_raid_disk = -1; 3200 clear_bit(In_sync, &rdev->flags); 3201 clear_bit(Bitmap_sync, &rdev->flags); 3202 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3203 if (err) { 3204 rdev->raid_disk = -1; 3205 return err; 3206 } else 3207 sysfs_notify_dirent_safe(rdev->sysfs_state); 3208 /* failure here is OK */; 3209 sysfs_link_rdev(rdev->mddev, rdev); 3210 /* don't wakeup anyone, leave that to userspace. */ 3211 } else { 3212 if (slot >= rdev->mddev->raid_disks && 3213 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3214 return -ENOSPC; 3215 rdev->raid_disk = slot; 3216 /* assume it is working */ 3217 clear_bit(Faulty, &rdev->flags); 3218 clear_bit(WriteMostly, &rdev->flags); 3219 set_bit(In_sync, &rdev->flags); 3220 sysfs_notify_dirent_safe(rdev->sysfs_state); 3221 } 3222 return len; 3223 } 3224 3225 static struct rdev_sysfs_entry rdev_slot = 3226 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3227 3228 static ssize_t 3229 offset_show(struct md_rdev *rdev, char *page) 3230 { 3231 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3232 } 3233 3234 static ssize_t 3235 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3236 { 3237 unsigned long long offset; 3238 if (kstrtoull(buf, 10, &offset) < 0) 3239 return -EINVAL; 3240 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3241 return -EBUSY; 3242 if (rdev->sectors && rdev->mddev->external) 3243 /* Must set offset before size, so overlap checks 3244 * can be sane */ 3245 return -EBUSY; 3246 rdev->data_offset = offset; 3247 rdev->new_data_offset = offset; 3248 return len; 3249 } 3250 3251 static struct rdev_sysfs_entry rdev_offset = 3252 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3253 3254 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3255 { 3256 return sprintf(page, "%llu\n", 3257 (unsigned long long)rdev->new_data_offset); 3258 } 3259 3260 static ssize_t new_offset_store(struct md_rdev *rdev, 3261 const char *buf, size_t len) 3262 { 3263 unsigned long long new_offset; 3264 struct mddev *mddev = rdev->mddev; 3265 3266 if (kstrtoull(buf, 10, &new_offset) < 0) 3267 return -EINVAL; 3268 3269 if (mddev->sync_thread || 3270 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3271 return -EBUSY; 3272 if (new_offset == rdev->data_offset) 3273 /* reset is always permitted */ 3274 ; 3275 else if (new_offset > rdev->data_offset) { 3276 /* must not push array size beyond rdev_sectors */ 3277 if (new_offset - rdev->data_offset 3278 + mddev->dev_sectors > rdev->sectors) 3279 return -E2BIG; 3280 } 3281 /* Metadata worries about other space details. */ 3282 3283 /* decreasing the offset is inconsistent with a backwards 3284 * reshape. 3285 */ 3286 if (new_offset < rdev->data_offset && 3287 mddev->reshape_backwards) 3288 return -EINVAL; 3289 /* Increasing offset is inconsistent with forwards 3290 * reshape. reshape_direction should be set to 3291 * 'backwards' first. 3292 */ 3293 if (new_offset > rdev->data_offset && 3294 !mddev->reshape_backwards) 3295 return -EINVAL; 3296 3297 if (mddev->pers && mddev->persistent && 3298 !super_types[mddev->major_version] 3299 .allow_new_offset(rdev, new_offset)) 3300 return -E2BIG; 3301 rdev->new_data_offset = new_offset; 3302 if (new_offset > rdev->data_offset) 3303 mddev->reshape_backwards = 1; 3304 else if (new_offset < rdev->data_offset) 3305 mddev->reshape_backwards = 0; 3306 3307 return len; 3308 } 3309 static struct rdev_sysfs_entry rdev_new_offset = 3310 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3311 3312 static ssize_t 3313 rdev_size_show(struct md_rdev *rdev, char *page) 3314 { 3315 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3316 } 3317 3318 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3319 { 3320 /* check if two start/length pairs overlap */ 3321 if (a->data_offset + a->sectors <= b->data_offset) 3322 return false; 3323 if (b->data_offset + b->sectors <= a->data_offset) 3324 return false; 3325 return true; 3326 } 3327 3328 static bool md_rdev_overlaps(struct md_rdev *rdev) 3329 { 3330 struct mddev *mddev; 3331 struct md_rdev *rdev2; 3332 3333 spin_lock(&all_mddevs_lock); 3334 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3335 if (test_bit(MD_DELETED, &mddev->flags)) 3336 continue; 3337 rdev_for_each(rdev2, mddev) { 3338 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3339 md_rdevs_overlap(rdev, rdev2)) { 3340 spin_unlock(&all_mddevs_lock); 3341 return true; 3342 } 3343 } 3344 } 3345 spin_unlock(&all_mddevs_lock); 3346 return false; 3347 } 3348 3349 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3350 { 3351 unsigned long long blocks; 3352 sector_t new; 3353 3354 if (kstrtoull(buf, 10, &blocks) < 0) 3355 return -EINVAL; 3356 3357 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3358 return -EINVAL; /* sector conversion overflow */ 3359 3360 new = blocks * 2; 3361 if (new != blocks * 2) 3362 return -EINVAL; /* unsigned long long to sector_t overflow */ 3363 3364 *sectors = new; 3365 return 0; 3366 } 3367 3368 static ssize_t 3369 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3370 { 3371 struct mddev *my_mddev = rdev->mddev; 3372 sector_t oldsectors = rdev->sectors; 3373 sector_t sectors; 3374 3375 if (test_bit(Journal, &rdev->flags)) 3376 return -EBUSY; 3377 if (strict_blocks_to_sectors(buf, §ors) < 0) 3378 return -EINVAL; 3379 if (rdev->data_offset != rdev->new_data_offset) 3380 return -EINVAL; /* too confusing */ 3381 if (my_mddev->pers && rdev->raid_disk >= 0) { 3382 if (my_mddev->persistent) { 3383 sectors = super_types[my_mddev->major_version]. 3384 rdev_size_change(rdev, sectors); 3385 if (!sectors) 3386 return -EBUSY; 3387 } else if (!sectors) 3388 sectors = bdev_nr_sectors(rdev->bdev) - 3389 rdev->data_offset; 3390 if (!my_mddev->pers->resize) 3391 /* Cannot change size for RAID0 or Linear etc */ 3392 return -EINVAL; 3393 } 3394 if (sectors < my_mddev->dev_sectors) 3395 return -EINVAL; /* component must fit device */ 3396 3397 rdev->sectors = sectors; 3398 3399 /* 3400 * Check that all other rdevs with the same bdev do not overlap. This 3401 * check does not provide a hard guarantee, it just helps avoid 3402 * dangerous mistakes. 3403 */ 3404 if (sectors > oldsectors && my_mddev->external && 3405 md_rdev_overlaps(rdev)) { 3406 /* 3407 * Someone else could have slipped in a size change here, but 3408 * doing so is just silly. We put oldsectors back because we 3409 * know it is safe, and trust userspace not to race with itself. 3410 */ 3411 rdev->sectors = oldsectors; 3412 return -EBUSY; 3413 } 3414 return len; 3415 } 3416 3417 static struct rdev_sysfs_entry rdev_size = 3418 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3419 3420 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3421 { 3422 unsigned long long recovery_start = rdev->recovery_offset; 3423 3424 if (test_bit(In_sync, &rdev->flags) || 3425 recovery_start == MaxSector) 3426 return sprintf(page, "none\n"); 3427 3428 return sprintf(page, "%llu\n", recovery_start); 3429 } 3430 3431 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3432 { 3433 unsigned long long recovery_start; 3434 3435 if (cmd_match(buf, "none")) 3436 recovery_start = MaxSector; 3437 else if (kstrtoull(buf, 10, &recovery_start)) 3438 return -EINVAL; 3439 3440 if (rdev->mddev->pers && 3441 rdev->raid_disk >= 0) 3442 return -EBUSY; 3443 3444 rdev->recovery_offset = recovery_start; 3445 if (recovery_start == MaxSector) 3446 set_bit(In_sync, &rdev->flags); 3447 else 3448 clear_bit(In_sync, &rdev->flags); 3449 return len; 3450 } 3451 3452 static struct rdev_sysfs_entry rdev_recovery_start = 3453 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3454 3455 /* sysfs access to bad-blocks list. 3456 * We present two files. 3457 * 'bad-blocks' lists sector numbers and lengths of ranges that 3458 * are recorded as bad. The list is truncated to fit within 3459 * the one-page limit of sysfs. 3460 * Writing "sector length" to this file adds an acknowledged 3461 * bad block list. 3462 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3463 * been acknowledged. Writing to this file adds bad blocks 3464 * without acknowledging them. This is largely for testing. 3465 */ 3466 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3467 { 3468 return badblocks_show(&rdev->badblocks, page, 0); 3469 } 3470 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3471 { 3472 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3473 /* Maybe that ack was all we needed */ 3474 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3475 wake_up(&rdev->blocked_wait); 3476 return rv; 3477 } 3478 static struct rdev_sysfs_entry rdev_bad_blocks = 3479 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3480 3481 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3482 { 3483 return badblocks_show(&rdev->badblocks, page, 1); 3484 } 3485 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3486 { 3487 return badblocks_store(&rdev->badblocks, page, len, 1); 3488 } 3489 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3490 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3491 3492 static ssize_t 3493 ppl_sector_show(struct md_rdev *rdev, char *page) 3494 { 3495 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3496 } 3497 3498 static ssize_t 3499 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3500 { 3501 unsigned long long sector; 3502 3503 if (kstrtoull(buf, 10, §or) < 0) 3504 return -EINVAL; 3505 if (sector != (sector_t)sector) 3506 return -EINVAL; 3507 3508 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3509 rdev->raid_disk >= 0) 3510 return -EBUSY; 3511 3512 if (rdev->mddev->persistent) { 3513 if (rdev->mddev->major_version == 0) 3514 return -EINVAL; 3515 if ((sector > rdev->sb_start && 3516 sector - rdev->sb_start > S16_MAX) || 3517 (sector < rdev->sb_start && 3518 rdev->sb_start - sector > -S16_MIN)) 3519 return -EINVAL; 3520 rdev->ppl.offset = sector - rdev->sb_start; 3521 } else if (!rdev->mddev->external) { 3522 return -EBUSY; 3523 } 3524 rdev->ppl.sector = sector; 3525 return len; 3526 } 3527 3528 static struct rdev_sysfs_entry rdev_ppl_sector = 3529 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3530 3531 static ssize_t 3532 ppl_size_show(struct md_rdev *rdev, char *page) 3533 { 3534 return sprintf(page, "%u\n", rdev->ppl.size); 3535 } 3536 3537 static ssize_t 3538 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3539 { 3540 unsigned int size; 3541 3542 if (kstrtouint(buf, 10, &size) < 0) 3543 return -EINVAL; 3544 3545 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3546 rdev->raid_disk >= 0) 3547 return -EBUSY; 3548 3549 if (rdev->mddev->persistent) { 3550 if (rdev->mddev->major_version == 0) 3551 return -EINVAL; 3552 if (size > U16_MAX) 3553 return -EINVAL; 3554 } else if (!rdev->mddev->external) { 3555 return -EBUSY; 3556 } 3557 rdev->ppl.size = size; 3558 return len; 3559 } 3560 3561 static struct rdev_sysfs_entry rdev_ppl_size = 3562 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3563 3564 static struct attribute *rdev_default_attrs[] = { 3565 &rdev_state.attr, 3566 &rdev_errors.attr, 3567 &rdev_slot.attr, 3568 &rdev_offset.attr, 3569 &rdev_new_offset.attr, 3570 &rdev_size.attr, 3571 &rdev_recovery_start.attr, 3572 &rdev_bad_blocks.attr, 3573 &rdev_unack_bad_blocks.attr, 3574 &rdev_ppl_sector.attr, 3575 &rdev_ppl_size.attr, 3576 NULL, 3577 }; 3578 ATTRIBUTE_GROUPS(rdev_default); 3579 static ssize_t 3580 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3581 { 3582 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3583 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3584 3585 if (!entry->show) 3586 return -EIO; 3587 if (!rdev->mddev) 3588 return -ENODEV; 3589 return entry->show(rdev, page); 3590 } 3591 3592 static ssize_t 3593 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3594 const char *page, size_t length) 3595 { 3596 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3597 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3598 struct kernfs_node *kn = NULL; 3599 ssize_t rv; 3600 struct mddev *mddev = rdev->mddev; 3601 3602 if (!entry->store) 3603 return -EIO; 3604 if (!capable(CAP_SYS_ADMIN)) 3605 return -EACCES; 3606 3607 if (entry->store == state_store && cmd_match(page, "remove")) 3608 kn = sysfs_break_active_protection(kobj, attr); 3609 3610 rv = mddev ? mddev_lock(mddev) : -ENODEV; 3611 if (!rv) { 3612 if (rdev->mddev == NULL) 3613 rv = -ENODEV; 3614 else 3615 rv = entry->store(rdev, page, length); 3616 mddev_unlock(mddev); 3617 } 3618 3619 if (kn) 3620 sysfs_unbreak_active_protection(kn); 3621 3622 return rv; 3623 } 3624 3625 static void rdev_free(struct kobject *ko) 3626 { 3627 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3628 kfree(rdev); 3629 } 3630 static const struct sysfs_ops rdev_sysfs_ops = { 3631 .show = rdev_attr_show, 3632 .store = rdev_attr_store, 3633 }; 3634 static const struct kobj_type rdev_ktype = { 3635 .release = rdev_free, 3636 .sysfs_ops = &rdev_sysfs_ops, 3637 .default_groups = rdev_default_groups, 3638 }; 3639 3640 int md_rdev_init(struct md_rdev *rdev) 3641 { 3642 rdev->desc_nr = -1; 3643 rdev->saved_raid_disk = -1; 3644 rdev->raid_disk = -1; 3645 rdev->flags = 0; 3646 rdev->data_offset = 0; 3647 rdev->new_data_offset = 0; 3648 rdev->sb_events = 0; 3649 rdev->last_read_error = 0; 3650 rdev->sb_loaded = 0; 3651 rdev->bb_page = NULL; 3652 atomic_set(&rdev->nr_pending, 0); 3653 atomic_set(&rdev->read_errors, 0); 3654 atomic_set(&rdev->corrected_errors, 0); 3655 3656 INIT_LIST_HEAD(&rdev->same_set); 3657 init_waitqueue_head(&rdev->blocked_wait); 3658 3659 /* Add space to store bad block list. 3660 * This reserves the space even on arrays where it cannot 3661 * be used - I wonder if that matters 3662 */ 3663 return badblocks_init(&rdev->badblocks, 0); 3664 } 3665 EXPORT_SYMBOL_GPL(md_rdev_init); 3666 3667 /* 3668 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3669 * 3670 * mark the device faulty if: 3671 * 3672 * - the device is nonexistent (zero size) 3673 * - the device has no valid superblock 3674 * 3675 * a faulty rdev _never_ has rdev->sb set. 3676 */ 3677 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3678 { 3679 struct md_rdev *rdev; 3680 struct md_rdev *holder; 3681 sector_t size; 3682 int err; 3683 3684 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3685 if (!rdev) 3686 return ERR_PTR(-ENOMEM); 3687 3688 err = md_rdev_init(rdev); 3689 if (err) 3690 goto out_free_rdev; 3691 err = alloc_disk_sb(rdev); 3692 if (err) 3693 goto out_clear_rdev; 3694 3695 if (super_format == -2) { 3696 holder = &claim_rdev; 3697 } else { 3698 holder = rdev; 3699 set_bit(Holder, &rdev->flags); 3700 } 3701 3702 rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, 3703 holder, NULL); 3704 if (IS_ERR(rdev->bdev)) { 3705 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3706 MAJOR(newdev), MINOR(newdev)); 3707 err = PTR_ERR(rdev->bdev); 3708 goto out_clear_rdev; 3709 } 3710 3711 kobject_init(&rdev->kobj, &rdev_ktype); 3712 3713 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3714 if (!size) { 3715 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3716 rdev->bdev); 3717 err = -EINVAL; 3718 goto out_blkdev_put; 3719 } 3720 3721 if (super_format >= 0) { 3722 err = super_types[super_format]. 3723 load_super(rdev, NULL, super_minor); 3724 if (err == -EINVAL) { 3725 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3726 rdev->bdev, 3727 super_format, super_minor); 3728 goto out_blkdev_put; 3729 } 3730 if (err < 0) { 3731 pr_warn("md: could not read %pg's sb, not importing!\n", 3732 rdev->bdev); 3733 goto out_blkdev_put; 3734 } 3735 } 3736 3737 return rdev; 3738 3739 out_blkdev_put: 3740 blkdev_put(rdev->bdev, holder); 3741 out_clear_rdev: 3742 md_rdev_clear(rdev); 3743 out_free_rdev: 3744 kfree(rdev); 3745 return ERR_PTR(err); 3746 } 3747 3748 /* 3749 * Check a full RAID array for plausibility 3750 */ 3751 3752 static int analyze_sbs(struct mddev *mddev) 3753 { 3754 int i; 3755 struct md_rdev *rdev, *freshest, *tmp; 3756 3757 freshest = NULL; 3758 rdev_for_each_safe(rdev, tmp, mddev) 3759 switch (super_types[mddev->major_version]. 3760 load_super(rdev, freshest, mddev->minor_version)) { 3761 case 1: 3762 freshest = rdev; 3763 break; 3764 case 0: 3765 break; 3766 default: 3767 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3768 rdev->bdev); 3769 md_kick_rdev_from_array(rdev); 3770 } 3771 3772 /* Cannot find a valid fresh disk */ 3773 if (!freshest) { 3774 pr_warn("md: cannot find a valid disk\n"); 3775 return -EINVAL; 3776 } 3777 3778 super_types[mddev->major_version]. 3779 validate_super(mddev, NULL/*freshest*/, freshest); 3780 3781 i = 0; 3782 rdev_for_each_safe(rdev, tmp, mddev) { 3783 if (mddev->max_disks && 3784 (rdev->desc_nr >= mddev->max_disks || 3785 i > mddev->max_disks)) { 3786 pr_warn("md: %s: %pg: only %d devices permitted\n", 3787 mdname(mddev), rdev->bdev, 3788 mddev->max_disks); 3789 md_kick_rdev_from_array(rdev); 3790 continue; 3791 } 3792 if (rdev != freshest) { 3793 if (super_types[mddev->major_version]. 3794 validate_super(mddev, freshest, rdev)) { 3795 pr_warn("md: kicking non-fresh %pg from array!\n", 3796 rdev->bdev); 3797 md_kick_rdev_from_array(rdev); 3798 continue; 3799 } 3800 } 3801 if (mddev->level == LEVEL_MULTIPATH) { 3802 rdev->desc_nr = i++; 3803 rdev->raid_disk = rdev->desc_nr; 3804 set_bit(In_sync, &rdev->flags); 3805 } else if (rdev->raid_disk >= 3806 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3807 !test_bit(Journal, &rdev->flags)) { 3808 rdev->raid_disk = -1; 3809 clear_bit(In_sync, &rdev->flags); 3810 } 3811 } 3812 3813 return 0; 3814 } 3815 3816 /* Read a fixed-point number. 3817 * Numbers in sysfs attributes should be in "standard" units where 3818 * possible, so time should be in seconds. 3819 * However we internally use a a much smaller unit such as 3820 * milliseconds or jiffies. 3821 * This function takes a decimal number with a possible fractional 3822 * component, and produces an integer which is the result of 3823 * multiplying that number by 10^'scale'. 3824 * all without any floating-point arithmetic. 3825 */ 3826 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3827 { 3828 unsigned long result = 0; 3829 long decimals = -1; 3830 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3831 if (*cp == '.') 3832 decimals = 0; 3833 else if (decimals < scale) { 3834 unsigned int value; 3835 value = *cp - '0'; 3836 result = result * 10 + value; 3837 if (decimals >= 0) 3838 decimals++; 3839 } 3840 cp++; 3841 } 3842 if (*cp == '\n') 3843 cp++; 3844 if (*cp) 3845 return -EINVAL; 3846 if (decimals < 0) 3847 decimals = 0; 3848 *res = result * int_pow(10, scale - decimals); 3849 return 0; 3850 } 3851 3852 static ssize_t 3853 safe_delay_show(struct mddev *mddev, char *page) 3854 { 3855 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3856 3857 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3858 } 3859 static ssize_t 3860 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3861 { 3862 unsigned long msec; 3863 3864 if (mddev_is_clustered(mddev)) { 3865 pr_warn("md: Safemode is disabled for clustered mode\n"); 3866 return -EINVAL; 3867 } 3868 3869 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3870 return -EINVAL; 3871 if (msec == 0) 3872 mddev->safemode_delay = 0; 3873 else { 3874 unsigned long old_delay = mddev->safemode_delay; 3875 unsigned long new_delay = (msec*HZ)/1000; 3876 3877 if (new_delay == 0) 3878 new_delay = 1; 3879 mddev->safemode_delay = new_delay; 3880 if (new_delay < old_delay || old_delay == 0) 3881 mod_timer(&mddev->safemode_timer, jiffies+1); 3882 } 3883 return len; 3884 } 3885 static struct md_sysfs_entry md_safe_delay = 3886 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3887 3888 static ssize_t 3889 level_show(struct mddev *mddev, char *page) 3890 { 3891 struct md_personality *p; 3892 int ret; 3893 spin_lock(&mddev->lock); 3894 p = mddev->pers; 3895 if (p) 3896 ret = sprintf(page, "%s\n", p->name); 3897 else if (mddev->clevel[0]) 3898 ret = sprintf(page, "%s\n", mddev->clevel); 3899 else if (mddev->level != LEVEL_NONE) 3900 ret = sprintf(page, "%d\n", mddev->level); 3901 else 3902 ret = 0; 3903 spin_unlock(&mddev->lock); 3904 return ret; 3905 } 3906 3907 static ssize_t 3908 level_store(struct mddev *mddev, const char *buf, size_t len) 3909 { 3910 char clevel[16]; 3911 ssize_t rv; 3912 size_t slen = len; 3913 struct md_personality *pers, *oldpers; 3914 long level; 3915 void *priv, *oldpriv; 3916 struct md_rdev *rdev; 3917 3918 if (slen == 0 || slen >= sizeof(clevel)) 3919 return -EINVAL; 3920 3921 rv = mddev_lock(mddev); 3922 if (rv) 3923 return rv; 3924 3925 if (mddev->pers == NULL) { 3926 strncpy(mddev->clevel, buf, slen); 3927 if (mddev->clevel[slen-1] == '\n') 3928 slen--; 3929 mddev->clevel[slen] = 0; 3930 mddev->level = LEVEL_NONE; 3931 rv = len; 3932 goto out_unlock; 3933 } 3934 rv = -EROFS; 3935 if (!md_is_rdwr(mddev)) 3936 goto out_unlock; 3937 3938 /* request to change the personality. Need to ensure: 3939 * - array is not engaged in resync/recovery/reshape 3940 * - old personality can be suspended 3941 * - new personality will access other array. 3942 */ 3943 3944 rv = -EBUSY; 3945 if (mddev->sync_thread || 3946 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3947 mddev->reshape_position != MaxSector || 3948 mddev->sysfs_active) 3949 goto out_unlock; 3950 3951 rv = -EINVAL; 3952 if (!mddev->pers->quiesce) { 3953 pr_warn("md: %s: %s does not support online personality change\n", 3954 mdname(mddev), mddev->pers->name); 3955 goto out_unlock; 3956 } 3957 3958 /* Now find the new personality */ 3959 strncpy(clevel, buf, slen); 3960 if (clevel[slen-1] == '\n') 3961 slen--; 3962 clevel[slen] = 0; 3963 if (kstrtol(clevel, 10, &level)) 3964 level = LEVEL_NONE; 3965 3966 if (request_module("md-%s", clevel) != 0) 3967 request_module("md-level-%s", clevel); 3968 spin_lock(&pers_lock); 3969 pers = find_pers(level, clevel); 3970 if (!pers || !try_module_get(pers->owner)) { 3971 spin_unlock(&pers_lock); 3972 pr_warn("md: personality %s not loaded\n", clevel); 3973 rv = -EINVAL; 3974 goto out_unlock; 3975 } 3976 spin_unlock(&pers_lock); 3977 3978 if (pers == mddev->pers) { 3979 /* Nothing to do! */ 3980 module_put(pers->owner); 3981 rv = len; 3982 goto out_unlock; 3983 } 3984 if (!pers->takeover) { 3985 module_put(pers->owner); 3986 pr_warn("md: %s: %s does not support personality takeover\n", 3987 mdname(mddev), clevel); 3988 rv = -EINVAL; 3989 goto out_unlock; 3990 } 3991 3992 rdev_for_each(rdev, mddev) 3993 rdev->new_raid_disk = rdev->raid_disk; 3994 3995 /* ->takeover must set new_* and/or delta_disks 3996 * if it succeeds, and may set them when it fails. 3997 */ 3998 priv = pers->takeover(mddev); 3999 if (IS_ERR(priv)) { 4000 mddev->new_level = mddev->level; 4001 mddev->new_layout = mddev->layout; 4002 mddev->new_chunk_sectors = mddev->chunk_sectors; 4003 mddev->raid_disks -= mddev->delta_disks; 4004 mddev->delta_disks = 0; 4005 mddev->reshape_backwards = 0; 4006 module_put(pers->owner); 4007 pr_warn("md: %s: %s would not accept array\n", 4008 mdname(mddev), clevel); 4009 rv = PTR_ERR(priv); 4010 goto out_unlock; 4011 } 4012 4013 /* Looks like we have a winner */ 4014 mddev_suspend(mddev); 4015 mddev_detach(mddev); 4016 4017 spin_lock(&mddev->lock); 4018 oldpers = mddev->pers; 4019 oldpriv = mddev->private; 4020 mddev->pers = pers; 4021 mddev->private = priv; 4022 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4023 mddev->level = mddev->new_level; 4024 mddev->layout = mddev->new_layout; 4025 mddev->chunk_sectors = mddev->new_chunk_sectors; 4026 mddev->delta_disks = 0; 4027 mddev->reshape_backwards = 0; 4028 mddev->degraded = 0; 4029 spin_unlock(&mddev->lock); 4030 4031 if (oldpers->sync_request == NULL && 4032 mddev->external) { 4033 /* We are converting from a no-redundancy array 4034 * to a redundancy array and metadata is managed 4035 * externally so we need to be sure that writes 4036 * won't block due to a need to transition 4037 * clean->dirty 4038 * until external management is started. 4039 */ 4040 mddev->in_sync = 0; 4041 mddev->safemode_delay = 0; 4042 mddev->safemode = 0; 4043 } 4044 4045 oldpers->free(mddev, oldpriv); 4046 4047 if (oldpers->sync_request == NULL && 4048 pers->sync_request != NULL) { 4049 /* need to add the md_redundancy_group */ 4050 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4051 pr_warn("md: cannot register extra attributes for %s\n", 4052 mdname(mddev)); 4053 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4054 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4055 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4056 } 4057 if (oldpers->sync_request != NULL && 4058 pers->sync_request == NULL) { 4059 /* need to remove the md_redundancy_group */ 4060 if (mddev->to_remove == NULL) 4061 mddev->to_remove = &md_redundancy_group; 4062 } 4063 4064 module_put(oldpers->owner); 4065 4066 rdev_for_each(rdev, mddev) { 4067 if (rdev->raid_disk < 0) 4068 continue; 4069 if (rdev->new_raid_disk >= mddev->raid_disks) 4070 rdev->new_raid_disk = -1; 4071 if (rdev->new_raid_disk == rdev->raid_disk) 4072 continue; 4073 sysfs_unlink_rdev(mddev, rdev); 4074 } 4075 rdev_for_each(rdev, mddev) { 4076 if (rdev->raid_disk < 0) 4077 continue; 4078 if (rdev->new_raid_disk == rdev->raid_disk) 4079 continue; 4080 rdev->raid_disk = rdev->new_raid_disk; 4081 if (rdev->raid_disk < 0) 4082 clear_bit(In_sync, &rdev->flags); 4083 else { 4084 if (sysfs_link_rdev(mddev, rdev)) 4085 pr_warn("md: cannot register rd%d for %s after level change\n", 4086 rdev->raid_disk, mdname(mddev)); 4087 } 4088 } 4089 4090 if (pers->sync_request == NULL) { 4091 /* this is now an array without redundancy, so 4092 * it must always be in_sync 4093 */ 4094 mddev->in_sync = 1; 4095 del_timer_sync(&mddev->safemode_timer); 4096 } 4097 blk_set_stacking_limits(&mddev->queue->limits); 4098 pers->run(mddev); 4099 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4100 mddev_resume(mddev); 4101 if (!mddev->thread) 4102 md_update_sb(mddev, 1); 4103 sysfs_notify_dirent_safe(mddev->sysfs_level); 4104 md_new_event(); 4105 rv = len; 4106 out_unlock: 4107 mddev_unlock(mddev); 4108 return rv; 4109 } 4110 4111 static struct md_sysfs_entry md_level = 4112 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4113 4114 static ssize_t 4115 layout_show(struct mddev *mddev, char *page) 4116 { 4117 /* just a number, not meaningful for all levels */ 4118 if (mddev->reshape_position != MaxSector && 4119 mddev->layout != mddev->new_layout) 4120 return sprintf(page, "%d (%d)\n", 4121 mddev->new_layout, mddev->layout); 4122 return sprintf(page, "%d\n", mddev->layout); 4123 } 4124 4125 static ssize_t 4126 layout_store(struct mddev *mddev, const char *buf, size_t len) 4127 { 4128 unsigned int n; 4129 int err; 4130 4131 err = kstrtouint(buf, 10, &n); 4132 if (err < 0) 4133 return err; 4134 err = mddev_lock(mddev); 4135 if (err) 4136 return err; 4137 4138 if (mddev->pers) { 4139 if (mddev->pers->check_reshape == NULL) 4140 err = -EBUSY; 4141 else if (!md_is_rdwr(mddev)) 4142 err = -EROFS; 4143 else { 4144 mddev->new_layout = n; 4145 err = mddev->pers->check_reshape(mddev); 4146 if (err) 4147 mddev->new_layout = mddev->layout; 4148 } 4149 } else { 4150 mddev->new_layout = n; 4151 if (mddev->reshape_position == MaxSector) 4152 mddev->layout = n; 4153 } 4154 mddev_unlock(mddev); 4155 return err ?: len; 4156 } 4157 static struct md_sysfs_entry md_layout = 4158 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4159 4160 static ssize_t 4161 raid_disks_show(struct mddev *mddev, char *page) 4162 { 4163 if (mddev->raid_disks == 0) 4164 return 0; 4165 if (mddev->reshape_position != MaxSector && 4166 mddev->delta_disks != 0) 4167 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4168 mddev->raid_disks - mddev->delta_disks); 4169 return sprintf(page, "%d\n", mddev->raid_disks); 4170 } 4171 4172 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4173 4174 static ssize_t 4175 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4176 { 4177 unsigned int n; 4178 int err; 4179 4180 err = kstrtouint(buf, 10, &n); 4181 if (err < 0) 4182 return err; 4183 4184 err = mddev_lock(mddev); 4185 if (err) 4186 return err; 4187 if (mddev->pers) 4188 err = update_raid_disks(mddev, n); 4189 else if (mddev->reshape_position != MaxSector) { 4190 struct md_rdev *rdev; 4191 int olddisks = mddev->raid_disks - mddev->delta_disks; 4192 4193 err = -EINVAL; 4194 rdev_for_each(rdev, mddev) { 4195 if (olddisks < n && 4196 rdev->data_offset < rdev->new_data_offset) 4197 goto out_unlock; 4198 if (olddisks > n && 4199 rdev->data_offset > rdev->new_data_offset) 4200 goto out_unlock; 4201 } 4202 err = 0; 4203 mddev->delta_disks = n - olddisks; 4204 mddev->raid_disks = n; 4205 mddev->reshape_backwards = (mddev->delta_disks < 0); 4206 } else 4207 mddev->raid_disks = n; 4208 out_unlock: 4209 mddev_unlock(mddev); 4210 return err ? err : len; 4211 } 4212 static struct md_sysfs_entry md_raid_disks = 4213 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4214 4215 static ssize_t 4216 uuid_show(struct mddev *mddev, char *page) 4217 { 4218 return sprintf(page, "%pU\n", mddev->uuid); 4219 } 4220 static struct md_sysfs_entry md_uuid = 4221 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4222 4223 static ssize_t 4224 chunk_size_show(struct mddev *mddev, char *page) 4225 { 4226 if (mddev->reshape_position != MaxSector && 4227 mddev->chunk_sectors != mddev->new_chunk_sectors) 4228 return sprintf(page, "%d (%d)\n", 4229 mddev->new_chunk_sectors << 9, 4230 mddev->chunk_sectors << 9); 4231 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4232 } 4233 4234 static ssize_t 4235 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4236 { 4237 unsigned long n; 4238 int err; 4239 4240 err = kstrtoul(buf, 10, &n); 4241 if (err < 0) 4242 return err; 4243 4244 err = mddev_lock(mddev); 4245 if (err) 4246 return err; 4247 if (mddev->pers) { 4248 if (mddev->pers->check_reshape == NULL) 4249 err = -EBUSY; 4250 else if (!md_is_rdwr(mddev)) 4251 err = -EROFS; 4252 else { 4253 mddev->new_chunk_sectors = n >> 9; 4254 err = mddev->pers->check_reshape(mddev); 4255 if (err) 4256 mddev->new_chunk_sectors = mddev->chunk_sectors; 4257 } 4258 } else { 4259 mddev->new_chunk_sectors = n >> 9; 4260 if (mddev->reshape_position == MaxSector) 4261 mddev->chunk_sectors = n >> 9; 4262 } 4263 mddev_unlock(mddev); 4264 return err ?: len; 4265 } 4266 static struct md_sysfs_entry md_chunk_size = 4267 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4268 4269 static ssize_t 4270 resync_start_show(struct mddev *mddev, char *page) 4271 { 4272 if (mddev->recovery_cp == MaxSector) 4273 return sprintf(page, "none\n"); 4274 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4275 } 4276 4277 static ssize_t 4278 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4279 { 4280 unsigned long long n; 4281 int err; 4282 4283 if (cmd_match(buf, "none")) 4284 n = MaxSector; 4285 else { 4286 err = kstrtoull(buf, 10, &n); 4287 if (err < 0) 4288 return err; 4289 if (n != (sector_t)n) 4290 return -EINVAL; 4291 } 4292 4293 err = mddev_lock(mddev); 4294 if (err) 4295 return err; 4296 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4297 err = -EBUSY; 4298 4299 if (!err) { 4300 mddev->recovery_cp = n; 4301 if (mddev->pers) 4302 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4303 } 4304 mddev_unlock(mddev); 4305 return err ?: len; 4306 } 4307 static struct md_sysfs_entry md_resync_start = 4308 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4309 resync_start_show, resync_start_store); 4310 4311 /* 4312 * The array state can be: 4313 * 4314 * clear 4315 * No devices, no size, no level 4316 * Equivalent to STOP_ARRAY ioctl 4317 * inactive 4318 * May have some settings, but array is not active 4319 * all IO results in error 4320 * When written, doesn't tear down array, but just stops it 4321 * suspended (not supported yet) 4322 * All IO requests will block. The array can be reconfigured. 4323 * Writing this, if accepted, will block until array is quiescent 4324 * readonly 4325 * no resync can happen. no superblocks get written. 4326 * write requests fail 4327 * read-auto 4328 * like readonly, but behaves like 'clean' on a write request. 4329 * 4330 * clean - no pending writes, but otherwise active. 4331 * When written to inactive array, starts without resync 4332 * If a write request arrives then 4333 * if metadata is known, mark 'dirty' and switch to 'active'. 4334 * if not known, block and switch to write-pending 4335 * If written to an active array that has pending writes, then fails. 4336 * active 4337 * fully active: IO and resync can be happening. 4338 * When written to inactive array, starts with resync 4339 * 4340 * write-pending 4341 * clean, but writes are blocked waiting for 'active' to be written. 4342 * 4343 * active-idle 4344 * like active, but no writes have been seen for a while (100msec). 4345 * 4346 * broken 4347 * Array is failed. It's useful because mounted-arrays aren't stopped 4348 * when array is failed, so this state will at least alert the user that 4349 * something is wrong. 4350 */ 4351 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4352 write_pending, active_idle, broken, bad_word}; 4353 static char *array_states[] = { 4354 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4355 "write-pending", "active-idle", "broken", NULL }; 4356 4357 static int match_word(const char *word, char **list) 4358 { 4359 int n; 4360 for (n=0; list[n]; n++) 4361 if (cmd_match(word, list[n])) 4362 break; 4363 return n; 4364 } 4365 4366 static ssize_t 4367 array_state_show(struct mddev *mddev, char *page) 4368 { 4369 enum array_state st = inactive; 4370 4371 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4372 switch(mddev->ro) { 4373 case MD_RDONLY: 4374 st = readonly; 4375 break; 4376 case MD_AUTO_READ: 4377 st = read_auto; 4378 break; 4379 case MD_RDWR: 4380 spin_lock(&mddev->lock); 4381 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4382 st = write_pending; 4383 else if (mddev->in_sync) 4384 st = clean; 4385 else if (mddev->safemode) 4386 st = active_idle; 4387 else 4388 st = active; 4389 spin_unlock(&mddev->lock); 4390 } 4391 4392 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4393 st = broken; 4394 } else { 4395 if (list_empty(&mddev->disks) && 4396 mddev->raid_disks == 0 && 4397 mddev->dev_sectors == 0) 4398 st = clear; 4399 else 4400 st = inactive; 4401 } 4402 return sprintf(page, "%s\n", array_states[st]); 4403 } 4404 4405 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4406 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4407 static int restart_array(struct mddev *mddev); 4408 4409 static ssize_t 4410 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4411 { 4412 int err = 0; 4413 enum array_state st = match_word(buf, array_states); 4414 4415 if (mddev->pers && (st == active || st == clean) && 4416 mddev->ro != MD_RDONLY) { 4417 /* don't take reconfig_mutex when toggling between 4418 * clean and active 4419 */ 4420 spin_lock(&mddev->lock); 4421 if (st == active) { 4422 restart_array(mddev); 4423 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4424 md_wakeup_thread(mddev->thread); 4425 wake_up(&mddev->sb_wait); 4426 } else /* st == clean */ { 4427 restart_array(mddev); 4428 if (!set_in_sync(mddev)) 4429 err = -EBUSY; 4430 } 4431 if (!err) 4432 sysfs_notify_dirent_safe(mddev->sysfs_state); 4433 spin_unlock(&mddev->lock); 4434 return err ?: len; 4435 } 4436 err = mddev_lock(mddev); 4437 if (err) 4438 return err; 4439 err = -EINVAL; 4440 switch(st) { 4441 case bad_word: 4442 break; 4443 case clear: 4444 /* stopping an active array */ 4445 err = do_md_stop(mddev, 0, NULL); 4446 break; 4447 case inactive: 4448 /* stopping an active array */ 4449 if (mddev->pers) 4450 err = do_md_stop(mddev, 2, NULL); 4451 else 4452 err = 0; /* already inactive */ 4453 break; 4454 case suspended: 4455 break; /* not supported yet */ 4456 case readonly: 4457 if (mddev->pers) 4458 err = md_set_readonly(mddev, NULL); 4459 else { 4460 mddev->ro = MD_RDONLY; 4461 set_disk_ro(mddev->gendisk, 1); 4462 err = do_md_run(mddev); 4463 } 4464 break; 4465 case read_auto: 4466 if (mddev->pers) { 4467 if (md_is_rdwr(mddev)) 4468 err = md_set_readonly(mddev, NULL); 4469 else if (mddev->ro == MD_RDONLY) 4470 err = restart_array(mddev); 4471 if (err == 0) { 4472 mddev->ro = MD_AUTO_READ; 4473 set_disk_ro(mddev->gendisk, 0); 4474 } 4475 } else { 4476 mddev->ro = MD_AUTO_READ; 4477 err = do_md_run(mddev); 4478 } 4479 break; 4480 case clean: 4481 if (mddev->pers) { 4482 err = restart_array(mddev); 4483 if (err) 4484 break; 4485 spin_lock(&mddev->lock); 4486 if (!set_in_sync(mddev)) 4487 err = -EBUSY; 4488 spin_unlock(&mddev->lock); 4489 } else 4490 err = -EINVAL; 4491 break; 4492 case active: 4493 if (mddev->pers) { 4494 err = restart_array(mddev); 4495 if (err) 4496 break; 4497 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4498 wake_up(&mddev->sb_wait); 4499 err = 0; 4500 } else { 4501 mddev->ro = MD_RDWR; 4502 set_disk_ro(mddev->gendisk, 0); 4503 err = do_md_run(mddev); 4504 } 4505 break; 4506 case write_pending: 4507 case active_idle: 4508 case broken: 4509 /* these cannot be set */ 4510 break; 4511 } 4512 4513 if (!err) { 4514 if (mddev->hold_active == UNTIL_IOCTL) 4515 mddev->hold_active = 0; 4516 sysfs_notify_dirent_safe(mddev->sysfs_state); 4517 } 4518 mddev_unlock(mddev); 4519 return err ?: len; 4520 } 4521 static struct md_sysfs_entry md_array_state = 4522 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4523 4524 static ssize_t 4525 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4526 return sprintf(page, "%d\n", 4527 atomic_read(&mddev->max_corr_read_errors)); 4528 } 4529 4530 static ssize_t 4531 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4532 { 4533 unsigned int n; 4534 int rv; 4535 4536 rv = kstrtouint(buf, 10, &n); 4537 if (rv < 0) 4538 return rv; 4539 if (n > INT_MAX) 4540 return -EINVAL; 4541 atomic_set(&mddev->max_corr_read_errors, n); 4542 return len; 4543 } 4544 4545 static struct md_sysfs_entry max_corr_read_errors = 4546 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4547 max_corrected_read_errors_store); 4548 4549 static ssize_t 4550 null_show(struct mddev *mddev, char *page) 4551 { 4552 return -EINVAL; 4553 } 4554 4555 static ssize_t 4556 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4557 { 4558 /* buf must be %d:%d\n? giving major and minor numbers */ 4559 /* The new device is added to the array. 4560 * If the array has a persistent superblock, we read the 4561 * superblock to initialise info and check validity. 4562 * Otherwise, only checking done is that in bind_rdev_to_array, 4563 * which mainly checks size. 4564 */ 4565 char *e; 4566 int major = simple_strtoul(buf, &e, 10); 4567 int minor; 4568 dev_t dev; 4569 struct md_rdev *rdev; 4570 int err; 4571 4572 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4573 return -EINVAL; 4574 minor = simple_strtoul(e+1, &e, 10); 4575 if (*e && *e != '\n') 4576 return -EINVAL; 4577 dev = MKDEV(major, minor); 4578 if (major != MAJOR(dev) || 4579 minor != MINOR(dev)) 4580 return -EOVERFLOW; 4581 4582 err = mddev_lock(mddev); 4583 if (err) 4584 return err; 4585 if (mddev->persistent) { 4586 rdev = md_import_device(dev, mddev->major_version, 4587 mddev->minor_version); 4588 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4589 struct md_rdev *rdev0 4590 = list_entry(mddev->disks.next, 4591 struct md_rdev, same_set); 4592 err = super_types[mddev->major_version] 4593 .load_super(rdev, rdev0, mddev->minor_version); 4594 if (err < 0) 4595 goto out; 4596 } 4597 } else if (mddev->external) 4598 rdev = md_import_device(dev, -2, -1); 4599 else 4600 rdev = md_import_device(dev, -1, -1); 4601 4602 if (IS_ERR(rdev)) { 4603 mddev_unlock(mddev); 4604 return PTR_ERR(rdev); 4605 } 4606 err = bind_rdev_to_array(rdev, mddev); 4607 out: 4608 if (err) 4609 export_rdev(rdev, mddev); 4610 mddev_unlock(mddev); 4611 if (!err) 4612 md_new_event(); 4613 return err ? err : len; 4614 } 4615 4616 static struct md_sysfs_entry md_new_device = 4617 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4618 4619 static ssize_t 4620 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4621 { 4622 char *end; 4623 unsigned long chunk, end_chunk; 4624 int err; 4625 4626 err = mddev_lock(mddev); 4627 if (err) 4628 return err; 4629 if (!mddev->bitmap) 4630 goto out; 4631 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4632 while (*buf) { 4633 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4634 if (buf == end) break; 4635 if (*end == '-') { /* range */ 4636 buf = end + 1; 4637 end_chunk = simple_strtoul(buf, &end, 0); 4638 if (buf == end) break; 4639 } 4640 if (*end && !isspace(*end)) break; 4641 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4642 buf = skip_spaces(end); 4643 } 4644 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4645 out: 4646 mddev_unlock(mddev); 4647 return len; 4648 } 4649 4650 static struct md_sysfs_entry md_bitmap = 4651 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4652 4653 static ssize_t 4654 size_show(struct mddev *mddev, char *page) 4655 { 4656 return sprintf(page, "%llu\n", 4657 (unsigned long long)mddev->dev_sectors / 2); 4658 } 4659 4660 static int update_size(struct mddev *mddev, sector_t num_sectors); 4661 4662 static ssize_t 4663 size_store(struct mddev *mddev, const char *buf, size_t len) 4664 { 4665 /* If array is inactive, we can reduce the component size, but 4666 * not increase it (except from 0). 4667 * If array is active, we can try an on-line resize 4668 */ 4669 sector_t sectors; 4670 int err = strict_blocks_to_sectors(buf, §ors); 4671 4672 if (err < 0) 4673 return err; 4674 err = mddev_lock(mddev); 4675 if (err) 4676 return err; 4677 if (mddev->pers) { 4678 err = update_size(mddev, sectors); 4679 if (err == 0) 4680 md_update_sb(mddev, 1); 4681 } else { 4682 if (mddev->dev_sectors == 0 || 4683 mddev->dev_sectors > sectors) 4684 mddev->dev_sectors = sectors; 4685 else 4686 err = -ENOSPC; 4687 } 4688 mddev_unlock(mddev); 4689 return err ? err : len; 4690 } 4691 4692 static struct md_sysfs_entry md_size = 4693 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4694 4695 /* Metadata version. 4696 * This is one of 4697 * 'none' for arrays with no metadata (good luck...) 4698 * 'external' for arrays with externally managed metadata, 4699 * or N.M for internally known formats 4700 */ 4701 static ssize_t 4702 metadata_show(struct mddev *mddev, char *page) 4703 { 4704 if (mddev->persistent) 4705 return sprintf(page, "%d.%d\n", 4706 mddev->major_version, mddev->minor_version); 4707 else if (mddev->external) 4708 return sprintf(page, "external:%s\n", mddev->metadata_type); 4709 else 4710 return sprintf(page, "none\n"); 4711 } 4712 4713 static ssize_t 4714 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4715 { 4716 int major, minor; 4717 char *e; 4718 int err; 4719 /* Changing the details of 'external' metadata is 4720 * always permitted. Otherwise there must be 4721 * no devices attached to the array. 4722 */ 4723 4724 err = mddev_lock(mddev); 4725 if (err) 4726 return err; 4727 err = -EBUSY; 4728 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4729 ; 4730 else if (!list_empty(&mddev->disks)) 4731 goto out_unlock; 4732 4733 err = 0; 4734 if (cmd_match(buf, "none")) { 4735 mddev->persistent = 0; 4736 mddev->external = 0; 4737 mddev->major_version = 0; 4738 mddev->minor_version = 90; 4739 goto out_unlock; 4740 } 4741 if (strncmp(buf, "external:", 9) == 0) { 4742 size_t namelen = len-9; 4743 if (namelen >= sizeof(mddev->metadata_type)) 4744 namelen = sizeof(mddev->metadata_type)-1; 4745 strncpy(mddev->metadata_type, buf+9, namelen); 4746 mddev->metadata_type[namelen] = 0; 4747 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4748 mddev->metadata_type[--namelen] = 0; 4749 mddev->persistent = 0; 4750 mddev->external = 1; 4751 mddev->major_version = 0; 4752 mddev->minor_version = 90; 4753 goto out_unlock; 4754 } 4755 major = simple_strtoul(buf, &e, 10); 4756 err = -EINVAL; 4757 if (e==buf || *e != '.') 4758 goto out_unlock; 4759 buf = e+1; 4760 minor = simple_strtoul(buf, &e, 10); 4761 if (e==buf || (*e && *e != '\n') ) 4762 goto out_unlock; 4763 err = -ENOENT; 4764 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4765 goto out_unlock; 4766 mddev->major_version = major; 4767 mddev->minor_version = minor; 4768 mddev->persistent = 1; 4769 mddev->external = 0; 4770 err = 0; 4771 out_unlock: 4772 mddev_unlock(mddev); 4773 return err ?: len; 4774 } 4775 4776 static struct md_sysfs_entry md_metadata = 4777 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4778 4779 static ssize_t 4780 action_show(struct mddev *mddev, char *page) 4781 { 4782 char *type = "idle"; 4783 unsigned long recovery = mddev->recovery; 4784 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4785 type = "frozen"; 4786 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4787 (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4788 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4789 type = "reshape"; 4790 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4791 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4792 type = "resync"; 4793 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4794 type = "check"; 4795 else 4796 type = "repair"; 4797 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4798 type = "recover"; 4799 else if (mddev->reshape_position != MaxSector) 4800 type = "reshape"; 4801 } 4802 return sprintf(page, "%s\n", type); 4803 } 4804 4805 static void stop_sync_thread(struct mddev *mddev) 4806 { 4807 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4808 return; 4809 4810 if (mddev_lock(mddev)) 4811 return; 4812 4813 /* 4814 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4815 * held. 4816 */ 4817 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4818 mddev_unlock(mddev); 4819 return; 4820 } 4821 4822 if (work_pending(&mddev->del_work)) 4823 flush_workqueue(md_misc_wq); 4824 4825 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4826 /* 4827 * Thread might be blocked waiting for metadata update which will now 4828 * never happen 4829 */ 4830 md_wakeup_thread_directly(mddev->sync_thread); 4831 4832 mddev_unlock(mddev); 4833 } 4834 4835 static void idle_sync_thread(struct mddev *mddev) 4836 { 4837 int sync_seq = atomic_read(&mddev->sync_seq); 4838 4839 mutex_lock(&mddev->sync_mutex); 4840 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4841 stop_sync_thread(mddev); 4842 4843 wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4844 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4845 4846 mutex_unlock(&mddev->sync_mutex); 4847 } 4848 4849 static void frozen_sync_thread(struct mddev *mddev) 4850 { 4851 mutex_lock(&mddev->sync_mutex); 4852 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4853 stop_sync_thread(mddev); 4854 4855 wait_event(resync_wait, mddev->sync_thread == NULL && 4856 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4857 4858 mutex_unlock(&mddev->sync_mutex); 4859 } 4860 4861 static ssize_t 4862 action_store(struct mddev *mddev, const char *page, size_t len) 4863 { 4864 if (!mddev->pers || !mddev->pers->sync_request) 4865 return -EINVAL; 4866 4867 4868 if (cmd_match(page, "idle")) 4869 idle_sync_thread(mddev); 4870 else if (cmd_match(page, "frozen")) 4871 frozen_sync_thread(mddev); 4872 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4873 return -EBUSY; 4874 else if (cmd_match(page, "resync")) 4875 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4876 else if (cmd_match(page, "recover")) { 4877 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4878 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4879 } else if (cmd_match(page, "reshape")) { 4880 int err; 4881 if (mddev->pers->start_reshape == NULL) 4882 return -EINVAL; 4883 err = mddev_lock(mddev); 4884 if (!err) { 4885 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4886 err = -EBUSY; 4887 } else if (mddev->reshape_position == MaxSector || 4888 mddev->pers->check_reshape == NULL || 4889 mddev->pers->check_reshape(mddev)) { 4890 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4891 err = mddev->pers->start_reshape(mddev); 4892 } else { 4893 /* 4894 * If reshape is still in progress, and 4895 * md_check_recovery() can continue to reshape, 4896 * don't restart reshape because data can be 4897 * corrupted for raid456. 4898 */ 4899 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4900 } 4901 mddev_unlock(mddev); 4902 } 4903 if (err) 4904 return err; 4905 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4906 } else { 4907 if (cmd_match(page, "check")) 4908 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4909 else if (!cmd_match(page, "repair")) 4910 return -EINVAL; 4911 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4912 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4913 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4914 } 4915 if (mddev->ro == MD_AUTO_READ) { 4916 /* A write to sync_action is enough to justify 4917 * canceling read-auto mode 4918 */ 4919 mddev->ro = MD_RDWR; 4920 md_wakeup_thread(mddev->sync_thread); 4921 } 4922 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4923 md_wakeup_thread(mddev->thread); 4924 sysfs_notify_dirent_safe(mddev->sysfs_action); 4925 return len; 4926 } 4927 4928 static struct md_sysfs_entry md_scan_mode = 4929 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4930 4931 static ssize_t 4932 last_sync_action_show(struct mddev *mddev, char *page) 4933 { 4934 return sprintf(page, "%s\n", mddev->last_sync_action); 4935 } 4936 4937 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4938 4939 static ssize_t 4940 mismatch_cnt_show(struct mddev *mddev, char *page) 4941 { 4942 return sprintf(page, "%llu\n", 4943 (unsigned long long) 4944 atomic64_read(&mddev->resync_mismatches)); 4945 } 4946 4947 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4948 4949 static ssize_t 4950 sync_min_show(struct mddev *mddev, char *page) 4951 { 4952 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4953 mddev->sync_speed_min ? "local": "system"); 4954 } 4955 4956 static ssize_t 4957 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4958 { 4959 unsigned int min; 4960 int rv; 4961 4962 if (strncmp(buf, "system", 6)==0) { 4963 min = 0; 4964 } else { 4965 rv = kstrtouint(buf, 10, &min); 4966 if (rv < 0) 4967 return rv; 4968 if (min == 0) 4969 return -EINVAL; 4970 } 4971 mddev->sync_speed_min = min; 4972 return len; 4973 } 4974 4975 static struct md_sysfs_entry md_sync_min = 4976 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4977 4978 static ssize_t 4979 sync_max_show(struct mddev *mddev, char *page) 4980 { 4981 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4982 mddev->sync_speed_max ? "local": "system"); 4983 } 4984 4985 static ssize_t 4986 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4987 { 4988 unsigned int max; 4989 int rv; 4990 4991 if (strncmp(buf, "system", 6)==0) { 4992 max = 0; 4993 } else { 4994 rv = kstrtouint(buf, 10, &max); 4995 if (rv < 0) 4996 return rv; 4997 if (max == 0) 4998 return -EINVAL; 4999 } 5000 mddev->sync_speed_max = max; 5001 return len; 5002 } 5003 5004 static struct md_sysfs_entry md_sync_max = 5005 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5006 5007 static ssize_t 5008 degraded_show(struct mddev *mddev, char *page) 5009 { 5010 return sprintf(page, "%d\n", mddev->degraded); 5011 } 5012 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5013 5014 static ssize_t 5015 sync_force_parallel_show(struct mddev *mddev, char *page) 5016 { 5017 return sprintf(page, "%d\n", mddev->parallel_resync); 5018 } 5019 5020 static ssize_t 5021 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5022 { 5023 long n; 5024 5025 if (kstrtol(buf, 10, &n)) 5026 return -EINVAL; 5027 5028 if (n != 0 && n != 1) 5029 return -EINVAL; 5030 5031 mddev->parallel_resync = n; 5032 5033 if (mddev->sync_thread) 5034 wake_up(&resync_wait); 5035 5036 return len; 5037 } 5038 5039 /* force parallel resync, even with shared block devices */ 5040 static struct md_sysfs_entry md_sync_force_parallel = 5041 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5042 sync_force_parallel_show, sync_force_parallel_store); 5043 5044 static ssize_t 5045 sync_speed_show(struct mddev *mddev, char *page) 5046 { 5047 unsigned long resync, dt, db; 5048 if (mddev->curr_resync == MD_RESYNC_NONE) 5049 return sprintf(page, "none\n"); 5050 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5051 dt = (jiffies - mddev->resync_mark) / HZ; 5052 if (!dt) dt++; 5053 db = resync - mddev->resync_mark_cnt; 5054 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5055 } 5056 5057 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5058 5059 static ssize_t 5060 sync_completed_show(struct mddev *mddev, char *page) 5061 { 5062 unsigned long long max_sectors, resync; 5063 5064 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5065 return sprintf(page, "none\n"); 5066 5067 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5068 mddev->curr_resync == MD_RESYNC_DELAYED) 5069 return sprintf(page, "delayed\n"); 5070 5071 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5072 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5073 max_sectors = mddev->resync_max_sectors; 5074 else 5075 max_sectors = mddev->dev_sectors; 5076 5077 resync = mddev->curr_resync_completed; 5078 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5079 } 5080 5081 static struct md_sysfs_entry md_sync_completed = 5082 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5083 5084 static ssize_t 5085 min_sync_show(struct mddev *mddev, char *page) 5086 { 5087 return sprintf(page, "%llu\n", 5088 (unsigned long long)mddev->resync_min); 5089 } 5090 static ssize_t 5091 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5092 { 5093 unsigned long long min; 5094 int err; 5095 5096 if (kstrtoull(buf, 10, &min)) 5097 return -EINVAL; 5098 5099 spin_lock(&mddev->lock); 5100 err = -EINVAL; 5101 if (min > mddev->resync_max) 5102 goto out_unlock; 5103 5104 err = -EBUSY; 5105 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5106 goto out_unlock; 5107 5108 /* Round down to multiple of 4K for safety */ 5109 mddev->resync_min = round_down(min, 8); 5110 err = 0; 5111 5112 out_unlock: 5113 spin_unlock(&mddev->lock); 5114 return err ?: len; 5115 } 5116 5117 static struct md_sysfs_entry md_min_sync = 5118 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5119 5120 static ssize_t 5121 max_sync_show(struct mddev *mddev, char *page) 5122 { 5123 if (mddev->resync_max == MaxSector) 5124 return sprintf(page, "max\n"); 5125 else 5126 return sprintf(page, "%llu\n", 5127 (unsigned long long)mddev->resync_max); 5128 } 5129 static ssize_t 5130 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5131 { 5132 int err; 5133 spin_lock(&mddev->lock); 5134 if (strncmp(buf, "max", 3) == 0) 5135 mddev->resync_max = MaxSector; 5136 else { 5137 unsigned long long max; 5138 int chunk; 5139 5140 err = -EINVAL; 5141 if (kstrtoull(buf, 10, &max)) 5142 goto out_unlock; 5143 if (max < mddev->resync_min) 5144 goto out_unlock; 5145 5146 err = -EBUSY; 5147 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5148 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5149 goto out_unlock; 5150 5151 /* Must be a multiple of chunk_size */ 5152 chunk = mddev->chunk_sectors; 5153 if (chunk) { 5154 sector_t temp = max; 5155 5156 err = -EINVAL; 5157 if (sector_div(temp, chunk)) 5158 goto out_unlock; 5159 } 5160 mddev->resync_max = max; 5161 } 5162 wake_up(&mddev->recovery_wait); 5163 err = 0; 5164 out_unlock: 5165 spin_unlock(&mddev->lock); 5166 return err ?: len; 5167 } 5168 5169 static struct md_sysfs_entry md_max_sync = 5170 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5171 5172 static ssize_t 5173 suspend_lo_show(struct mddev *mddev, char *page) 5174 { 5175 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 5176 } 5177 5178 static ssize_t 5179 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5180 { 5181 unsigned long long new; 5182 int err; 5183 5184 err = kstrtoull(buf, 10, &new); 5185 if (err < 0) 5186 return err; 5187 if (new != (sector_t)new) 5188 return -EINVAL; 5189 5190 err = mddev_lock(mddev); 5191 if (err) 5192 return err; 5193 err = -EINVAL; 5194 if (mddev->pers == NULL || 5195 mddev->pers->quiesce == NULL) 5196 goto unlock; 5197 mddev_suspend(mddev); 5198 mddev->suspend_lo = new; 5199 mddev_resume(mddev); 5200 5201 err = 0; 5202 unlock: 5203 mddev_unlock(mddev); 5204 return err ?: len; 5205 } 5206 static struct md_sysfs_entry md_suspend_lo = 5207 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5208 5209 static ssize_t 5210 suspend_hi_show(struct mddev *mddev, char *page) 5211 { 5212 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 5213 } 5214 5215 static ssize_t 5216 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5217 { 5218 unsigned long long new; 5219 int err; 5220 5221 err = kstrtoull(buf, 10, &new); 5222 if (err < 0) 5223 return err; 5224 if (new != (sector_t)new) 5225 return -EINVAL; 5226 5227 err = mddev_lock(mddev); 5228 if (err) 5229 return err; 5230 err = -EINVAL; 5231 if (mddev->pers == NULL) 5232 goto unlock; 5233 5234 mddev_suspend(mddev); 5235 mddev->suspend_hi = new; 5236 mddev_resume(mddev); 5237 5238 err = 0; 5239 unlock: 5240 mddev_unlock(mddev); 5241 return err ?: len; 5242 } 5243 static struct md_sysfs_entry md_suspend_hi = 5244 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5245 5246 static ssize_t 5247 reshape_position_show(struct mddev *mddev, char *page) 5248 { 5249 if (mddev->reshape_position != MaxSector) 5250 return sprintf(page, "%llu\n", 5251 (unsigned long long)mddev->reshape_position); 5252 strcpy(page, "none\n"); 5253 return 5; 5254 } 5255 5256 static ssize_t 5257 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5258 { 5259 struct md_rdev *rdev; 5260 unsigned long long new; 5261 int err; 5262 5263 err = kstrtoull(buf, 10, &new); 5264 if (err < 0) 5265 return err; 5266 if (new != (sector_t)new) 5267 return -EINVAL; 5268 err = mddev_lock(mddev); 5269 if (err) 5270 return err; 5271 err = -EBUSY; 5272 if (mddev->pers) 5273 goto unlock; 5274 mddev->reshape_position = new; 5275 mddev->delta_disks = 0; 5276 mddev->reshape_backwards = 0; 5277 mddev->new_level = mddev->level; 5278 mddev->new_layout = mddev->layout; 5279 mddev->new_chunk_sectors = mddev->chunk_sectors; 5280 rdev_for_each(rdev, mddev) 5281 rdev->new_data_offset = rdev->data_offset; 5282 err = 0; 5283 unlock: 5284 mddev_unlock(mddev); 5285 return err ?: len; 5286 } 5287 5288 static struct md_sysfs_entry md_reshape_position = 5289 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5290 reshape_position_store); 5291 5292 static ssize_t 5293 reshape_direction_show(struct mddev *mddev, char *page) 5294 { 5295 return sprintf(page, "%s\n", 5296 mddev->reshape_backwards ? "backwards" : "forwards"); 5297 } 5298 5299 static ssize_t 5300 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5301 { 5302 int backwards = 0; 5303 int err; 5304 5305 if (cmd_match(buf, "forwards")) 5306 backwards = 0; 5307 else if (cmd_match(buf, "backwards")) 5308 backwards = 1; 5309 else 5310 return -EINVAL; 5311 if (mddev->reshape_backwards == backwards) 5312 return len; 5313 5314 err = mddev_lock(mddev); 5315 if (err) 5316 return err; 5317 /* check if we are allowed to change */ 5318 if (mddev->delta_disks) 5319 err = -EBUSY; 5320 else if (mddev->persistent && 5321 mddev->major_version == 0) 5322 err = -EINVAL; 5323 else 5324 mddev->reshape_backwards = backwards; 5325 mddev_unlock(mddev); 5326 return err ?: len; 5327 } 5328 5329 static struct md_sysfs_entry md_reshape_direction = 5330 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5331 reshape_direction_store); 5332 5333 static ssize_t 5334 array_size_show(struct mddev *mddev, char *page) 5335 { 5336 if (mddev->external_size) 5337 return sprintf(page, "%llu\n", 5338 (unsigned long long)mddev->array_sectors/2); 5339 else 5340 return sprintf(page, "default\n"); 5341 } 5342 5343 static ssize_t 5344 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5345 { 5346 sector_t sectors; 5347 int err; 5348 5349 err = mddev_lock(mddev); 5350 if (err) 5351 return err; 5352 5353 /* cluster raid doesn't support change array_sectors */ 5354 if (mddev_is_clustered(mddev)) { 5355 mddev_unlock(mddev); 5356 return -EINVAL; 5357 } 5358 5359 if (strncmp(buf, "default", 7) == 0) { 5360 if (mddev->pers) 5361 sectors = mddev->pers->size(mddev, 0, 0); 5362 else 5363 sectors = mddev->array_sectors; 5364 5365 mddev->external_size = 0; 5366 } else { 5367 if (strict_blocks_to_sectors(buf, §ors) < 0) 5368 err = -EINVAL; 5369 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5370 err = -E2BIG; 5371 else 5372 mddev->external_size = 1; 5373 } 5374 5375 if (!err) { 5376 mddev->array_sectors = sectors; 5377 if (mddev->pers) 5378 set_capacity_and_notify(mddev->gendisk, 5379 mddev->array_sectors); 5380 } 5381 mddev_unlock(mddev); 5382 return err ?: len; 5383 } 5384 5385 static struct md_sysfs_entry md_array_size = 5386 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5387 array_size_store); 5388 5389 static ssize_t 5390 consistency_policy_show(struct mddev *mddev, char *page) 5391 { 5392 int ret; 5393 5394 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5395 ret = sprintf(page, "journal\n"); 5396 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5397 ret = sprintf(page, "ppl\n"); 5398 } else if (mddev->bitmap) { 5399 ret = sprintf(page, "bitmap\n"); 5400 } else if (mddev->pers) { 5401 if (mddev->pers->sync_request) 5402 ret = sprintf(page, "resync\n"); 5403 else 5404 ret = sprintf(page, "none\n"); 5405 } else { 5406 ret = sprintf(page, "unknown\n"); 5407 } 5408 5409 return ret; 5410 } 5411 5412 static ssize_t 5413 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5414 { 5415 int err = 0; 5416 5417 if (mddev->pers) { 5418 if (mddev->pers->change_consistency_policy) 5419 err = mddev->pers->change_consistency_policy(mddev, buf); 5420 else 5421 err = -EBUSY; 5422 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5423 set_bit(MD_HAS_PPL, &mddev->flags); 5424 } else { 5425 err = -EINVAL; 5426 } 5427 5428 return err ? err : len; 5429 } 5430 5431 static struct md_sysfs_entry md_consistency_policy = 5432 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5433 consistency_policy_store); 5434 5435 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5436 { 5437 return sprintf(page, "%d\n", mddev->fail_last_dev); 5438 } 5439 5440 /* 5441 * Setting fail_last_dev to true to allow last device to be forcibly removed 5442 * from RAID1/RAID10. 5443 */ 5444 static ssize_t 5445 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5446 { 5447 int ret; 5448 bool value; 5449 5450 ret = kstrtobool(buf, &value); 5451 if (ret) 5452 return ret; 5453 5454 if (value != mddev->fail_last_dev) 5455 mddev->fail_last_dev = value; 5456 5457 return len; 5458 } 5459 static struct md_sysfs_entry md_fail_last_dev = 5460 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5461 fail_last_dev_store); 5462 5463 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5464 { 5465 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5466 return sprintf(page, "n/a\n"); 5467 else 5468 return sprintf(page, "%d\n", mddev->serialize_policy); 5469 } 5470 5471 /* 5472 * Setting serialize_policy to true to enforce write IO is not reordered 5473 * for raid1. 5474 */ 5475 static ssize_t 5476 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5477 { 5478 int err; 5479 bool value; 5480 5481 err = kstrtobool(buf, &value); 5482 if (err) 5483 return err; 5484 5485 if (value == mddev->serialize_policy) 5486 return len; 5487 5488 err = mddev_lock(mddev); 5489 if (err) 5490 return err; 5491 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5492 pr_err("md: serialize_policy is only effective for raid1\n"); 5493 err = -EINVAL; 5494 goto unlock; 5495 } 5496 5497 mddev_suspend(mddev); 5498 if (value) 5499 mddev_create_serial_pool(mddev, NULL, true); 5500 else 5501 mddev_destroy_serial_pool(mddev, NULL, true); 5502 mddev->serialize_policy = value; 5503 mddev_resume(mddev); 5504 unlock: 5505 mddev_unlock(mddev); 5506 return err ?: len; 5507 } 5508 5509 static struct md_sysfs_entry md_serialize_policy = 5510 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5511 serialize_policy_store); 5512 5513 5514 static struct attribute *md_default_attrs[] = { 5515 &md_level.attr, 5516 &md_layout.attr, 5517 &md_raid_disks.attr, 5518 &md_uuid.attr, 5519 &md_chunk_size.attr, 5520 &md_size.attr, 5521 &md_resync_start.attr, 5522 &md_metadata.attr, 5523 &md_new_device.attr, 5524 &md_safe_delay.attr, 5525 &md_array_state.attr, 5526 &md_reshape_position.attr, 5527 &md_reshape_direction.attr, 5528 &md_array_size.attr, 5529 &max_corr_read_errors.attr, 5530 &md_consistency_policy.attr, 5531 &md_fail_last_dev.attr, 5532 &md_serialize_policy.attr, 5533 NULL, 5534 }; 5535 5536 static const struct attribute_group md_default_group = { 5537 .attrs = md_default_attrs, 5538 }; 5539 5540 static struct attribute *md_redundancy_attrs[] = { 5541 &md_scan_mode.attr, 5542 &md_last_scan_mode.attr, 5543 &md_mismatches.attr, 5544 &md_sync_min.attr, 5545 &md_sync_max.attr, 5546 &md_sync_speed.attr, 5547 &md_sync_force_parallel.attr, 5548 &md_sync_completed.attr, 5549 &md_min_sync.attr, 5550 &md_max_sync.attr, 5551 &md_suspend_lo.attr, 5552 &md_suspend_hi.attr, 5553 &md_bitmap.attr, 5554 &md_degraded.attr, 5555 NULL, 5556 }; 5557 static const struct attribute_group md_redundancy_group = { 5558 .name = NULL, 5559 .attrs = md_redundancy_attrs, 5560 }; 5561 5562 static const struct attribute_group *md_attr_groups[] = { 5563 &md_default_group, 5564 &md_bitmap_group, 5565 NULL, 5566 }; 5567 5568 static ssize_t 5569 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5570 { 5571 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5572 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5573 ssize_t rv; 5574 5575 if (!entry->show) 5576 return -EIO; 5577 spin_lock(&all_mddevs_lock); 5578 if (!mddev_get(mddev)) { 5579 spin_unlock(&all_mddevs_lock); 5580 return -EBUSY; 5581 } 5582 spin_unlock(&all_mddevs_lock); 5583 5584 rv = entry->show(mddev, page); 5585 mddev_put(mddev); 5586 return rv; 5587 } 5588 5589 static ssize_t 5590 md_attr_store(struct kobject *kobj, struct attribute *attr, 5591 const char *page, size_t length) 5592 { 5593 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5594 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5595 ssize_t rv; 5596 5597 if (!entry->store) 5598 return -EIO; 5599 if (!capable(CAP_SYS_ADMIN)) 5600 return -EACCES; 5601 spin_lock(&all_mddevs_lock); 5602 if (!mddev_get(mddev)) { 5603 spin_unlock(&all_mddevs_lock); 5604 return -EBUSY; 5605 } 5606 spin_unlock(&all_mddevs_lock); 5607 rv = entry->store(mddev, page, length); 5608 mddev_put(mddev); 5609 return rv; 5610 } 5611 5612 static void md_kobj_release(struct kobject *ko) 5613 { 5614 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5615 5616 if (mddev->sysfs_state) 5617 sysfs_put(mddev->sysfs_state); 5618 if (mddev->sysfs_level) 5619 sysfs_put(mddev->sysfs_level); 5620 5621 del_gendisk(mddev->gendisk); 5622 put_disk(mddev->gendisk); 5623 } 5624 5625 static const struct sysfs_ops md_sysfs_ops = { 5626 .show = md_attr_show, 5627 .store = md_attr_store, 5628 }; 5629 static const struct kobj_type md_ktype = { 5630 .release = md_kobj_release, 5631 .sysfs_ops = &md_sysfs_ops, 5632 .default_groups = md_attr_groups, 5633 }; 5634 5635 int mdp_major = 0; 5636 5637 static void mddev_delayed_delete(struct work_struct *ws) 5638 { 5639 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5640 5641 kobject_put(&mddev->kobj); 5642 } 5643 5644 static void no_op(struct percpu_ref *r) {} 5645 5646 int mddev_init_writes_pending(struct mddev *mddev) 5647 { 5648 if (mddev->writes_pending.percpu_count_ptr) 5649 return 0; 5650 if (percpu_ref_init(&mddev->writes_pending, no_op, 5651 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) 5652 return -ENOMEM; 5653 /* We want to start with the refcount at zero */ 5654 percpu_ref_put(&mddev->writes_pending); 5655 return 0; 5656 } 5657 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5658 5659 struct mddev *md_alloc(dev_t dev, char *name) 5660 { 5661 /* 5662 * If dev is zero, name is the name of a device to allocate with 5663 * an arbitrary minor number. It will be "md_???" 5664 * If dev is non-zero it must be a device number with a MAJOR of 5665 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5666 * the device is being created by opening a node in /dev. 5667 * If "name" is not NULL, the device is being created by 5668 * writing to /sys/module/md_mod/parameters/new_array. 5669 */ 5670 static DEFINE_MUTEX(disks_mutex); 5671 struct mddev *mddev; 5672 struct gendisk *disk; 5673 int partitioned; 5674 int shift; 5675 int unit; 5676 int error ; 5677 5678 /* 5679 * Wait for any previous instance of this device to be completely 5680 * removed (mddev_delayed_delete). 5681 */ 5682 flush_workqueue(md_misc_wq); 5683 5684 mutex_lock(&disks_mutex); 5685 mddev = mddev_alloc(dev); 5686 if (IS_ERR(mddev)) { 5687 error = PTR_ERR(mddev); 5688 goto out_unlock; 5689 } 5690 5691 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5692 shift = partitioned ? MdpMinorShift : 0; 5693 unit = MINOR(mddev->unit) >> shift; 5694 5695 if (name && !dev) { 5696 /* Need to ensure that 'name' is not a duplicate. 5697 */ 5698 struct mddev *mddev2; 5699 spin_lock(&all_mddevs_lock); 5700 5701 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5702 if (mddev2->gendisk && 5703 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5704 spin_unlock(&all_mddevs_lock); 5705 error = -EEXIST; 5706 goto out_free_mddev; 5707 } 5708 spin_unlock(&all_mddevs_lock); 5709 } 5710 if (name && dev) 5711 /* 5712 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5713 */ 5714 mddev->hold_active = UNTIL_STOP; 5715 5716 error = -ENOMEM; 5717 disk = blk_alloc_disk(NUMA_NO_NODE); 5718 if (!disk) 5719 goto out_free_mddev; 5720 5721 disk->major = MAJOR(mddev->unit); 5722 disk->first_minor = unit << shift; 5723 disk->minors = 1 << shift; 5724 if (name) 5725 strcpy(disk->disk_name, name); 5726 else if (partitioned) 5727 sprintf(disk->disk_name, "md_d%d", unit); 5728 else 5729 sprintf(disk->disk_name, "md%d", unit); 5730 disk->fops = &md_fops; 5731 disk->private_data = mddev; 5732 5733 mddev->queue = disk->queue; 5734 blk_set_stacking_limits(&mddev->queue->limits); 5735 blk_queue_write_cache(mddev->queue, true, true); 5736 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5737 mddev->gendisk = disk; 5738 error = add_disk(disk); 5739 if (error) 5740 goto out_put_disk; 5741 5742 kobject_init(&mddev->kobj, &md_ktype); 5743 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5744 if (error) { 5745 /* 5746 * The disk is already live at this point. Clear the hold flag 5747 * and let mddev_put take care of the deletion, as it isn't any 5748 * different from a normal close on last release now. 5749 */ 5750 mddev->hold_active = 0; 5751 mutex_unlock(&disks_mutex); 5752 mddev_put(mddev); 5753 return ERR_PTR(error); 5754 } 5755 5756 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5757 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5758 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5759 mutex_unlock(&disks_mutex); 5760 return mddev; 5761 5762 out_put_disk: 5763 put_disk(disk); 5764 out_free_mddev: 5765 mddev_free(mddev); 5766 out_unlock: 5767 mutex_unlock(&disks_mutex); 5768 return ERR_PTR(error); 5769 } 5770 5771 static int md_alloc_and_put(dev_t dev, char *name) 5772 { 5773 struct mddev *mddev = md_alloc(dev, name); 5774 5775 if (IS_ERR(mddev)) 5776 return PTR_ERR(mddev); 5777 mddev_put(mddev); 5778 return 0; 5779 } 5780 5781 static void md_probe(dev_t dev) 5782 { 5783 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5784 return; 5785 if (create_on_open) 5786 md_alloc_and_put(dev, NULL); 5787 } 5788 5789 static int add_named_array(const char *val, const struct kernel_param *kp) 5790 { 5791 /* 5792 * val must be "md_*" or "mdNNN". 5793 * For "md_*" we allocate an array with a large free minor number, and 5794 * set the name to val. val must not already be an active name. 5795 * For "mdNNN" we allocate an array with the minor number NNN 5796 * which must not already be in use. 5797 */ 5798 int len = strlen(val); 5799 char buf[DISK_NAME_LEN]; 5800 unsigned long devnum; 5801 5802 while (len && val[len-1] == '\n') 5803 len--; 5804 if (len >= DISK_NAME_LEN) 5805 return -E2BIG; 5806 strscpy(buf, val, len+1); 5807 if (strncmp(buf, "md_", 3) == 0) 5808 return md_alloc_and_put(0, buf); 5809 if (strncmp(buf, "md", 2) == 0 && 5810 isdigit(buf[2]) && 5811 kstrtoul(buf+2, 10, &devnum) == 0 && 5812 devnum <= MINORMASK) 5813 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5814 5815 return -EINVAL; 5816 } 5817 5818 static void md_safemode_timeout(struct timer_list *t) 5819 { 5820 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5821 5822 mddev->safemode = 1; 5823 if (mddev->external) 5824 sysfs_notify_dirent_safe(mddev->sysfs_state); 5825 5826 md_wakeup_thread(mddev->thread); 5827 } 5828 5829 static int start_dirty_degraded; 5830 static void active_io_release(struct percpu_ref *ref) 5831 { 5832 struct mddev *mddev = container_of(ref, struct mddev, active_io); 5833 5834 wake_up(&mddev->sb_wait); 5835 } 5836 5837 int md_run(struct mddev *mddev) 5838 { 5839 int err; 5840 struct md_rdev *rdev; 5841 struct md_personality *pers; 5842 bool nowait = true; 5843 5844 if (list_empty(&mddev->disks)) 5845 /* cannot run an array with no devices.. */ 5846 return -EINVAL; 5847 5848 if (mddev->pers) 5849 return -EBUSY; 5850 /* Cannot run until previous stop completes properly */ 5851 if (mddev->sysfs_active) 5852 return -EBUSY; 5853 5854 /* 5855 * Analyze all RAID superblock(s) 5856 */ 5857 if (!mddev->raid_disks) { 5858 if (!mddev->persistent) 5859 return -EINVAL; 5860 err = analyze_sbs(mddev); 5861 if (err) 5862 return -EINVAL; 5863 } 5864 5865 if (mddev->level != LEVEL_NONE) 5866 request_module("md-level-%d", mddev->level); 5867 else if (mddev->clevel[0]) 5868 request_module("md-%s", mddev->clevel); 5869 5870 /* 5871 * Drop all container device buffers, from now on 5872 * the only valid external interface is through the md 5873 * device. 5874 */ 5875 mddev->has_superblocks = false; 5876 rdev_for_each(rdev, mddev) { 5877 if (test_bit(Faulty, &rdev->flags)) 5878 continue; 5879 sync_blockdev(rdev->bdev); 5880 invalidate_bdev(rdev->bdev); 5881 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 5882 mddev->ro = MD_RDONLY; 5883 if (mddev->gendisk) 5884 set_disk_ro(mddev->gendisk, 1); 5885 } 5886 5887 if (rdev->sb_page) 5888 mddev->has_superblocks = true; 5889 5890 /* perform some consistency tests on the device. 5891 * We don't want the data to overlap the metadata, 5892 * Internal Bitmap issues have been handled elsewhere. 5893 */ 5894 if (rdev->meta_bdev) { 5895 /* Nothing to check */; 5896 } else if (rdev->data_offset < rdev->sb_start) { 5897 if (mddev->dev_sectors && 5898 rdev->data_offset + mddev->dev_sectors 5899 > rdev->sb_start) { 5900 pr_warn("md: %s: data overlaps metadata\n", 5901 mdname(mddev)); 5902 return -EINVAL; 5903 } 5904 } else { 5905 if (rdev->sb_start + rdev->sb_size/512 5906 > rdev->data_offset) { 5907 pr_warn("md: %s: metadata overlaps data\n", 5908 mdname(mddev)); 5909 return -EINVAL; 5910 } 5911 } 5912 sysfs_notify_dirent_safe(rdev->sysfs_state); 5913 nowait = nowait && bdev_nowait(rdev->bdev); 5914 } 5915 5916 err = percpu_ref_init(&mddev->active_io, active_io_release, 5917 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); 5918 if (err) 5919 return err; 5920 5921 if (!bioset_initialized(&mddev->bio_set)) { 5922 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5923 if (err) 5924 goto exit_active_io; 5925 } 5926 if (!bioset_initialized(&mddev->sync_set)) { 5927 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5928 if (err) 5929 goto exit_bio_set; 5930 } 5931 5932 if (!bioset_initialized(&mddev->io_clone_set)) { 5933 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5934 offsetof(struct md_io_clone, bio_clone), 0); 5935 if (err) 5936 goto exit_sync_set; 5937 } 5938 5939 spin_lock(&pers_lock); 5940 pers = find_pers(mddev->level, mddev->clevel); 5941 if (!pers || !try_module_get(pers->owner)) { 5942 spin_unlock(&pers_lock); 5943 if (mddev->level != LEVEL_NONE) 5944 pr_warn("md: personality for level %d is not loaded!\n", 5945 mddev->level); 5946 else 5947 pr_warn("md: personality for level %s is not loaded!\n", 5948 mddev->clevel); 5949 err = -EINVAL; 5950 goto abort; 5951 } 5952 spin_unlock(&pers_lock); 5953 if (mddev->level != pers->level) { 5954 mddev->level = pers->level; 5955 mddev->new_level = pers->level; 5956 } 5957 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5958 5959 if (mddev->reshape_position != MaxSector && 5960 pers->start_reshape == NULL) { 5961 /* This personality cannot handle reshaping... */ 5962 module_put(pers->owner); 5963 err = -EINVAL; 5964 goto abort; 5965 } 5966 5967 if (pers->sync_request) { 5968 /* Warn if this is a potentially silly 5969 * configuration. 5970 */ 5971 struct md_rdev *rdev2; 5972 int warned = 0; 5973 5974 rdev_for_each(rdev, mddev) 5975 rdev_for_each(rdev2, mddev) { 5976 if (rdev < rdev2 && 5977 rdev->bdev->bd_disk == 5978 rdev2->bdev->bd_disk) { 5979 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 5980 mdname(mddev), 5981 rdev->bdev, 5982 rdev2->bdev); 5983 warned = 1; 5984 } 5985 } 5986 5987 if (warned) 5988 pr_warn("True protection against single-disk failure might be compromised.\n"); 5989 } 5990 5991 mddev->recovery = 0; 5992 /* may be over-ridden by personality */ 5993 mddev->resync_max_sectors = mddev->dev_sectors; 5994 5995 mddev->ok_start_degraded = start_dirty_degraded; 5996 5997 if (start_readonly && md_is_rdwr(mddev)) 5998 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 5999 6000 err = pers->run(mddev); 6001 if (err) 6002 pr_warn("md: pers->run() failed ...\n"); 6003 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6004 WARN_ONCE(!mddev->external_size, 6005 "%s: default size too small, but 'external_size' not in effect?\n", 6006 __func__); 6007 pr_warn("md: invalid array_size %llu > default size %llu\n", 6008 (unsigned long long)mddev->array_sectors / 2, 6009 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6010 err = -EINVAL; 6011 } 6012 if (err == 0 && pers->sync_request && 6013 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6014 struct bitmap *bitmap; 6015 6016 bitmap = md_bitmap_create(mddev, -1); 6017 if (IS_ERR(bitmap)) { 6018 err = PTR_ERR(bitmap); 6019 pr_warn("%s: failed to create bitmap (%d)\n", 6020 mdname(mddev), err); 6021 } else 6022 mddev->bitmap = bitmap; 6023 6024 } 6025 if (err) 6026 goto bitmap_abort; 6027 6028 if (mddev->bitmap_info.max_write_behind > 0) { 6029 bool create_pool = false; 6030 6031 rdev_for_each(rdev, mddev) { 6032 if (test_bit(WriteMostly, &rdev->flags) && 6033 rdev_init_serial(rdev)) 6034 create_pool = true; 6035 } 6036 if (create_pool && mddev->serial_info_pool == NULL) { 6037 mddev->serial_info_pool = 6038 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6039 sizeof(struct serial_info)); 6040 if (!mddev->serial_info_pool) { 6041 err = -ENOMEM; 6042 goto bitmap_abort; 6043 } 6044 } 6045 } 6046 6047 if (mddev->queue) { 6048 bool nonrot = true; 6049 6050 rdev_for_each(rdev, mddev) { 6051 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { 6052 nonrot = false; 6053 break; 6054 } 6055 } 6056 if (mddev->degraded) 6057 nonrot = false; 6058 if (nonrot) 6059 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6060 else 6061 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6062 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6063 6064 /* Set the NOWAIT flags if all underlying devices support it */ 6065 if (nowait) 6066 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6067 } 6068 if (pers->sync_request) { 6069 if (mddev->kobj.sd && 6070 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6071 pr_warn("md: cannot register extra attributes for %s\n", 6072 mdname(mddev)); 6073 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6074 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6075 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6076 } else if (mddev->ro == MD_AUTO_READ) 6077 mddev->ro = MD_RDWR; 6078 6079 atomic_set(&mddev->max_corr_read_errors, 6080 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6081 mddev->safemode = 0; 6082 if (mddev_is_clustered(mddev)) 6083 mddev->safemode_delay = 0; 6084 else 6085 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6086 mddev->in_sync = 1; 6087 smp_wmb(); 6088 spin_lock(&mddev->lock); 6089 mddev->pers = pers; 6090 spin_unlock(&mddev->lock); 6091 rdev_for_each(rdev, mddev) 6092 if (rdev->raid_disk >= 0) 6093 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6094 6095 if (mddev->degraded && md_is_rdwr(mddev)) 6096 /* This ensures that recovering status is reported immediately 6097 * via sysfs - until a lack of spares is confirmed. 6098 */ 6099 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6100 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6101 6102 if (mddev->sb_flags) 6103 md_update_sb(mddev, 0); 6104 6105 md_new_event(); 6106 return 0; 6107 6108 bitmap_abort: 6109 mddev_detach(mddev); 6110 if (mddev->private) 6111 pers->free(mddev, mddev->private); 6112 mddev->private = NULL; 6113 module_put(pers->owner); 6114 md_bitmap_destroy(mddev); 6115 abort: 6116 bioset_exit(&mddev->io_clone_set); 6117 exit_sync_set: 6118 bioset_exit(&mddev->sync_set); 6119 exit_bio_set: 6120 bioset_exit(&mddev->bio_set); 6121 exit_active_io: 6122 percpu_ref_exit(&mddev->active_io); 6123 return err; 6124 } 6125 EXPORT_SYMBOL_GPL(md_run); 6126 6127 int do_md_run(struct mddev *mddev) 6128 { 6129 int err; 6130 6131 set_bit(MD_NOT_READY, &mddev->flags); 6132 err = md_run(mddev); 6133 if (err) 6134 goto out; 6135 err = md_bitmap_load(mddev); 6136 if (err) { 6137 md_bitmap_destroy(mddev); 6138 goto out; 6139 } 6140 6141 if (mddev_is_clustered(mddev)) 6142 md_allow_write(mddev); 6143 6144 /* run start up tasks that require md_thread */ 6145 md_start(mddev); 6146 6147 md_wakeup_thread(mddev->thread); 6148 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6149 6150 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6151 clear_bit(MD_NOT_READY, &mddev->flags); 6152 mddev->changed = 1; 6153 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6154 sysfs_notify_dirent_safe(mddev->sysfs_state); 6155 sysfs_notify_dirent_safe(mddev->sysfs_action); 6156 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6157 out: 6158 clear_bit(MD_NOT_READY, &mddev->flags); 6159 return err; 6160 } 6161 6162 int md_start(struct mddev *mddev) 6163 { 6164 int ret = 0; 6165 6166 if (mddev->pers->start) { 6167 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6168 md_wakeup_thread(mddev->thread); 6169 ret = mddev->pers->start(mddev); 6170 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6171 md_wakeup_thread(mddev->sync_thread); 6172 } 6173 return ret; 6174 } 6175 EXPORT_SYMBOL_GPL(md_start); 6176 6177 static int restart_array(struct mddev *mddev) 6178 { 6179 struct gendisk *disk = mddev->gendisk; 6180 struct md_rdev *rdev; 6181 bool has_journal = false; 6182 bool has_readonly = false; 6183 6184 /* Complain if it has no devices */ 6185 if (list_empty(&mddev->disks)) 6186 return -ENXIO; 6187 if (!mddev->pers) 6188 return -EINVAL; 6189 if (md_is_rdwr(mddev)) 6190 return -EBUSY; 6191 6192 rcu_read_lock(); 6193 rdev_for_each_rcu(rdev, mddev) { 6194 if (test_bit(Journal, &rdev->flags) && 6195 !test_bit(Faulty, &rdev->flags)) 6196 has_journal = true; 6197 if (rdev_read_only(rdev)) 6198 has_readonly = true; 6199 } 6200 rcu_read_unlock(); 6201 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6202 /* Don't restart rw with journal missing/faulty */ 6203 return -EINVAL; 6204 if (has_readonly) 6205 return -EROFS; 6206 6207 mddev->safemode = 0; 6208 mddev->ro = MD_RDWR; 6209 set_disk_ro(disk, 0); 6210 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6211 /* Kick recovery or resync if necessary */ 6212 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6213 md_wakeup_thread(mddev->thread); 6214 md_wakeup_thread(mddev->sync_thread); 6215 sysfs_notify_dirent_safe(mddev->sysfs_state); 6216 return 0; 6217 } 6218 6219 static void md_clean(struct mddev *mddev) 6220 { 6221 mddev->array_sectors = 0; 6222 mddev->external_size = 0; 6223 mddev->dev_sectors = 0; 6224 mddev->raid_disks = 0; 6225 mddev->recovery_cp = 0; 6226 mddev->resync_min = 0; 6227 mddev->resync_max = MaxSector; 6228 mddev->reshape_position = MaxSector; 6229 /* we still need mddev->external in export_rdev, do not clear it yet */ 6230 mddev->persistent = 0; 6231 mddev->level = LEVEL_NONE; 6232 mddev->clevel[0] = 0; 6233 mddev->flags = 0; 6234 mddev->sb_flags = 0; 6235 mddev->ro = MD_RDWR; 6236 mddev->metadata_type[0] = 0; 6237 mddev->chunk_sectors = 0; 6238 mddev->ctime = mddev->utime = 0; 6239 mddev->layout = 0; 6240 mddev->max_disks = 0; 6241 mddev->events = 0; 6242 mddev->can_decrease_events = 0; 6243 mddev->delta_disks = 0; 6244 mddev->reshape_backwards = 0; 6245 mddev->new_level = LEVEL_NONE; 6246 mddev->new_layout = 0; 6247 mddev->new_chunk_sectors = 0; 6248 mddev->curr_resync = MD_RESYNC_NONE; 6249 atomic64_set(&mddev->resync_mismatches, 0); 6250 mddev->suspend_lo = mddev->suspend_hi = 0; 6251 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6252 mddev->recovery = 0; 6253 mddev->in_sync = 0; 6254 mddev->changed = 0; 6255 mddev->degraded = 0; 6256 mddev->safemode = 0; 6257 mddev->private = NULL; 6258 mddev->cluster_info = NULL; 6259 mddev->bitmap_info.offset = 0; 6260 mddev->bitmap_info.default_offset = 0; 6261 mddev->bitmap_info.default_space = 0; 6262 mddev->bitmap_info.chunksize = 0; 6263 mddev->bitmap_info.daemon_sleep = 0; 6264 mddev->bitmap_info.max_write_behind = 0; 6265 mddev->bitmap_info.nodes = 0; 6266 } 6267 6268 static void __md_stop_writes(struct mddev *mddev) 6269 { 6270 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6271 if (work_pending(&mddev->del_work)) 6272 flush_workqueue(md_misc_wq); 6273 if (mddev->sync_thread) { 6274 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6275 md_reap_sync_thread(mddev); 6276 } 6277 6278 del_timer_sync(&mddev->safemode_timer); 6279 6280 if (mddev->pers && mddev->pers->quiesce) { 6281 mddev->pers->quiesce(mddev, 1); 6282 mddev->pers->quiesce(mddev, 0); 6283 } 6284 md_bitmap_flush(mddev); 6285 6286 if (md_is_rdwr(mddev) && 6287 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6288 mddev->sb_flags)) { 6289 /* mark array as shutdown cleanly */ 6290 if (!mddev_is_clustered(mddev)) 6291 mddev->in_sync = 1; 6292 md_update_sb(mddev, 1); 6293 } 6294 /* disable policy to guarantee rdevs free resources for serialization */ 6295 mddev->serialize_policy = 0; 6296 mddev_destroy_serial_pool(mddev, NULL, true); 6297 } 6298 6299 void md_stop_writes(struct mddev *mddev) 6300 { 6301 mddev_lock_nointr(mddev); 6302 __md_stop_writes(mddev); 6303 mddev_unlock(mddev); 6304 } 6305 EXPORT_SYMBOL_GPL(md_stop_writes); 6306 6307 static void mddev_detach(struct mddev *mddev) 6308 { 6309 md_bitmap_wait_behind_writes(mddev); 6310 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6311 mddev->pers->quiesce(mddev, 1); 6312 mddev->pers->quiesce(mddev, 0); 6313 } 6314 md_unregister_thread(mddev, &mddev->thread); 6315 if (mddev->queue) 6316 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6317 } 6318 6319 static void __md_stop(struct mddev *mddev) 6320 { 6321 struct md_personality *pers = mddev->pers; 6322 md_bitmap_destroy(mddev); 6323 mddev_detach(mddev); 6324 /* Ensure ->event_work is done */ 6325 if (mddev->event_work.func) 6326 flush_workqueue(md_misc_wq); 6327 spin_lock(&mddev->lock); 6328 mddev->pers = NULL; 6329 spin_unlock(&mddev->lock); 6330 if (mddev->private) 6331 pers->free(mddev, mddev->private); 6332 mddev->private = NULL; 6333 if (pers->sync_request && mddev->to_remove == NULL) 6334 mddev->to_remove = &md_redundancy_group; 6335 module_put(pers->owner); 6336 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6337 6338 percpu_ref_exit(&mddev->active_io); 6339 bioset_exit(&mddev->bio_set); 6340 bioset_exit(&mddev->sync_set); 6341 bioset_exit(&mddev->io_clone_set); 6342 } 6343 6344 void md_stop(struct mddev *mddev) 6345 { 6346 lockdep_assert_held(&mddev->reconfig_mutex); 6347 6348 /* stop the array and free an attached data structures. 6349 * This is called from dm-raid 6350 */ 6351 __md_stop_writes(mddev); 6352 __md_stop(mddev); 6353 percpu_ref_exit(&mddev->writes_pending); 6354 } 6355 6356 EXPORT_SYMBOL_GPL(md_stop); 6357 6358 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6359 { 6360 int err = 0; 6361 int did_freeze = 0; 6362 6363 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6364 return -EBUSY; 6365 6366 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6367 did_freeze = 1; 6368 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6369 md_wakeup_thread(mddev->thread); 6370 } 6371 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6372 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6373 6374 /* 6375 * Thread might be blocked waiting for metadata update which will now 6376 * never happen 6377 */ 6378 md_wakeup_thread_directly(mddev->sync_thread); 6379 6380 mddev_unlock(mddev); 6381 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6382 &mddev->recovery)); 6383 wait_event(mddev->sb_wait, 6384 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6385 mddev_lock_nointr(mddev); 6386 6387 mutex_lock(&mddev->open_mutex); 6388 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6389 mddev->sync_thread || 6390 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6391 pr_warn("md: %s still in use.\n",mdname(mddev)); 6392 err = -EBUSY; 6393 goto out; 6394 } 6395 6396 if (mddev->pers) { 6397 __md_stop_writes(mddev); 6398 6399 if (mddev->ro == MD_RDONLY) { 6400 err = -ENXIO; 6401 goto out; 6402 } 6403 6404 mddev->ro = MD_RDONLY; 6405 set_disk_ro(mddev->gendisk, 1); 6406 } 6407 6408 out: 6409 if ((mddev->pers && !err) || did_freeze) { 6410 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6411 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6412 md_wakeup_thread(mddev->thread); 6413 sysfs_notify_dirent_safe(mddev->sysfs_state); 6414 } 6415 6416 mutex_unlock(&mddev->open_mutex); 6417 return err; 6418 } 6419 6420 /* mode: 6421 * 0 - completely stop and dis-assemble array 6422 * 2 - stop but do not disassemble array 6423 */ 6424 static int do_md_stop(struct mddev *mddev, int mode, 6425 struct block_device *bdev) 6426 { 6427 struct gendisk *disk = mddev->gendisk; 6428 struct md_rdev *rdev; 6429 int did_freeze = 0; 6430 6431 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6432 did_freeze = 1; 6433 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6434 md_wakeup_thread(mddev->thread); 6435 } 6436 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6437 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6438 6439 /* 6440 * Thread might be blocked waiting for metadata update which will now 6441 * never happen 6442 */ 6443 md_wakeup_thread_directly(mddev->sync_thread); 6444 6445 mddev_unlock(mddev); 6446 wait_event(resync_wait, (mddev->sync_thread == NULL && 6447 !test_bit(MD_RECOVERY_RUNNING, 6448 &mddev->recovery))); 6449 mddev_lock_nointr(mddev); 6450 6451 mutex_lock(&mddev->open_mutex); 6452 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6453 mddev->sysfs_active || 6454 mddev->sync_thread || 6455 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6456 pr_warn("md: %s still in use.\n",mdname(mddev)); 6457 mutex_unlock(&mddev->open_mutex); 6458 if (did_freeze) { 6459 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6460 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6461 md_wakeup_thread(mddev->thread); 6462 } 6463 return -EBUSY; 6464 } 6465 if (mddev->pers) { 6466 if (!md_is_rdwr(mddev)) 6467 set_disk_ro(disk, 0); 6468 6469 __md_stop_writes(mddev); 6470 __md_stop(mddev); 6471 6472 /* tell userspace to handle 'inactive' */ 6473 sysfs_notify_dirent_safe(mddev->sysfs_state); 6474 6475 rdev_for_each(rdev, mddev) 6476 if (rdev->raid_disk >= 0) 6477 sysfs_unlink_rdev(mddev, rdev); 6478 6479 set_capacity_and_notify(disk, 0); 6480 mutex_unlock(&mddev->open_mutex); 6481 mddev->changed = 1; 6482 6483 if (!md_is_rdwr(mddev)) 6484 mddev->ro = MD_RDWR; 6485 } else 6486 mutex_unlock(&mddev->open_mutex); 6487 /* 6488 * Free resources if final stop 6489 */ 6490 if (mode == 0) { 6491 pr_info("md: %s stopped.\n", mdname(mddev)); 6492 6493 if (mddev->bitmap_info.file) { 6494 struct file *f = mddev->bitmap_info.file; 6495 spin_lock(&mddev->lock); 6496 mddev->bitmap_info.file = NULL; 6497 spin_unlock(&mddev->lock); 6498 fput(f); 6499 } 6500 mddev->bitmap_info.offset = 0; 6501 6502 export_array(mddev); 6503 6504 md_clean(mddev); 6505 if (mddev->hold_active == UNTIL_STOP) 6506 mddev->hold_active = 0; 6507 } 6508 md_new_event(); 6509 sysfs_notify_dirent_safe(mddev->sysfs_state); 6510 return 0; 6511 } 6512 6513 #ifndef MODULE 6514 static void autorun_array(struct mddev *mddev) 6515 { 6516 struct md_rdev *rdev; 6517 int err; 6518 6519 if (list_empty(&mddev->disks)) 6520 return; 6521 6522 pr_info("md: running: "); 6523 6524 rdev_for_each(rdev, mddev) { 6525 pr_cont("<%pg>", rdev->bdev); 6526 } 6527 pr_cont("\n"); 6528 6529 err = do_md_run(mddev); 6530 if (err) { 6531 pr_warn("md: do_md_run() returned %d\n", err); 6532 do_md_stop(mddev, 0, NULL); 6533 } 6534 } 6535 6536 /* 6537 * lets try to run arrays based on all disks that have arrived 6538 * until now. (those are in pending_raid_disks) 6539 * 6540 * the method: pick the first pending disk, collect all disks with 6541 * the same UUID, remove all from the pending list and put them into 6542 * the 'same_array' list. Then order this list based on superblock 6543 * update time (freshest comes first), kick out 'old' disks and 6544 * compare superblocks. If everything's fine then run it. 6545 * 6546 * If "unit" is allocated, then bump its reference count 6547 */ 6548 static void autorun_devices(int part) 6549 { 6550 struct md_rdev *rdev0, *rdev, *tmp; 6551 struct mddev *mddev; 6552 6553 pr_info("md: autorun ...\n"); 6554 while (!list_empty(&pending_raid_disks)) { 6555 int unit; 6556 dev_t dev; 6557 LIST_HEAD(candidates); 6558 rdev0 = list_entry(pending_raid_disks.next, 6559 struct md_rdev, same_set); 6560 6561 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6562 INIT_LIST_HEAD(&candidates); 6563 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6564 if (super_90_load(rdev, rdev0, 0) >= 0) { 6565 pr_debug("md: adding %pg ...\n", 6566 rdev->bdev); 6567 list_move(&rdev->same_set, &candidates); 6568 } 6569 /* 6570 * now we have a set of devices, with all of them having 6571 * mostly sane superblocks. It's time to allocate the 6572 * mddev. 6573 */ 6574 if (part) { 6575 dev = MKDEV(mdp_major, 6576 rdev0->preferred_minor << MdpMinorShift); 6577 unit = MINOR(dev) >> MdpMinorShift; 6578 } else { 6579 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6580 unit = MINOR(dev); 6581 } 6582 if (rdev0->preferred_minor != unit) { 6583 pr_warn("md: unit number in %pg is bad: %d\n", 6584 rdev0->bdev, rdev0->preferred_minor); 6585 break; 6586 } 6587 6588 mddev = md_alloc(dev, NULL); 6589 if (IS_ERR(mddev)) 6590 break; 6591 6592 if (mddev_lock(mddev)) 6593 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6594 else if (mddev->raid_disks || mddev->major_version 6595 || !list_empty(&mddev->disks)) { 6596 pr_warn("md: %s already running, cannot run %pg\n", 6597 mdname(mddev), rdev0->bdev); 6598 mddev_unlock(mddev); 6599 } else { 6600 pr_debug("md: created %s\n", mdname(mddev)); 6601 mddev->persistent = 1; 6602 rdev_for_each_list(rdev, tmp, &candidates) { 6603 list_del_init(&rdev->same_set); 6604 if (bind_rdev_to_array(rdev, mddev)) 6605 export_rdev(rdev, mddev); 6606 } 6607 autorun_array(mddev); 6608 mddev_unlock(mddev); 6609 } 6610 /* on success, candidates will be empty, on error 6611 * it won't... 6612 */ 6613 rdev_for_each_list(rdev, tmp, &candidates) { 6614 list_del_init(&rdev->same_set); 6615 export_rdev(rdev, mddev); 6616 } 6617 mddev_put(mddev); 6618 } 6619 pr_info("md: ... autorun DONE.\n"); 6620 } 6621 #endif /* !MODULE */ 6622 6623 static int get_version(void __user *arg) 6624 { 6625 mdu_version_t ver; 6626 6627 ver.major = MD_MAJOR_VERSION; 6628 ver.minor = MD_MINOR_VERSION; 6629 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6630 6631 if (copy_to_user(arg, &ver, sizeof(ver))) 6632 return -EFAULT; 6633 6634 return 0; 6635 } 6636 6637 static int get_array_info(struct mddev *mddev, void __user *arg) 6638 { 6639 mdu_array_info_t info; 6640 int nr,working,insync,failed,spare; 6641 struct md_rdev *rdev; 6642 6643 nr = working = insync = failed = spare = 0; 6644 rcu_read_lock(); 6645 rdev_for_each_rcu(rdev, mddev) { 6646 nr++; 6647 if (test_bit(Faulty, &rdev->flags)) 6648 failed++; 6649 else { 6650 working++; 6651 if (test_bit(In_sync, &rdev->flags)) 6652 insync++; 6653 else if (test_bit(Journal, &rdev->flags)) 6654 /* TODO: add journal count to md_u.h */ 6655 ; 6656 else 6657 spare++; 6658 } 6659 } 6660 rcu_read_unlock(); 6661 6662 info.major_version = mddev->major_version; 6663 info.minor_version = mddev->minor_version; 6664 info.patch_version = MD_PATCHLEVEL_VERSION; 6665 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6666 info.level = mddev->level; 6667 info.size = mddev->dev_sectors / 2; 6668 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6669 info.size = -1; 6670 info.nr_disks = nr; 6671 info.raid_disks = mddev->raid_disks; 6672 info.md_minor = mddev->md_minor; 6673 info.not_persistent= !mddev->persistent; 6674 6675 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6676 info.state = 0; 6677 if (mddev->in_sync) 6678 info.state = (1<<MD_SB_CLEAN); 6679 if (mddev->bitmap && mddev->bitmap_info.offset) 6680 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6681 if (mddev_is_clustered(mddev)) 6682 info.state |= (1<<MD_SB_CLUSTERED); 6683 info.active_disks = insync; 6684 info.working_disks = working; 6685 info.failed_disks = failed; 6686 info.spare_disks = spare; 6687 6688 info.layout = mddev->layout; 6689 info.chunk_size = mddev->chunk_sectors << 9; 6690 6691 if (copy_to_user(arg, &info, sizeof(info))) 6692 return -EFAULT; 6693 6694 return 0; 6695 } 6696 6697 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6698 { 6699 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6700 char *ptr; 6701 int err; 6702 6703 file = kzalloc(sizeof(*file), GFP_NOIO); 6704 if (!file) 6705 return -ENOMEM; 6706 6707 err = 0; 6708 spin_lock(&mddev->lock); 6709 /* bitmap enabled */ 6710 if (mddev->bitmap_info.file) { 6711 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6712 sizeof(file->pathname)); 6713 if (IS_ERR(ptr)) 6714 err = PTR_ERR(ptr); 6715 else 6716 memmove(file->pathname, ptr, 6717 sizeof(file->pathname)-(ptr-file->pathname)); 6718 } 6719 spin_unlock(&mddev->lock); 6720 6721 if (err == 0 && 6722 copy_to_user(arg, file, sizeof(*file))) 6723 err = -EFAULT; 6724 6725 kfree(file); 6726 return err; 6727 } 6728 6729 static int get_disk_info(struct mddev *mddev, void __user * arg) 6730 { 6731 mdu_disk_info_t info; 6732 struct md_rdev *rdev; 6733 6734 if (copy_from_user(&info, arg, sizeof(info))) 6735 return -EFAULT; 6736 6737 rcu_read_lock(); 6738 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6739 if (rdev) { 6740 info.major = MAJOR(rdev->bdev->bd_dev); 6741 info.minor = MINOR(rdev->bdev->bd_dev); 6742 info.raid_disk = rdev->raid_disk; 6743 info.state = 0; 6744 if (test_bit(Faulty, &rdev->flags)) 6745 info.state |= (1<<MD_DISK_FAULTY); 6746 else if (test_bit(In_sync, &rdev->flags)) { 6747 info.state |= (1<<MD_DISK_ACTIVE); 6748 info.state |= (1<<MD_DISK_SYNC); 6749 } 6750 if (test_bit(Journal, &rdev->flags)) 6751 info.state |= (1<<MD_DISK_JOURNAL); 6752 if (test_bit(WriteMostly, &rdev->flags)) 6753 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6754 if (test_bit(FailFast, &rdev->flags)) 6755 info.state |= (1<<MD_DISK_FAILFAST); 6756 } else { 6757 info.major = info.minor = 0; 6758 info.raid_disk = -1; 6759 info.state = (1<<MD_DISK_REMOVED); 6760 } 6761 rcu_read_unlock(); 6762 6763 if (copy_to_user(arg, &info, sizeof(info))) 6764 return -EFAULT; 6765 6766 return 0; 6767 } 6768 6769 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6770 { 6771 struct md_rdev *rdev; 6772 dev_t dev = MKDEV(info->major,info->minor); 6773 6774 if (mddev_is_clustered(mddev) && 6775 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6776 pr_warn("%s: Cannot add to clustered mddev.\n", 6777 mdname(mddev)); 6778 return -EINVAL; 6779 } 6780 6781 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6782 return -EOVERFLOW; 6783 6784 if (!mddev->raid_disks) { 6785 int err; 6786 /* expecting a device which has a superblock */ 6787 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6788 if (IS_ERR(rdev)) { 6789 pr_warn("md: md_import_device returned %ld\n", 6790 PTR_ERR(rdev)); 6791 return PTR_ERR(rdev); 6792 } 6793 if (!list_empty(&mddev->disks)) { 6794 struct md_rdev *rdev0 6795 = list_entry(mddev->disks.next, 6796 struct md_rdev, same_set); 6797 err = super_types[mddev->major_version] 6798 .load_super(rdev, rdev0, mddev->minor_version); 6799 if (err < 0) { 6800 pr_warn("md: %pg has different UUID to %pg\n", 6801 rdev->bdev, 6802 rdev0->bdev); 6803 export_rdev(rdev, mddev); 6804 return -EINVAL; 6805 } 6806 } 6807 err = bind_rdev_to_array(rdev, mddev); 6808 if (err) 6809 export_rdev(rdev, mddev); 6810 return err; 6811 } 6812 6813 /* 6814 * md_add_new_disk can be used once the array is assembled 6815 * to add "hot spares". They must already have a superblock 6816 * written 6817 */ 6818 if (mddev->pers) { 6819 int err; 6820 if (!mddev->pers->hot_add_disk) { 6821 pr_warn("%s: personality does not support diskops!\n", 6822 mdname(mddev)); 6823 return -EINVAL; 6824 } 6825 if (mddev->persistent) 6826 rdev = md_import_device(dev, mddev->major_version, 6827 mddev->minor_version); 6828 else 6829 rdev = md_import_device(dev, -1, -1); 6830 if (IS_ERR(rdev)) { 6831 pr_warn("md: md_import_device returned %ld\n", 6832 PTR_ERR(rdev)); 6833 return PTR_ERR(rdev); 6834 } 6835 /* set saved_raid_disk if appropriate */ 6836 if (!mddev->persistent) { 6837 if (info->state & (1<<MD_DISK_SYNC) && 6838 info->raid_disk < mddev->raid_disks) { 6839 rdev->raid_disk = info->raid_disk; 6840 clear_bit(Bitmap_sync, &rdev->flags); 6841 } else 6842 rdev->raid_disk = -1; 6843 rdev->saved_raid_disk = rdev->raid_disk; 6844 } else 6845 super_types[mddev->major_version]. 6846 validate_super(mddev, NULL/*freshest*/, rdev); 6847 if ((info->state & (1<<MD_DISK_SYNC)) && 6848 rdev->raid_disk != info->raid_disk) { 6849 /* This was a hot-add request, but events doesn't 6850 * match, so reject it. 6851 */ 6852 export_rdev(rdev, mddev); 6853 return -EINVAL; 6854 } 6855 6856 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6857 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6858 set_bit(WriteMostly, &rdev->flags); 6859 else 6860 clear_bit(WriteMostly, &rdev->flags); 6861 if (info->state & (1<<MD_DISK_FAILFAST)) 6862 set_bit(FailFast, &rdev->flags); 6863 else 6864 clear_bit(FailFast, &rdev->flags); 6865 6866 if (info->state & (1<<MD_DISK_JOURNAL)) { 6867 struct md_rdev *rdev2; 6868 bool has_journal = false; 6869 6870 /* make sure no existing journal disk */ 6871 rdev_for_each(rdev2, mddev) { 6872 if (test_bit(Journal, &rdev2->flags)) { 6873 has_journal = true; 6874 break; 6875 } 6876 } 6877 if (has_journal || mddev->bitmap) { 6878 export_rdev(rdev, mddev); 6879 return -EBUSY; 6880 } 6881 set_bit(Journal, &rdev->flags); 6882 } 6883 /* 6884 * check whether the device shows up in other nodes 6885 */ 6886 if (mddev_is_clustered(mddev)) { 6887 if (info->state & (1 << MD_DISK_CANDIDATE)) 6888 set_bit(Candidate, &rdev->flags); 6889 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6890 /* --add initiated by this node */ 6891 err = md_cluster_ops->add_new_disk(mddev, rdev); 6892 if (err) { 6893 export_rdev(rdev, mddev); 6894 return err; 6895 } 6896 } 6897 } 6898 6899 rdev->raid_disk = -1; 6900 err = bind_rdev_to_array(rdev, mddev); 6901 6902 if (err) 6903 export_rdev(rdev, mddev); 6904 6905 if (mddev_is_clustered(mddev)) { 6906 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6907 if (!err) { 6908 err = md_cluster_ops->new_disk_ack(mddev, 6909 err == 0); 6910 if (err) 6911 md_kick_rdev_from_array(rdev); 6912 } 6913 } else { 6914 if (err) 6915 md_cluster_ops->add_new_disk_cancel(mddev); 6916 else 6917 err = add_bound_rdev(rdev); 6918 } 6919 6920 } else if (!err) 6921 err = add_bound_rdev(rdev); 6922 6923 return err; 6924 } 6925 6926 /* otherwise, md_add_new_disk is only allowed 6927 * for major_version==0 superblocks 6928 */ 6929 if (mddev->major_version != 0) { 6930 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6931 return -EINVAL; 6932 } 6933 6934 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6935 int err; 6936 rdev = md_import_device(dev, -1, 0); 6937 if (IS_ERR(rdev)) { 6938 pr_warn("md: error, md_import_device() returned %ld\n", 6939 PTR_ERR(rdev)); 6940 return PTR_ERR(rdev); 6941 } 6942 rdev->desc_nr = info->number; 6943 if (info->raid_disk < mddev->raid_disks) 6944 rdev->raid_disk = info->raid_disk; 6945 else 6946 rdev->raid_disk = -1; 6947 6948 if (rdev->raid_disk < mddev->raid_disks) 6949 if (info->state & (1<<MD_DISK_SYNC)) 6950 set_bit(In_sync, &rdev->flags); 6951 6952 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6953 set_bit(WriteMostly, &rdev->flags); 6954 if (info->state & (1<<MD_DISK_FAILFAST)) 6955 set_bit(FailFast, &rdev->flags); 6956 6957 if (!mddev->persistent) { 6958 pr_debug("md: nonpersistent superblock ...\n"); 6959 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 6960 } else 6961 rdev->sb_start = calc_dev_sboffset(rdev); 6962 rdev->sectors = rdev->sb_start; 6963 6964 err = bind_rdev_to_array(rdev, mddev); 6965 if (err) { 6966 export_rdev(rdev, mddev); 6967 return err; 6968 } 6969 } 6970 6971 return 0; 6972 } 6973 6974 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6975 { 6976 struct md_rdev *rdev; 6977 6978 if (!mddev->pers) 6979 return -ENODEV; 6980 6981 rdev = find_rdev(mddev, dev); 6982 if (!rdev) 6983 return -ENXIO; 6984 6985 if (rdev->raid_disk < 0) 6986 goto kick_rdev; 6987 6988 clear_bit(Blocked, &rdev->flags); 6989 remove_and_add_spares(mddev, rdev); 6990 6991 if (rdev->raid_disk >= 0) 6992 goto busy; 6993 6994 kick_rdev: 6995 if (mddev_is_clustered(mddev)) { 6996 if (md_cluster_ops->remove_disk(mddev, rdev)) 6997 goto busy; 6998 } 6999 7000 md_kick_rdev_from_array(rdev); 7001 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7002 if (mddev->thread) 7003 md_wakeup_thread(mddev->thread); 7004 else 7005 md_update_sb(mddev, 1); 7006 md_new_event(); 7007 7008 return 0; 7009 busy: 7010 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7011 rdev->bdev, mdname(mddev)); 7012 return -EBUSY; 7013 } 7014 7015 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7016 { 7017 int err; 7018 struct md_rdev *rdev; 7019 7020 if (!mddev->pers) 7021 return -ENODEV; 7022 7023 if (mddev->major_version != 0) { 7024 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7025 mdname(mddev)); 7026 return -EINVAL; 7027 } 7028 if (!mddev->pers->hot_add_disk) { 7029 pr_warn("%s: personality does not support diskops!\n", 7030 mdname(mddev)); 7031 return -EINVAL; 7032 } 7033 7034 rdev = md_import_device(dev, -1, 0); 7035 if (IS_ERR(rdev)) { 7036 pr_warn("md: error, md_import_device() returned %ld\n", 7037 PTR_ERR(rdev)); 7038 return -EINVAL; 7039 } 7040 7041 if (mddev->persistent) 7042 rdev->sb_start = calc_dev_sboffset(rdev); 7043 else 7044 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7045 7046 rdev->sectors = rdev->sb_start; 7047 7048 if (test_bit(Faulty, &rdev->flags)) { 7049 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7050 rdev->bdev, mdname(mddev)); 7051 err = -EINVAL; 7052 goto abort_export; 7053 } 7054 7055 clear_bit(In_sync, &rdev->flags); 7056 rdev->desc_nr = -1; 7057 rdev->saved_raid_disk = -1; 7058 err = bind_rdev_to_array(rdev, mddev); 7059 if (err) 7060 goto abort_export; 7061 7062 /* 7063 * The rest should better be atomic, we can have disk failures 7064 * noticed in interrupt contexts ... 7065 */ 7066 7067 rdev->raid_disk = -1; 7068 7069 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7070 if (!mddev->thread) 7071 md_update_sb(mddev, 1); 7072 /* 7073 * If the new disk does not support REQ_NOWAIT, 7074 * disable on the whole MD. 7075 */ 7076 if (!bdev_nowait(rdev->bdev)) { 7077 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7078 mdname(mddev), rdev->bdev); 7079 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7080 } 7081 /* 7082 * Kick recovery, maybe this spare has to be added to the 7083 * array immediately. 7084 */ 7085 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7086 md_wakeup_thread(mddev->thread); 7087 md_new_event(); 7088 return 0; 7089 7090 abort_export: 7091 export_rdev(rdev, mddev); 7092 return err; 7093 } 7094 7095 static int set_bitmap_file(struct mddev *mddev, int fd) 7096 { 7097 int err = 0; 7098 7099 if (mddev->pers) { 7100 if (!mddev->pers->quiesce || !mddev->thread) 7101 return -EBUSY; 7102 if (mddev->recovery || mddev->sync_thread) 7103 return -EBUSY; 7104 /* we should be able to change the bitmap.. */ 7105 } 7106 7107 if (fd >= 0) { 7108 struct inode *inode; 7109 struct file *f; 7110 7111 if (mddev->bitmap || mddev->bitmap_info.file) 7112 return -EEXIST; /* cannot add when bitmap is present */ 7113 7114 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7115 pr_warn("%s: bitmap files not supported by this kernel\n", 7116 mdname(mddev)); 7117 return -EINVAL; 7118 } 7119 pr_warn("%s: using deprecated bitmap file support\n", 7120 mdname(mddev)); 7121 7122 f = fget(fd); 7123 7124 if (f == NULL) { 7125 pr_warn("%s: error: failed to get bitmap file\n", 7126 mdname(mddev)); 7127 return -EBADF; 7128 } 7129 7130 inode = f->f_mapping->host; 7131 if (!S_ISREG(inode->i_mode)) { 7132 pr_warn("%s: error: bitmap file must be a regular file\n", 7133 mdname(mddev)); 7134 err = -EBADF; 7135 } else if (!(f->f_mode & FMODE_WRITE)) { 7136 pr_warn("%s: error: bitmap file must open for write\n", 7137 mdname(mddev)); 7138 err = -EBADF; 7139 } else if (atomic_read(&inode->i_writecount) != 1) { 7140 pr_warn("%s: error: bitmap file is already in use\n", 7141 mdname(mddev)); 7142 err = -EBUSY; 7143 } 7144 if (err) { 7145 fput(f); 7146 return err; 7147 } 7148 mddev->bitmap_info.file = f; 7149 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7150 } else if (mddev->bitmap == NULL) 7151 return -ENOENT; /* cannot remove what isn't there */ 7152 err = 0; 7153 if (mddev->pers) { 7154 if (fd >= 0) { 7155 struct bitmap *bitmap; 7156 7157 bitmap = md_bitmap_create(mddev, -1); 7158 mddev_suspend(mddev); 7159 if (!IS_ERR(bitmap)) { 7160 mddev->bitmap = bitmap; 7161 err = md_bitmap_load(mddev); 7162 } else 7163 err = PTR_ERR(bitmap); 7164 if (err) { 7165 md_bitmap_destroy(mddev); 7166 fd = -1; 7167 } 7168 mddev_resume(mddev); 7169 } else if (fd < 0) { 7170 mddev_suspend(mddev); 7171 md_bitmap_destroy(mddev); 7172 mddev_resume(mddev); 7173 } 7174 } 7175 if (fd < 0) { 7176 struct file *f = mddev->bitmap_info.file; 7177 if (f) { 7178 spin_lock(&mddev->lock); 7179 mddev->bitmap_info.file = NULL; 7180 spin_unlock(&mddev->lock); 7181 fput(f); 7182 } 7183 } 7184 7185 return err; 7186 } 7187 7188 /* 7189 * md_set_array_info is used two different ways 7190 * The original usage is when creating a new array. 7191 * In this usage, raid_disks is > 0 and it together with 7192 * level, size, not_persistent,layout,chunksize determine the 7193 * shape of the array. 7194 * This will always create an array with a type-0.90.0 superblock. 7195 * The newer usage is when assembling an array. 7196 * In this case raid_disks will be 0, and the major_version field is 7197 * use to determine which style super-blocks are to be found on the devices. 7198 * The minor and patch _version numbers are also kept incase the 7199 * super_block handler wishes to interpret them. 7200 */ 7201 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7202 { 7203 if (info->raid_disks == 0) { 7204 /* just setting version number for superblock loading */ 7205 if (info->major_version < 0 || 7206 info->major_version >= ARRAY_SIZE(super_types) || 7207 super_types[info->major_version].name == NULL) { 7208 /* maybe try to auto-load a module? */ 7209 pr_warn("md: superblock version %d not known\n", 7210 info->major_version); 7211 return -EINVAL; 7212 } 7213 mddev->major_version = info->major_version; 7214 mddev->minor_version = info->minor_version; 7215 mddev->patch_version = info->patch_version; 7216 mddev->persistent = !info->not_persistent; 7217 /* ensure mddev_put doesn't delete this now that there 7218 * is some minimal configuration. 7219 */ 7220 mddev->ctime = ktime_get_real_seconds(); 7221 return 0; 7222 } 7223 mddev->major_version = MD_MAJOR_VERSION; 7224 mddev->minor_version = MD_MINOR_VERSION; 7225 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7226 mddev->ctime = ktime_get_real_seconds(); 7227 7228 mddev->level = info->level; 7229 mddev->clevel[0] = 0; 7230 mddev->dev_sectors = 2 * (sector_t)info->size; 7231 mddev->raid_disks = info->raid_disks; 7232 /* don't set md_minor, it is determined by which /dev/md* was 7233 * openned 7234 */ 7235 if (info->state & (1<<MD_SB_CLEAN)) 7236 mddev->recovery_cp = MaxSector; 7237 else 7238 mddev->recovery_cp = 0; 7239 mddev->persistent = ! info->not_persistent; 7240 mddev->external = 0; 7241 7242 mddev->layout = info->layout; 7243 if (mddev->level == 0) 7244 /* Cannot trust RAID0 layout info here */ 7245 mddev->layout = -1; 7246 mddev->chunk_sectors = info->chunk_size >> 9; 7247 7248 if (mddev->persistent) { 7249 mddev->max_disks = MD_SB_DISKS; 7250 mddev->flags = 0; 7251 mddev->sb_flags = 0; 7252 } 7253 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7254 7255 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7256 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7257 mddev->bitmap_info.offset = 0; 7258 7259 mddev->reshape_position = MaxSector; 7260 7261 /* 7262 * Generate a 128 bit UUID 7263 */ 7264 get_random_bytes(mddev->uuid, 16); 7265 7266 mddev->new_level = mddev->level; 7267 mddev->new_chunk_sectors = mddev->chunk_sectors; 7268 mddev->new_layout = mddev->layout; 7269 mddev->delta_disks = 0; 7270 mddev->reshape_backwards = 0; 7271 7272 return 0; 7273 } 7274 7275 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7276 { 7277 lockdep_assert_held(&mddev->reconfig_mutex); 7278 7279 if (mddev->external_size) 7280 return; 7281 7282 mddev->array_sectors = array_sectors; 7283 } 7284 EXPORT_SYMBOL(md_set_array_sectors); 7285 7286 static int update_size(struct mddev *mddev, sector_t num_sectors) 7287 { 7288 struct md_rdev *rdev; 7289 int rv; 7290 int fit = (num_sectors == 0); 7291 sector_t old_dev_sectors = mddev->dev_sectors; 7292 7293 if (mddev->pers->resize == NULL) 7294 return -EINVAL; 7295 /* The "num_sectors" is the number of sectors of each device that 7296 * is used. This can only make sense for arrays with redundancy. 7297 * linear and raid0 always use whatever space is available. We can only 7298 * consider changing this number if no resync or reconstruction is 7299 * happening, and if the new size is acceptable. It must fit before the 7300 * sb_start or, if that is <data_offset, it must fit before the size 7301 * of each device. If num_sectors is zero, we find the largest size 7302 * that fits. 7303 */ 7304 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7305 mddev->sync_thread) 7306 return -EBUSY; 7307 if (!md_is_rdwr(mddev)) 7308 return -EROFS; 7309 7310 rdev_for_each(rdev, mddev) { 7311 sector_t avail = rdev->sectors; 7312 7313 if (fit && (num_sectors == 0 || num_sectors > avail)) 7314 num_sectors = avail; 7315 if (avail < num_sectors) 7316 return -ENOSPC; 7317 } 7318 rv = mddev->pers->resize(mddev, num_sectors); 7319 if (!rv) { 7320 if (mddev_is_clustered(mddev)) 7321 md_cluster_ops->update_size(mddev, old_dev_sectors); 7322 else if (mddev->queue) { 7323 set_capacity_and_notify(mddev->gendisk, 7324 mddev->array_sectors); 7325 } 7326 } 7327 return rv; 7328 } 7329 7330 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7331 { 7332 int rv; 7333 struct md_rdev *rdev; 7334 /* change the number of raid disks */ 7335 if (mddev->pers->check_reshape == NULL) 7336 return -EINVAL; 7337 if (!md_is_rdwr(mddev)) 7338 return -EROFS; 7339 if (raid_disks <= 0 || 7340 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7341 return -EINVAL; 7342 if (mddev->sync_thread || 7343 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7344 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7345 mddev->reshape_position != MaxSector) 7346 return -EBUSY; 7347 7348 rdev_for_each(rdev, mddev) { 7349 if (mddev->raid_disks < raid_disks && 7350 rdev->data_offset < rdev->new_data_offset) 7351 return -EINVAL; 7352 if (mddev->raid_disks > raid_disks && 7353 rdev->data_offset > rdev->new_data_offset) 7354 return -EINVAL; 7355 } 7356 7357 mddev->delta_disks = raid_disks - mddev->raid_disks; 7358 if (mddev->delta_disks < 0) 7359 mddev->reshape_backwards = 1; 7360 else if (mddev->delta_disks > 0) 7361 mddev->reshape_backwards = 0; 7362 7363 rv = mddev->pers->check_reshape(mddev); 7364 if (rv < 0) { 7365 mddev->delta_disks = 0; 7366 mddev->reshape_backwards = 0; 7367 } 7368 return rv; 7369 } 7370 7371 /* 7372 * update_array_info is used to change the configuration of an 7373 * on-line array. 7374 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7375 * fields in the info are checked against the array. 7376 * Any differences that cannot be handled will cause an error. 7377 * Normally, only one change can be managed at a time. 7378 */ 7379 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7380 { 7381 int rv = 0; 7382 int cnt = 0; 7383 int state = 0; 7384 7385 /* calculate expected state,ignoring low bits */ 7386 if (mddev->bitmap && mddev->bitmap_info.offset) 7387 state |= (1 << MD_SB_BITMAP_PRESENT); 7388 7389 if (mddev->major_version != info->major_version || 7390 mddev->minor_version != info->minor_version || 7391 /* mddev->patch_version != info->patch_version || */ 7392 mddev->ctime != info->ctime || 7393 mddev->level != info->level || 7394 /* mddev->layout != info->layout || */ 7395 mddev->persistent != !info->not_persistent || 7396 mddev->chunk_sectors != info->chunk_size >> 9 || 7397 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7398 ((state^info->state) & 0xfffffe00) 7399 ) 7400 return -EINVAL; 7401 /* Check there is only one change */ 7402 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7403 cnt++; 7404 if (mddev->raid_disks != info->raid_disks) 7405 cnt++; 7406 if (mddev->layout != info->layout) 7407 cnt++; 7408 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7409 cnt++; 7410 if (cnt == 0) 7411 return 0; 7412 if (cnt > 1) 7413 return -EINVAL; 7414 7415 if (mddev->layout != info->layout) { 7416 /* Change layout 7417 * we don't need to do anything at the md level, the 7418 * personality will take care of it all. 7419 */ 7420 if (mddev->pers->check_reshape == NULL) 7421 return -EINVAL; 7422 else { 7423 mddev->new_layout = info->layout; 7424 rv = mddev->pers->check_reshape(mddev); 7425 if (rv) 7426 mddev->new_layout = mddev->layout; 7427 return rv; 7428 } 7429 } 7430 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7431 rv = update_size(mddev, (sector_t)info->size * 2); 7432 7433 if (mddev->raid_disks != info->raid_disks) 7434 rv = update_raid_disks(mddev, info->raid_disks); 7435 7436 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7437 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7438 rv = -EINVAL; 7439 goto err; 7440 } 7441 if (mddev->recovery || mddev->sync_thread) { 7442 rv = -EBUSY; 7443 goto err; 7444 } 7445 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7446 struct bitmap *bitmap; 7447 /* add the bitmap */ 7448 if (mddev->bitmap) { 7449 rv = -EEXIST; 7450 goto err; 7451 } 7452 if (mddev->bitmap_info.default_offset == 0) { 7453 rv = -EINVAL; 7454 goto err; 7455 } 7456 mddev->bitmap_info.offset = 7457 mddev->bitmap_info.default_offset; 7458 mddev->bitmap_info.space = 7459 mddev->bitmap_info.default_space; 7460 bitmap = md_bitmap_create(mddev, -1); 7461 mddev_suspend(mddev); 7462 if (!IS_ERR(bitmap)) { 7463 mddev->bitmap = bitmap; 7464 rv = md_bitmap_load(mddev); 7465 } else 7466 rv = PTR_ERR(bitmap); 7467 if (rv) 7468 md_bitmap_destroy(mddev); 7469 mddev_resume(mddev); 7470 } else { 7471 /* remove the bitmap */ 7472 if (!mddev->bitmap) { 7473 rv = -ENOENT; 7474 goto err; 7475 } 7476 if (mddev->bitmap->storage.file) { 7477 rv = -EINVAL; 7478 goto err; 7479 } 7480 if (mddev->bitmap_info.nodes) { 7481 /* hold PW on all the bitmap lock */ 7482 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7483 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7484 rv = -EPERM; 7485 md_cluster_ops->unlock_all_bitmaps(mddev); 7486 goto err; 7487 } 7488 7489 mddev->bitmap_info.nodes = 0; 7490 md_cluster_ops->leave(mddev); 7491 module_put(md_cluster_mod); 7492 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7493 } 7494 mddev_suspend(mddev); 7495 md_bitmap_destroy(mddev); 7496 mddev_resume(mddev); 7497 mddev->bitmap_info.offset = 0; 7498 } 7499 } 7500 md_update_sb(mddev, 1); 7501 return rv; 7502 err: 7503 return rv; 7504 } 7505 7506 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7507 { 7508 struct md_rdev *rdev; 7509 int err = 0; 7510 7511 if (mddev->pers == NULL) 7512 return -ENODEV; 7513 7514 rcu_read_lock(); 7515 rdev = md_find_rdev_rcu(mddev, dev); 7516 if (!rdev) 7517 err = -ENODEV; 7518 else { 7519 md_error(mddev, rdev); 7520 if (test_bit(MD_BROKEN, &mddev->flags)) 7521 err = -EBUSY; 7522 } 7523 rcu_read_unlock(); 7524 return err; 7525 } 7526 7527 /* 7528 * We have a problem here : there is no easy way to give a CHS 7529 * virtual geometry. We currently pretend that we have a 2 heads 7530 * 4 sectors (with a BIG number of cylinders...). This drives 7531 * dosfs just mad... ;-) 7532 */ 7533 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7534 { 7535 struct mddev *mddev = bdev->bd_disk->private_data; 7536 7537 geo->heads = 2; 7538 geo->sectors = 4; 7539 geo->cylinders = mddev->array_sectors / 8; 7540 return 0; 7541 } 7542 7543 static inline bool md_ioctl_valid(unsigned int cmd) 7544 { 7545 switch (cmd) { 7546 case ADD_NEW_DISK: 7547 case GET_ARRAY_INFO: 7548 case GET_BITMAP_FILE: 7549 case GET_DISK_INFO: 7550 case HOT_ADD_DISK: 7551 case HOT_REMOVE_DISK: 7552 case RAID_VERSION: 7553 case RESTART_ARRAY_RW: 7554 case RUN_ARRAY: 7555 case SET_ARRAY_INFO: 7556 case SET_BITMAP_FILE: 7557 case SET_DISK_FAULTY: 7558 case STOP_ARRAY: 7559 case STOP_ARRAY_RO: 7560 case CLUSTERED_DISK_NACK: 7561 return true; 7562 default: 7563 return false; 7564 } 7565 } 7566 7567 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7568 { 7569 mdu_array_info_t info; 7570 int err; 7571 7572 if (!argp) 7573 memset(&info, 0, sizeof(info)); 7574 else if (copy_from_user(&info, argp, sizeof(info))) 7575 return -EFAULT; 7576 7577 if (mddev->pers) { 7578 err = update_array_info(mddev, &info); 7579 if (err) 7580 pr_warn("md: couldn't update array info. %d\n", err); 7581 return err; 7582 } 7583 7584 if (!list_empty(&mddev->disks)) { 7585 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7586 return -EBUSY; 7587 } 7588 7589 if (mddev->raid_disks) { 7590 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7591 return -EBUSY; 7592 } 7593 7594 err = md_set_array_info(mddev, &info); 7595 if (err) 7596 pr_warn("md: couldn't set array info. %d\n", err); 7597 7598 return err; 7599 } 7600 7601 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7602 unsigned int cmd, unsigned long arg) 7603 { 7604 int err = 0; 7605 void __user *argp = (void __user *)arg; 7606 struct mddev *mddev = NULL; 7607 bool did_set_md_closing = false; 7608 7609 if (!md_ioctl_valid(cmd)) 7610 return -ENOTTY; 7611 7612 switch (cmd) { 7613 case RAID_VERSION: 7614 case GET_ARRAY_INFO: 7615 case GET_DISK_INFO: 7616 break; 7617 default: 7618 if (!capable(CAP_SYS_ADMIN)) 7619 return -EACCES; 7620 } 7621 7622 /* 7623 * Commands dealing with the RAID driver but not any 7624 * particular array: 7625 */ 7626 switch (cmd) { 7627 case RAID_VERSION: 7628 err = get_version(argp); 7629 goto out; 7630 default:; 7631 } 7632 7633 /* 7634 * Commands creating/starting a new array: 7635 */ 7636 7637 mddev = bdev->bd_disk->private_data; 7638 7639 if (!mddev) { 7640 BUG(); 7641 goto out; 7642 } 7643 7644 /* Some actions do not requires the mutex */ 7645 switch (cmd) { 7646 case GET_ARRAY_INFO: 7647 if (!mddev->raid_disks && !mddev->external) 7648 err = -ENODEV; 7649 else 7650 err = get_array_info(mddev, argp); 7651 goto out; 7652 7653 case GET_DISK_INFO: 7654 if (!mddev->raid_disks && !mddev->external) 7655 err = -ENODEV; 7656 else 7657 err = get_disk_info(mddev, argp); 7658 goto out; 7659 7660 case SET_DISK_FAULTY: 7661 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7662 goto out; 7663 7664 case GET_BITMAP_FILE: 7665 err = get_bitmap_file(mddev, argp); 7666 goto out; 7667 7668 } 7669 7670 if (cmd == HOT_REMOVE_DISK) 7671 /* need to ensure recovery thread has run */ 7672 wait_event_interruptible_timeout(mddev->sb_wait, 7673 !test_bit(MD_RECOVERY_NEEDED, 7674 &mddev->recovery), 7675 msecs_to_jiffies(5000)); 7676 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7677 /* Need to flush page cache, and ensure no-one else opens 7678 * and writes 7679 */ 7680 mutex_lock(&mddev->open_mutex); 7681 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7682 mutex_unlock(&mddev->open_mutex); 7683 err = -EBUSY; 7684 goto out; 7685 } 7686 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7687 mutex_unlock(&mddev->open_mutex); 7688 err = -EBUSY; 7689 goto out; 7690 } 7691 did_set_md_closing = true; 7692 mutex_unlock(&mddev->open_mutex); 7693 sync_blockdev(bdev); 7694 } 7695 err = mddev_lock(mddev); 7696 if (err) { 7697 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7698 err, cmd); 7699 goto out; 7700 } 7701 7702 if (cmd == SET_ARRAY_INFO) { 7703 err = __md_set_array_info(mddev, argp); 7704 goto unlock; 7705 } 7706 7707 /* 7708 * Commands querying/configuring an existing array: 7709 */ 7710 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7711 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7712 if ((!mddev->raid_disks && !mddev->external) 7713 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7714 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7715 && cmd != GET_BITMAP_FILE) { 7716 err = -ENODEV; 7717 goto unlock; 7718 } 7719 7720 /* 7721 * Commands even a read-only array can execute: 7722 */ 7723 switch (cmd) { 7724 case RESTART_ARRAY_RW: 7725 err = restart_array(mddev); 7726 goto unlock; 7727 7728 case STOP_ARRAY: 7729 err = do_md_stop(mddev, 0, bdev); 7730 goto unlock; 7731 7732 case STOP_ARRAY_RO: 7733 err = md_set_readonly(mddev, bdev); 7734 goto unlock; 7735 7736 case HOT_REMOVE_DISK: 7737 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7738 goto unlock; 7739 7740 case ADD_NEW_DISK: 7741 /* We can support ADD_NEW_DISK on read-only arrays 7742 * only if we are re-adding a preexisting device. 7743 * So require mddev->pers and MD_DISK_SYNC. 7744 */ 7745 if (mddev->pers) { 7746 mdu_disk_info_t info; 7747 if (copy_from_user(&info, argp, sizeof(info))) 7748 err = -EFAULT; 7749 else if (!(info.state & (1<<MD_DISK_SYNC))) 7750 /* Need to clear read-only for this */ 7751 break; 7752 else 7753 err = md_add_new_disk(mddev, &info); 7754 goto unlock; 7755 } 7756 break; 7757 } 7758 7759 /* 7760 * The remaining ioctls are changing the state of the 7761 * superblock, so we do not allow them on read-only arrays. 7762 */ 7763 if (!md_is_rdwr(mddev) && mddev->pers) { 7764 if (mddev->ro != MD_AUTO_READ) { 7765 err = -EROFS; 7766 goto unlock; 7767 } 7768 mddev->ro = MD_RDWR; 7769 sysfs_notify_dirent_safe(mddev->sysfs_state); 7770 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7771 /* mddev_unlock will wake thread */ 7772 /* If a device failed while we were read-only, we 7773 * need to make sure the metadata is updated now. 7774 */ 7775 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7776 mddev_unlock(mddev); 7777 wait_event(mddev->sb_wait, 7778 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7779 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7780 mddev_lock_nointr(mddev); 7781 } 7782 } 7783 7784 switch (cmd) { 7785 case ADD_NEW_DISK: 7786 { 7787 mdu_disk_info_t info; 7788 if (copy_from_user(&info, argp, sizeof(info))) 7789 err = -EFAULT; 7790 else 7791 err = md_add_new_disk(mddev, &info); 7792 goto unlock; 7793 } 7794 7795 case CLUSTERED_DISK_NACK: 7796 if (mddev_is_clustered(mddev)) 7797 md_cluster_ops->new_disk_ack(mddev, false); 7798 else 7799 err = -EINVAL; 7800 goto unlock; 7801 7802 case HOT_ADD_DISK: 7803 err = hot_add_disk(mddev, new_decode_dev(arg)); 7804 goto unlock; 7805 7806 case RUN_ARRAY: 7807 err = do_md_run(mddev); 7808 goto unlock; 7809 7810 case SET_BITMAP_FILE: 7811 err = set_bitmap_file(mddev, (int)arg); 7812 goto unlock; 7813 7814 default: 7815 err = -EINVAL; 7816 goto unlock; 7817 } 7818 7819 unlock: 7820 if (mddev->hold_active == UNTIL_IOCTL && 7821 err != -EINVAL) 7822 mddev->hold_active = 0; 7823 mddev_unlock(mddev); 7824 out: 7825 if(did_set_md_closing) 7826 clear_bit(MD_CLOSING, &mddev->flags); 7827 return err; 7828 } 7829 #ifdef CONFIG_COMPAT 7830 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7831 unsigned int cmd, unsigned long arg) 7832 { 7833 switch (cmd) { 7834 case HOT_REMOVE_DISK: 7835 case HOT_ADD_DISK: 7836 case SET_DISK_FAULTY: 7837 case SET_BITMAP_FILE: 7838 /* These take in integer arg, do not convert */ 7839 break; 7840 default: 7841 arg = (unsigned long)compat_ptr(arg); 7842 break; 7843 } 7844 7845 return md_ioctl(bdev, mode, cmd, arg); 7846 } 7847 #endif /* CONFIG_COMPAT */ 7848 7849 static int md_set_read_only(struct block_device *bdev, bool ro) 7850 { 7851 struct mddev *mddev = bdev->bd_disk->private_data; 7852 int err; 7853 7854 err = mddev_lock(mddev); 7855 if (err) 7856 return err; 7857 7858 if (!mddev->raid_disks && !mddev->external) { 7859 err = -ENODEV; 7860 goto out_unlock; 7861 } 7862 7863 /* 7864 * Transitioning to read-auto need only happen for arrays that call 7865 * md_write_start and which are not ready for writes yet. 7866 */ 7867 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7868 err = restart_array(mddev); 7869 if (err) 7870 goto out_unlock; 7871 mddev->ro = MD_AUTO_READ; 7872 } 7873 7874 out_unlock: 7875 mddev_unlock(mddev); 7876 return err; 7877 } 7878 7879 static int md_open(struct gendisk *disk, blk_mode_t mode) 7880 { 7881 struct mddev *mddev; 7882 int err; 7883 7884 spin_lock(&all_mddevs_lock); 7885 mddev = mddev_get(disk->private_data); 7886 spin_unlock(&all_mddevs_lock); 7887 if (!mddev) 7888 return -ENODEV; 7889 7890 err = mutex_lock_interruptible(&mddev->open_mutex); 7891 if (err) 7892 goto out; 7893 7894 err = -ENODEV; 7895 if (test_bit(MD_CLOSING, &mddev->flags)) 7896 goto out_unlock; 7897 7898 atomic_inc(&mddev->openers); 7899 mutex_unlock(&mddev->open_mutex); 7900 7901 disk_check_media_change(disk); 7902 return 0; 7903 7904 out_unlock: 7905 mutex_unlock(&mddev->open_mutex); 7906 out: 7907 mddev_put(mddev); 7908 return err; 7909 } 7910 7911 static void md_release(struct gendisk *disk) 7912 { 7913 struct mddev *mddev = disk->private_data; 7914 7915 BUG_ON(!mddev); 7916 atomic_dec(&mddev->openers); 7917 mddev_put(mddev); 7918 } 7919 7920 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7921 { 7922 struct mddev *mddev = disk->private_data; 7923 unsigned int ret = 0; 7924 7925 if (mddev->changed) 7926 ret = DISK_EVENT_MEDIA_CHANGE; 7927 mddev->changed = 0; 7928 return ret; 7929 } 7930 7931 static void md_free_disk(struct gendisk *disk) 7932 { 7933 struct mddev *mddev = disk->private_data; 7934 7935 percpu_ref_exit(&mddev->writes_pending); 7936 mddev_free(mddev); 7937 } 7938 7939 const struct block_device_operations md_fops = 7940 { 7941 .owner = THIS_MODULE, 7942 .submit_bio = md_submit_bio, 7943 .open = md_open, 7944 .release = md_release, 7945 .ioctl = md_ioctl, 7946 #ifdef CONFIG_COMPAT 7947 .compat_ioctl = md_compat_ioctl, 7948 #endif 7949 .getgeo = md_getgeo, 7950 .check_events = md_check_events, 7951 .set_read_only = md_set_read_only, 7952 .free_disk = md_free_disk, 7953 }; 7954 7955 static int md_thread(void *arg) 7956 { 7957 struct md_thread *thread = arg; 7958 7959 /* 7960 * md_thread is a 'system-thread', it's priority should be very 7961 * high. We avoid resource deadlocks individually in each 7962 * raid personality. (RAID5 does preallocation) We also use RR and 7963 * the very same RT priority as kswapd, thus we will never get 7964 * into a priority inversion deadlock. 7965 * 7966 * we definitely have to have equal or higher priority than 7967 * bdflush, otherwise bdflush will deadlock if there are too 7968 * many dirty RAID5 blocks. 7969 */ 7970 7971 allow_signal(SIGKILL); 7972 while (!kthread_should_stop()) { 7973 7974 /* We need to wait INTERRUPTIBLE so that 7975 * we don't add to the load-average. 7976 * That means we need to be sure no signals are 7977 * pending 7978 */ 7979 if (signal_pending(current)) 7980 flush_signals(current); 7981 7982 wait_event_interruptible_timeout 7983 (thread->wqueue, 7984 test_bit(THREAD_WAKEUP, &thread->flags) 7985 || kthread_should_stop() || kthread_should_park(), 7986 thread->timeout); 7987 7988 clear_bit(THREAD_WAKEUP, &thread->flags); 7989 if (kthread_should_park()) 7990 kthread_parkme(); 7991 if (!kthread_should_stop()) 7992 thread->run(thread); 7993 } 7994 7995 return 0; 7996 } 7997 7998 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 7999 { 8000 struct md_thread *t; 8001 8002 rcu_read_lock(); 8003 t = rcu_dereference(thread); 8004 if (t) 8005 wake_up_process(t->tsk); 8006 rcu_read_unlock(); 8007 } 8008 8009 void md_wakeup_thread(struct md_thread __rcu *thread) 8010 { 8011 struct md_thread *t; 8012 8013 rcu_read_lock(); 8014 t = rcu_dereference(thread); 8015 if (t) { 8016 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8017 set_bit(THREAD_WAKEUP, &t->flags); 8018 wake_up(&t->wqueue); 8019 } 8020 rcu_read_unlock(); 8021 } 8022 EXPORT_SYMBOL(md_wakeup_thread); 8023 8024 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8025 struct mddev *mddev, const char *name) 8026 { 8027 struct md_thread *thread; 8028 8029 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8030 if (!thread) 8031 return NULL; 8032 8033 init_waitqueue_head(&thread->wqueue); 8034 8035 thread->run = run; 8036 thread->mddev = mddev; 8037 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8038 thread->tsk = kthread_run(md_thread, thread, 8039 "%s_%s", 8040 mdname(thread->mddev), 8041 name); 8042 if (IS_ERR(thread->tsk)) { 8043 kfree(thread); 8044 return NULL; 8045 } 8046 return thread; 8047 } 8048 EXPORT_SYMBOL(md_register_thread); 8049 8050 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8051 { 8052 struct md_thread *thread = rcu_dereference_protected(*threadp, 8053 lockdep_is_held(&mddev->reconfig_mutex)); 8054 8055 if (!thread) 8056 return; 8057 8058 rcu_assign_pointer(*threadp, NULL); 8059 synchronize_rcu(); 8060 8061 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8062 kthread_stop(thread->tsk); 8063 kfree(thread); 8064 } 8065 EXPORT_SYMBOL(md_unregister_thread); 8066 8067 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8068 { 8069 if (!rdev || test_bit(Faulty, &rdev->flags)) 8070 return; 8071 8072 if (!mddev->pers || !mddev->pers->error_handler) 8073 return; 8074 mddev->pers->error_handler(mddev, rdev); 8075 8076 if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) 8077 return; 8078 8079 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8080 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8081 sysfs_notify_dirent_safe(rdev->sysfs_state); 8082 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8083 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8084 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8085 md_wakeup_thread(mddev->thread); 8086 } 8087 if (mddev->event_work.func) 8088 queue_work(md_misc_wq, &mddev->event_work); 8089 md_new_event(); 8090 } 8091 EXPORT_SYMBOL(md_error); 8092 8093 /* seq_file implementation /proc/mdstat */ 8094 8095 static void status_unused(struct seq_file *seq) 8096 { 8097 int i = 0; 8098 struct md_rdev *rdev; 8099 8100 seq_printf(seq, "unused devices: "); 8101 8102 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8103 i++; 8104 seq_printf(seq, "%pg ", rdev->bdev); 8105 } 8106 if (!i) 8107 seq_printf(seq, "<none>"); 8108 8109 seq_printf(seq, "\n"); 8110 } 8111 8112 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8113 { 8114 sector_t max_sectors, resync, res; 8115 unsigned long dt, db = 0; 8116 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8117 int scale, recovery_active; 8118 unsigned int per_milli; 8119 8120 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8121 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8122 max_sectors = mddev->resync_max_sectors; 8123 else 8124 max_sectors = mddev->dev_sectors; 8125 8126 resync = mddev->curr_resync; 8127 if (resync < MD_RESYNC_ACTIVE) { 8128 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8129 /* Still cleaning up */ 8130 resync = max_sectors; 8131 } else if (resync > max_sectors) { 8132 resync = max_sectors; 8133 } else { 8134 res = atomic_read(&mddev->recovery_active); 8135 /* 8136 * Resync has started, but the subtraction has overflowed or 8137 * yielded one of the special values. Force it to active to 8138 * ensure the status reports an active resync. 8139 */ 8140 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8141 resync = MD_RESYNC_ACTIVE; 8142 else 8143 resync -= res; 8144 } 8145 8146 if (resync == MD_RESYNC_NONE) { 8147 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8148 struct md_rdev *rdev; 8149 8150 rdev_for_each(rdev, mddev) 8151 if (rdev->raid_disk >= 0 && 8152 !test_bit(Faulty, &rdev->flags) && 8153 rdev->recovery_offset != MaxSector && 8154 rdev->recovery_offset) { 8155 seq_printf(seq, "\trecover=REMOTE"); 8156 return 1; 8157 } 8158 if (mddev->reshape_position != MaxSector) 8159 seq_printf(seq, "\treshape=REMOTE"); 8160 else 8161 seq_printf(seq, "\tresync=REMOTE"); 8162 return 1; 8163 } 8164 if (mddev->recovery_cp < MaxSector) { 8165 seq_printf(seq, "\tresync=PENDING"); 8166 return 1; 8167 } 8168 return 0; 8169 } 8170 if (resync < MD_RESYNC_ACTIVE) { 8171 seq_printf(seq, "\tresync=DELAYED"); 8172 return 1; 8173 } 8174 8175 WARN_ON(max_sectors == 0); 8176 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8177 * in a sector_t, and (max_sectors>>scale) will fit in a 8178 * u32, as those are the requirements for sector_div. 8179 * Thus 'scale' must be at least 10 8180 */ 8181 scale = 10; 8182 if (sizeof(sector_t) > sizeof(unsigned long)) { 8183 while ( max_sectors/2 > (1ULL<<(scale+32))) 8184 scale++; 8185 } 8186 res = (resync>>scale)*1000; 8187 sector_div(res, (u32)((max_sectors>>scale)+1)); 8188 8189 per_milli = res; 8190 { 8191 int i, x = per_milli/50, y = 20-x; 8192 seq_printf(seq, "["); 8193 for (i = 0; i < x; i++) 8194 seq_printf(seq, "="); 8195 seq_printf(seq, ">"); 8196 for (i = 0; i < y; i++) 8197 seq_printf(seq, "."); 8198 seq_printf(seq, "] "); 8199 } 8200 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8201 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8202 "reshape" : 8203 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8204 "check" : 8205 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8206 "resync" : "recovery"))), 8207 per_milli/10, per_milli % 10, 8208 (unsigned long long) resync/2, 8209 (unsigned long long) max_sectors/2); 8210 8211 /* 8212 * dt: time from mark until now 8213 * db: blocks written from mark until now 8214 * rt: remaining time 8215 * 8216 * rt is a sector_t, which is always 64bit now. We are keeping 8217 * the original algorithm, but it is not really necessary. 8218 * 8219 * Original algorithm: 8220 * So we divide before multiply in case it is 32bit and close 8221 * to the limit. 8222 * We scale the divisor (db) by 32 to avoid losing precision 8223 * near the end of resync when the number of remaining sectors 8224 * is close to 'db'. 8225 * We then divide rt by 32 after multiplying by db to compensate. 8226 * The '+1' avoids division by zero if db is very small. 8227 */ 8228 dt = ((jiffies - mddev->resync_mark) / HZ); 8229 if (!dt) dt++; 8230 8231 curr_mark_cnt = mddev->curr_mark_cnt; 8232 recovery_active = atomic_read(&mddev->recovery_active); 8233 resync_mark_cnt = mddev->resync_mark_cnt; 8234 8235 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8236 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8237 8238 rt = max_sectors - resync; /* number of remaining sectors */ 8239 rt = div64_u64(rt, db/32+1); 8240 rt *= dt; 8241 rt >>= 5; 8242 8243 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8244 ((unsigned long)rt % 60)/6); 8245 8246 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8247 return 1; 8248 } 8249 8250 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8251 { 8252 struct list_head *tmp; 8253 loff_t l = *pos; 8254 struct mddev *mddev; 8255 8256 if (l == 0x10000) { 8257 ++*pos; 8258 return (void *)2; 8259 } 8260 if (l > 0x10000) 8261 return NULL; 8262 if (!l--) 8263 /* header */ 8264 return (void*)1; 8265 8266 spin_lock(&all_mddevs_lock); 8267 list_for_each(tmp,&all_mddevs) 8268 if (!l--) { 8269 mddev = list_entry(tmp, struct mddev, all_mddevs); 8270 if (!mddev_get(mddev)) 8271 continue; 8272 spin_unlock(&all_mddevs_lock); 8273 return mddev; 8274 } 8275 spin_unlock(&all_mddevs_lock); 8276 if (!l--) 8277 return (void*)2;/* tail */ 8278 return NULL; 8279 } 8280 8281 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8282 { 8283 struct list_head *tmp; 8284 struct mddev *next_mddev, *mddev = v; 8285 struct mddev *to_put = NULL; 8286 8287 ++*pos; 8288 if (v == (void*)2) 8289 return NULL; 8290 8291 spin_lock(&all_mddevs_lock); 8292 if (v == (void*)1) { 8293 tmp = all_mddevs.next; 8294 } else { 8295 to_put = mddev; 8296 tmp = mddev->all_mddevs.next; 8297 } 8298 8299 for (;;) { 8300 if (tmp == &all_mddevs) { 8301 next_mddev = (void*)2; 8302 *pos = 0x10000; 8303 break; 8304 } 8305 next_mddev = list_entry(tmp, struct mddev, all_mddevs); 8306 if (mddev_get(next_mddev)) 8307 break; 8308 mddev = next_mddev; 8309 tmp = mddev->all_mddevs.next; 8310 } 8311 spin_unlock(&all_mddevs_lock); 8312 8313 if (to_put) 8314 mddev_put(to_put); 8315 return next_mddev; 8316 8317 } 8318 8319 static void md_seq_stop(struct seq_file *seq, void *v) 8320 { 8321 struct mddev *mddev = v; 8322 8323 if (mddev && v != (void*)1 && v != (void*)2) 8324 mddev_put(mddev); 8325 } 8326 8327 static int md_seq_show(struct seq_file *seq, void *v) 8328 { 8329 struct mddev *mddev = v; 8330 sector_t sectors; 8331 struct md_rdev *rdev; 8332 8333 if (v == (void*)1) { 8334 struct md_personality *pers; 8335 seq_printf(seq, "Personalities : "); 8336 spin_lock(&pers_lock); 8337 list_for_each_entry(pers, &pers_list, list) 8338 seq_printf(seq, "[%s] ", pers->name); 8339 8340 spin_unlock(&pers_lock); 8341 seq_printf(seq, "\n"); 8342 seq->poll_event = atomic_read(&md_event_count); 8343 return 0; 8344 } 8345 if (v == (void*)2) { 8346 status_unused(seq); 8347 return 0; 8348 } 8349 8350 spin_lock(&mddev->lock); 8351 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8352 seq_printf(seq, "%s : %sactive", mdname(mddev), 8353 mddev->pers ? "" : "in"); 8354 if (mddev->pers) { 8355 if (mddev->ro == MD_RDONLY) 8356 seq_printf(seq, " (read-only)"); 8357 if (mddev->ro == MD_AUTO_READ) 8358 seq_printf(seq, " (auto-read-only)"); 8359 seq_printf(seq, " %s", mddev->pers->name); 8360 } 8361 8362 sectors = 0; 8363 rcu_read_lock(); 8364 rdev_for_each_rcu(rdev, mddev) { 8365 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8366 8367 if (test_bit(WriteMostly, &rdev->flags)) 8368 seq_printf(seq, "(W)"); 8369 if (test_bit(Journal, &rdev->flags)) 8370 seq_printf(seq, "(J)"); 8371 if (test_bit(Faulty, &rdev->flags)) { 8372 seq_printf(seq, "(F)"); 8373 continue; 8374 } 8375 if (rdev->raid_disk < 0) 8376 seq_printf(seq, "(S)"); /* spare */ 8377 if (test_bit(Replacement, &rdev->flags)) 8378 seq_printf(seq, "(R)"); 8379 sectors += rdev->sectors; 8380 } 8381 rcu_read_unlock(); 8382 8383 if (!list_empty(&mddev->disks)) { 8384 if (mddev->pers) 8385 seq_printf(seq, "\n %llu blocks", 8386 (unsigned long long) 8387 mddev->array_sectors / 2); 8388 else 8389 seq_printf(seq, "\n %llu blocks", 8390 (unsigned long long)sectors / 2); 8391 } 8392 if (mddev->persistent) { 8393 if (mddev->major_version != 0 || 8394 mddev->minor_version != 90) { 8395 seq_printf(seq," super %d.%d", 8396 mddev->major_version, 8397 mddev->minor_version); 8398 } 8399 } else if (mddev->external) 8400 seq_printf(seq, " super external:%s", 8401 mddev->metadata_type); 8402 else 8403 seq_printf(seq, " super non-persistent"); 8404 8405 if (mddev->pers) { 8406 mddev->pers->status(seq, mddev); 8407 seq_printf(seq, "\n "); 8408 if (mddev->pers->sync_request) { 8409 if (status_resync(seq, mddev)) 8410 seq_printf(seq, "\n "); 8411 } 8412 } else 8413 seq_printf(seq, "\n "); 8414 8415 md_bitmap_status(seq, mddev->bitmap); 8416 8417 seq_printf(seq, "\n"); 8418 } 8419 spin_unlock(&mddev->lock); 8420 8421 return 0; 8422 } 8423 8424 static const struct seq_operations md_seq_ops = { 8425 .start = md_seq_start, 8426 .next = md_seq_next, 8427 .stop = md_seq_stop, 8428 .show = md_seq_show, 8429 }; 8430 8431 static int md_seq_open(struct inode *inode, struct file *file) 8432 { 8433 struct seq_file *seq; 8434 int error; 8435 8436 error = seq_open(file, &md_seq_ops); 8437 if (error) 8438 return error; 8439 8440 seq = file->private_data; 8441 seq->poll_event = atomic_read(&md_event_count); 8442 return error; 8443 } 8444 8445 static int md_unloading; 8446 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8447 { 8448 struct seq_file *seq = filp->private_data; 8449 __poll_t mask; 8450 8451 if (md_unloading) 8452 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8453 poll_wait(filp, &md_event_waiters, wait); 8454 8455 /* always allow read */ 8456 mask = EPOLLIN | EPOLLRDNORM; 8457 8458 if (seq->poll_event != atomic_read(&md_event_count)) 8459 mask |= EPOLLERR | EPOLLPRI; 8460 return mask; 8461 } 8462 8463 static const struct proc_ops mdstat_proc_ops = { 8464 .proc_open = md_seq_open, 8465 .proc_read = seq_read, 8466 .proc_lseek = seq_lseek, 8467 .proc_release = seq_release, 8468 .proc_poll = mdstat_poll, 8469 }; 8470 8471 int register_md_personality(struct md_personality *p) 8472 { 8473 pr_debug("md: %s personality registered for level %d\n", 8474 p->name, p->level); 8475 spin_lock(&pers_lock); 8476 list_add_tail(&p->list, &pers_list); 8477 spin_unlock(&pers_lock); 8478 return 0; 8479 } 8480 EXPORT_SYMBOL(register_md_personality); 8481 8482 int unregister_md_personality(struct md_personality *p) 8483 { 8484 pr_debug("md: %s personality unregistered\n", p->name); 8485 spin_lock(&pers_lock); 8486 list_del_init(&p->list); 8487 spin_unlock(&pers_lock); 8488 return 0; 8489 } 8490 EXPORT_SYMBOL(unregister_md_personality); 8491 8492 int register_md_cluster_operations(struct md_cluster_operations *ops, 8493 struct module *module) 8494 { 8495 int ret = 0; 8496 spin_lock(&pers_lock); 8497 if (md_cluster_ops != NULL) 8498 ret = -EALREADY; 8499 else { 8500 md_cluster_ops = ops; 8501 md_cluster_mod = module; 8502 } 8503 spin_unlock(&pers_lock); 8504 return ret; 8505 } 8506 EXPORT_SYMBOL(register_md_cluster_operations); 8507 8508 int unregister_md_cluster_operations(void) 8509 { 8510 spin_lock(&pers_lock); 8511 md_cluster_ops = NULL; 8512 spin_unlock(&pers_lock); 8513 return 0; 8514 } 8515 EXPORT_SYMBOL(unregister_md_cluster_operations); 8516 8517 int md_setup_cluster(struct mddev *mddev, int nodes) 8518 { 8519 int ret; 8520 if (!md_cluster_ops) 8521 request_module("md-cluster"); 8522 spin_lock(&pers_lock); 8523 /* ensure module won't be unloaded */ 8524 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8525 pr_warn("can't find md-cluster module or get its reference.\n"); 8526 spin_unlock(&pers_lock); 8527 return -ENOENT; 8528 } 8529 spin_unlock(&pers_lock); 8530 8531 ret = md_cluster_ops->join(mddev, nodes); 8532 if (!ret) 8533 mddev->safemode_delay = 0; 8534 return ret; 8535 } 8536 8537 void md_cluster_stop(struct mddev *mddev) 8538 { 8539 if (!md_cluster_ops) 8540 return; 8541 md_cluster_ops->leave(mddev); 8542 module_put(md_cluster_mod); 8543 } 8544 8545 static int is_mddev_idle(struct mddev *mddev, int init) 8546 { 8547 struct md_rdev *rdev; 8548 int idle; 8549 int curr_events; 8550 8551 idle = 1; 8552 rcu_read_lock(); 8553 rdev_for_each_rcu(rdev, mddev) { 8554 struct gendisk *disk = rdev->bdev->bd_disk; 8555 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8556 atomic_read(&disk->sync_io); 8557 /* sync IO will cause sync_io to increase before the disk_stats 8558 * as sync_io is counted when a request starts, and 8559 * disk_stats is counted when it completes. 8560 * So resync activity will cause curr_events to be smaller than 8561 * when there was no such activity. 8562 * non-sync IO will cause disk_stat to increase without 8563 * increasing sync_io so curr_events will (eventually) 8564 * be larger than it was before. Once it becomes 8565 * substantially larger, the test below will cause 8566 * the array to appear non-idle, and resync will slow 8567 * down. 8568 * If there is a lot of outstanding resync activity when 8569 * we set last_event to curr_events, then all that activity 8570 * completing might cause the array to appear non-idle 8571 * and resync will be slowed down even though there might 8572 * not have been non-resync activity. This will only 8573 * happen once though. 'last_events' will soon reflect 8574 * the state where there is little or no outstanding 8575 * resync requests, and further resync activity will 8576 * always make curr_events less than last_events. 8577 * 8578 */ 8579 if (init || curr_events - rdev->last_events > 64) { 8580 rdev->last_events = curr_events; 8581 idle = 0; 8582 } 8583 } 8584 rcu_read_unlock(); 8585 return idle; 8586 } 8587 8588 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8589 { 8590 /* another "blocks" (512byte) blocks have been synced */ 8591 atomic_sub(blocks, &mddev->recovery_active); 8592 wake_up(&mddev->recovery_wait); 8593 if (!ok) { 8594 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8595 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8596 md_wakeup_thread(mddev->thread); 8597 // stop recovery, signal do_sync .... 8598 } 8599 } 8600 EXPORT_SYMBOL(md_done_sync); 8601 8602 /* md_write_start(mddev, bi) 8603 * If we need to update some array metadata (e.g. 'active' flag 8604 * in superblock) before writing, schedule a superblock update 8605 * and wait for it to complete. 8606 * A return value of 'false' means that the write wasn't recorded 8607 * and cannot proceed as the array is being suspend. 8608 */ 8609 bool md_write_start(struct mddev *mddev, struct bio *bi) 8610 { 8611 int did_change = 0; 8612 8613 if (bio_data_dir(bi) != WRITE) 8614 return true; 8615 8616 BUG_ON(mddev->ro == MD_RDONLY); 8617 if (mddev->ro == MD_AUTO_READ) { 8618 /* need to switch to read/write */ 8619 mddev->ro = MD_RDWR; 8620 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8621 md_wakeup_thread(mddev->thread); 8622 md_wakeup_thread(mddev->sync_thread); 8623 did_change = 1; 8624 } 8625 rcu_read_lock(); 8626 percpu_ref_get(&mddev->writes_pending); 8627 smp_mb(); /* Match smp_mb in set_in_sync() */ 8628 if (mddev->safemode == 1) 8629 mddev->safemode = 0; 8630 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8631 if (mddev->in_sync || mddev->sync_checkers) { 8632 spin_lock(&mddev->lock); 8633 if (mddev->in_sync) { 8634 mddev->in_sync = 0; 8635 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8636 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8637 md_wakeup_thread(mddev->thread); 8638 did_change = 1; 8639 } 8640 spin_unlock(&mddev->lock); 8641 } 8642 rcu_read_unlock(); 8643 if (did_change) 8644 sysfs_notify_dirent_safe(mddev->sysfs_state); 8645 if (!mddev->has_superblocks) 8646 return true; 8647 wait_event(mddev->sb_wait, 8648 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8649 is_md_suspended(mddev)); 8650 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8651 percpu_ref_put(&mddev->writes_pending); 8652 return false; 8653 } 8654 return true; 8655 } 8656 EXPORT_SYMBOL(md_write_start); 8657 8658 /* md_write_inc can only be called when md_write_start() has 8659 * already been called at least once of the current request. 8660 * It increments the counter and is useful when a single request 8661 * is split into several parts. Each part causes an increment and 8662 * so needs a matching md_write_end(). 8663 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8664 * a spinlocked region. 8665 */ 8666 void md_write_inc(struct mddev *mddev, struct bio *bi) 8667 { 8668 if (bio_data_dir(bi) != WRITE) 8669 return; 8670 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8671 percpu_ref_get(&mddev->writes_pending); 8672 } 8673 EXPORT_SYMBOL(md_write_inc); 8674 8675 void md_write_end(struct mddev *mddev) 8676 { 8677 percpu_ref_put(&mddev->writes_pending); 8678 8679 if (mddev->safemode == 2) 8680 md_wakeup_thread(mddev->thread); 8681 else if (mddev->safemode_delay) 8682 /* The roundup() ensures this only performs locking once 8683 * every ->safemode_delay jiffies 8684 */ 8685 mod_timer(&mddev->safemode_timer, 8686 roundup(jiffies, mddev->safemode_delay) + 8687 mddev->safemode_delay); 8688 } 8689 8690 EXPORT_SYMBOL(md_write_end); 8691 8692 /* This is used by raid0 and raid10 */ 8693 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8694 struct bio *bio, sector_t start, sector_t size) 8695 { 8696 struct bio *discard_bio = NULL; 8697 8698 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8699 &discard_bio) || !discard_bio) 8700 return; 8701 8702 bio_chain(discard_bio, bio); 8703 bio_clone_blkg_association(discard_bio, bio); 8704 if (mddev->gendisk) 8705 trace_block_bio_remap(discard_bio, 8706 disk_devt(mddev->gendisk), 8707 bio->bi_iter.bi_sector); 8708 submit_bio_noacct(discard_bio); 8709 } 8710 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8711 8712 static void md_end_clone_io(struct bio *bio) 8713 { 8714 struct md_io_clone *md_io_clone = bio->bi_private; 8715 struct bio *orig_bio = md_io_clone->orig_bio; 8716 struct mddev *mddev = md_io_clone->mddev; 8717 8718 if (bio->bi_status && !orig_bio->bi_status) 8719 orig_bio->bi_status = bio->bi_status; 8720 8721 if (md_io_clone->start_time) 8722 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8723 8724 bio_put(bio); 8725 bio_endio(orig_bio); 8726 percpu_ref_put(&mddev->active_io); 8727 } 8728 8729 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8730 { 8731 struct block_device *bdev = (*bio)->bi_bdev; 8732 struct md_io_clone *md_io_clone; 8733 struct bio *clone = 8734 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8735 8736 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8737 md_io_clone->orig_bio = *bio; 8738 md_io_clone->mddev = mddev; 8739 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8740 md_io_clone->start_time = bio_start_io_acct(*bio); 8741 8742 clone->bi_end_io = md_end_clone_io; 8743 clone->bi_private = md_io_clone; 8744 *bio = clone; 8745 } 8746 8747 void md_account_bio(struct mddev *mddev, struct bio **bio) 8748 { 8749 percpu_ref_get(&mddev->active_io); 8750 md_clone_bio(mddev, bio); 8751 } 8752 EXPORT_SYMBOL_GPL(md_account_bio); 8753 8754 /* md_allow_write(mddev) 8755 * Calling this ensures that the array is marked 'active' so that writes 8756 * may proceed without blocking. It is important to call this before 8757 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8758 * Must be called with mddev_lock held. 8759 */ 8760 void md_allow_write(struct mddev *mddev) 8761 { 8762 if (!mddev->pers) 8763 return; 8764 if (!md_is_rdwr(mddev)) 8765 return; 8766 if (!mddev->pers->sync_request) 8767 return; 8768 8769 spin_lock(&mddev->lock); 8770 if (mddev->in_sync) { 8771 mddev->in_sync = 0; 8772 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8773 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8774 if (mddev->safemode_delay && 8775 mddev->safemode == 0) 8776 mddev->safemode = 1; 8777 spin_unlock(&mddev->lock); 8778 md_update_sb(mddev, 0); 8779 sysfs_notify_dirent_safe(mddev->sysfs_state); 8780 /* wait for the dirty state to be recorded in the metadata */ 8781 wait_event(mddev->sb_wait, 8782 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8783 } else 8784 spin_unlock(&mddev->lock); 8785 } 8786 EXPORT_SYMBOL_GPL(md_allow_write); 8787 8788 #define SYNC_MARKS 10 8789 #define SYNC_MARK_STEP (3*HZ) 8790 #define UPDATE_FREQUENCY (5*60*HZ) 8791 void md_do_sync(struct md_thread *thread) 8792 { 8793 struct mddev *mddev = thread->mddev; 8794 struct mddev *mddev2; 8795 unsigned int currspeed = 0, window; 8796 sector_t max_sectors,j, io_sectors, recovery_done; 8797 unsigned long mark[SYNC_MARKS]; 8798 unsigned long update_time; 8799 sector_t mark_cnt[SYNC_MARKS]; 8800 int last_mark,m; 8801 sector_t last_check; 8802 int skipped = 0; 8803 struct md_rdev *rdev; 8804 char *desc, *action = NULL; 8805 struct blk_plug plug; 8806 int ret; 8807 8808 /* just incase thread restarts... */ 8809 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8810 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8811 return; 8812 if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8813 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8814 return; 8815 } 8816 8817 if (mddev_is_clustered(mddev)) { 8818 ret = md_cluster_ops->resync_start(mddev); 8819 if (ret) 8820 goto skip; 8821 8822 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8823 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8824 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8825 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8826 && ((unsigned long long)mddev->curr_resync_completed 8827 < (unsigned long long)mddev->resync_max_sectors)) 8828 goto skip; 8829 } 8830 8831 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8832 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8833 desc = "data-check"; 8834 action = "check"; 8835 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8836 desc = "requested-resync"; 8837 action = "repair"; 8838 } else 8839 desc = "resync"; 8840 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8841 desc = "reshape"; 8842 else 8843 desc = "recovery"; 8844 8845 mddev->last_sync_action = action ?: desc; 8846 8847 /* 8848 * Before starting a resync we must have set curr_resync to 8849 * 2, and then checked that every "conflicting" array has curr_resync 8850 * less than ours. When we find one that is the same or higher 8851 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8852 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8853 * This will mean we have to start checking from the beginning again. 8854 * 8855 */ 8856 8857 do { 8858 int mddev2_minor = -1; 8859 mddev->curr_resync = MD_RESYNC_DELAYED; 8860 8861 try_again: 8862 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8863 goto skip; 8864 spin_lock(&all_mddevs_lock); 8865 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8866 if (test_bit(MD_DELETED, &mddev2->flags)) 8867 continue; 8868 if (mddev2 == mddev) 8869 continue; 8870 if (!mddev->parallel_resync 8871 && mddev2->curr_resync 8872 && match_mddev_units(mddev, mddev2)) { 8873 DEFINE_WAIT(wq); 8874 if (mddev < mddev2 && 8875 mddev->curr_resync == MD_RESYNC_DELAYED) { 8876 /* arbitrarily yield */ 8877 mddev->curr_resync = MD_RESYNC_YIELDED; 8878 wake_up(&resync_wait); 8879 } 8880 if (mddev > mddev2 && 8881 mddev->curr_resync == MD_RESYNC_YIELDED) 8882 /* no need to wait here, we can wait the next 8883 * time 'round when curr_resync == 2 8884 */ 8885 continue; 8886 /* We need to wait 'interruptible' so as not to 8887 * contribute to the load average, and not to 8888 * be caught by 'softlockup' 8889 */ 8890 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8891 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8892 mddev2->curr_resync >= mddev->curr_resync) { 8893 if (mddev2_minor != mddev2->md_minor) { 8894 mddev2_minor = mddev2->md_minor; 8895 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8896 desc, mdname(mddev), 8897 mdname(mddev2)); 8898 } 8899 spin_unlock(&all_mddevs_lock); 8900 8901 if (signal_pending(current)) 8902 flush_signals(current); 8903 schedule(); 8904 finish_wait(&resync_wait, &wq); 8905 goto try_again; 8906 } 8907 finish_wait(&resync_wait, &wq); 8908 } 8909 } 8910 spin_unlock(&all_mddevs_lock); 8911 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8912 8913 j = 0; 8914 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8915 /* resync follows the size requested by the personality, 8916 * which defaults to physical size, but can be virtual size 8917 */ 8918 max_sectors = mddev->resync_max_sectors; 8919 atomic64_set(&mddev->resync_mismatches, 0); 8920 /* we don't use the checkpoint if there's a bitmap */ 8921 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8922 j = mddev->resync_min; 8923 else if (!mddev->bitmap) 8924 j = mddev->recovery_cp; 8925 8926 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8927 max_sectors = mddev->resync_max_sectors; 8928 /* 8929 * If the original node aborts reshaping then we continue the 8930 * reshaping, so set j again to avoid restart reshape from the 8931 * first beginning 8932 */ 8933 if (mddev_is_clustered(mddev) && 8934 mddev->reshape_position != MaxSector) 8935 j = mddev->reshape_position; 8936 } else { 8937 /* recovery follows the physical size of devices */ 8938 max_sectors = mddev->dev_sectors; 8939 j = MaxSector; 8940 rcu_read_lock(); 8941 rdev_for_each_rcu(rdev, mddev) 8942 if (rdev->raid_disk >= 0 && 8943 !test_bit(Journal, &rdev->flags) && 8944 !test_bit(Faulty, &rdev->flags) && 8945 !test_bit(In_sync, &rdev->flags) && 8946 rdev->recovery_offset < j) 8947 j = rdev->recovery_offset; 8948 rcu_read_unlock(); 8949 8950 /* If there is a bitmap, we need to make sure all 8951 * writes that started before we added a spare 8952 * complete before we start doing a recovery. 8953 * Otherwise the write might complete and (via 8954 * bitmap_endwrite) set a bit in the bitmap after the 8955 * recovery has checked that bit and skipped that 8956 * region. 8957 */ 8958 if (mddev->bitmap) { 8959 mddev->pers->quiesce(mddev, 1); 8960 mddev->pers->quiesce(mddev, 0); 8961 } 8962 } 8963 8964 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8965 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8966 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8967 speed_max(mddev), desc); 8968 8969 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8970 8971 io_sectors = 0; 8972 for (m = 0; m < SYNC_MARKS; m++) { 8973 mark[m] = jiffies; 8974 mark_cnt[m] = io_sectors; 8975 } 8976 last_mark = 0; 8977 mddev->resync_mark = mark[last_mark]; 8978 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8979 8980 /* 8981 * Tune reconstruction: 8982 */ 8983 window = 32 * (PAGE_SIZE / 512); 8984 pr_debug("md: using %dk window, over a total of %lluk.\n", 8985 window/2, (unsigned long long)max_sectors/2); 8986 8987 atomic_set(&mddev->recovery_active, 0); 8988 last_check = 0; 8989 8990 if (j >= MD_RESYNC_ACTIVE) { 8991 pr_debug("md: resuming %s of %s from checkpoint.\n", 8992 desc, mdname(mddev)); 8993 mddev->curr_resync = j; 8994 } else 8995 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 8996 mddev->curr_resync_completed = j; 8997 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8998 md_new_event(); 8999 update_time = jiffies; 9000 9001 blk_start_plug(&plug); 9002 while (j < max_sectors) { 9003 sector_t sectors; 9004 9005 skipped = 0; 9006 9007 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9008 ((mddev->curr_resync > mddev->curr_resync_completed && 9009 (mddev->curr_resync - mddev->curr_resync_completed) 9010 > (max_sectors >> 4)) || 9011 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9012 (j - mddev->curr_resync_completed)*2 9013 >= mddev->resync_max - mddev->curr_resync_completed || 9014 mddev->curr_resync_completed > mddev->resync_max 9015 )) { 9016 /* time to update curr_resync_completed */ 9017 wait_event(mddev->recovery_wait, 9018 atomic_read(&mddev->recovery_active) == 0); 9019 mddev->curr_resync_completed = j; 9020 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9021 j > mddev->recovery_cp) 9022 mddev->recovery_cp = j; 9023 update_time = jiffies; 9024 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9025 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9026 } 9027 9028 while (j >= mddev->resync_max && 9029 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9030 /* As this condition is controlled by user-space, 9031 * we can block indefinitely, so use '_interruptible' 9032 * to avoid triggering warnings. 9033 */ 9034 flush_signals(current); /* just in case */ 9035 wait_event_interruptible(mddev->recovery_wait, 9036 mddev->resync_max > j 9037 || test_bit(MD_RECOVERY_INTR, 9038 &mddev->recovery)); 9039 } 9040 9041 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9042 break; 9043 9044 sectors = mddev->pers->sync_request(mddev, j, &skipped); 9045 if (sectors == 0) { 9046 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9047 break; 9048 } 9049 9050 if (!skipped) { /* actual IO requested */ 9051 io_sectors += sectors; 9052 atomic_add(sectors, &mddev->recovery_active); 9053 } 9054 9055 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9056 break; 9057 9058 j += sectors; 9059 if (j > max_sectors) 9060 /* when skipping, extra large numbers can be returned. */ 9061 j = max_sectors; 9062 if (j >= MD_RESYNC_ACTIVE) 9063 mddev->curr_resync = j; 9064 mddev->curr_mark_cnt = io_sectors; 9065 if (last_check == 0) 9066 /* this is the earliest that rebuild will be 9067 * visible in /proc/mdstat 9068 */ 9069 md_new_event(); 9070 9071 if (last_check + window > io_sectors || j == max_sectors) 9072 continue; 9073 9074 last_check = io_sectors; 9075 repeat: 9076 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9077 /* step marks */ 9078 int next = (last_mark+1) % SYNC_MARKS; 9079 9080 mddev->resync_mark = mark[next]; 9081 mddev->resync_mark_cnt = mark_cnt[next]; 9082 mark[next] = jiffies; 9083 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9084 last_mark = next; 9085 } 9086 9087 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9088 break; 9089 9090 /* 9091 * this loop exits only if either when we are slower than 9092 * the 'hard' speed limit, or the system was IO-idle for 9093 * a jiffy. 9094 * the system might be non-idle CPU-wise, but we only care 9095 * about not overloading the IO subsystem. (things like an 9096 * e2fsck being done on the RAID array should execute fast) 9097 */ 9098 cond_resched(); 9099 9100 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9101 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9102 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9103 9104 if (currspeed > speed_min(mddev)) { 9105 if (currspeed > speed_max(mddev)) { 9106 msleep(500); 9107 goto repeat; 9108 } 9109 if (!is_mddev_idle(mddev, 0)) { 9110 /* 9111 * Give other IO more of a chance. 9112 * The faster the devices, the less we wait. 9113 */ 9114 wait_event(mddev->recovery_wait, 9115 !atomic_read(&mddev->recovery_active)); 9116 } 9117 } 9118 } 9119 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9120 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9121 ? "interrupted" : "done"); 9122 /* 9123 * this also signals 'finished resyncing' to md_stop 9124 */ 9125 blk_finish_plug(&plug); 9126 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9127 9128 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9129 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9130 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9131 mddev->curr_resync_completed = mddev->curr_resync; 9132 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9133 } 9134 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9135 9136 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9137 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9138 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9139 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9140 if (mddev->curr_resync >= mddev->recovery_cp) { 9141 pr_debug("md: checkpointing %s of %s.\n", 9142 desc, mdname(mddev)); 9143 if (test_bit(MD_RECOVERY_ERROR, 9144 &mddev->recovery)) 9145 mddev->recovery_cp = 9146 mddev->curr_resync_completed; 9147 else 9148 mddev->recovery_cp = 9149 mddev->curr_resync; 9150 } 9151 } else 9152 mddev->recovery_cp = MaxSector; 9153 } else { 9154 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9155 mddev->curr_resync = MaxSector; 9156 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9157 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9158 rcu_read_lock(); 9159 rdev_for_each_rcu(rdev, mddev) 9160 if (rdev->raid_disk >= 0 && 9161 mddev->delta_disks >= 0 && 9162 !test_bit(Journal, &rdev->flags) && 9163 !test_bit(Faulty, &rdev->flags) && 9164 !test_bit(In_sync, &rdev->flags) && 9165 rdev->recovery_offset < mddev->curr_resync) 9166 rdev->recovery_offset = mddev->curr_resync; 9167 rcu_read_unlock(); 9168 } 9169 } 9170 } 9171 skip: 9172 /* set CHANGE_PENDING here since maybe another update is needed, 9173 * so other nodes are informed. It should be harmless for normal 9174 * raid */ 9175 set_mask_bits(&mddev->sb_flags, 0, 9176 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9177 9178 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9179 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9180 mddev->delta_disks > 0 && 9181 mddev->pers->finish_reshape && 9182 mddev->pers->size && 9183 mddev->queue) { 9184 mddev_lock_nointr(mddev); 9185 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9186 mddev_unlock(mddev); 9187 if (!mddev_is_clustered(mddev)) 9188 set_capacity_and_notify(mddev->gendisk, 9189 mddev->array_sectors); 9190 } 9191 9192 spin_lock(&mddev->lock); 9193 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9194 /* We completed so min/max setting can be forgotten if used. */ 9195 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9196 mddev->resync_min = 0; 9197 mddev->resync_max = MaxSector; 9198 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9199 mddev->resync_min = mddev->curr_resync_completed; 9200 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9201 mddev->curr_resync = MD_RESYNC_NONE; 9202 spin_unlock(&mddev->lock); 9203 9204 wake_up(&resync_wait); 9205 wake_up(&mddev->sb_wait); 9206 md_wakeup_thread(mddev->thread); 9207 return; 9208 } 9209 EXPORT_SYMBOL_GPL(md_do_sync); 9210 9211 static int remove_and_add_spares(struct mddev *mddev, 9212 struct md_rdev *this) 9213 { 9214 struct md_rdev *rdev; 9215 int spares = 0; 9216 int removed = 0; 9217 bool remove_some = false; 9218 9219 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9220 /* Mustn't remove devices when resync thread is running */ 9221 return 0; 9222 9223 rdev_for_each(rdev, mddev) { 9224 if ((this == NULL || rdev == this) && 9225 rdev->raid_disk >= 0 && 9226 !test_bit(Blocked, &rdev->flags) && 9227 test_bit(Faulty, &rdev->flags) && 9228 atomic_read(&rdev->nr_pending)==0) { 9229 /* Faulty non-Blocked devices with nr_pending == 0 9230 * never get nr_pending incremented, 9231 * never get Faulty cleared, and never get Blocked set. 9232 * So we can synchronize_rcu now rather than once per device 9233 */ 9234 remove_some = true; 9235 set_bit(RemoveSynchronized, &rdev->flags); 9236 } 9237 } 9238 9239 if (remove_some) 9240 synchronize_rcu(); 9241 rdev_for_each(rdev, mddev) { 9242 if ((this == NULL || rdev == this) && 9243 rdev->raid_disk >= 0 && 9244 !test_bit(Blocked, &rdev->flags) && 9245 ((test_bit(RemoveSynchronized, &rdev->flags) || 9246 (!test_bit(In_sync, &rdev->flags) && 9247 !test_bit(Journal, &rdev->flags))) && 9248 atomic_read(&rdev->nr_pending)==0)) { 9249 if (mddev->pers->hot_remove_disk( 9250 mddev, rdev) == 0) { 9251 sysfs_unlink_rdev(mddev, rdev); 9252 rdev->saved_raid_disk = rdev->raid_disk; 9253 rdev->raid_disk = -1; 9254 removed++; 9255 } 9256 } 9257 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 9258 clear_bit(RemoveSynchronized, &rdev->flags); 9259 } 9260 9261 if (removed && mddev->kobj.sd) 9262 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9263 9264 if (this && removed) 9265 goto no_add; 9266 9267 rdev_for_each(rdev, mddev) { 9268 if (this && this != rdev) 9269 continue; 9270 if (test_bit(Candidate, &rdev->flags)) 9271 continue; 9272 if (rdev->raid_disk >= 0 && 9273 !test_bit(In_sync, &rdev->flags) && 9274 !test_bit(Journal, &rdev->flags) && 9275 !test_bit(Faulty, &rdev->flags)) 9276 spares++; 9277 if (rdev->raid_disk >= 0) 9278 continue; 9279 if (test_bit(Faulty, &rdev->flags)) 9280 continue; 9281 if (!test_bit(Journal, &rdev->flags)) { 9282 if (!md_is_rdwr(mddev) && 9283 !(rdev->saved_raid_disk >= 0 && 9284 !test_bit(Bitmap_sync, &rdev->flags))) 9285 continue; 9286 9287 rdev->recovery_offset = 0; 9288 } 9289 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9290 /* failure here is OK */ 9291 sysfs_link_rdev(mddev, rdev); 9292 if (!test_bit(Journal, &rdev->flags)) 9293 spares++; 9294 md_new_event(); 9295 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9296 } 9297 } 9298 no_add: 9299 if (removed) 9300 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9301 return spares; 9302 } 9303 9304 static void md_start_sync(struct work_struct *ws) 9305 { 9306 struct mddev *mddev = container_of(ws, struct mddev, del_work); 9307 9308 rcu_assign_pointer(mddev->sync_thread, 9309 md_register_thread(md_do_sync, mddev, "resync")); 9310 if (!mddev->sync_thread) { 9311 pr_warn("%s: could not start resync thread...\n", 9312 mdname(mddev)); 9313 /* leave the spares where they are, it shouldn't hurt */ 9314 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9315 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9316 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9317 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9318 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9319 wake_up(&resync_wait); 9320 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9321 &mddev->recovery)) 9322 if (mddev->sysfs_action) 9323 sysfs_notify_dirent_safe(mddev->sysfs_action); 9324 } else 9325 md_wakeup_thread(mddev->sync_thread); 9326 sysfs_notify_dirent_safe(mddev->sysfs_action); 9327 md_new_event(); 9328 } 9329 9330 /* 9331 * This routine is regularly called by all per-raid-array threads to 9332 * deal with generic issues like resync and super-block update. 9333 * Raid personalities that don't have a thread (linear/raid0) do not 9334 * need this as they never do any recovery or update the superblock. 9335 * 9336 * It does not do any resync itself, but rather "forks" off other threads 9337 * to do that as needed. 9338 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9339 * "->recovery" and create a thread at ->sync_thread. 9340 * When the thread finishes it sets MD_RECOVERY_DONE 9341 * and wakeups up this thread which will reap the thread and finish up. 9342 * This thread also removes any faulty devices (with nr_pending == 0). 9343 * 9344 * The overall approach is: 9345 * 1/ if the superblock needs updating, update it. 9346 * 2/ If a recovery thread is running, don't do anything else. 9347 * 3/ If recovery has finished, clean up, possibly marking spares active. 9348 * 4/ If there are any faulty devices, remove them. 9349 * 5/ If array is degraded, try to add spares devices 9350 * 6/ If array has spares or is not in-sync, start a resync thread. 9351 */ 9352 void md_check_recovery(struct mddev *mddev) 9353 { 9354 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { 9355 /* Write superblock - thread that called mddev_suspend() 9356 * holds reconfig_mutex for us. 9357 */ 9358 set_bit(MD_UPDATING_SB, &mddev->flags); 9359 smp_mb__after_atomic(); 9360 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) 9361 md_update_sb(mddev, 0); 9362 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); 9363 wake_up(&mddev->sb_wait); 9364 } 9365 9366 if (is_md_suspended(mddev)) 9367 return; 9368 9369 if (mddev->bitmap) 9370 md_bitmap_daemon_work(mddev); 9371 9372 if (signal_pending(current)) { 9373 if (mddev->pers->sync_request && !mddev->external) { 9374 pr_debug("md: %s in immediate safe mode\n", 9375 mdname(mddev)); 9376 mddev->safemode = 2; 9377 } 9378 flush_signals(current); 9379 } 9380 9381 if (!md_is_rdwr(mddev) && 9382 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9383 return; 9384 if ( ! ( 9385 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9386 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9387 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9388 (mddev->external == 0 && mddev->safemode == 1) || 9389 (mddev->safemode == 2 9390 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9391 )) 9392 return; 9393 9394 if (mddev_trylock(mddev)) { 9395 int spares = 0; 9396 bool try_set_sync = mddev->safemode != 0; 9397 9398 if (!mddev->external && mddev->safemode == 1) 9399 mddev->safemode = 0; 9400 9401 if (!md_is_rdwr(mddev)) { 9402 struct md_rdev *rdev; 9403 if (!mddev->external && mddev->in_sync) 9404 /* 'Blocked' flag not needed as failed devices 9405 * will be recorded if array switched to read/write. 9406 * Leaving it set will prevent the device 9407 * from being removed. 9408 */ 9409 rdev_for_each(rdev, mddev) 9410 clear_bit(Blocked, &rdev->flags); 9411 /* On a read-only array we can: 9412 * - remove failed devices 9413 * - add already-in_sync devices if the array itself 9414 * is in-sync. 9415 * As we only add devices that are already in-sync, 9416 * we can activate the spares immediately. 9417 */ 9418 remove_and_add_spares(mddev, NULL); 9419 /* There is no thread, but we need to call 9420 * ->spare_active and clear saved_raid_disk 9421 */ 9422 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9423 md_reap_sync_thread(mddev); 9424 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9425 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9426 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9427 goto unlock; 9428 } 9429 9430 if (mddev_is_clustered(mddev)) { 9431 struct md_rdev *rdev, *tmp; 9432 /* kick the device if another node issued a 9433 * remove disk. 9434 */ 9435 rdev_for_each_safe(rdev, tmp, mddev) { 9436 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9437 rdev->raid_disk < 0) 9438 md_kick_rdev_from_array(rdev); 9439 } 9440 } 9441 9442 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9443 spin_lock(&mddev->lock); 9444 set_in_sync(mddev); 9445 spin_unlock(&mddev->lock); 9446 } 9447 9448 if (mddev->sb_flags) 9449 md_update_sb(mddev, 0); 9450 9451 /* 9452 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9453 * still set. 9454 */ 9455 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9456 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9457 /* resync/recovery still happening */ 9458 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9459 goto unlock; 9460 } 9461 9462 if (WARN_ON_ONCE(!mddev->sync_thread)) 9463 goto unlock; 9464 9465 md_reap_sync_thread(mddev); 9466 goto unlock; 9467 } 9468 9469 /* Set RUNNING before clearing NEEDED to avoid 9470 * any transients in the value of "sync_action". 9471 */ 9472 mddev->curr_resync_completed = 0; 9473 spin_lock(&mddev->lock); 9474 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9475 spin_unlock(&mddev->lock); 9476 /* Clear some bits that don't mean anything, but 9477 * might be left set 9478 */ 9479 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9480 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9481 9482 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9483 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 9484 goto not_running; 9485 /* no recovery is running. 9486 * remove any failed drives, then 9487 * add spares if possible. 9488 * Spares are also removed and re-added, to allow 9489 * the personality to fail the re-add. 9490 */ 9491 9492 if (mddev->reshape_position != MaxSector) { 9493 if (mddev->pers->check_reshape == NULL || 9494 mddev->pers->check_reshape(mddev) != 0) 9495 /* Cannot proceed */ 9496 goto not_running; 9497 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9498 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9499 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 9500 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9501 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9502 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9503 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9504 } else if (mddev->recovery_cp < MaxSector) { 9505 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9506 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9507 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9508 /* nothing to be done ... */ 9509 goto not_running; 9510 9511 if (mddev->pers->sync_request) { 9512 if (spares) { 9513 /* We are adding a device or devices to an array 9514 * which has the bitmap stored on all devices. 9515 * So make sure all bitmap pages get written 9516 */ 9517 md_bitmap_write_all(mddev->bitmap); 9518 } 9519 INIT_WORK(&mddev->del_work, md_start_sync); 9520 queue_work(md_misc_wq, &mddev->del_work); 9521 goto unlock; 9522 } 9523 not_running: 9524 if (!mddev->sync_thread) { 9525 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9526 wake_up(&resync_wait); 9527 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9528 &mddev->recovery)) 9529 if (mddev->sysfs_action) 9530 sysfs_notify_dirent_safe(mddev->sysfs_action); 9531 } 9532 unlock: 9533 wake_up(&mddev->sb_wait); 9534 mddev_unlock(mddev); 9535 } 9536 } 9537 EXPORT_SYMBOL(md_check_recovery); 9538 9539 void md_reap_sync_thread(struct mddev *mddev) 9540 { 9541 struct md_rdev *rdev; 9542 sector_t old_dev_sectors = mddev->dev_sectors; 9543 bool is_reshaped = false; 9544 9545 /* resync has finished, collect result */ 9546 md_unregister_thread(mddev, &mddev->sync_thread); 9547 atomic_inc(&mddev->sync_seq); 9548 9549 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9550 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9551 mddev->degraded != mddev->raid_disks) { 9552 /* success...*/ 9553 /* activate any spares */ 9554 if (mddev->pers->spare_active(mddev)) { 9555 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9556 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9557 } 9558 } 9559 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9560 mddev->pers->finish_reshape) { 9561 mddev->pers->finish_reshape(mddev); 9562 if (mddev_is_clustered(mddev)) 9563 is_reshaped = true; 9564 } 9565 9566 /* If array is no-longer degraded, then any saved_raid_disk 9567 * information must be scrapped. 9568 */ 9569 if (!mddev->degraded) 9570 rdev_for_each(rdev, mddev) 9571 rdev->saved_raid_disk = -1; 9572 9573 md_update_sb(mddev, 1); 9574 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9575 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9576 * clustered raid */ 9577 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9578 md_cluster_ops->resync_finish(mddev); 9579 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9580 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9581 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9582 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9583 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9584 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9585 /* 9586 * We call md_cluster_ops->update_size here because sync_size could 9587 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9588 * so it is time to update size across cluster. 9589 */ 9590 if (mddev_is_clustered(mddev) && is_reshaped 9591 && !test_bit(MD_CLOSING, &mddev->flags)) 9592 md_cluster_ops->update_size(mddev, old_dev_sectors); 9593 /* flag recovery needed just to double check */ 9594 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9595 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9596 sysfs_notify_dirent_safe(mddev->sysfs_action); 9597 md_new_event(); 9598 if (mddev->event_work.func) 9599 queue_work(md_misc_wq, &mddev->event_work); 9600 wake_up(&resync_wait); 9601 } 9602 EXPORT_SYMBOL(md_reap_sync_thread); 9603 9604 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9605 { 9606 sysfs_notify_dirent_safe(rdev->sysfs_state); 9607 wait_event_timeout(rdev->blocked_wait, 9608 !test_bit(Blocked, &rdev->flags) && 9609 !test_bit(BlockedBadBlocks, &rdev->flags), 9610 msecs_to_jiffies(5000)); 9611 rdev_dec_pending(rdev, mddev); 9612 } 9613 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9614 9615 void md_finish_reshape(struct mddev *mddev) 9616 { 9617 /* called be personality module when reshape completes. */ 9618 struct md_rdev *rdev; 9619 9620 rdev_for_each(rdev, mddev) { 9621 if (rdev->data_offset > rdev->new_data_offset) 9622 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9623 else 9624 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9625 rdev->data_offset = rdev->new_data_offset; 9626 } 9627 } 9628 EXPORT_SYMBOL(md_finish_reshape); 9629 9630 /* Bad block management */ 9631 9632 /* Returns 1 on success, 0 on failure */ 9633 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9634 int is_new) 9635 { 9636 struct mddev *mddev = rdev->mddev; 9637 int rv; 9638 if (is_new) 9639 s += rdev->new_data_offset; 9640 else 9641 s += rdev->data_offset; 9642 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9643 if (rv == 0) { 9644 /* Make sure they get written out promptly */ 9645 if (test_bit(ExternalBbl, &rdev->flags)) 9646 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9647 sysfs_notify_dirent_safe(rdev->sysfs_state); 9648 set_mask_bits(&mddev->sb_flags, 0, 9649 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9650 md_wakeup_thread(rdev->mddev->thread); 9651 return 1; 9652 } else 9653 return 0; 9654 } 9655 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9656 9657 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9658 int is_new) 9659 { 9660 int rv; 9661 if (is_new) 9662 s += rdev->new_data_offset; 9663 else 9664 s += rdev->data_offset; 9665 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9666 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9667 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9668 return rv; 9669 } 9670 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9671 9672 static int md_notify_reboot(struct notifier_block *this, 9673 unsigned long code, void *x) 9674 { 9675 struct mddev *mddev, *n; 9676 int need_delay = 0; 9677 9678 spin_lock(&all_mddevs_lock); 9679 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9680 if (!mddev_get(mddev)) 9681 continue; 9682 spin_unlock(&all_mddevs_lock); 9683 if (mddev_trylock(mddev)) { 9684 if (mddev->pers) 9685 __md_stop_writes(mddev); 9686 if (mddev->persistent) 9687 mddev->safemode = 2; 9688 mddev_unlock(mddev); 9689 } 9690 need_delay = 1; 9691 mddev_put(mddev); 9692 spin_lock(&all_mddevs_lock); 9693 } 9694 spin_unlock(&all_mddevs_lock); 9695 9696 /* 9697 * certain more exotic SCSI devices are known to be 9698 * volatile wrt too early system reboots. While the 9699 * right place to handle this issue is the given 9700 * driver, we do want to have a safe RAID driver ... 9701 */ 9702 if (need_delay) 9703 msleep(1000); 9704 9705 return NOTIFY_DONE; 9706 } 9707 9708 static struct notifier_block md_notifier = { 9709 .notifier_call = md_notify_reboot, 9710 .next = NULL, 9711 .priority = INT_MAX, /* before any real devices */ 9712 }; 9713 9714 static void md_geninit(void) 9715 { 9716 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9717 9718 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9719 } 9720 9721 static int __init md_init(void) 9722 { 9723 int ret = -ENOMEM; 9724 9725 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9726 if (!md_wq) 9727 goto err_wq; 9728 9729 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9730 if (!md_misc_wq) 9731 goto err_misc_wq; 9732 9733 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9734 0); 9735 if (!md_bitmap_wq) 9736 goto err_bitmap_wq; 9737 9738 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9739 if (ret < 0) 9740 goto err_md; 9741 9742 ret = __register_blkdev(0, "mdp", md_probe); 9743 if (ret < 0) 9744 goto err_mdp; 9745 mdp_major = ret; 9746 9747 register_reboot_notifier(&md_notifier); 9748 raid_table_header = register_sysctl("dev/raid", raid_table); 9749 9750 md_geninit(); 9751 return 0; 9752 9753 err_mdp: 9754 unregister_blkdev(MD_MAJOR, "md"); 9755 err_md: 9756 destroy_workqueue(md_bitmap_wq); 9757 err_bitmap_wq: 9758 destroy_workqueue(md_misc_wq); 9759 err_misc_wq: 9760 destroy_workqueue(md_wq); 9761 err_wq: 9762 return ret; 9763 } 9764 9765 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9766 { 9767 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9768 struct md_rdev *rdev2, *tmp; 9769 int role, ret; 9770 9771 /* 9772 * If size is changed in another node then we need to 9773 * do resize as well. 9774 */ 9775 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9776 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9777 if (ret) 9778 pr_info("md-cluster: resize failed\n"); 9779 else 9780 md_bitmap_update_sb(mddev->bitmap); 9781 } 9782 9783 /* Check for change of roles in the active devices */ 9784 rdev_for_each_safe(rdev2, tmp, mddev) { 9785 if (test_bit(Faulty, &rdev2->flags)) 9786 continue; 9787 9788 /* Check if the roles changed */ 9789 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9790 9791 if (test_bit(Candidate, &rdev2->flags)) { 9792 if (role == MD_DISK_ROLE_FAULTY) { 9793 pr_info("md: Removing Candidate device %pg because add failed\n", 9794 rdev2->bdev); 9795 md_kick_rdev_from_array(rdev2); 9796 continue; 9797 } 9798 else 9799 clear_bit(Candidate, &rdev2->flags); 9800 } 9801 9802 if (role != rdev2->raid_disk) { 9803 /* 9804 * got activated except reshape is happening. 9805 */ 9806 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9807 !(le32_to_cpu(sb->feature_map) & 9808 MD_FEATURE_RESHAPE_ACTIVE)) { 9809 rdev2->saved_raid_disk = role; 9810 ret = remove_and_add_spares(mddev, rdev2); 9811 pr_info("Activated spare: %pg\n", 9812 rdev2->bdev); 9813 /* wakeup mddev->thread here, so array could 9814 * perform resync with the new activated disk */ 9815 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9816 md_wakeup_thread(mddev->thread); 9817 } 9818 /* device faulty 9819 * We just want to do the minimum to mark the disk 9820 * as faulty. The recovery is performed by the 9821 * one who initiated the error. 9822 */ 9823 if (role == MD_DISK_ROLE_FAULTY || 9824 role == MD_DISK_ROLE_JOURNAL) { 9825 md_error(mddev, rdev2); 9826 clear_bit(Blocked, &rdev2->flags); 9827 } 9828 } 9829 } 9830 9831 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9832 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9833 if (ret) 9834 pr_warn("md: updating array disks failed. %d\n", ret); 9835 } 9836 9837 /* 9838 * Since mddev->delta_disks has already updated in update_raid_disks, 9839 * so it is time to check reshape. 9840 */ 9841 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9842 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9843 /* 9844 * reshape is happening in the remote node, we need to 9845 * update reshape_position and call start_reshape. 9846 */ 9847 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9848 if (mddev->pers->update_reshape_pos) 9849 mddev->pers->update_reshape_pos(mddev); 9850 if (mddev->pers->start_reshape) 9851 mddev->pers->start_reshape(mddev); 9852 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9853 mddev->reshape_position != MaxSector && 9854 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9855 /* reshape is just done in another node. */ 9856 mddev->reshape_position = MaxSector; 9857 if (mddev->pers->update_reshape_pos) 9858 mddev->pers->update_reshape_pos(mddev); 9859 } 9860 9861 /* Finally set the event to be up to date */ 9862 mddev->events = le64_to_cpu(sb->events); 9863 } 9864 9865 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9866 { 9867 int err; 9868 struct page *swapout = rdev->sb_page; 9869 struct mdp_superblock_1 *sb; 9870 9871 /* Store the sb page of the rdev in the swapout temporary 9872 * variable in case we err in the future 9873 */ 9874 rdev->sb_page = NULL; 9875 err = alloc_disk_sb(rdev); 9876 if (err == 0) { 9877 ClearPageUptodate(rdev->sb_page); 9878 rdev->sb_loaded = 0; 9879 err = super_types[mddev->major_version]. 9880 load_super(rdev, NULL, mddev->minor_version); 9881 } 9882 if (err < 0) { 9883 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9884 __func__, __LINE__, rdev->desc_nr, err); 9885 if (rdev->sb_page) 9886 put_page(rdev->sb_page); 9887 rdev->sb_page = swapout; 9888 rdev->sb_loaded = 1; 9889 return err; 9890 } 9891 9892 sb = page_address(rdev->sb_page); 9893 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9894 * is not set 9895 */ 9896 9897 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9898 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9899 9900 /* The other node finished recovery, call spare_active to set 9901 * device In_sync and mddev->degraded 9902 */ 9903 if (rdev->recovery_offset == MaxSector && 9904 !test_bit(In_sync, &rdev->flags) && 9905 mddev->pers->spare_active(mddev)) 9906 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9907 9908 put_page(swapout); 9909 return 0; 9910 } 9911 9912 void md_reload_sb(struct mddev *mddev, int nr) 9913 { 9914 struct md_rdev *rdev = NULL, *iter; 9915 int err; 9916 9917 /* Find the rdev */ 9918 rdev_for_each_rcu(iter, mddev) { 9919 if (iter->desc_nr == nr) { 9920 rdev = iter; 9921 break; 9922 } 9923 } 9924 9925 if (!rdev) { 9926 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9927 return; 9928 } 9929 9930 err = read_rdev(mddev, rdev); 9931 if (err < 0) 9932 return; 9933 9934 check_sb_changes(mddev, rdev); 9935 9936 /* Read all rdev's to update recovery_offset */ 9937 rdev_for_each_rcu(rdev, mddev) { 9938 if (!test_bit(Faulty, &rdev->flags)) 9939 read_rdev(mddev, rdev); 9940 } 9941 } 9942 EXPORT_SYMBOL(md_reload_sb); 9943 9944 #ifndef MODULE 9945 9946 /* 9947 * Searches all registered partitions for autorun RAID arrays 9948 * at boot time. 9949 */ 9950 9951 static DEFINE_MUTEX(detected_devices_mutex); 9952 static LIST_HEAD(all_detected_devices); 9953 struct detected_devices_node { 9954 struct list_head list; 9955 dev_t dev; 9956 }; 9957 9958 void md_autodetect_dev(dev_t dev) 9959 { 9960 struct detected_devices_node *node_detected_dev; 9961 9962 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9963 if (node_detected_dev) { 9964 node_detected_dev->dev = dev; 9965 mutex_lock(&detected_devices_mutex); 9966 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9967 mutex_unlock(&detected_devices_mutex); 9968 } 9969 } 9970 9971 void md_autostart_arrays(int part) 9972 { 9973 struct md_rdev *rdev; 9974 struct detected_devices_node *node_detected_dev; 9975 dev_t dev; 9976 int i_scanned, i_passed; 9977 9978 i_scanned = 0; 9979 i_passed = 0; 9980 9981 pr_info("md: Autodetecting RAID arrays.\n"); 9982 9983 mutex_lock(&detected_devices_mutex); 9984 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9985 i_scanned++; 9986 node_detected_dev = list_entry(all_detected_devices.next, 9987 struct detected_devices_node, list); 9988 list_del(&node_detected_dev->list); 9989 dev = node_detected_dev->dev; 9990 kfree(node_detected_dev); 9991 mutex_unlock(&detected_devices_mutex); 9992 rdev = md_import_device(dev,0, 90); 9993 mutex_lock(&detected_devices_mutex); 9994 if (IS_ERR(rdev)) 9995 continue; 9996 9997 if (test_bit(Faulty, &rdev->flags)) 9998 continue; 9999 10000 set_bit(AutoDetected, &rdev->flags); 10001 list_add(&rdev->same_set, &pending_raid_disks); 10002 i_passed++; 10003 } 10004 mutex_unlock(&detected_devices_mutex); 10005 10006 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10007 10008 autorun_devices(part); 10009 } 10010 10011 #endif /* !MODULE */ 10012 10013 static __exit void md_exit(void) 10014 { 10015 struct mddev *mddev, *n; 10016 int delay = 1; 10017 10018 unregister_blkdev(MD_MAJOR,"md"); 10019 unregister_blkdev(mdp_major, "mdp"); 10020 unregister_reboot_notifier(&md_notifier); 10021 unregister_sysctl_table(raid_table_header); 10022 10023 /* We cannot unload the modules while some process is 10024 * waiting for us in select() or poll() - wake them up 10025 */ 10026 md_unloading = 1; 10027 while (waitqueue_active(&md_event_waiters)) { 10028 /* not safe to leave yet */ 10029 wake_up(&md_event_waiters); 10030 msleep(delay); 10031 delay += delay; 10032 } 10033 remove_proc_entry("mdstat", NULL); 10034 10035 spin_lock(&all_mddevs_lock); 10036 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10037 if (!mddev_get(mddev)) 10038 continue; 10039 spin_unlock(&all_mddevs_lock); 10040 export_array(mddev); 10041 mddev->ctime = 0; 10042 mddev->hold_active = 0; 10043 /* 10044 * As the mddev is now fully clear, mddev_put will schedule 10045 * the mddev for destruction by a workqueue, and the 10046 * destroy_workqueue() below will wait for that to complete. 10047 */ 10048 mddev_put(mddev); 10049 spin_lock(&all_mddevs_lock); 10050 } 10051 spin_unlock(&all_mddevs_lock); 10052 10053 destroy_workqueue(md_misc_wq); 10054 destroy_workqueue(md_bitmap_wq); 10055 destroy_workqueue(md_wq); 10056 } 10057 10058 subsys_initcall(md_init); 10059 module_exit(md_exit) 10060 10061 static int get_ro(char *buffer, const struct kernel_param *kp) 10062 { 10063 return sprintf(buffer, "%d\n", start_readonly); 10064 } 10065 static int set_ro(const char *val, const struct kernel_param *kp) 10066 { 10067 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10068 } 10069 10070 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10071 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10072 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10073 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10074 10075 MODULE_LICENSE("GPL"); 10076 MODULE_DESCRIPTION("MD RAID framework"); 10077 MODULE_ALIAS("md"); 10078 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10079