1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected by pers_lock. */ 74 static LIST_HEAD(pers_list); 75 static DEFINE_SPINLOCK(pers_lock); 76 77 static const struct kobj_type md_ktype; 78 79 struct md_cluster_operations *md_cluster_ops; 80 EXPORT_SYMBOL(md_cluster_ops); 81 static struct module *md_cluster_mod; 82 83 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 84 static struct workqueue_struct *md_wq; 85 static struct workqueue_struct *md_misc_wq; 86 struct workqueue_struct *md_bitmap_wq; 87 88 static int remove_and_add_spares(struct mddev *mddev, 89 struct md_rdev *this); 90 static void mddev_detach(struct mddev *mddev); 91 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 92 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 93 94 /* 95 * Default number of read corrections we'll attempt on an rdev 96 * before ejecting it from the array. We divide the read error 97 * count by 2 for every hour elapsed between read errors. 98 */ 99 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 100 /* Default safemode delay: 200 msec */ 101 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 102 /* 103 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 104 * is 1000 KB/sec, so the extra system load does not show up that much. 105 * Increase it if you want to have more _guaranteed_ speed. Note that 106 * the RAID driver will use the maximum available bandwidth if the IO 107 * subsystem is idle. There is also an 'absolute maximum' reconstruction 108 * speed limit - in case reconstruction slows down your system despite 109 * idle IO detection. 110 * 111 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 112 * or /sys/block/mdX/md/sync_speed_{min,max} 113 */ 114 115 static int sysctl_speed_limit_min = 1000; 116 static int sysctl_speed_limit_max = 200000; 117 static inline int speed_min(struct mddev *mddev) 118 { 119 return mddev->sync_speed_min ? 120 mddev->sync_speed_min : sysctl_speed_limit_min; 121 } 122 123 static inline int speed_max(struct mddev *mddev) 124 { 125 return mddev->sync_speed_max ? 126 mddev->sync_speed_max : sysctl_speed_limit_max; 127 } 128 129 static void rdev_uninit_serial(struct md_rdev *rdev) 130 { 131 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 132 return; 133 134 kvfree(rdev->serial); 135 rdev->serial = NULL; 136 } 137 138 static void rdevs_uninit_serial(struct mddev *mddev) 139 { 140 struct md_rdev *rdev; 141 142 rdev_for_each(rdev, mddev) 143 rdev_uninit_serial(rdev); 144 } 145 146 static int rdev_init_serial(struct md_rdev *rdev) 147 { 148 /* serial_nums equals with BARRIER_BUCKETS_NR */ 149 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 150 struct serial_in_rdev *serial = NULL; 151 152 if (test_bit(CollisionCheck, &rdev->flags)) 153 return 0; 154 155 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 156 GFP_KERNEL); 157 if (!serial) 158 return -ENOMEM; 159 160 for (i = 0; i < serial_nums; i++) { 161 struct serial_in_rdev *serial_tmp = &serial[i]; 162 163 spin_lock_init(&serial_tmp->serial_lock); 164 serial_tmp->serial_rb = RB_ROOT_CACHED; 165 init_waitqueue_head(&serial_tmp->serial_io_wait); 166 } 167 168 rdev->serial = serial; 169 set_bit(CollisionCheck, &rdev->flags); 170 171 return 0; 172 } 173 174 static int rdevs_init_serial(struct mddev *mddev) 175 { 176 struct md_rdev *rdev; 177 int ret = 0; 178 179 rdev_for_each(rdev, mddev) { 180 ret = rdev_init_serial(rdev); 181 if (ret) 182 break; 183 } 184 185 /* Free all resources if pool is not existed */ 186 if (ret && !mddev->serial_info_pool) 187 rdevs_uninit_serial(mddev); 188 189 return ret; 190 } 191 192 /* 193 * rdev needs to enable serial stuffs if it meets the conditions: 194 * 1. it is multi-queue device flaged with writemostly. 195 * 2. the write-behind mode is enabled. 196 */ 197 static int rdev_need_serial(struct md_rdev *rdev) 198 { 199 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 200 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 201 test_bit(WriteMostly, &rdev->flags)); 202 } 203 204 /* 205 * Init resource for rdev(s), then create serial_info_pool if: 206 * 1. rdev is the first device which return true from rdev_enable_serial. 207 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 208 */ 209 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 210 bool is_suspend) 211 { 212 int ret = 0; 213 214 if (rdev && !rdev_need_serial(rdev) && 215 !test_bit(CollisionCheck, &rdev->flags)) 216 return; 217 218 if (!is_suspend) 219 mddev_suspend(mddev); 220 221 if (!rdev) 222 ret = rdevs_init_serial(mddev); 223 else 224 ret = rdev_init_serial(rdev); 225 if (ret) 226 goto abort; 227 228 if (mddev->serial_info_pool == NULL) { 229 /* 230 * already in memalloc noio context by 231 * mddev_suspend() 232 */ 233 mddev->serial_info_pool = 234 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 235 sizeof(struct serial_info)); 236 if (!mddev->serial_info_pool) { 237 rdevs_uninit_serial(mddev); 238 pr_err("can't alloc memory pool for serialization\n"); 239 } 240 } 241 242 abort: 243 if (!is_suspend) 244 mddev_resume(mddev); 245 } 246 247 /* 248 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 249 * 1. rdev is the last device flaged with CollisionCheck. 250 * 2. when bitmap is destroyed while policy is not enabled. 251 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 252 */ 253 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 254 bool is_suspend) 255 { 256 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 257 return; 258 259 if (mddev->serial_info_pool) { 260 struct md_rdev *temp; 261 int num = 0; /* used to track if other rdevs need the pool */ 262 263 if (!is_suspend) 264 mddev_suspend(mddev); 265 rdev_for_each(temp, mddev) { 266 if (!rdev) { 267 if (!mddev->serialize_policy || 268 !rdev_need_serial(temp)) 269 rdev_uninit_serial(temp); 270 else 271 num++; 272 } else if (temp != rdev && 273 test_bit(CollisionCheck, &temp->flags)) 274 num++; 275 } 276 277 if (rdev) 278 rdev_uninit_serial(rdev); 279 280 if (num) 281 pr_info("The mempool could be used by other devices\n"); 282 else { 283 mempool_destroy(mddev->serial_info_pool); 284 mddev->serial_info_pool = NULL; 285 } 286 if (!is_suspend) 287 mddev_resume(mddev); 288 } 289 } 290 291 static struct ctl_table_header *raid_table_header; 292 293 static struct ctl_table raid_table[] = { 294 { 295 .procname = "speed_limit_min", 296 .data = &sysctl_speed_limit_min, 297 .maxlen = sizeof(int), 298 .mode = S_IRUGO|S_IWUSR, 299 .proc_handler = proc_dointvec, 300 }, 301 { 302 .procname = "speed_limit_max", 303 .data = &sysctl_speed_limit_max, 304 .maxlen = sizeof(int), 305 .mode = S_IRUGO|S_IWUSR, 306 .proc_handler = proc_dointvec, 307 }, 308 { } 309 }; 310 311 static int start_readonly; 312 313 /* 314 * The original mechanism for creating an md device is to create 315 * a device node in /dev and to open it. This causes races with device-close. 316 * The preferred method is to write to the "new_array" module parameter. 317 * This can avoid races. 318 * Setting create_on_open to false disables the original mechanism 319 * so all the races disappear. 320 */ 321 static bool create_on_open = true; 322 323 /* 324 * We have a system wide 'event count' that is incremented 325 * on any 'interesting' event, and readers of /proc/mdstat 326 * can use 'poll' or 'select' to find out when the event 327 * count increases. 328 * 329 * Events are: 330 * start array, stop array, error, add device, remove device, 331 * start build, activate spare 332 */ 333 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 334 static atomic_t md_event_count; 335 void md_new_event(void) 336 { 337 atomic_inc(&md_event_count); 338 wake_up(&md_event_waiters); 339 } 340 EXPORT_SYMBOL_GPL(md_new_event); 341 342 /* 343 * Enables to iterate over all existing md arrays 344 * all_mddevs_lock protects this list. 345 */ 346 static LIST_HEAD(all_mddevs); 347 static DEFINE_SPINLOCK(all_mddevs_lock); 348 349 /* Rather than calling directly into the personality make_request function, 350 * IO requests come here first so that we can check if the device is 351 * being suspended pending a reconfiguration. 352 * We hold a refcount over the call to ->make_request. By the time that 353 * call has finished, the bio has been linked into some internal structure 354 * and so is visible to ->quiesce(), so we don't need the refcount any more. 355 */ 356 static bool is_suspended(struct mddev *mddev, struct bio *bio) 357 { 358 if (is_md_suspended(mddev)) 359 return true; 360 if (bio_data_dir(bio) != WRITE) 361 return false; 362 if (mddev->suspend_lo >= mddev->suspend_hi) 363 return false; 364 if (bio->bi_iter.bi_sector >= mddev->suspend_hi) 365 return false; 366 if (bio_end_sector(bio) < mddev->suspend_lo) 367 return false; 368 return true; 369 } 370 371 void md_handle_request(struct mddev *mddev, struct bio *bio) 372 { 373 check_suspended: 374 if (is_suspended(mddev, bio)) { 375 DEFINE_WAIT(__wait); 376 /* Bail out if REQ_NOWAIT is set for the bio */ 377 if (bio->bi_opf & REQ_NOWAIT) { 378 bio_wouldblock_error(bio); 379 return; 380 } 381 for (;;) { 382 prepare_to_wait(&mddev->sb_wait, &__wait, 383 TASK_UNINTERRUPTIBLE); 384 if (!is_suspended(mddev, bio)) 385 break; 386 schedule(); 387 } 388 finish_wait(&mddev->sb_wait, &__wait); 389 } 390 if (!percpu_ref_tryget_live(&mddev->active_io)) 391 goto check_suspended; 392 393 if (!mddev->pers->make_request(mddev, bio)) { 394 percpu_ref_put(&mddev->active_io); 395 goto check_suspended; 396 } 397 398 percpu_ref_put(&mddev->active_io); 399 } 400 EXPORT_SYMBOL(md_handle_request); 401 402 static void md_submit_bio(struct bio *bio) 403 { 404 const int rw = bio_data_dir(bio); 405 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 406 407 if (mddev == NULL || mddev->pers == NULL) { 408 bio_io_error(bio); 409 return; 410 } 411 412 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 413 bio_io_error(bio); 414 return; 415 } 416 417 bio = bio_split_to_limits(bio); 418 if (!bio) 419 return; 420 421 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 422 if (bio_sectors(bio) != 0) 423 bio->bi_status = BLK_STS_IOERR; 424 bio_endio(bio); 425 return; 426 } 427 428 /* bio could be mergeable after passing to underlayer */ 429 bio->bi_opf &= ~REQ_NOMERGE; 430 431 md_handle_request(mddev, bio); 432 } 433 434 /* mddev_suspend makes sure no new requests are submitted 435 * to the device, and that any requests that have been submitted 436 * are completely handled. 437 * Once mddev_detach() is called and completes, the module will be 438 * completely unused. 439 */ 440 void mddev_suspend(struct mddev *mddev) 441 { 442 struct md_thread *thread = rcu_dereference_protected(mddev->thread, 443 lockdep_is_held(&mddev->reconfig_mutex)); 444 445 WARN_ON_ONCE(thread && current == thread->tsk); 446 if (mddev->suspended++) 447 return; 448 wake_up(&mddev->sb_wait); 449 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); 450 percpu_ref_kill(&mddev->active_io); 451 452 if (mddev->pers && mddev->pers->prepare_suspend) 453 mddev->pers->prepare_suspend(mddev); 454 455 wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); 456 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); 457 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); 458 459 del_timer_sync(&mddev->safemode_timer); 460 /* restrict memory reclaim I/O during raid array is suspend */ 461 mddev->noio_flag = memalloc_noio_save(); 462 } 463 EXPORT_SYMBOL_GPL(mddev_suspend); 464 465 void mddev_resume(struct mddev *mddev) 466 { 467 lockdep_assert_held(&mddev->reconfig_mutex); 468 if (--mddev->suspended) 469 return; 470 471 /* entred the memalloc scope from mddev_suspend() */ 472 memalloc_noio_restore(mddev->noio_flag); 473 474 percpu_ref_resurrect(&mddev->active_io); 475 wake_up(&mddev->sb_wait); 476 477 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 478 md_wakeup_thread(mddev->thread); 479 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 480 } 481 EXPORT_SYMBOL_GPL(mddev_resume); 482 483 /* 484 * Generic flush handling for md 485 */ 486 487 static void md_end_flush(struct bio *bio) 488 { 489 struct md_rdev *rdev = bio->bi_private; 490 struct mddev *mddev = rdev->mddev; 491 492 bio_put(bio); 493 494 rdev_dec_pending(rdev, mddev); 495 496 if (atomic_dec_and_test(&mddev->flush_pending)) { 497 /* The pair is percpu_ref_get() from md_flush_request() */ 498 percpu_ref_put(&mddev->active_io); 499 500 /* The pre-request flush has finished */ 501 queue_work(md_wq, &mddev->flush_work); 502 } 503 } 504 505 static void md_submit_flush_data(struct work_struct *ws); 506 507 static void submit_flushes(struct work_struct *ws) 508 { 509 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 510 struct md_rdev *rdev; 511 512 mddev->start_flush = ktime_get_boottime(); 513 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 514 atomic_set(&mddev->flush_pending, 1); 515 rcu_read_lock(); 516 rdev_for_each_rcu(rdev, mddev) 517 if (rdev->raid_disk >= 0 && 518 !test_bit(Faulty, &rdev->flags)) { 519 struct bio *bi; 520 521 atomic_inc(&rdev->nr_pending); 522 rcu_read_unlock(); 523 bi = bio_alloc_bioset(rdev->bdev, 0, 524 REQ_OP_WRITE | REQ_PREFLUSH, 525 GFP_NOIO, &mddev->bio_set); 526 bi->bi_end_io = md_end_flush; 527 bi->bi_private = rdev; 528 atomic_inc(&mddev->flush_pending); 529 submit_bio(bi); 530 rcu_read_lock(); 531 } 532 rcu_read_unlock(); 533 if (atomic_dec_and_test(&mddev->flush_pending)) 534 queue_work(md_wq, &mddev->flush_work); 535 } 536 537 static void md_submit_flush_data(struct work_struct *ws) 538 { 539 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 540 struct bio *bio = mddev->flush_bio; 541 542 /* 543 * must reset flush_bio before calling into md_handle_request to avoid a 544 * deadlock, because other bios passed md_handle_request suspend check 545 * could wait for this and below md_handle_request could wait for those 546 * bios because of suspend check 547 */ 548 spin_lock_irq(&mddev->lock); 549 mddev->prev_flush_start = mddev->start_flush; 550 mddev->flush_bio = NULL; 551 spin_unlock_irq(&mddev->lock); 552 wake_up(&mddev->sb_wait); 553 554 if (bio->bi_iter.bi_size == 0) { 555 /* an empty barrier - all done */ 556 bio_endio(bio); 557 } else { 558 bio->bi_opf &= ~REQ_PREFLUSH; 559 md_handle_request(mddev, bio); 560 } 561 } 562 563 /* 564 * Manages consolidation of flushes and submitting any flushes needed for 565 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 566 * being finished in another context. Returns false if the flushing is 567 * complete but still needs the I/O portion of the bio to be processed. 568 */ 569 bool md_flush_request(struct mddev *mddev, struct bio *bio) 570 { 571 ktime_t req_start = ktime_get_boottime(); 572 spin_lock_irq(&mddev->lock); 573 /* flush requests wait until ongoing flush completes, 574 * hence coalescing all the pending requests. 575 */ 576 wait_event_lock_irq(mddev->sb_wait, 577 !mddev->flush_bio || 578 ktime_before(req_start, mddev->prev_flush_start), 579 mddev->lock); 580 /* new request after previous flush is completed */ 581 if (ktime_after(req_start, mddev->prev_flush_start)) { 582 WARN_ON(mddev->flush_bio); 583 /* 584 * Grab a reference to make sure mddev_suspend() will wait for 585 * this flush to be done. 586 * 587 * md_flush_reqeust() is called under md_handle_request() and 588 * 'active_io' is already grabbed, hence percpu_ref_is_zero() 589 * won't pass, percpu_ref_tryget_live() can't be used because 590 * percpu_ref_kill() can be called by mddev_suspend() 591 * concurrently. 592 */ 593 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 594 percpu_ref_get(&mddev->active_io); 595 mddev->flush_bio = bio; 596 bio = NULL; 597 } 598 spin_unlock_irq(&mddev->lock); 599 600 if (!bio) { 601 INIT_WORK(&mddev->flush_work, submit_flushes); 602 queue_work(md_wq, &mddev->flush_work); 603 } else { 604 /* flush was performed for some other bio while we waited. */ 605 if (bio->bi_iter.bi_size == 0) 606 /* an empty barrier - all done */ 607 bio_endio(bio); 608 else { 609 bio->bi_opf &= ~REQ_PREFLUSH; 610 return false; 611 } 612 } 613 return true; 614 } 615 EXPORT_SYMBOL(md_flush_request); 616 617 static inline struct mddev *mddev_get(struct mddev *mddev) 618 { 619 lockdep_assert_held(&all_mddevs_lock); 620 621 if (test_bit(MD_DELETED, &mddev->flags)) 622 return NULL; 623 atomic_inc(&mddev->active); 624 return mddev; 625 } 626 627 static void mddev_delayed_delete(struct work_struct *ws); 628 629 void mddev_put(struct mddev *mddev) 630 { 631 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 632 return; 633 if (!mddev->raid_disks && list_empty(&mddev->disks) && 634 mddev->ctime == 0 && !mddev->hold_active) { 635 /* Array is not configured at all, and not held active, 636 * so destroy it */ 637 set_bit(MD_DELETED, &mddev->flags); 638 639 /* 640 * Call queue_work inside the spinlock so that 641 * flush_workqueue() after mddev_find will succeed in waiting 642 * for the work to be done. 643 */ 644 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 645 queue_work(md_misc_wq, &mddev->del_work); 646 } 647 spin_unlock(&all_mddevs_lock); 648 } 649 650 static void md_safemode_timeout(struct timer_list *t); 651 652 void mddev_init(struct mddev *mddev) 653 { 654 mutex_init(&mddev->open_mutex); 655 mutex_init(&mddev->reconfig_mutex); 656 mutex_init(&mddev->sync_mutex); 657 mutex_init(&mddev->bitmap_info.mutex); 658 INIT_LIST_HEAD(&mddev->disks); 659 INIT_LIST_HEAD(&mddev->all_mddevs); 660 INIT_LIST_HEAD(&mddev->deleting); 661 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 662 atomic_set(&mddev->active, 1); 663 atomic_set(&mddev->openers, 0); 664 atomic_set(&mddev->sync_seq, 0); 665 spin_lock_init(&mddev->lock); 666 atomic_set(&mddev->flush_pending, 0); 667 init_waitqueue_head(&mddev->sb_wait); 668 init_waitqueue_head(&mddev->recovery_wait); 669 mddev->reshape_position = MaxSector; 670 mddev->reshape_backwards = 0; 671 mddev->last_sync_action = "none"; 672 mddev->resync_min = 0; 673 mddev->resync_max = MaxSector; 674 mddev->level = LEVEL_NONE; 675 } 676 EXPORT_SYMBOL_GPL(mddev_init); 677 678 static struct mddev *mddev_find_locked(dev_t unit) 679 { 680 struct mddev *mddev; 681 682 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 683 if (mddev->unit == unit) 684 return mddev; 685 686 return NULL; 687 } 688 689 /* find an unused unit number */ 690 static dev_t mddev_alloc_unit(void) 691 { 692 static int next_minor = 512; 693 int start = next_minor; 694 bool is_free = 0; 695 dev_t dev = 0; 696 697 while (!is_free) { 698 dev = MKDEV(MD_MAJOR, next_minor); 699 next_minor++; 700 if (next_minor > MINORMASK) 701 next_minor = 0; 702 if (next_minor == start) 703 return 0; /* Oh dear, all in use. */ 704 is_free = !mddev_find_locked(dev); 705 } 706 707 return dev; 708 } 709 710 static struct mddev *mddev_alloc(dev_t unit) 711 { 712 struct mddev *new; 713 int error; 714 715 if (unit && MAJOR(unit) != MD_MAJOR) 716 unit &= ~((1 << MdpMinorShift) - 1); 717 718 new = kzalloc(sizeof(*new), GFP_KERNEL); 719 if (!new) 720 return ERR_PTR(-ENOMEM); 721 mddev_init(new); 722 723 spin_lock(&all_mddevs_lock); 724 if (unit) { 725 error = -EEXIST; 726 if (mddev_find_locked(unit)) 727 goto out_free_new; 728 new->unit = unit; 729 if (MAJOR(unit) == MD_MAJOR) 730 new->md_minor = MINOR(unit); 731 else 732 new->md_minor = MINOR(unit) >> MdpMinorShift; 733 new->hold_active = UNTIL_IOCTL; 734 } else { 735 error = -ENODEV; 736 new->unit = mddev_alloc_unit(); 737 if (!new->unit) 738 goto out_free_new; 739 new->md_minor = MINOR(new->unit); 740 new->hold_active = UNTIL_STOP; 741 } 742 743 list_add(&new->all_mddevs, &all_mddevs); 744 spin_unlock(&all_mddevs_lock); 745 return new; 746 out_free_new: 747 spin_unlock(&all_mddevs_lock); 748 kfree(new); 749 return ERR_PTR(error); 750 } 751 752 static void mddev_free(struct mddev *mddev) 753 { 754 spin_lock(&all_mddevs_lock); 755 list_del(&mddev->all_mddevs); 756 spin_unlock(&all_mddevs_lock); 757 758 kfree(mddev); 759 } 760 761 static const struct attribute_group md_redundancy_group; 762 763 void mddev_unlock(struct mddev *mddev) 764 { 765 struct md_rdev *rdev; 766 struct md_rdev *tmp; 767 LIST_HEAD(delete); 768 769 if (!list_empty(&mddev->deleting)) 770 list_splice_init(&mddev->deleting, &delete); 771 772 if (mddev->to_remove) { 773 /* These cannot be removed under reconfig_mutex as 774 * an access to the files will try to take reconfig_mutex 775 * while holding the file unremovable, which leads to 776 * a deadlock. 777 * So hold set sysfs_active while the remove in happeing, 778 * and anything else which might set ->to_remove or my 779 * otherwise change the sysfs namespace will fail with 780 * -EBUSY if sysfs_active is still set. 781 * We set sysfs_active under reconfig_mutex and elsewhere 782 * test it under the same mutex to ensure its correct value 783 * is seen. 784 */ 785 const struct attribute_group *to_remove = mddev->to_remove; 786 mddev->to_remove = NULL; 787 mddev->sysfs_active = 1; 788 mutex_unlock(&mddev->reconfig_mutex); 789 790 if (mddev->kobj.sd) { 791 if (to_remove != &md_redundancy_group) 792 sysfs_remove_group(&mddev->kobj, to_remove); 793 if (mddev->pers == NULL || 794 mddev->pers->sync_request == NULL) { 795 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 796 if (mddev->sysfs_action) 797 sysfs_put(mddev->sysfs_action); 798 if (mddev->sysfs_completed) 799 sysfs_put(mddev->sysfs_completed); 800 if (mddev->sysfs_degraded) 801 sysfs_put(mddev->sysfs_degraded); 802 mddev->sysfs_action = NULL; 803 mddev->sysfs_completed = NULL; 804 mddev->sysfs_degraded = NULL; 805 } 806 } 807 mddev->sysfs_active = 0; 808 } else 809 mutex_unlock(&mddev->reconfig_mutex); 810 811 md_wakeup_thread(mddev->thread); 812 wake_up(&mddev->sb_wait); 813 814 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 815 list_del_init(&rdev->same_set); 816 kobject_del(&rdev->kobj); 817 export_rdev(rdev, mddev); 818 } 819 } 820 EXPORT_SYMBOL_GPL(mddev_unlock); 821 822 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 823 { 824 struct md_rdev *rdev; 825 826 rdev_for_each_rcu(rdev, mddev) 827 if (rdev->desc_nr == nr) 828 return rdev; 829 830 return NULL; 831 } 832 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 833 834 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 835 { 836 struct md_rdev *rdev; 837 838 rdev_for_each(rdev, mddev) 839 if (rdev->bdev->bd_dev == dev) 840 return rdev; 841 842 return NULL; 843 } 844 845 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 846 { 847 struct md_rdev *rdev; 848 849 rdev_for_each_rcu(rdev, mddev) 850 if (rdev->bdev->bd_dev == dev) 851 return rdev; 852 853 return NULL; 854 } 855 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 856 857 static struct md_personality *find_pers(int level, char *clevel) 858 { 859 struct md_personality *pers; 860 list_for_each_entry(pers, &pers_list, list) { 861 if (level != LEVEL_NONE && pers->level == level) 862 return pers; 863 if (strcmp(pers->name, clevel)==0) 864 return pers; 865 } 866 return NULL; 867 } 868 869 /* return the offset of the super block in 512byte sectors */ 870 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 871 { 872 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 873 } 874 875 static int alloc_disk_sb(struct md_rdev *rdev) 876 { 877 rdev->sb_page = alloc_page(GFP_KERNEL); 878 if (!rdev->sb_page) 879 return -ENOMEM; 880 return 0; 881 } 882 883 void md_rdev_clear(struct md_rdev *rdev) 884 { 885 if (rdev->sb_page) { 886 put_page(rdev->sb_page); 887 rdev->sb_loaded = 0; 888 rdev->sb_page = NULL; 889 rdev->sb_start = 0; 890 rdev->sectors = 0; 891 } 892 if (rdev->bb_page) { 893 put_page(rdev->bb_page); 894 rdev->bb_page = NULL; 895 } 896 badblocks_exit(&rdev->badblocks); 897 } 898 EXPORT_SYMBOL_GPL(md_rdev_clear); 899 900 static void super_written(struct bio *bio) 901 { 902 struct md_rdev *rdev = bio->bi_private; 903 struct mddev *mddev = rdev->mddev; 904 905 if (bio->bi_status) { 906 pr_err("md: %s gets error=%d\n", __func__, 907 blk_status_to_errno(bio->bi_status)); 908 md_error(mddev, rdev); 909 if (!test_bit(Faulty, &rdev->flags) 910 && (bio->bi_opf & MD_FAILFAST)) { 911 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 912 set_bit(LastDev, &rdev->flags); 913 } 914 } else 915 clear_bit(LastDev, &rdev->flags); 916 917 bio_put(bio); 918 919 rdev_dec_pending(rdev, mddev); 920 921 if (atomic_dec_and_test(&mddev->pending_writes)) 922 wake_up(&mddev->sb_wait); 923 } 924 925 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 926 sector_t sector, int size, struct page *page) 927 { 928 /* write first size bytes of page to sector of rdev 929 * Increment mddev->pending_writes before returning 930 * and decrement it on completion, waking up sb_wait 931 * if zero is reached. 932 * If an error occurred, call md_error 933 */ 934 struct bio *bio; 935 936 if (!page) 937 return; 938 939 if (test_bit(Faulty, &rdev->flags)) 940 return; 941 942 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 943 1, 944 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 945 | REQ_PREFLUSH | REQ_FUA, 946 GFP_NOIO, &mddev->sync_set); 947 948 atomic_inc(&rdev->nr_pending); 949 950 bio->bi_iter.bi_sector = sector; 951 __bio_add_page(bio, page, size, 0); 952 bio->bi_private = rdev; 953 bio->bi_end_io = super_written; 954 955 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 956 test_bit(FailFast, &rdev->flags) && 957 !test_bit(LastDev, &rdev->flags)) 958 bio->bi_opf |= MD_FAILFAST; 959 960 atomic_inc(&mddev->pending_writes); 961 submit_bio(bio); 962 } 963 964 int md_super_wait(struct mddev *mddev) 965 { 966 /* wait for all superblock writes that were scheduled to complete */ 967 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 968 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 969 return -EAGAIN; 970 return 0; 971 } 972 973 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 974 struct page *page, blk_opf_t opf, bool metadata_op) 975 { 976 struct bio bio; 977 struct bio_vec bvec; 978 979 if (metadata_op && rdev->meta_bdev) 980 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 981 else 982 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 983 984 if (metadata_op) 985 bio.bi_iter.bi_sector = sector + rdev->sb_start; 986 else if (rdev->mddev->reshape_position != MaxSector && 987 (rdev->mddev->reshape_backwards == 988 (sector >= rdev->mddev->reshape_position))) 989 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 990 else 991 bio.bi_iter.bi_sector = sector + rdev->data_offset; 992 __bio_add_page(&bio, page, size, 0); 993 994 submit_bio_wait(&bio); 995 996 return !bio.bi_status; 997 } 998 EXPORT_SYMBOL_GPL(sync_page_io); 999 1000 static int read_disk_sb(struct md_rdev *rdev, int size) 1001 { 1002 if (rdev->sb_loaded) 1003 return 0; 1004 1005 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1006 goto fail; 1007 rdev->sb_loaded = 1; 1008 return 0; 1009 1010 fail: 1011 pr_err("md: disabled device %pg, could not read superblock.\n", 1012 rdev->bdev); 1013 return -EINVAL; 1014 } 1015 1016 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1017 { 1018 return sb1->set_uuid0 == sb2->set_uuid0 && 1019 sb1->set_uuid1 == sb2->set_uuid1 && 1020 sb1->set_uuid2 == sb2->set_uuid2 && 1021 sb1->set_uuid3 == sb2->set_uuid3; 1022 } 1023 1024 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1025 { 1026 int ret; 1027 mdp_super_t *tmp1, *tmp2; 1028 1029 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1030 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1031 1032 if (!tmp1 || !tmp2) { 1033 ret = 0; 1034 goto abort; 1035 } 1036 1037 *tmp1 = *sb1; 1038 *tmp2 = *sb2; 1039 1040 /* 1041 * nr_disks is not constant 1042 */ 1043 tmp1->nr_disks = 0; 1044 tmp2->nr_disks = 0; 1045 1046 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1047 abort: 1048 kfree(tmp1); 1049 kfree(tmp2); 1050 return ret; 1051 } 1052 1053 static u32 md_csum_fold(u32 csum) 1054 { 1055 csum = (csum & 0xffff) + (csum >> 16); 1056 return (csum & 0xffff) + (csum >> 16); 1057 } 1058 1059 static unsigned int calc_sb_csum(mdp_super_t *sb) 1060 { 1061 u64 newcsum = 0; 1062 u32 *sb32 = (u32*)sb; 1063 int i; 1064 unsigned int disk_csum, csum; 1065 1066 disk_csum = sb->sb_csum; 1067 sb->sb_csum = 0; 1068 1069 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1070 newcsum += sb32[i]; 1071 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1072 1073 #ifdef CONFIG_ALPHA 1074 /* This used to use csum_partial, which was wrong for several 1075 * reasons including that different results are returned on 1076 * different architectures. It isn't critical that we get exactly 1077 * the same return value as before (we always csum_fold before 1078 * testing, and that removes any differences). However as we 1079 * know that csum_partial always returned a 16bit value on 1080 * alphas, do a fold to maximise conformity to previous behaviour. 1081 */ 1082 sb->sb_csum = md_csum_fold(disk_csum); 1083 #else 1084 sb->sb_csum = disk_csum; 1085 #endif 1086 return csum; 1087 } 1088 1089 /* 1090 * Handle superblock details. 1091 * We want to be able to handle multiple superblock formats 1092 * so we have a common interface to them all, and an array of 1093 * different handlers. 1094 * We rely on user-space to write the initial superblock, and support 1095 * reading and updating of superblocks. 1096 * Interface methods are: 1097 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1098 * loads and validates a superblock on dev. 1099 * if refdev != NULL, compare superblocks on both devices 1100 * Return: 1101 * 0 - dev has a superblock that is compatible with refdev 1102 * 1 - dev has a superblock that is compatible and newer than refdev 1103 * so dev should be used as the refdev in future 1104 * -EINVAL superblock incompatible or invalid 1105 * -othererror e.g. -EIO 1106 * 1107 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1108 * Verify that dev is acceptable into mddev. 1109 * The first time, mddev->raid_disks will be 0, and data from 1110 * dev should be merged in. Subsequent calls check that dev 1111 * is new enough. Return 0 or -EINVAL 1112 * 1113 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1114 * Update the superblock for rdev with data in mddev 1115 * This does not write to disc. 1116 * 1117 */ 1118 1119 struct super_type { 1120 char *name; 1121 struct module *owner; 1122 int (*load_super)(struct md_rdev *rdev, 1123 struct md_rdev *refdev, 1124 int minor_version); 1125 int (*validate_super)(struct mddev *mddev, 1126 struct md_rdev *freshest, 1127 struct md_rdev *rdev); 1128 void (*sync_super)(struct mddev *mddev, 1129 struct md_rdev *rdev); 1130 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1131 sector_t num_sectors); 1132 int (*allow_new_offset)(struct md_rdev *rdev, 1133 unsigned long long new_offset); 1134 }; 1135 1136 /* 1137 * Check that the given mddev has no bitmap. 1138 * 1139 * This function is called from the run method of all personalities that do not 1140 * support bitmaps. It prints an error message and returns non-zero if mddev 1141 * has a bitmap. Otherwise, it returns 0. 1142 * 1143 */ 1144 int md_check_no_bitmap(struct mddev *mddev) 1145 { 1146 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1147 return 0; 1148 pr_warn("%s: bitmaps are not supported for %s\n", 1149 mdname(mddev), mddev->pers->name); 1150 return 1; 1151 } 1152 EXPORT_SYMBOL(md_check_no_bitmap); 1153 1154 /* 1155 * load_super for 0.90.0 1156 */ 1157 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1158 { 1159 mdp_super_t *sb; 1160 int ret; 1161 bool spare_disk = true; 1162 1163 /* 1164 * Calculate the position of the superblock (512byte sectors), 1165 * it's at the end of the disk. 1166 * 1167 * It also happens to be a multiple of 4Kb. 1168 */ 1169 rdev->sb_start = calc_dev_sboffset(rdev); 1170 1171 ret = read_disk_sb(rdev, MD_SB_BYTES); 1172 if (ret) 1173 return ret; 1174 1175 ret = -EINVAL; 1176 1177 sb = page_address(rdev->sb_page); 1178 1179 if (sb->md_magic != MD_SB_MAGIC) { 1180 pr_warn("md: invalid raid superblock magic on %pg\n", 1181 rdev->bdev); 1182 goto abort; 1183 } 1184 1185 if (sb->major_version != 0 || 1186 sb->minor_version < 90 || 1187 sb->minor_version > 91) { 1188 pr_warn("Bad version number %d.%d on %pg\n", 1189 sb->major_version, sb->minor_version, rdev->bdev); 1190 goto abort; 1191 } 1192 1193 if (sb->raid_disks <= 0) 1194 goto abort; 1195 1196 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1197 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1198 goto abort; 1199 } 1200 1201 rdev->preferred_minor = sb->md_minor; 1202 rdev->data_offset = 0; 1203 rdev->new_data_offset = 0; 1204 rdev->sb_size = MD_SB_BYTES; 1205 rdev->badblocks.shift = -1; 1206 1207 if (sb->level == LEVEL_MULTIPATH) 1208 rdev->desc_nr = -1; 1209 else 1210 rdev->desc_nr = sb->this_disk.number; 1211 1212 /* not spare disk, or LEVEL_MULTIPATH */ 1213 if (sb->level == LEVEL_MULTIPATH || 1214 (rdev->desc_nr >= 0 && 1215 rdev->desc_nr < MD_SB_DISKS && 1216 sb->disks[rdev->desc_nr].state & 1217 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) 1218 spare_disk = false; 1219 1220 if (!refdev) { 1221 if (!spare_disk) 1222 ret = 1; 1223 else 1224 ret = 0; 1225 } else { 1226 __u64 ev1, ev2; 1227 mdp_super_t *refsb = page_address(refdev->sb_page); 1228 if (!md_uuid_equal(refsb, sb)) { 1229 pr_warn("md: %pg has different UUID to %pg\n", 1230 rdev->bdev, refdev->bdev); 1231 goto abort; 1232 } 1233 if (!md_sb_equal(refsb, sb)) { 1234 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1235 rdev->bdev, refdev->bdev); 1236 goto abort; 1237 } 1238 ev1 = md_event(sb); 1239 ev2 = md_event(refsb); 1240 1241 if (!spare_disk && ev1 > ev2) 1242 ret = 1; 1243 else 1244 ret = 0; 1245 } 1246 rdev->sectors = rdev->sb_start; 1247 /* Limit to 4TB as metadata cannot record more than that. 1248 * (not needed for Linear and RAID0 as metadata doesn't 1249 * record this size) 1250 */ 1251 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1252 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1253 1254 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1255 /* "this cannot possibly happen" ... */ 1256 ret = -EINVAL; 1257 1258 abort: 1259 return ret; 1260 } 1261 1262 /* 1263 * validate_super for 0.90.0 1264 * note: we are not using "freshest" for 0.9 superblock 1265 */ 1266 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1267 { 1268 mdp_disk_t *desc; 1269 mdp_super_t *sb = page_address(rdev->sb_page); 1270 __u64 ev1 = md_event(sb); 1271 1272 rdev->raid_disk = -1; 1273 clear_bit(Faulty, &rdev->flags); 1274 clear_bit(In_sync, &rdev->flags); 1275 clear_bit(Bitmap_sync, &rdev->flags); 1276 clear_bit(WriteMostly, &rdev->flags); 1277 1278 if (mddev->raid_disks == 0) { 1279 mddev->major_version = 0; 1280 mddev->minor_version = sb->minor_version; 1281 mddev->patch_version = sb->patch_version; 1282 mddev->external = 0; 1283 mddev->chunk_sectors = sb->chunk_size >> 9; 1284 mddev->ctime = sb->ctime; 1285 mddev->utime = sb->utime; 1286 mddev->level = sb->level; 1287 mddev->clevel[0] = 0; 1288 mddev->layout = sb->layout; 1289 mddev->raid_disks = sb->raid_disks; 1290 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1291 mddev->events = ev1; 1292 mddev->bitmap_info.offset = 0; 1293 mddev->bitmap_info.space = 0; 1294 /* bitmap can use 60 K after the 4K superblocks */ 1295 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1296 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1297 mddev->reshape_backwards = 0; 1298 1299 if (mddev->minor_version >= 91) { 1300 mddev->reshape_position = sb->reshape_position; 1301 mddev->delta_disks = sb->delta_disks; 1302 mddev->new_level = sb->new_level; 1303 mddev->new_layout = sb->new_layout; 1304 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1305 if (mddev->delta_disks < 0) 1306 mddev->reshape_backwards = 1; 1307 } else { 1308 mddev->reshape_position = MaxSector; 1309 mddev->delta_disks = 0; 1310 mddev->new_level = mddev->level; 1311 mddev->new_layout = mddev->layout; 1312 mddev->new_chunk_sectors = mddev->chunk_sectors; 1313 } 1314 if (mddev->level == 0) 1315 mddev->layout = -1; 1316 1317 if (sb->state & (1<<MD_SB_CLEAN)) 1318 mddev->recovery_cp = MaxSector; 1319 else { 1320 if (sb->events_hi == sb->cp_events_hi && 1321 sb->events_lo == sb->cp_events_lo) { 1322 mddev->recovery_cp = sb->recovery_cp; 1323 } else 1324 mddev->recovery_cp = 0; 1325 } 1326 1327 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1328 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1329 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1330 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1331 1332 mddev->max_disks = MD_SB_DISKS; 1333 1334 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1335 mddev->bitmap_info.file == NULL) { 1336 mddev->bitmap_info.offset = 1337 mddev->bitmap_info.default_offset; 1338 mddev->bitmap_info.space = 1339 mddev->bitmap_info.default_space; 1340 } 1341 1342 } else if (mddev->pers == NULL) { 1343 /* Insist on good event counter while assembling, except 1344 * for spares (which don't need an event count) */ 1345 ++ev1; 1346 if (sb->disks[rdev->desc_nr].state & ( 1347 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1348 if (ev1 < mddev->events) 1349 return -EINVAL; 1350 } else if (mddev->bitmap) { 1351 /* if adding to array with a bitmap, then we can accept an 1352 * older device ... but not too old. 1353 */ 1354 if (ev1 < mddev->bitmap->events_cleared) 1355 return 0; 1356 if (ev1 < mddev->events) 1357 set_bit(Bitmap_sync, &rdev->flags); 1358 } else { 1359 if (ev1 < mddev->events) 1360 /* just a hot-add of a new device, leave raid_disk at -1 */ 1361 return 0; 1362 } 1363 1364 if (mddev->level != LEVEL_MULTIPATH) { 1365 desc = sb->disks + rdev->desc_nr; 1366 1367 if (desc->state & (1<<MD_DISK_FAULTY)) 1368 set_bit(Faulty, &rdev->flags); 1369 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1370 desc->raid_disk < mddev->raid_disks */) { 1371 set_bit(In_sync, &rdev->flags); 1372 rdev->raid_disk = desc->raid_disk; 1373 rdev->saved_raid_disk = desc->raid_disk; 1374 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1375 /* active but not in sync implies recovery up to 1376 * reshape position. We don't know exactly where 1377 * that is, so set to zero for now */ 1378 if (mddev->minor_version >= 91) { 1379 rdev->recovery_offset = 0; 1380 rdev->raid_disk = desc->raid_disk; 1381 } 1382 } 1383 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1384 set_bit(WriteMostly, &rdev->flags); 1385 if (desc->state & (1<<MD_DISK_FAILFAST)) 1386 set_bit(FailFast, &rdev->flags); 1387 } else /* MULTIPATH are always insync */ 1388 set_bit(In_sync, &rdev->flags); 1389 return 0; 1390 } 1391 1392 /* 1393 * sync_super for 0.90.0 1394 */ 1395 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1396 { 1397 mdp_super_t *sb; 1398 struct md_rdev *rdev2; 1399 int next_spare = mddev->raid_disks; 1400 1401 /* make rdev->sb match mddev data.. 1402 * 1403 * 1/ zero out disks 1404 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1405 * 3/ any empty disks < next_spare become removed 1406 * 1407 * disks[0] gets initialised to REMOVED because 1408 * we cannot be sure from other fields if it has 1409 * been initialised or not. 1410 */ 1411 int i; 1412 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1413 1414 rdev->sb_size = MD_SB_BYTES; 1415 1416 sb = page_address(rdev->sb_page); 1417 1418 memset(sb, 0, sizeof(*sb)); 1419 1420 sb->md_magic = MD_SB_MAGIC; 1421 sb->major_version = mddev->major_version; 1422 sb->patch_version = mddev->patch_version; 1423 sb->gvalid_words = 0; /* ignored */ 1424 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1425 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1426 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1427 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1428 1429 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1430 sb->level = mddev->level; 1431 sb->size = mddev->dev_sectors / 2; 1432 sb->raid_disks = mddev->raid_disks; 1433 sb->md_minor = mddev->md_minor; 1434 sb->not_persistent = 0; 1435 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1436 sb->state = 0; 1437 sb->events_hi = (mddev->events>>32); 1438 sb->events_lo = (u32)mddev->events; 1439 1440 if (mddev->reshape_position == MaxSector) 1441 sb->minor_version = 90; 1442 else { 1443 sb->minor_version = 91; 1444 sb->reshape_position = mddev->reshape_position; 1445 sb->new_level = mddev->new_level; 1446 sb->delta_disks = mddev->delta_disks; 1447 sb->new_layout = mddev->new_layout; 1448 sb->new_chunk = mddev->new_chunk_sectors << 9; 1449 } 1450 mddev->minor_version = sb->minor_version; 1451 if (mddev->in_sync) 1452 { 1453 sb->recovery_cp = mddev->recovery_cp; 1454 sb->cp_events_hi = (mddev->events>>32); 1455 sb->cp_events_lo = (u32)mddev->events; 1456 if (mddev->recovery_cp == MaxSector) 1457 sb->state = (1<< MD_SB_CLEAN); 1458 } else 1459 sb->recovery_cp = 0; 1460 1461 sb->layout = mddev->layout; 1462 sb->chunk_size = mddev->chunk_sectors << 9; 1463 1464 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1465 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1466 1467 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1468 rdev_for_each(rdev2, mddev) { 1469 mdp_disk_t *d; 1470 int desc_nr; 1471 int is_active = test_bit(In_sync, &rdev2->flags); 1472 1473 if (rdev2->raid_disk >= 0 && 1474 sb->minor_version >= 91) 1475 /* we have nowhere to store the recovery_offset, 1476 * but if it is not below the reshape_position, 1477 * we can piggy-back on that. 1478 */ 1479 is_active = 1; 1480 if (rdev2->raid_disk < 0 || 1481 test_bit(Faulty, &rdev2->flags)) 1482 is_active = 0; 1483 if (is_active) 1484 desc_nr = rdev2->raid_disk; 1485 else 1486 desc_nr = next_spare++; 1487 rdev2->desc_nr = desc_nr; 1488 d = &sb->disks[rdev2->desc_nr]; 1489 nr_disks++; 1490 d->number = rdev2->desc_nr; 1491 d->major = MAJOR(rdev2->bdev->bd_dev); 1492 d->minor = MINOR(rdev2->bdev->bd_dev); 1493 if (is_active) 1494 d->raid_disk = rdev2->raid_disk; 1495 else 1496 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1497 if (test_bit(Faulty, &rdev2->flags)) 1498 d->state = (1<<MD_DISK_FAULTY); 1499 else if (is_active) { 1500 d->state = (1<<MD_DISK_ACTIVE); 1501 if (test_bit(In_sync, &rdev2->flags)) 1502 d->state |= (1<<MD_DISK_SYNC); 1503 active++; 1504 working++; 1505 } else { 1506 d->state = 0; 1507 spare++; 1508 working++; 1509 } 1510 if (test_bit(WriteMostly, &rdev2->flags)) 1511 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1512 if (test_bit(FailFast, &rdev2->flags)) 1513 d->state |= (1<<MD_DISK_FAILFAST); 1514 } 1515 /* now set the "removed" and "faulty" bits on any missing devices */ 1516 for (i=0 ; i < mddev->raid_disks ; i++) { 1517 mdp_disk_t *d = &sb->disks[i]; 1518 if (d->state == 0 && d->number == 0) { 1519 d->number = i; 1520 d->raid_disk = i; 1521 d->state = (1<<MD_DISK_REMOVED); 1522 d->state |= (1<<MD_DISK_FAULTY); 1523 failed++; 1524 } 1525 } 1526 sb->nr_disks = nr_disks; 1527 sb->active_disks = active; 1528 sb->working_disks = working; 1529 sb->failed_disks = failed; 1530 sb->spare_disks = spare; 1531 1532 sb->this_disk = sb->disks[rdev->desc_nr]; 1533 sb->sb_csum = calc_sb_csum(sb); 1534 } 1535 1536 /* 1537 * rdev_size_change for 0.90.0 1538 */ 1539 static unsigned long long 1540 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1541 { 1542 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1543 return 0; /* component must fit device */ 1544 if (rdev->mddev->bitmap_info.offset) 1545 return 0; /* can't move bitmap */ 1546 rdev->sb_start = calc_dev_sboffset(rdev); 1547 if (!num_sectors || num_sectors > rdev->sb_start) 1548 num_sectors = rdev->sb_start; 1549 /* Limit to 4TB as metadata cannot record more than that. 1550 * 4TB == 2^32 KB, or 2*2^32 sectors. 1551 */ 1552 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1553 num_sectors = (sector_t)(2ULL << 32) - 2; 1554 do { 1555 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1556 rdev->sb_page); 1557 } while (md_super_wait(rdev->mddev) < 0); 1558 return num_sectors; 1559 } 1560 1561 static int 1562 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1563 { 1564 /* non-zero offset changes not possible with v0.90 */ 1565 return new_offset == 0; 1566 } 1567 1568 /* 1569 * version 1 superblock 1570 */ 1571 1572 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1573 { 1574 __le32 disk_csum; 1575 u32 csum; 1576 unsigned long long newcsum; 1577 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1578 __le32 *isuper = (__le32*)sb; 1579 1580 disk_csum = sb->sb_csum; 1581 sb->sb_csum = 0; 1582 newcsum = 0; 1583 for (; size >= 4; size -= 4) 1584 newcsum += le32_to_cpu(*isuper++); 1585 1586 if (size == 2) 1587 newcsum += le16_to_cpu(*(__le16*) isuper); 1588 1589 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1590 sb->sb_csum = disk_csum; 1591 return cpu_to_le32(csum); 1592 } 1593 1594 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1595 { 1596 struct mdp_superblock_1 *sb; 1597 int ret; 1598 sector_t sb_start; 1599 sector_t sectors; 1600 int bmask; 1601 bool spare_disk = true; 1602 1603 /* 1604 * Calculate the position of the superblock in 512byte sectors. 1605 * It is always aligned to a 4K boundary and 1606 * depeding on minor_version, it can be: 1607 * 0: At least 8K, but less than 12K, from end of device 1608 * 1: At start of device 1609 * 2: 4K from start of device. 1610 */ 1611 switch(minor_version) { 1612 case 0: 1613 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1614 sb_start &= ~(sector_t)(4*2-1); 1615 break; 1616 case 1: 1617 sb_start = 0; 1618 break; 1619 case 2: 1620 sb_start = 8; 1621 break; 1622 default: 1623 return -EINVAL; 1624 } 1625 rdev->sb_start = sb_start; 1626 1627 /* superblock is rarely larger than 1K, but it can be larger, 1628 * and it is safe to read 4k, so we do that 1629 */ 1630 ret = read_disk_sb(rdev, 4096); 1631 if (ret) return ret; 1632 1633 sb = page_address(rdev->sb_page); 1634 1635 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1636 sb->major_version != cpu_to_le32(1) || 1637 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1638 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1639 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1640 return -EINVAL; 1641 1642 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1643 pr_warn("md: invalid superblock checksum on %pg\n", 1644 rdev->bdev); 1645 return -EINVAL; 1646 } 1647 if (le64_to_cpu(sb->data_size) < 10) { 1648 pr_warn("md: data_size too small on %pg\n", 1649 rdev->bdev); 1650 return -EINVAL; 1651 } 1652 if (sb->pad0 || 1653 sb->pad3[0] || 1654 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1655 /* Some padding is non-zero, might be a new feature */ 1656 return -EINVAL; 1657 1658 rdev->preferred_minor = 0xffff; 1659 rdev->data_offset = le64_to_cpu(sb->data_offset); 1660 rdev->new_data_offset = rdev->data_offset; 1661 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1662 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1663 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1664 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1665 1666 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1667 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1668 if (rdev->sb_size & bmask) 1669 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1670 1671 if (minor_version 1672 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1673 return -EINVAL; 1674 if (minor_version 1675 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1676 return -EINVAL; 1677 1678 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1679 rdev->desc_nr = -1; 1680 else 1681 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1682 1683 if (!rdev->bb_page) { 1684 rdev->bb_page = alloc_page(GFP_KERNEL); 1685 if (!rdev->bb_page) 1686 return -ENOMEM; 1687 } 1688 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1689 rdev->badblocks.count == 0) { 1690 /* need to load the bad block list. 1691 * Currently we limit it to one page. 1692 */ 1693 s32 offset; 1694 sector_t bb_sector; 1695 __le64 *bbp; 1696 int i; 1697 int sectors = le16_to_cpu(sb->bblog_size); 1698 if (sectors > (PAGE_SIZE / 512)) 1699 return -EINVAL; 1700 offset = le32_to_cpu(sb->bblog_offset); 1701 if (offset == 0) 1702 return -EINVAL; 1703 bb_sector = (long long)offset; 1704 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1705 rdev->bb_page, REQ_OP_READ, true)) 1706 return -EIO; 1707 bbp = (__le64 *)page_address(rdev->bb_page); 1708 rdev->badblocks.shift = sb->bblog_shift; 1709 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1710 u64 bb = le64_to_cpu(*bbp); 1711 int count = bb & (0x3ff); 1712 u64 sector = bb >> 10; 1713 sector <<= sb->bblog_shift; 1714 count <<= sb->bblog_shift; 1715 if (bb + 1 == 0) 1716 break; 1717 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1718 return -EINVAL; 1719 } 1720 } else if (sb->bblog_offset != 0) 1721 rdev->badblocks.shift = 0; 1722 1723 if ((le32_to_cpu(sb->feature_map) & 1724 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1725 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1726 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1727 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1728 } 1729 1730 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1731 sb->level != 0) 1732 return -EINVAL; 1733 1734 /* not spare disk, or LEVEL_MULTIPATH */ 1735 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || 1736 (rdev->desc_nr >= 0 && 1737 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1738 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1739 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) 1740 spare_disk = false; 1741 1742 if (!refdev) { 1743 if (!spare_disk) 1744 ret = 1; 1745 else 1746 ret = 0; 1747 } else { 1748 __u64 ev1, ev2; 1749 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1750 1751 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1752 sb->level != refsb->level || 1753 sb->layout != refsb->layout || 1754 sb->chunksize != refsb->chunksize) { 1755 pr_warn("md: %pg has strangely different superblock to %pg\n", 1756 rdev->bdev, 1757 refdev->bdev); 1758 return -EINVAL; 1759 } 1760 ev1 = le64_to_cpu(sb->events); 1761 ev2 = le64_to_cpu(refsb->events); 1762 1763 if (!spare_disk && ev1 > ev2) 1764 ret = 1; 1765 else 1766 ret = 0; 1767 } 1768 if (minor_version) 1769 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1770 else 1771 sectors = rdev->sb_start; 1772 if (sectors < le64_to_cpu(sb->data_size)) 1773 return -EINVAL; 1774 rdev->sectors = le64_to_cpu(sb->data_size); 1775 return ret; 1776 } 1777 1778 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1779 { 1780 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1781 __u64 ev1 = le64_to_cpu(sb->events); 1782 1783 rdev->raid_disk = -1; 1784 clear_bit(Faulty, &rdev->flags); 1785 clear_bit(In_sync, &rdev->flags); 1786 clear_bit(Bitmap_sync, &rdev->flags); 1787 clear_bit(WriteMostly, &rdev->flags); 1788 1789 if (mddev->raid_disks == 0) { 1790 mddev->major_version = 1; 1791 mddev->patch_version = 0; 1792 mddev->external = 0; 1793 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1794 mddev->ctime = le64_to_cpu(sb->ctime); 1795 mddev->utime = le64_to_cpu(sb->utime); 1796 mddev->level = le32_to_cpu(sb->level); 1797 mddev->clevel[0] = 0; 1798 mddev->layout = le32_to_cpu(sb->layout); 1799 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1800 mddev->dev_sectors = le64_to_cpu(sb->size); 1801 mddev->events = ev1; 1802 mddev->bitmap_info.offset = 0; 1803 mddev->bitmap_info.space = 0; 1804 /* Default location for bitmap is 1K after superblock 1805 * using 3K - total of 4K 1806 */ 1807 mddev->bitmap_info.default_offset = 1024 >> 9; 1808 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1809 mddev->reshape_backwards = 0; 1810 1811 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1812 memcpy(mddev->uuid, sb->set_uuid, 16); 1813 1814 mddev->max_disks = (4096-256)/2; 1815 1816 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1817 mddev->bitmap_info.file == NULL) { 1818 mddev->bitmap_info.offset = 1819 (__s32)le32_to_cpu(sb->bitmap_offset); 1820 /* Metadata doesn't record how much space is available. 1821 * For 1.0, we assume we can use up to the superblock 1822 * if before, else to 4K beyond superblock. 1823 * For others, assume no change is possible. 1824 */ 1825 if (mddev->minor_version > 0) 1826 mddev->bitmap_info.space = 0; 1827 else if (mddev->bitmap_info.offset > 0) 1828 mddev->bitmap_info.space = 1829 8 - mddev->bitmap_info.offset; 1830 else 1831 mddev->bitmap_info.space = 1832 -mddev->bitmap_info.offset; 1833 } 1834 1835 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1836 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1837 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1838 mddev->new_level = le32_to_cpu(sb->new_level); 1839 mddev->new_layout = le32_to_cpu(sb->new_layout); 1840 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1841 if (mddev->delta_disks < 0 || 1842 (mddev->delta_disks == 0 && 1843 (le32_to_cpu(sb->feature_map) 1844 & MD_FEATURE_RESHAPE_BACKWARDS))) 1845 mddev->reshape_backwards = 1; 1846 } else { 1847 mddev->reshape_position = MaxSector; 1848 mddev->delta_disks = 0; 1849 mddev->new_level = mddev->level; 1850 mddev->new_layout = mddev->layout; 1851 mddev->new_chunk_sectors = mddev->chunk_sectors; 1852 } 1853 1854 if (mddev->level == 0 && 1855 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1856 mddev->layout = -1; 1857 1858 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1859 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1860 1861 if (le32_to_cpu(sb->feature_map) & 1862 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1863 if (le32_to_cpu(sb->feature_map) & 1864 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1865 return -EINVAL; 1866 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1867 (le32_to_cpu(sb->feature_map) & 1868 MD_FEATURE_MULTIPLE_PPLS)) 1869 return -EINVAL; 1870 set_bit(MD_HAS_PPL, &mddev->flags); 1871 } 1872 } else if (mddev->pers == NULL) { 1873 /* Insist of good event counter while assembling, except for 1874 * spares (which don't need an event count). 1875 * Similar to mdadm, we allow event counter difference of 1 1876 * from the freshest device. 1877 */ 1878 if (rdev->desc_nr >= 0 && 1879 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1880 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1881 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1882 if (ev1 + 1 < mddev->events) 1883 return -EINVAL; 1884 } else if (mddev->bitmap) { 1885 /* If adding to array with a bitmap, then we can accept an 1886 * older device, but not too old. 1887 */ 1888 if (ev1 < mddev->bitmap->events_cleared) 1889 return 0; 1890 if (ev1 < mddev->events) 1891 set_bit(Bitmap_sync, &rdev->flags); 1892 } else { 1893 if (ev1 < mddev->events) 1894 /* just a hot-add of a new device, leave raid_disk at -1 */ 1895 return 0; 1896 } 1897 if (mddev->level != LEVEL_MULTIPATH) { 1898 int role; 1899 if (rdev->desc_nr < 0 || 1900 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1901 role = MD_DISK_ROLE_SPARE; 1902 rdev->desc_nr = -1; 1903 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1904 /* 1905 * If we are assembling, and our event counter is smaller than the 1906 * highest event counter, we cannot trust our superblock about the role. 1907 * It could happen that our rdev was marked as Faulty, and all other 1908 * superblocks were updated with +1 event counter. 1909 * Then, before the next superblock update, which typically happens when 1910 * remove_and_add_spares() removes the device from the array, there was 1911 * a crash or reboot. 1912 * If we allow current rdev without consulting the freshest superblock, 1913 * we could cause data corruption. 1914 * Note that in this case our event counter is smaller by 1 than the 1915 * highest, otherwise, this rdev would not be allowed into array; 1916 * both kernel and mdadm allow event counter difference of 1. 1917 */ 1918 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 1919 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 1920 1921 if (rdev->desc_nr >= freshest_max_dev) { 1922 /* this is unexpected, better not proceed */ 1923 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 1924 mdname(mddev), rdev->bdev, rdev->desc_nr, 1925 freshest->bdev, freshest_max_dev); 1926 return -EUCLEAN; 1927 } 1928 1929 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 1930 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 1931 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 1932 } else { 1933 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1934 } 1935 switch(role) { 1936 case MD_DISK_ROLE_SPARE: /* spare */ 1937 break; 1938 case MD_DISK_ROLE_FAULTY: /* faulty */ 1939 set_bit(Faulty, &rdev->flags); 1940 break; 1941 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1942 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1943 /* journal device without journal feature */ 1944 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1945 return -EINVAL; 1946 } 1947 set_bit(Journal, &rdev->flags); 1948 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1949 rdev->raid_disk = 0; 1950 break; 1951 default: 1952 rdev->saved_raid_disk = role; 1953 if ((le32_to_cpu(sb->feature_map) & 1954 MD_FEATURE_RECOVERY_OFFSET)) { 1955 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1956 if (!(le32_to_cpu(sb->feature_map) & 1957 MD_FEATURE_RECOVERY_BITMAP)) 1958 rdev->saved_raid_disk = -1; 1959 } else { 1960 /* 1961 * If the array is FROZEN, then the device can't 1962 * be in_sync with rest of array. 1963 */ 1964 if (!test_bit(MD_RECOVERY_FROZEN, 1965 &mddev->recovery)) 1966 set_bit(In_sync, &rdev->flags); 1967 } 1968 rdev->raid_disk = role; 1969 break; 1970 } 1971 if (sb->devflags & WriteMostly1) 1972 set_bit(WriteMostly, &rdev->flags); 1973 if (sb->devflags & FailFast1) 1974 set_bit(FailFast, &rdev->flags); 1975 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1976 set_bit(Replacement, &rdev->flags); 1977 } else /* MULTIPATH are always insync */ 1978 set_bit(In_sync, &rdev->flags); 1979 1980 return 0; 1981 } 1982 1983 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1984 { 1985 struct mdp_superblock_1 *sb; 1986 struct md_rdev *rdev2; 1987 int max_dev, i; 1988 /* make rdev->sb match mddev and rdev data. */ 1989 1990 sb = page_address(rdev->sb_page); 1991 1992 sb->feature_map = 0; 1993 sb->pad0 = 0; 1994 sb->recovery_offset = cpu_to_le64(0); 1995 memset(sb->pad3, 0, sizeof(sb->pad3)); 1996 1997 sb->utime = cpu_to_le64((__u64)mddev->utime); 1998 sb->events = cpu_to_le64(mddev->events); 1999 if (mddev->in_sync) 2000 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2001 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2002 sb->resync_offset = cpu_to_le64(MaxSector); 2003 else 2004 sb->resync_offset = cpu_to_le64(0); 2005 2006 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2007 2008 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2009 sb->size = cpu_to_le64(mddev->dev_sectors); 2010 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2011 sb->level = cpu_to_le32(mddev->level); 2012 sb->layout = cpu_to_le32(mddev->layout); 2013 if (test_bit(FailFast, &rdev->flags)) 2014 sb->devflags |= FailFast1; 2015 else 2016 sb->devflags &= ~FailFast1; 2017 2018 if (test_bit(WriteMostly, &rdev->flags)) 2019 sb->devflags |= WriteMostly1; 2020 else 2021 sb->devflags &= ~WriteMostly1; 2022 sb->data_offset = cpu_to_le64(rdev->data_offset); 2023 sb->data_size = cpu_to_le64(rdev->sectors); 2024 2025 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2026 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2027 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2028 } 2029 2030 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2031 !test_bit(In_sync, &rdev->flags)) { 2032 sb->feature_map |= 2033 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2034 sb->recovery_offset = 2035 cpu_to_le64(rdev->recovery_offset); 2036 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2037 sb->feature_map |= 2038 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2039 } 2040 /* Note: recovery_offset and journal_tail share space */ 2041 if (test_bit(Journal, &rdev->flags)) 2042 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2043 if (test_bit(Replacement, &rdev->flags)) 2044 sb->feature_map |= 2045 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2046 2047 if (mddev->reshape_position != MaxSector) { 2048 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2049 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2050 sb->new_layout = cpu_to_le32(mddev->new_layout); 2051 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2052 sb->new_level = cpu_to_le32(mddev->new_level); 2053 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2054 if (mddev->delta_disks == 0 && 2055 mddev->reshape_backwards) 2056 sb->feature_map 2057 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2058 if (rdev->new_data_offset != rdev->data_offset) { 2059 sb->feature_map 2060 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2061 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2062 - rdev->data_offset)); 2063 } 2064 } 2065 2066 if (mddev_is_clustered(mddev)) 2067 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2068 2069 if (rdev->badblocks.count == 0) 2070 /* Nothing to do for bad blocks*/ ; 2071 else if (sb->bblog_offset == 0) 2072 /* Cannot record bad blocks on this device */ 2073 md_error(mddev, rdev); 2074 else { 2075 struct badblocks *bb = &rdev->badblocks; 2076 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2077 u64 *p = bb->page; 2078 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2079 if (bb->changed) { 2080 unsigned seq; 2081 2082 retry: 2083 seq = read_seqbegin(&bb->lock); 2084 2085 memset(bbp, 0xff, PAGE_SIZE); 2086 2087 for (i = 0 ; i < bb->count ; i++) { 2088 u64 internal_bb = p[i]; 2089 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2090 | BB_LEN(internal_bb)); 2091 bbp[i] = cpu_to_le64(store_bb); 2092 } 2093 bb->changed = 0; 2094 if (read_seqretry(&bb->lock, seq)) 2095 goto retry; 2096 2097 bb->sector = (rdev->sb_start + 2098 (int)le32_to_cpu(sb->bblog_offset)); 2099 bb->size = le16_to_cpu(sb->bblog_size); 2100 } 2101 } 2102 2103 max_dev = 0; 2104 rdev_for_each(rdev2, mddev) 2105 if (rdev2->desc_nr+1 > max_dev) 2106 max_dev = rdev2->desc_nr+1; 2107 2108 if (max_dev > le32_to_cpu(sb->max_dev)) { 2109 int bmask; 2110 sb->max_dev = cpu_to_le32(max_dev); 2111 rdev->sb_size = max_dev * 2 + 256; 2112 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2113 if (rdev->sb_size & bmask) 2114 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2115 } else 2116 max_dev = le32_to_cpu(sb->max_dev); 2117 2118 for (i=0; i<max_dev;i++) 2119 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2120 2121 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2122 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2123 2124 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2125 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2126 sb->feature_map |= 2127 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2128 else 2129 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2130 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2131 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2132 } 2133 2134 rdev_for_each(rdev2, mddev) { 2135 i = rdev2->desc_nr; 2136 if (test_bit(Faulty, &rdev2->flags)) 2137 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2138 else if (test_bit(In_sync, &rdev2->flags)) 2139 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2140 else if (test_bit(Journal, &rdev2->flags)) 2141 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2142 else if (rdev2->raid_disk >= 0) 2143 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2144 else 2145 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2146 } 2147 2148 sb->sb_csum = calc_sb_1_csum(sb); 2149 } 2150 2151 static sector_t super_1_choose_bm_space(sector_t dev_size) 2152 { 2153 sector_t bm_space; 2154 2155 /* if the device is bigger than 8Gig, save 64k for bitmap 2156 * usage, if bigger than 200Gig, save 128k 2157 */ 2158 if (dev_size < 64*2) 2159 bm_space = 0; 2160 else if (dev_size - 64*2 >= 200*1024*1024*2) 2161 bm_space = 128*2; 2162 else if (dev_size - 4*2 > 8*1024*1024*2) 2163 bm_space = 64*2; 2164 else 2165 bm_space = 4*2; 2166 return bm_space; 2167 } 2168 2169 static unsigned long long 2170 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2171 { 2172 struct mdp_superblock_1 *sb; 2173 sector_t max_sectors; 2174 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2175 return 0; /* component must fit device */ 2176 if (rdev->data_offset != rdev->new_data_offset) 2177 return 0; /* too confusing */ 2178 if (rdev->sb_start < rdev->data_offset) { 2179 /* minor versions 1 and 2; superblock before data */ 2180 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2181 if (!num_sectors || num_sectors > max_sectors) 2182 num_sectors = max_sectors; 2183 } else if (rdev->mddev->bitmap_info.offset) { 2184 /* minor version 0 with bitmap we can't move */ 2185 return 0; 2186 } else { 2187 /* minor version 0; superblock after data */ 2188 sector_t sb_start, bm_space; 2189 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2190 2191 /* 8K is for superblock */ 2192 sb_start = dev_size - 8*2; 2193 sb_start &= ~(sector_t)(4*2 - 1); 2194 2195 bm_space = super_1_choose_bm_space(dev_size); 2196 2197 /* Space that can be used to store date needs to decrease 2198 * superblock bitmap space and bad block space(4K) 2199 */ 2200 max_sectors = sb_start - bm_space - 4*2; 2201 2202 if (!num_sectors || num_sectors > max_sectors) 2203 num_sectors = max_sectors; 2204 rdev->sb_start = sb_start; 2205 } 2206 sb = page_address(rdev->sb_page); 2207 sb->data_size = cpu_to_le64(num_sectors); 2208 sb->super_offset = cpu_to_le64(rdev->sb_start); 2209 sb->sb_csum = calc_sb_1_csum(sb); 2210 do { 2211 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2212 rdev->sb_page); 2213 } while (md_super_wait(rdev->mddev) < 0); 2214 return num_sectors; 2215 2216 } 2217 2218 static int 2219 super_1_allow_new_offset(struct md_rdev *rdev, 2220 unsigned long long new_offset) 2221 { 2222 /* All necessary checks on new >= old have been done */ 2223 struct bitmap *bitmap; 2224 if (new_offset >= rdev->data_offset) 2225 return 1; 2226 2227 /* with 1.0 metadata, there is no metadata to tread on 2228 * so we can always move back */ 2229 if (rdev->mddev->minor_version == 0) 2230 return 1; 2231 2232 /* otherwise we must be sure not to step on 2233 * any metadata, so stay: 2234 * 36K beyond start of superblock 2235 * beyond end of badblocks 2236 * beyond write-intent bitmap 2237 */ 2238 if (rdev->sb_start + (32+4)*2 > new_offset) 2239 return 0; 2240 bitmap = rdev->mddev->bitmap; 2241 if (bitmap && !rdev->mddev->bitmap_info.file && 2242 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2243 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2244 return 0; 2245 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2246 return 0; 2247 2248 return 1; 2249 } 2250 2251 static struct super_type super_types[] = { 2252 [0] = { 2253 .name = "0.90.0", 2254 .owner = THIS_MODULE, 2255 .load_super = super_90_load, 2256 .validate_super = super_90_validate, 2257 .sync_super = super_90_sync, 2258 .rdev_size_change = super_90_rdev_size_change, 2259 .allow_new_offset = super_90_allow_new_offset, 2260 }, 2261 [1] = { 2262 .name = "md-1", 2263 .owner = THIS_MODULE, 2264 .load_super = super_1_load, 2265 .validate_super = super_1_validate, 2266 .sync_super = super_1_sync, 2267 .rdev_size_change = super_1_rdev_size_change, 2268 .allow_new_offset = super_1_allow_new_offset, 2269 }, 2270 }; 2271 2272 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2273 { 2274 if (mddev->sync_super) { 2275 mddev->sync_super(mddev, rdev); 2276 return; 2277 } 2278 2279 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2280 2281 super_types[mddev->major_version].sync_super(mddev, rdev); 2282 } 2283 2284 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2285 { 2286 struct md_rdev *rdev, *rdev2; 2287 2288 rcu_read_lock(); 2289 rdev_for_each_rcu(rdev, mddev1) { 2290 if (test_bit(Faulty, &rdev->flags) || 2291 test_bit(Journal, &rdev->flags) || 2292 rdev->raid_disk == -1) 2293 continue; 2294 rdev_for_each_rcu(rdev2, mddev2) { 2295 if (test_bit(Faulty, &rdev2->flags) || 2296 test_bit(Journal, &rdev2->flags) || 2297 rdev2->raid_disk == -1) 2298 continue; 2299 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2300 rcu_read_unlock(); 2301 return 1; 2302 } 2303 } 2304 } 2305 rcu_read_unlock(); 2306 return 0; 2307 } 2308 2309 static LIST_HEAD(pending_raid_disks); 2310 2311 /* 2312 * Try to register data integrity profile for an mddev 2313 * 2314 * This is called when an array is started and after a disk has been kicked 2315 * from the array. It only succeeds if all working and active component devices 2316 * are integrity capable with matching profiles. 2317 */ 2318 int md_integrity_register(struct mddev *mddev) 2319 { 2320 struct md_rdev *rdev, *reference = NULL; 2321 2322 if (list_empty(&mddev->disks)) 2323 return 0; /* nothing to do */ 2324 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2325 return 0; /* shouldn't register, or already is */ 2326 rdev_for_each(rdev, mddev) { 2327 /* skip spares and non-functional disks */ 2328 if (test_bit(Faulty, &rdev->flags)) 2329 continue; 2330 if (rdev->raid_disk < 0) 2331 continue; 2332 if (!reference) { 2333 /* Use the first rdev as the reference */ 2334 reference = rdev; 2335 continue; 2336 } 2337 /* does this rdev's profile match the reference profile? */ 2338 if (blk_integrity_compare(reference->bdev->bd_disk, 2339 rdev->bdev->bd_disk) < 0) 2340 return -EINVAL; 2341 } 2342 if (!reference || !bdev_get_integrity(reference->bdev)) 2343 return 0; 2344 /* 2345 * All component devices are integrity capable and have matching 2346 * profiles, register the common profile for the md device. 2347 */ 2348 blk_integrity_register(mddev->gendisk, 2349 bdev_get_integrity(reference->bdev)); 2350 2351 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2352 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2353 (mddev->level != 1 && mddev->level != 10 && 2354 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2355 /* 2356 * No need to handle the failure of bioset_integrity_create, 2357 * because the function is called by md_run() -> pers->run(), 2358 * md_run calls bioset_exit -> bioset_integrity_free in case 2359 * of failure case. 2360 */ 2361 pr_err("md: failed to create integrity pool for %s\n", 2362 mdname(mddev)); 2363 return -EINVAL; 2364 } 2365 return 0; 2366 } 2367 EXPORT_SYMBOL(md_integrity_register); 2368 2369 /* 2370 * Attempt to add an rdev, but only if it is consistent with the current 2371 * integrity profile 2372 */ 2373 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2374 { 2375 struct blk_integrity *bi_mddev; 2376 2377 if (!mddev->gendisk) 2378 return 0; 2379 2380 bi_mddev = blk_get_integrity(mddev->gendisk); 2381 2382 if (!bi_mddev) /* nothing to do */ 2383 return 0; 2384 2385 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2386 pr_err("%s: incompatible integrity profile for %pg\n", 2387 mdname(mddev), rdev->bdev); 2388 return -ENXIO; 2389 } 2390 2391 return 0; 2392 } 2393 EXPORT_SYMBOL(md_integrity_add_rdev); 2394 2395 static bool rdev_read_only(struct md_rdev *rdev) 2396 { 2397 return bdev_read_only(rdev->bdev) || 2398 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2399 } 2400 2401 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2402 { 2403 char b[BDEVNAME_SIZE]; 2404 int err; 2405 2406 /* prevent duplicates */ 2407 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2408 return -EEXIST; 2409 2410 if (rdev_read_only(rdev) && mddev->pers) 2411 return -EROFS; 2412 2413 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2414 if (!test_bit(Journal, &rdev->flags) && 2415 rdev->sectors && 2416 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2417 if (mddev->pers) { 2418 /* Cannot change size, so fail 2419 * If mddev->level <= 0, then we don't care 2420 * about aligning sizes (e.g. linear) 2421 */ 2422 if (mddev->level > 0) 2423 return -ENOSPC; 2424 } else 2425 mddev->dev_sectors = rdev->sectors; 2426 } 2427 2428 /* Verify rdev->desc_nr is unique. 2429 * If it is -1, assign a free number, else 2430 * check number is not in use 2431 */ 2432 rcu_read_lock(); 2433 if (rdev->desc_nr < 0) { 2434 int choice = 0; 2435 if (mddev->pers) 2436 choice = mddev->raid_disks; 2437 while (md_find_rdev_nr_rcu(mddev, choice)) 2438 choice++; 2439 rdev->desc_nr = choice; 2440 } else { 2441 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2442 rcu_read_unlock(); 2443 return -EBUSY; 2444 } 2445 } 2446 rcu_read_unlock(); 2447 if (!test_bit(Journal, &rdev->flags) && 2448 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2449 pr_warn("md: %s: array is limited to %d devices\n", 2450 mdname(mddev), mddev->max_disks); 2451 return -EBUSY; 2452 } 2453 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2454 strreplace(b, '/', '!'); 2455 2456 rdev->mddev = mddev; 2457 pr_debug("md: bind<%s>\n", b); 2458 2459 if (mddev->raid_disks) 2460 mddev_create_serial_pool(mddev, rdev, false); 2461 2462 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2463 goto fail; 2464 2465 /* failure here is OK */ 2466 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2467 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2468 rdev->sysfs_unack_badblocks = 2469 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2470 rdev->sysfs_badblocks = 2471 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2472 2473 list_add_rcu(&rdev->same_set, &mddev->disks); 2474 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2475 2476 /* May as well allow recovery to be retried once */ 2477 mddev->recovery_disabled++; 2478 2479 return 0; 2480 2481 fail: 2482 pr_warn("md: failed to register dev-%s for %s\n", 2483 b, mdname(mddev)); 2484 return err; 2485 } 2486 2487 void md_autodetect_dev(dev_t dev); 2488 2489 /* just for claiming the bdev */ 2490 static struct md_rdev claim_rdev; 2491 2492 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2493 { 2494 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2495 md_rdev_clear(rdev); 2496 #ifndef MODULE 2497 if (test_bit(AutoDetected, &rdev->flags)) 2498 md_autodetect_dev(rdev->bdev->bd_dev); 2499 #endif 2500 blkdev_put(rdev->bdev, 2501 test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev); 2502 rdev->bdev = NULL; 2503 kobject_put(&rdev->kobj); 2504 } 2505 2506 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2507 { 2508 struct mddev *mddev = rdev->mddev; 2509 2510 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2511 list_del_rcu(&rdev->same_set); 2512 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2513 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 2514 rdev->mddev = NULL; 2515 sysfs_remove_link(&rdev->kobj, "block"); 2516 sysfs_put(rdev->sysfs_state); 2517 sysfs_put(rdev->sysfs_unack_badblocks); 2518 sysfs_put(rdev->sysfs_badblocks); 2519 rdev->sysfs_state = NULL; 2520 rdev->sysfs_unack_badblocks = NULL; 2521 rdev->sysfs_badblocks = NULL; 2522 rdev->badblocks.count = 0; 2523 2524 synchronize_rcu(); 2525 2526 /* 2527 * kobject_del() will wait for all in progress writers to be done, where 2528 * reconfig_mutex is held, hence it can't be called under 2529 * reconfig_mutex and it's delayed to mddev_unlock(). 2530 */ 2531 list_add(&rdev->same_set, &mddev->deleting); 2532 } 2533 2534 static void export_array(struct mddev *mddev) 2535 { 2536 struct md_rdev *rdev; 2537 2538 while (!list_empty(&mddev->disks)) { 2539 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2540 same_set); 2541 md_kick_rdev_from_array(rdev); 2542 } 2543 mddev->raid_disks = 0; 2544 mddev->major_version = 0; 2545 } 2546 2547 static bool set_in_sync(struct mddev *mddev) 2548 { 2549 lockdep_assert_held(&mddev->lock); 2550 if (!mddev->in_sync) { 2551 mddev->sync_checkers++; 2552 spin_unlock(&mddev->lock); 2553 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2554 spin_lock(&mddev->lock); 2555 if (!mddev->in_sync && 2556 percpu_ref_is_zero(&mddev->writes_pending)) { 2557 mddev->in_sync = 1; 2558 /* 2559 * Ensure ->in_sync is visible before we clear 2560 * ->sync_checkers. 2561 */ 2562 smp_mb(); 2563 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2564 sysfs_notify_dirent_safe(mddev->sysfs_state); 2565 } 2566 if (--mddev->sync_checkers == 0) 2567 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2568 } 2569 if (mddev->safemode == 1) 2570 mddev->safemode = 0; 2571 return mddev->in_sync; 2572 } 2573 2574 static void sync_sbs(struct mddev *mddev, int nospares) 2575 { 2576 /* Update each superblock (in-memory image), but 2577 * if we are allowed to, skip spares which already 2578 * have the right event counter, or have one earlier 2579 * (which would mean they aren't being marked as dirty 2580 * with the rest of the array) 2581 */ 2582 struct md_rdev *rdev; 2583 rdev_for_each(rdev, mddev) { 2584 if (rdev->sb_events == mddev->events || 2585 (nospares && 2586 rdev->raid_disk < 0 && 2587 rdev->sb_events+1 == mddev->events)) { 2588 /* Don't update this superblock */ 2589 rdev->sb_loaded = 2; 2590 } else { 2591 sync_super(mddev, rdev); 2592 rdev->sb_loaded = 1; 2593 } 2594 } 2595 } 2596 2597 static bool does_sb_need_changing(struct mddev *mddev) 2598 { 2599 struct md_rdev *rdev = NULL, *iter; 2600 struct mdp_superblock_1 *sb; 2601 int role; 2602 2603 /* Find a good rdev */ 2604 rdev_for_each(iter, mddev) 2605 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2606 rdev = iter; 2607 break; 2608 } 2609 2610 /* No good device found. */ 2611 if (!rdev) 2612 return false; 2613 2614 sb = page_address(rdev->sb_page); 2615 /* Check if a device has become faulty or a spare become active */ 2616 rdev_for_each(rdev, mddev) { 2617 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2618 /* Device activated? */ 2619 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2620 !test_bit(Faulty, &rdev->flags)) 2621 return true; 2622 /* Device turned faulty? */ 2623 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2624 return true; 2625 } 2626 2627 /* Check if any mddev parameters have changed */ 2628 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2629 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2630 (mddev->layout != le32_to_cpu(sb->layout)) || 2631 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2632 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2633 return true; 2634 2635 return false; 2636 } 2637 2638 void md_update_sb(struct mddev *mddev, int force_change) 2639 { 2640 struct md_rdev *rdev; 2641 int sync_req; 2642 int nospares = 0; 2643 int any_badblocks_changed = 0; 2644 int ret = -1; 2645 2646 if (!md_is_rdwr(mddev)) { 2647 if (force_change) 2648 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2649 return; 2650 } 2651 2652 repeat: 2653 if (mddev_is_clustered(mddev)) { 2654 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2655 force_change = 1; 2656 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2657 nospares = 1; 2658 ret = md_cluster_ops->metadata_update_start(mddev); 2659 /* Has someone else has updated the sb */ 2660 if (!does_sb_need_changing(mddev)) { 2661 if (ret == 0) 2662 md_cluster_ops->metadata_update_cancel(mddev); 2663 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2664 BIT(MD_SB_CHANGE_DEVS) | 2665 BIT(MD_SB_CHANGE_CLEAN)); 2666 return; 2667 } 2668 } 2669 2670 /* 2671 * First make sure individual recovery_offsets are correct 2672 * curr_resync_completed can only be used during recovery. 2673 * During reshape/resync it might use array-addresses rather 2674 * that device addresses. 2675 */ 2676 rdev_for_each(rdev, mddev) { 2677 if (rdev->raid_disk >= 0 && 2678 mddev->delta_disks >= 0 && 2679 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2680 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2681 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2682 !test_bit(Journal, &rdev->flags) && 2683 !test_bit(In_sync, &rdev->flags) && 2684 mddev->curr_resync_completed > rdev->recovery_offset) 2685 rdev->recovery_offset = mddev->curr_resync_completed; 2686 2687 } 2688 if (!mddev->persistent) { 2689 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2690 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2691 if (!mddev->external) { 2692 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2693 rdev_for_each(rdev, mddev) { 2694 if (rdev->badblocks.changed) { 2695 rdev->badblocks.changed = 0; 2696 ack_all_badblocks(&rdev->badblocks); 2697 md_error(mddev, rdev); 2698 } 2699 clear_bit(Blocked, &rdev->flags); 2700 clear_bit(BlockedBadBlocks, &rdev->flags); 2701 wake_up(&rdev->blocked_wait); 2702 } 2703 } 2704 wake_up(&mddev->sb_wait); 2705 return; 2706 } 2707 2708 spin_lock(&mddev->lock); 2709 2710 mddev->utime = ktime_get_real_seconds(); 2711 2712 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2713 force_change = 1; 2714 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2715 /* just a clean<-> dirty transition, possibly leave spares alone, 2716 * though if events isn't the right even/odd, we will have to do 2717 * spares after all 2718 */ 2719 nospares = 1; 2720 if (force_change) 2721 nospares = 0; 2722 if (mddev->degraded) 2723 /* If the array is degraded, then skipping spares is both 2724 * dangerous and fairly pointless. 2725 * Dangerous because a device that was removed from the array 2726 * might have a event_count that still looks up-to-date, 2727 * so it can be re-added without a resync. 2728 * Pointless because if there are any spares to skip, 2729 * then a recovery will happen and soon that array won't 2730 * be degraded any more and the spare can go back to sleep then. 2731 */ 2732 nospares = 0; 2733 2734 sync_req = mddev->in_sync; 2735 2736 /* If this is just a dirty<->clean transition, and the array is clean 2737 * and 'events' is odd, we can roll back to the previous clean state */ 2738 if (nospares 2739 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2740 && mddev->can_decrease_events 2741 && mddev->events != 1) { 2742 mddev->events--; 2743 mddev->can_decrease_events = 0; 2744 } else { 2745 /* otherwise we have to go forward and ... */ 2746 mddev->events ++; 2747 mddev->can_decrease_events = nospares; 2748 } 2749 2750 /* 2751 * This 64-bit counter should never wrap. 2752 * Either we are in around ~1 trillion A.C., assuming 2753 * 1 reboot per second, or we have a bug... 2754 */ 2755 WARN_ON(mddev->events == 0); 2756 2757 rdev_for_each(rdev, mddev) { 2758 if (rdev->badblocks.changed) 2759 any_badblocks_changed++; 2760 if (test_bit(Faulty, &rdev->flags)) 2761 set_bit(FaultRecorded, &rdev->flags); 2762 } 2763 2764 sync_sbs(mddev, nospares); 2765 spin_unlock(&mddev->lock); 2766 2767 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2768 mdname(mddev), mddev->in_sync); 2769 2770 if (mddev->queue) 2771 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2772 rewrite: 2773 md_bitmap_update_sb(mddev->bitmap); 2774 rdev_for_each(rdev, mddev) { 2775 if (rdev->sb_loaded != 1) 2776 continue; /* no noise on spare devices */ 2777 2778 if (!test_bit(Faulty, &rdev->flags)) { 2779 md_super_write(mddev,rdev, 2780 rdev->sb_start, rdev->sb_size, 2781 rdev->sb_page); 2782 pr_debug("md: (write) %pg's sb offset: %llu\n", 2783 rdev->bdev, 2784 (unsigned long long)rdev->sb_start); 2785 rdev->sb_events = mddev->events; 2786 if (rdev->badblocks.size) { 2787 md_super_write(mddev, rdev, 2788 rdev->badblocks.sector, 2789 rdev->badblocks.size << 9, 2790 rdev->bb_page); 2791 rdev->badblocks.size = 0; 2792 } 2793 2794 } else 2795 pr_debug("md: %pg (skipping faulty)\n", 2796 rdev->bdev); 2797 2798 if (mddev->level == LEVEL_MULTIPATH) 2799 /* only need to write one superblock... */ 2800 break; 2801 } 2802 if (md_super_wait(mddev) < 0) 2803 goto rewrite; 2804 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2805 2806 if (mddev_is_clustered(mddev) && ret == 0) 2807 md_cluster_ops->metadata_update_finish(mddev); 2808 2809 if (mddev->in_sync != sync_req || 2810 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2811 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2812 /* have to write it out again */ 2813 goto repeat; 2814 wake_up(&mddev->sb_wait); 2815 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2816 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2817 2818 rdev_for_each(rdev, mddev) { 2819 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2820 clear_bit(Blocked, &rdev->flags); 2821 2822 if (any_badblocks_changed) 2823 ack_all_badblocks(&rdev->badblocks); 2824 clear_bit(BlockedBadBlocks, &rdev->flags); 2825 wake_up(&rdev->blocked_wait); 2826 } 2827 } 2828 EXPORT_SYMBOL(md_update_sb); 2829 2830 static int add_bound_rdev(struct md_rdev *rdev) 2831 { 2832 struct mddev *mddev = rdev->mddev; 2833 int err = 0; 2834 bool add_journal = test_bit(Journal, &rdev->flags); 2835 2836 if (!mddev->pers->hot_remove_disk || add_journal) { 2837 /* If there is hot_add_disk but no hot_remove_disk 2838 * then added disks for geometry changes, 2839 * and should be added immediately. 2840 */ 2841 super_types[mddev->major_version]. 2842 validate_super(mddev, NULL/*freshest*/, rdev); 2843 if (add_journal) 2844 mddev_suspend(mddev); 2845 err = mddev->pers->hot_add_disk(mddev, rdev); 2846 if (add_journal) 2847 mddev_resume(mddev); 2848 if (err) { 2849 md_kick_rdev_from_array(rdev); 2850 return err; 2851 } 2852 } 2853 sysfs_notify_dirent_safe(rdev->sysfs_state); 2854 2855 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2856 if (mddev->degraded) 2857 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2858 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2859 md_new_event(); 2860 md_wakeup_thread(mddev->thread); 2861 return 0; 2862 } 2863 2864 /* words written to sysfs files may, or may not, be \n terminated. 2865 * We want to accept with case. For this we use cmd_match. 2866 */ 2867 static int cmd_match(const char *cmd, const char *str) 2868 { 2869 /* See if cmd, written into a sysfs file, matches 2870 * str. They must either be the same, or cmd can 2871 * have a trailing newline 2872 */ 2873 while (*cmd && *str && *cmd == *str) { 2874 cmd++; 2875 str++; 2876 } 2877 if (*cmd == '\n') 2878 cmd++; 2879 if (*str || *cmd) 2880 return 0; 2881 return 1; 2882 } 2883 2884 struct rdev_sysfs_entry { 2885 struct attribute attr; 2886 ssize_t (*show)(struct md_rdev *, char *); 2887 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2888 }; 2889 2890 static ssize_t 2891 state_show(struct md_rdev *rdev, char *page) 2892 { 2893 char *sep = ","; 2894 size_t len = 0; 2895 unsigned long flags = READ_ONCE(rdev->flags); 2896 2897 if (test_bit(Faulty, &flags) || 2898 (!test_bit(ExternalBbl, &flags) && 2899 rdev->badblocks.unacked_exist)) 2900 len += sprintf(page+len, "faulty%s", sep); 2901 if (test_bit(In_sync, &flags)) 2902 len += sprintf(page+len, "in_sync%s", sep); 2903 if (test_bit(Journal, &flags)) 2904 len += sprintf(page+len, "journal%s", sep); 2905 if (test_bit(WriteMostly, &flags)) 2906 len += sprintf(page+len, "write_mostly%s", sep); 2907 if (test_bit(Blocked, &flags) || 2908 (rdev->badblocks.unacked_exist 2909 && !test_bit(Faulty, &flags))) 2910 len += sprintf(page+len, "blocked%s", sep); 2911 if (!test_bit(Faulty, &flags) && 2912 !test_bit(Journal, &flags) && 2913 !test_bit(In_sync, &flags)) 2914 len += sprintf(page+len, "spare%s", sep); 2915 if (test_bit(WriteErrorSeen, &flags)) 2916 len += sprintf(page+len, "write_error%s", sep); 2917 if (test_bit(WantReplacement, &flags)) 2918 len += sprintf(page+len, "want_replacement%s", sep); 2919 if (test_bit(Replacement, &flags)) 2920 len += sprintf(page+len, "replacement%s", sep); 2921 if (test_bit(ExternalBbl, &flags)) 2922 len += sprintf(page+len, "external_bbl%s", sep); 2923 if (test_bit(FailFast, &flags)) 2924 len += sprintf(page+len, "failfast%s", sep); 2925 2926 if (len) 2927 len -= strlen(sep); 2928 2929 return len+sprintf(page+len, "\n"); 2930 } 2931 2932 static ssize_t 2933 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2934 { 2935 /* can write 2936 * faulty - simulates an error 2937 * remove - disconnects the device 2938 * writemostly - sets write_mostly 2939 * -writemostly - clears write_mostly 2940 * blocked - sets the Blocked flags 2941 * -blocked - clears the Blocked and possibly simulates an error 2942 * insync - sets Insync providing device isn't active 2943 * -insync - clear Insync for a device with a slot assigned, 2944 * so that it gets rebuilt based on bitmap 2945 * write_error - sets WriteErrorSeen 2946 * -write_error - clears WriteErrorSeen 2947 * {,-}failfast - set/clear FailFast 2948 */ 2949 2950 struct mddev *mddev = rdev->mddev; 2951 int err = -EINVAL; 2952 bool need_update_sb = false; 2953 2954 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2955 md_error(rdev->mddev, rdev); 2956 2957 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2958 err = -EBUSY; 2959 else 2960 err = 0; 2961 } else if (cmd_match(buf, "remove")) { 2962 if (rdev->mddev->pers) { 2963 clear_bit(Blocked, &rdev->flags); 2964 remove_and_add_spares(rdev->mddev, rdev); 2965 } 2966 if (rdev->raid_disk >= 0) 2967 err = -EBUSY; 2968 else { 2969 err = 0; 2970 if (mddev_is_clustered(mddev)) 2971 err = md_cluster_ops->remove_disk(mddev, rdev); 2972 2973 if (err == 0) { 2974 md_kick_rdev_from_array(rdev); 2975 if (mddev->pers) { 2976 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2977 md_wakeup_thread(mddev->thread); 2978 } 2979 md_new_event(); 2980 } 2981 } 2982 } else if (cmd_match(buf, "writemostly")) { 2983 set_bit(WriteMostly, &rdev->flags); 2984 mddev_create_serial_pool(rdev->mddev, rdev, false); 2985 need_update_sb = true; 2986 err = 0; 2987 } else if (cmd_match(buf, "-writemostly")) { 2988 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 2989 clear_bit(WriteMostly, &rdev->flags); 2990 need_update_sb = true; 2991 err = 0; 2992 } else if (cmd_match(buf, "blocked")) { 2993 set_bit(Blocked, &rdev->flags); 2994 err = 0; 2995 } else if (cmd_match(buf, "-blocked")) { 2996 if (!test_bit(Faulty, &rdev->flags) && 2997 !test_bit(ExternalBbl, &rdev->flags) && 2998 rdev->badblocks.unacked_exist) { 2999 /* metadata handler doesn't understand badblocks, 3000 * so we need to fail the device 3001 */ 3002 md_error(rdev->mddev, rdev); 3003 } 3004 clear_bit(Blocked, &rdev->flags); 3005 clear_bit(BlockedBadBlocks, &rdev->flags); 3006 wake_up(&rdev->blocked_wait); 3007 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3008 md_wakeup_thread(rdev->mddev->thread); 3009 3010 err = 0; 3011 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3012 set_bit(In_sync, &rdev->flags); 3013 err = 0; 3014 } else if (cmd_match(buf, "failfast")) { 3015 set_bit(FailFast, &rdev->flags); 3016 need_update_sb = true; 3017 err = 0; 3018 } else if (cmd_match(buf, "-failfast")) { 3019 clear_bit(FailFast, &rdev->flags); 3020 need_update_sb = true; 3021 err = 0; 3022 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3023 !test_bit(Journal, &rdev->flags)) { 3024 if (rdev->mddev->pers == NULL) { 3025 clear_bit(In_sync, &rdev->flags); 3026 rdev->saved_raid_disk = rdev->raid_disk; 3027 rdev->raid_disk = -1; 3028 err = 0; 3029 } 3030 } else if (cmd_match(buf, "write_error")) { 3031 set_bit(WriteErrorSeen, &rdev->flags); 3032 err = 0; 3033 } else if (cmd_match(buf, "-write_error")) { 3034 clear_bit(WriteErrorSeen, &rdev->flags); 3035 err = 0; 3036 } else if (cmd_match(buf, "want_replacement")) { 3037 /* Any non-spare device that is not a replacement can 3038 * become want_replacement at any time, but we then need to 3039 * check if recovery is needed. 3040 */ 3041 if (rdev->raid_disk >= 0 && 3042 !test_bit(Journal, &rdev->flags) && 3043 !test_bit(Replacement, &rdev->flags)) 3044 set_bit(WantReplacement, &rdev->flags); 3045 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3046 md_wakeup_thread(rdev->mddev->thread); 3047 err = 0; 3048 } else if (cmd_match(buf, "-want_replacement")) { 3049 /* Clearing 'want_replacement' is always allowed. 3050 * Once replacements starts it is too late though. 3051 */ 3052 err = 0; 3053 clear_bit(WantReplacement, &rdev->flags); 3054 } else if (cmd_match(buf, "replacement")) { 3055 /* Can only set a device as a replacement when array has not 3056 * yet been started. Once running, replacement is automatic 3057 * from spares, or by assigning 'slot'. 3058 */ 3059 if (rdev->mddev->pers) 3060 err = -EBUSY; 3061 else { 3062 set_bit(Replacement, &rdev->flags); 3063 err = 0; 3064 } 3065 } else if (cmd_match(buf, "-replacement")) { 3066 /* Similarly, can only clear Replacement before start */ 3067 if (rdev->mddev->pers) 3068 err = -EBUSY; 3069 else { 3070 clear_bit(Replacement, &rdev->flags); 3071 err = 0; 3072 } 3073 } else if (cmd_match(buf, "re-add")) { 3074 if (!rdev->mddev->pers) 3075 err = -EINVAL; 3076 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3077 rdev->saved_raid_disk >= 0) { 3078 /* clear_bit is performed _after_ all the devices 3079 * have their local Faulty bit cleared. If any writes 3080 * happen in the meantime in the local node, they 3081 * will land in the local bitmap, which will be synced 3082 * by this node eventually 3083 */ 3084 if (!mddev_is_clustered(rdev->mddev) || 3085 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3086 clear_bit(Faulty, &rdev->flags); 3087 err = add_bound_rdev(rdev); 3088 } 3089 } else 3090 err = -EBUSY; 3091 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3092 set_bit(ExternalBbl, &rdev->flags); 3093 rdev->badblocks.shift = 0; 3094 err = 0; 3095 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3096 clear_bit(ExternalBbl, &rdev->flags); 3097 err = 0; 3098 } 3099 if (need_update_sb) 3100 md_update_sb(mddev, 1); 3101 if (!err) 3102 sysfs_notify_dirent_safe(rdev->sysfs_state); 3103 return err ? err : len; 3104 } 3105 static struct rdev_sysfs_entry rdev_state = 3106 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3107 3108 static ssize_t 3109 errors_show(struct md_rdev *rdev, char *page) 3110 { 3111 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3112 } 3113 3114 static ssize_t 3115 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3116 { 3117 unsigned int n; 3118 int rv; 3119 3120 rv = kstrtouint(buf, 10, &n); 3121 if (rv < 0) 3122 return rv; 3123 atomic_set(&rdev->corrected_errors, n); 3124 return len; 3125 } 3126 static struct rdev_sysfs_entry rdev_errors = 3127 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3128 3129 static ssize_t 3130 slot_show(struct md_rdev *rdev, char *page) 3131 { 3132 if (test_bit(Journal, &rdev->flags)) 3133 return sprintf(page, "journal\n"); 3134 else if (rdev->raid_disk < 0) 3135 return sprintf(page, "none\n"); 3136 else 3137 return sprintf(page, "%d\n", rdev->raid_disk); 3138 } 3139 3140 static ssize_t 3141 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3142 { 3143 int slot; 3144 int err; 3145 3146 if (test_bit(Journal, &rdev->flags)) 3147 return -EBUSY; 3148 if (strncmp(buf, "none", 4)==0) 3149 slot = -1; 3150 else { 3151 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3152 if (err < 0) 3153 return err; 3154 if (slot < 0) 3155 /* overflow */ 3156 return -ENOSPC; 3157 } 3158 if (rdev->mddev->pers && slot == -1) { 3159 /* Setting 'slot' on an active array requires also 3160 * updating the 'rd%d' link, and communicating 3161 * with the personality with ->hot_*_disk. 3162 * For now we only support removing 3163 * failed/spare devices. This normally happens automatically, 3164 * but not when the metadata is externally managed. 3165 */ 3166 if (rdev->raid_disk == -1) 3167 return -EEXIST; 3168 /* personality does all needed checks */ 3169 if (rdev->mddev->pers->hot_remove_disk == NULL) 3170 return -EINVAL; 3171 clear_bit(Blocked, &rdev->flags); 3172 remove_and_add_spares(rdev->mddev, rdev); 3173 if (rdev->raid_disk >= 0) 3174 return -EBUSY; 3175 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3176 md_wakeup_thread(rdev->mddev->thread); 3177 } else if (rdev->mddev->pers) { 3178 /* Activating a spare .. or possibly reactivating 3179 * if we ever get bitmaps working here. 3180 */ 3181 int err; 3182 3183 if (rdev->raid_disk != -1) 3184 return -EBUSY; 3185 3186 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3187 return -EBUSY; 3188 3189 if (rdev->mddev->pers->hot_add_disk == NULL) 3190 return -EINVAL; 3191 3192 if (slot >= rdev->mddev->raid_disks && 3193 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3194 return -ENOSPC; 3195 3196 rdev->raid_disk = slot; 3197 if (test_bit(In_sync, &rdev->flags)) 3198 rdev->saved_raid_disk = slot; 3199 else 3200 rdev->saved_raid_disk = -1; 3201 clear_bit(In_sync, &rdev->flags); 3202 clear_bit(Bitmap_sync, &rdev->flags); 3203 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3204 if (err) { 3205 rdev->raid_disk = -1; 3206 return err; 3207 } else 3208 sysfs_notify_dirent_safe(rdev->sysfs_state); 3209 /* failure here is OK */; 3210 sysfs_link_rdev(rdev->mddev, rdev); 3211 /* don't wakeup anyone, leave that to userspace. */ 3212 } else { 3213 if (slot >= rdev->mddev->raid_disks && 3214 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3215 return -ENOSPC; 3216 rdev->raid_disk = slot; 3217 /* assume it is working */ 3218 clear_bit(Faulty, &rdev->flags); 3219 clear_bit(WriteMostly, &rdev->flags); 3220 set_bit(In_sync, &rdev->flags); 3221 sysfs_notify_dirent_safe(rdev->sysfs_state); 3222 } 3223 return len; 3224 } 3225 3226 static struct rdev_sysfs_entry rdev_slot = 3227 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3228 3229 static ssize_t 3230 offset_show(struct md_rdev *rdev, char *page) 3231 { 3232 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3233 } 3234 3235 static ssize_t 3236 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3237 { 3238 unsigned long long offset; 3239 if (kstrtoull(buf, 10, &offset) < 0) 3240 return -EINVAL; 3241 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3242 return -EBUSY; 3243 if (rdev->sectors && rdev->mddev->external) 3244 /* Must set offset before size, so overlap checks 3245 * can be sane */ 3246 return -EBUSY; 3247 rdev->data_offset = offset; 3248 rdev->new_data_offset = offset; 3249 return len; 3250 } 3251 3252 static struct rdev_sysfs_entry rdev_offset = 3253 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3254 3255 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3256 { 3257 return sprintf(page, "%llu\n", 3258 (unsigned long long)rdev->new_data_offset); 3259 } 3260 3261 static ssize_t new_offset_store(struct md_rdev *rdev, 3262 const char *buf, size_t len) 3263 { 3264 unsigned long long new_offset; 3265 struct mddev *mddev = rdev->mddev; 3266 3267 if (kstrtoull(buf, 10, &new_offset) < 0) 3268 return -EINVAL; 3269 3270 if (mddev->sync_thread || 3271 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3272 return -EBUSY; 3273 if (new_offset == rdev->data_offset) 3274 /* reset is always permitted */ 3275 ; 3276 else if (new_offset > rdev->data_offset) { 3277 /* must not push array size beyond rdev_sectors */ 3278 if (new_offset - rdev->data_offset 3279 + mddev->dev_sectors > rdev->sectors) 3280 return -E2BIG; 3281 } 3282 /* Metadata worries about other space details. */ 3283 3284 /* decreasing the offset is inconsistent with a backwards 3285 * reshape. 3286 */ 3287 if (new_offset < rdev->data_offset && 3288 mddev->reshape_backwards) 3289 return -EINVAL; 3290 /* Increasing offset is inconsistent with forwards 3291 * reshape. reshape_direction should be set to 3292 * 'backwards' first. 3293 */ 3294 if (new_offset > rdev->data_offset && 3295 !mddev->reshape_backwards) 3296 return -EINVAL; 3297 3298 if (mddev->pers && mddev->persistent && 3299 !super_types[mddev->major_version] 3300 .allow_new_offset(rdev, new_offset)) 3301 return -E2BIG; 3302 rdev->new_data_offset = new_offset; 3303 if (new_offset > rdev->data_offset) 3304 mddev->reshape_backwards = 1; 3305 else if (new_offset < rdev->data_offset) 3306 mddev->reshape_backwards = 0; 3307 3308 return len; 3309 } 3310 static struct rdev_sysfs_entry rdev_new_offset = 3311 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3312 3313 static ssize_t 3314 rdev_size_show(struct md_rdev *rdev, char *page) 3315 { 3316 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3317 } 3318 3319 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3320 { 3321 /* check if two start/length pairs overlap */ 3322 if (a->data_offset + a->sectors <= b->data_offset) 3323 return false; 3324 if (b->data_offset + b->sectors <= a->data_offset) 3325 return false; 3326 return true; 3327 } 3328 3329 static bool md_rdev_overlaps(struct md_rdev *rdev) 3330 { 3331 struct mddev *mddev; 3332 struct md_rdev *rdev2; 3333 3334 spin_lock(&all_mddevs_lock); 3335 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3336 if (test_bit(MD_DELETED, &mddev->flags)) 3337 continue; 3338 rdev_for_each(rdev2, mddev) { 3339 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3340 md_rdevs_overlap(rdev, rdev2)) { 3341 spin_unlock(&all_mddevs_lock); 3342 return true; 3343 } 3344 } 3345 } 3346 spin_unlock(&all_mddevs_lock); 3347 return false; 3348 } 3349 3350 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3351 { 3352 unsigned long long blocks; 3353 sector_t new; 3354 3355 if (kstrtoull(buf, 10, &blocks) < 0) 3356 return -EINVAL; 3357 3358 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3359 return -EINVAL; /* sector conversion overflow */ 3360 3361 new = blocks * 2; 3362 if (new != blocks * 2) 3363 return -EINVAL; /* unsigned long long to sector_t overflow */ 3364 3365 *sectors = new; 3366 return 0; 3367 } 3368 3369 static ssize_t 3370 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3371 { 3372 struct mddev *my_mddev = rdev->mddev; 3373 sector_t oldsectors = rdev->sectors; 3374 sector_t sectors; 3375 3376 if (test_bit(Journal, &rdev->flags)) 3377 return -EBUSY; 3378 if (strict_blocks_to_sectors(buf, §ors) < 0) 3379 return -EINVAL; 3380 if (rdev->data_offset != rdev->new_data_offset) 3381 return -EINVAL; /* too confusing */ 3382 if (my_mddev->pers && rdev->raid_disk >= 0) { 3383 if (my_mddev->persistent) { 3384 sectors = super_types[my_mddev->major_version]. 3385 rdev_size_change(rdev, sectors); 3386 if (!sectors) 3387 return -EBUSY; 3388 } else if (!sectors) 3389 sectors = bdev_nr_sectors(rdev->bdev) - 3390 rdev->data_offset; 3391 if (!my_mddev->pers->resize) 3392 /* Cannot change size for RAID0 or Linear etc */ 3393 return -EINVAL; 3394 } 3395 if (sectors < my_mddev->dev_sectors) 3396 return -EINVAL; /* component must fit device */ 3397 3398 rdev->sectors = sectors; 3399 3400 /* 3401 * Check that all other rdevs with the same bdev do not overlap. This 3402 * check does not provide a hard guarantee, it just helps avoid 3403 * dangerous mistakes. 3404 */ 3405 if (sectors > oldsectors && my_mddev->external && 3406 md_rdev_overlaps(rdev)) { 3407 /* 3408 * Someone else could have slipped in a size change here, but 3409 * doing so is just silly. We put oldsectors back because we 3410 * know it is safe, and trust userspace not to race with itself. 3411 */ 3412 rdev->sectors = oldsectors; 3413 return -EBUSY; 3414 } 3415 return len; 3416 } 3417 3418 static struct rdev_sysfs_entry rdev_size = 3419 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3420 3421 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3422 { 3423 unsigned long long recovery_start = rdev->recovery_offset; 3424 3425 if (test_bit(In_sync, &rdev->flags) || 3426 recovery_start == MaxSector) 3427 return sprintf(page, "none\n"); 3428 3429 return sprintf(page, "%llu\n", recovery_start); 3430 } 3431 3432 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3433 { 3434 unsigned long long recovery_start; 3435 3436 if (cmd_match(buf, "none")) 3437 recovery_start = MaxSector; 3438 else if (kstrtoull(buf, 10, &recovery_start)) 3439 return -EINVAL; 3440 3441 if (rdev->mddev->pers && 3442 rdev->raid_disk >= 0) 3443 return -EBUSY; 3444 3445 rdev->recovery_offset = recovery_start; 3446 if (recovery_start == MaxSector) 3447 set_bit(In_sync, &rdev->flags); 3448 else 3449 clear_bit(In_sync, &rdev->flags); 3450 return len; 3451 } 3452 3453 static struct rdev_sysfs_entry rdev_recovery_start = 3454 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3455 3456 /* sysfs access to bad-blocks list. 3457 * We present two files. 3458 * 'bad-blocks' lists sector numbers and lengths of ranges that 3459 * are recorded as bad. The list is truncated to fit within 3460 * the one-page limit of sysfs. 3461 * Writing "sector length" to this file adds an acknowledged 3462 * bad block list. 3463 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3464 * been acknowledged. Writing to this file adds bad blocks 3465 * without acknowledging them. This is largely for testing. 3466 */ 3467 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3468 { 3469 return badblocks_show(&rdev->badblocks, page, 0); 3470 } 3471 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3472 { 3473 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3474 /* Maybe that ack was all we needed */ 3475 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3476 wake_up(&rdev->blocked_wait); 3477 return rv; 3478 } 3479 static struct rdev_sysfs_entry rdev_bad_blocks = 3480 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3481 3482 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3483 { 3484 return badblocks_show(&rdev->badblocks, page, 1); 3485 } 3486 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3487 { 3488 return badblocks_store(&rdev->badblocks, page, len, 1); 3489 } 3490 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3491 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3492 3493 static ssize_t 3494 ppl_sector_show(struct md_rdev *rdev, char *page) 3495 { 3496 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3497 } 3498 3499 static ssize_t 3500 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3501 { 3502 unsigned long long sector; 3503 3504 if (kstrtoull(buf, 10, §or) < 0) 3505 return -EINVAL; 3506 if (sector != (sector_t)sector) 3507 return -EINVAL; 3508 3509 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3510 rdev->raid_disk >= 0) 3511 return -EBUSY; 3512 3513 if (rdev->mddev->persistent) { 3514 if (rdev->mddev->major_version == 0) 3515 return -EINVAL; 3516 if ((sector > rdev->sb_start && 3517 sector - rdev->sb_start > S16_MAX) || 3518 (sector < rdev->sb_start && 3519 rdev->sb_start - sector > -S16_MIN)) 3520 return -EINVAL; 3521 rdev->ppl.offset = sector - rdev->sb_start; 3522 } else if (!rdev->mddev->external) { 3523 return -EBUSY; 3524 } 3525 rdev->ppl.sector = sector; 3526 return len; 3527 } 3528 3529 static struct rdev_sysfs_entry rdev_ppl_sector = 3530 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3531 3532 static ssize_t 3533 ppl_size_show(struct md_rdev *rdev, char *page) 3534 { 3535 return sprintf(page, "%u\n", rdev->ppl.size); 3536 } 3537 3538 static ssize_t 3539 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3540 { 3541 unsigned int size; 3542 3543 if (kstrtouint(buf, 10, &size) < 0) 3544 return -EINVAL; 3545 3546 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3547 rdev->raid_disk >= 0) 3548 return -EBUSY; 3549 3550 if (rdev->mddev->persistent) { 3551 if (rdev->mddev->major_version == 0) 3552 return -EINVAL; 3553 if (size > U16_MAX) 3554 return -EINVAL; 3555 } else if (!rdev->mddev->external) { 3556 return -EBUSY; 3557 } 3558 rdev->ppl.size = size; 3559 return len; 3560 } 3561 3562 static struct rdev_sysfs_entry rdev_ppl_size = 3563 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3564 3565 static struct attribute *rdev_default_attrs[] = { 3566 &rdev_state.attr, 3567 &rdev_errors.attr, 3568 &rdev_slot.attr, 3569 &rdev_offset.attr, 3570 &rdev_new_offset.attr, 3571 &rdev_size.attr, 3572 &rdev_recovery_start.attr, 3573 &rdev_bad_blocks.attr, 3574 &rdev_unack_bad_blocks.attr, 3575 &rdev_ppl_sector.attr, 3576 &rdev_ppl_size.attr, 3577 NULL, 3578 }; 3579 ATTRIBUTE_GROUPS(rdev_default); 3580 static ssize_t 3581 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3582 { 3583 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3584 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3585 3586 if (!entry->show) 3587 return -EIO; 3588 if (!rdev->mddev) 3589 return -ENODEV; 3590 return entry->show(rdev, page); 3591 } 3592 3593 static ssize_t 3594 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3595 const char *page, size_t length) 3596 { 3597 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3598 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3599 struct kernfs_node *kn = NULL; 3600 ssize_t rv; 3601 struct mddev *mddev = rdev->mddev; 3602 3603 if (!entry->store) 3604 return -EIO; 3605 if (!capable(CAP_SYS_ADMIN)) 3606 return -EACCES; 3607 3608 if (entry->store == state_store && cmd_match(page, "remove")) 3609 kn = sysfs_break_active_protection(kobj, attr); 3610 3611 rv = mddev ? mddev_lock(mddev) : -ENODEV; 3612 if (!rv) { 3613 if (rdev->mddev == NULL) 3614 rv = -ENODEV; 3615 else 3616 rv = entry->store(rdev, page, length); 3617 mddev_unlock(mddev); 3618 } 3619 3620 if (kn) 3621 sysfs_unbreak_active_protection(kn); 3622 3623 return rv; 3624 } 3625 3626 static void rdev_free(struct kobject *ko) 3627 { 3628 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3629 kfree(rdev); 3630 } 3631 static const struct sysfs_ops rdev_sysfs_ops = { 3632 .show = rdev_attr_show, 3633 .store = rdev_attr_store, 3634 }; 3635 static const struct kobj_type rdev_ktype = { 3636 .release = rdev_free, 3637 .sysfs_ops = &rdev_sysfs_ops, 3638 .default_groups = rdev_default_groups, 3639 }; 3640 3641 int md_rdev_init(struct md_rdev *rdev) 3642 { 3643 rdev->desc_nr = -1; 3644 rdev->saved_raid_disk = -1; 3645 rdev->raid_disk = -1; 3646 rdev->flags = 0; 3647 rdev->data_offset = 0; 3648 rdev->new_data_offset = 0; 3649 rdev->sb_events = 0; 3650 rdev->last_read_error = 0; 3651 rdev->sb_loaded = 0; 3652 rdev->bb_page = NULL; 3653 atomic_set(&rdev->nr_pending, 0); 3654 atomic_set(&rdev->read_errors, 0); 3655 atomic_set(&rdev->corrected_errors, 0); 3656 3657 INIT_LIST_HEAD(&rdev->same_set); 3658 init_waitqueue_head(&rdev->blocked_wait); 3659 3660 /* Add space to store bad block list. 3661 * This reserves the space even on arrays where it cannot 3662 * be used - I wonder if that matters 3663 */ 3664 return badblocks_init(&rdev->badblocks, 0); 3665 } 3666 EXPORT_SYMBOL_GPL(md_rdev_init); 3667 3668 /* 3669 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3670 * 3671 * mark the device faulty if: 3672 * 3673 * - the device is nonexistent (zero size) 3674 * - the device has no valid superblock 3675 * 3676 * a faulty rdev _never_ has rdev->sb set. 3677 */ 3678 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3679 { 3680 struct md_rdev *rdev; 3681 struct md_rdev *holder; 3682 sector_t size; 3683 int err; 3684 3685 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3686 if (!rdev) 3687 return ERR_PTR(-ENOMEM); 3688 3689 err = md_rdev_init(rdev); 3690 if (err) 3691 goto out_free_rdev; 3692 err = alloc_disk_sb(rdev); 3693 if (err) 3694 goto out_clear_rdev; 3695 3696 if (super_format == -2) { 3697 holder = &claim_rdev; 3698 } else { 3699 holder = rdev; 3700 set_bit(Holder, &rdev->flags); 3701 } 3702 3703 rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, 3704 holder, NULL); 3705 if (IS_ERR(rdev->bdev)) { 3706 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3707 MAJOR(newdev), MINOR(newdev)); 3708 err = PTR_ERR(rdev->bdev); 3709 goto out_clear_rdev; 3710 } 3711 3712 kobject_init(&rdev->kobj, &rdev_ktype); 3713 3714 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3715 if (!size) { 3716 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3717 rdev->bdev); 3718 err = -EINVAL; 3719 goto out_blkdev_put; 3720 } 3721 3722 if (super_format >= 0) { 3723 err = super_types[super_format]. 3724 load_super(rdev, NULL, super_minor); 3725 if (err == -EINVAL) { 3726 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3727 rdev->bdev, 3728 super_format, super_minor); 3729 goto out_blkdev_put; 3730 } 3731 if (err < 0) { 3732 pr_warn("md: could not read %pg's sb, not importing!\n", 3733 rdev->bdev); 3734 goto out_blkdev_put; 3735 } 3736 } 3737 3738 return rdev; 3739 3740 out_blkdev_put: 3741 blkdev_put(rdev->bdev, holder); 3742 out_clear_rdev: 3743 md_rdev_clear(rdev); 3744 out_free_rdev: 3745 kfree(rdev); 3746 return ERR_PTR(err); 3747 } 3748 3749 /* 3750 * Check a full RAID array for plausibility 3751 */ 3752 3753 static int analyze_sbs(struct mddev *mddev) 3754 { 3755 int i; 3756 struct md_rdev *rdev, *freshest, *tmp; 3757 3758 freshest = NULL; 3759 rdev_for_each_safe(rdev, tmp, mddev) 3760 switch (super_types[mddev->major_version]. 3761 load_super(rdev, freshest, mddev->minor_version)) { 3762 case 1: 3763 freshest = rdev; 3764 break; 3765 case 0: 3766 break; 3767 default: 3768 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3769 rdev->bdev); 3770 md_kick_rdev_from_array(rdev); 3771 } 3772 3773 /* Cannot find a valid fresh disk */ 3774 if (!freshest) { 3775 pr_warn("md: cannot find a valid disk\n"); 3776 return -EINVAL; 3777 } 3778 3779 super_types[mddev->major_version]. 3780 validate_super(mddev, NULL/*freshest*/, freshest); 3781 3782 i = 0; 3783 rdev_for_each_safe(rdev, tmp, mddev) { 3784 if (mddev->max_disks && 3785 (rdev->desc_nr >= mddev->max_disks || 3786 i > mddev->max_disks)) { 3787 pr_warn("md: %s: %pg: only %d devices permitted\n", 3788 mdname(mddev), rdev->bdev, 3789 mddev->max_disks); 3790 md_kick_rdev_from_array(rdev); 3791 continue; 3792 } 3793 if (rdev != freshest) { 3794 if (super_types[mddev->major_version]. 3795 validate_super(mddev, freshest, rdev)) { 3796 pr_warn("md: kicking non-fresh %pg from array!\n", 3797 rdev->bdev); 3798 md_kick_rdev_from_array(rdev); 3799 continue; 3800 } 3801 } 3802 if (mddev->level == LEVEL_MULTIPATH) { 3803 rdev->desc_nr = i++; 3804 rdev->raid_disk = rdev->desc_nr; 3805 set_bit(In_sync, &rdev->flags); 3806 } else if (rdev->raid_disk >= 3807 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3808 !test_bit(Journal, &rdev->flags)) { 3809 rdev->raid_disk = -1; 3810 clear_bit(In_sync, &rdev->flags); 3811 } 3812 } 3813 3814 return 0; 3815 } 3816 3817 /* Read a fixed-point number. 3818 * Numbers in sysfs attributes should be in "standard" units where 3819 * possible, so time should be in seconds. 3820 * However we internally use a a much smaller unit such as 3821 * milliseconds or jiffies. 3822 * This function takes a decimal number with a possible fractional 3823 * component, and produces an integer which is the result of 3824 * multiplying that number by 10^'scale'. 3825 * all without any floating-point arithmetic. 3826 */ 3827 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3828 { 3829 unsigned long result = 0; 3830 long decimals = -1; 3831 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3832 if (*cp == '.') 3833 decimals = 0; 3834 else if (decimals < scale) { 3835 unsigned int value; 3836 value = *cp - '0'; 3837 result = result * 10 + value; 3838 if (decimals >= 0) 3839 decimals++; 3840 } 3841 cp++; 3842 } 3843 if (*cp == '\n') 3844 cp++; 3845 if (*cp) 3846 return -EINVAL; 3847 if (decimals < 0) 3848 decimals = 0; 3849 *res = result * int_pow(10, scale - decimals); 3850 return 0; 3851 } 3852 3853 static ssize_t 3854 safe_delay_show(struct mddev *mddev, char *page) 3855 { 3856 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3857 3858 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3859 } 3860 static ssize_t 3861 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3862 { 3863 unsigned long msec; 3864 3865 if (mddev_is_clustered(mddev)) { 3866 pr_warn("md: Safemode is disabled for clustered mode\n"); 3867 return -EINVAL; 3868 } 3869 3870 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3871 return -EINVAL; 3872 if (msec == 0) 3873 mddev->safemode_delay = 0; 3874 else { 3875 unsigned long old_delay = mddev->safemode_delay; 3876 unsigned long new_delay = (msec*HZ)/1000; 3877 3878 if (new_delay == 0) 3879 new_delay = 1; 3880 mddev->safemode_delay = new_delay; 3881 if (new_delay < old_delay || old_delay == 0) 3882 mod_timer(&mddev->safemode_timer, jiffies+1); 3883 } 3884 return len; 3885 } 3886 static struct md_sysfs_entry md_safe_delay = 3887 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3888 3889 static ssize_t 3890 level_show(struct mddev *mddev, char *page) 3891 { 3892 struct md_personality *p; 3893 int ret; 3894 spin_lock(&mddev->lock); 3895 p = mddev->pers; 3896 if (p) 3897 ret = sprintf(page, "%s\n", p->name); 3898 else if (mddev->clevel[0]) 3899 ret = sprintf(page, "%s\n", mddev->clevel); 3900 else if (mddev->level != LEVEL_NONE) 3901 ret = sprintf(page, "%d\n", mddev->level); 3902 else 3903 ret = 0; 3904 spin_unlock(&mddev->lock); 3905 return ret; 3906 } 3907 3908 static ssize_t 3909 level_store(struct mddev *mddev, const char *buf, size_t len) 3910 { 3911 char clevel[16]; 3912 ssize_t rv; 3913 size_t slen = len; 3914 struct md_personality *pers, *oldpers; 3915 long level; 3916 void *priv, *oldpriv; 3917 struct md_rdev *rdev; 3918 3919 if (slen == 0 || slen >= sizeof(clevel)) 3920 return -EINVAL; 3921 3922 rv = mddev_lock(mddev); 3923 if (rv) 3924 return rv; 3925 3926 if (mddev->pers == NULL) { 3927 strncpy(mddev->clevel, buf, slen); 3928 if (mddev->clevel[slen-1] == '\n') 3929 slen--; 3930 mddev->clevel[slen] = 0; 3931 mddev->level = LEVEL_NONE; 3932 rv = len; 3933 goto out_unlock; 3934 } 3935 rv = -EROFS; 3936 if (!md_is_rdwr(mddev)) 3937 goto out_unlock; 3938 3939 /* request to change the personality. Need to ensure: 3940 * - array is not engaged in resync/recovery/reshape 3941 * - old personality can be suspended 3942 * - new personality will access other array. 3943 */ 3944 3945 rv = -EBUSY; 3946 if (mddev->sync_thread || 3947 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3948 mddev->reshape_position != MaxSector || 3949 mddev->sysfs_active) 3950 goto out_unlock; 3951 3952 rv = -EINVAL; 3953 if (!mddev->pers->quiesce) { 3954 pr_warn("md: %s: %s does not support online personality change\n", 3955 mdname(mddev), mddev->pers->name); 3956 goto out_unlock; 3957 } 3958 3959 /* Now find the new personality */ 3960 strncpy(clevel, buf, slen); 3961 if (clevel[slen-1] == '\n') 3962 slen--; 3963 clevel[slen] = 0; 3964 if (kstrtol(clevel, 10, &level)) 3965 level = LEVEL_NONE; 3966 3967 if (request_module("md-%s", clevel) != 0) 3968 request_module("md-level-%s", clevel); 3969 spin_lock(&pers_lock); 3970 pers = find_pers(level, clevel); 3971 if (!pers || !try_module_get(pers->owner)) { 3972 spin_unlock(&pers_lock); 3973 pr_warn("md: personality %s not loaded\n", clevel); 3974 rv = -EINVAL; 3975 goto out_unlock; 3976 } 3977 spin_unlock(&pers_lock); 3978 3979 if (pers == mddev->pers) { 3980 /* Nothing to do! */ 3981 module_put(pers->owner); 3982 rv = len; 3983 goto out_unlock; 3984 } 3985 if (!pers->takeover) { 3986 module_put(pers->owner); 3987 pr_warn("md: %s: %s does not support personality takeover\n", 3988 mdname(mddev), clevel); 3989 rv = -EINVAL; 3990 goto out_unlock; 3991 } 3992 3993 rdev_for_each(rdev, mddev) 3994 rdev->new_raid_disk = rdev->raid_disk; 3995 3996 /* ->takeover must set new_* and/or delta_disks 3997 * if it succeeds, and may set them when it fails. 3998 */ 3999 priv = pers->takeover(mddev); 4000 if (IS_ERR(priv)) { 4001 mddev->new_level = mddev->level; 4002 mddev->new_layout = mddev->layout; 4003 mddev->new_chunk_sectors = mddev->chunk_sectors; 4004 mddev->raid_disks -= mddev->delta_disks; 4005 mddev->delta_disks = 0; 4006 mddev->reshape_backwards = 0; 4007 module_put(pers->owner); 4008 pr_warn("md: %s: %s would not accept array\n", 4009 mdname(mddev), clevel); 4010 rv = PTR_ERR(priv); 4011 goto out_unlock; 4012 } 4013 4014 /* Looks like we have a winner */ 4015 mddev_suspend(mddev); 4016 mddev_detach(mddev); 4017 4018 spin_lock(&mddev->lock); 4019 oldpers = mddev->pers; 4020 oldpriv = mddev->private; 4021 mddev->pers = pers; 4022 mddev->private = priv; 4023 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4024 mddev->level = mddev->new_level; 4025 mddev->layout = mddev->new_layout; 4026 mddev->chunk_sectors = mddev->new_chunk_sectors; 4027 mddev->delta_disks = 0; 4028 mddev->reshape_backwards = 0; 4029 mddev->degraded = 0; 4030 spin_unlock(&mddev->lock); 4031 4032 if (oldpers->sync_request == NULL && 4033 mddev->external) { 4034 /* We are converting from a no-redundancy array 4035 * to a redundancy array and metadata is managed 4036 * externally so we need to be sure that writes 4037 * won't block due to a need to transition 4038 * clean->dirty 4039 * until external management is started. 4040 */ 4041 mddev->in_sync = 0; 4042 mddev->safemode_delay = 0; 4043 mddev->safemode = 0; 4044 } 4045 4046 oldpers->free(mddev, oldpriv); 4047 4048 if (oldpers->sync_request == NULL && 4049 pers->sync_request != NULL) { 4050 /* need to add the md_redundancy_group */ 4051 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4052 pr_warn("md: cannot register extra attributes for %s\n", 4053 mdname(mddev)); 4054 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4055 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4056 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4057 } 4058 if (oldpers->sync_request != NULL && 4059 pers->sync_request == NULL) { 4060 /* need to remove the md_redundancy_group */ 4061 if (mddev->to_remove == NULL) 4062 mddev->to_remove = &md_redundancy_group; 4063 } 4064 4065 module_put(oldpers->owner); 4066 4067 rdev_for_each(rdev, mddev) { 4068 if (rdev->raid_disk < 0) 4069 continue; 4070 if (rdev->new_raid_disk >= mddev->raid_disks) 4071 rdev->new_raid_disk = -1; 4072 if (rdev->new_raid_disk == rdev->raid_disk) 4073 continue; 4074 sysfs_unlink_rdev(mddev, rdev); 4075 } 4076 rdev_for_each(rdev, mddev) { 4077 if (rdev->raid_disk < 0) 4078 continue; 4079 if (rdev->new_raid_disk == rdev->raid_disk) 4080 continue; 4081 rdev->raid_disk = rdev->new_raid_disk; 4082 if (rdev->raid_disk < 0) 4083 clear_bit(In_sync, &rdev->flags); 4084 else { 4085 if (sysfs_link_rdev(mddev, rdev)) 4086 pr_warn("md: cannot register rd%d for %s after level change\n", 4087 rdev->raid_disk, mdname(mddev)); 4088 } 4089 } 4090 4091 if (pers->sync_request == NULL) { 4092 /* this is now an array without redundancy, so 4093 * it must always be in_sync 4094 */ 4095 mddev->in_sync = 1; 4096 del_timer_sync(&mddev->safemode_timer); 4097 } 4098 blk_set_stacking_limits(&mddev->queue->limits); 4099 pers->run(mddev); 4100 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4101 mddev_resume(mddev); 4102 if (!mddev->thread) 4103 md_update_sb(mddev, 1); 4104 sysfs_notify_dirent_safe(mddev->sysfs_level); 4105 md_new_event(); 4106 rv = len; 4107 out_unlock: 4108 mddev_unlock(mddev); 4109 return rv; 4110 } 4111 4112 static struct md_sysfs_entry md_level = 4113 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4114 4115 static ssize_t 4116 layout_show(struct mddev *mddev, char *page) 4117 { 4118 /* just a number, not meaningful for all levels */ 4119 if (mddev->reshape_position != MaxSector && 4120 mddev->layout != mddev->new_layout) 4121 return sprintf(page, "%d (%d)\n", 4122 mddev->new_layout, mddev->layout); 4123 return sprintf(page, "%d\n", mddev->layout); 4124 } 4125 4126 static ssize_t 4127 layout_store(struct mddev *mddev, const char *buf, size_t len) 4128 { 4129 unsigned int n; 4130 int err; 4131 4132 err = kstrtouint(buf, 10, &n); 4133 if (err < 0) 4134 return err; 4135 err = mddev_lock(mddev); 4136 if (err) 4137 return err; 4138 4139 if (mddev->pers) { 4140 if (mddev->pers->check_reshape == NULL) 4141 err = -EBUSY; 4142 else if (!md_is_rdwr(mddev)) 4143 err = -EROFS; 4144 else { 4145 mddev->new_layout = n; 4146 err = mddev->pers->check_reshape(mddev); 4147 if (err) 4148 mddev->new_layout = mddev->layout; 4149 } 4150 } else { 4151 mddev->new_layout = n; 4152 if (mddev->reshape_position == MaxSector) 4153 mddev->layout = n; 4154 } 4155 mddev_unlock(mddev); 4156 return err ?: len; 4157 } 4158 static struct md_sysfs_entry md_layout = 4159 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4160 4161 static ssize_t 4162 raid_disks_show(struct mddev *mddev, char *page) 4163 { 4164 if (mddev->raid_disks == 0) 4165 return 0; 4166 if (mddev->reshape_position != MaxSector && 4167 mddev->delta_disks != 0) 4168 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4169 mddev->raid_disks - mddev->delta_disks); 4170 return sprintf(page, "%d\n", mddev->raid_disks); 4171 } 4172 4173 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4174 4175 static ssize_t 4176 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4177 { 4178 unsigned int n; 4179 int err; 4180 4181 err = kstrtouint(buf, 10, &n); 4182 if (err < 0) 4183 return err; 4184 4185 err = mddev_lock(mddev); 4186 if (err) 4187 return err; 4188 if (mddev->pers) 4189 err = update_raid_disks(mddev, n); 4190 else if (mddev->reshape_position != MaxSector) { 4191 struct md_rdev *rdev; 4192 int olddisks = mddev->raid_disks - mddev->delta_disks; 4193 4194 err = -EINVAL; 4195 rdev_for_each(rdev, mddev) { 4196 if (olddisks < n && 4197 rdev->data_offset < rdev->new_data_offset) 4198 goto out_unlock; 4199 if (olddisks > n && 4200 rdev->data_offset > rdev->new_data_offset) 4201 goto out_unlock; 4202 } 4203 err = 0; 4204 mddev->delta_disks = n - olddisks; 4205 mddev->raid_disks = n; 4206 mddev->reshape_backwards = (mddev->delta_disks < 0); 4207 } else 4208 mddev->raid_disks = n; 4209 out_unlock: 4210 mddev_unlock(mddev); 4211 return err ? err : len; 4212 } 4213 static struct md_sysfs_entry md_raid_disks = 4214 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4215 4216 static ssize_t 4217 uuid_show(struct mddev *mddev, char *page) 4218 { 4219 return sprintf(page, "%pU\n", mddev->uuid); 4220 } 4221 static struct md_sysfs_entry md_uuid = 4222 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4223 4224 static ssize_t 4225 chunk_size_show(struct mddev *mddev, char *page) 4226 { 4227 if (mddev->reshape_position != MaxSector && 4228 mddev->chunk_sectors != mddev->new_chunk_sectors) 4229 return sprintf(page, "%d (%d)\n", 4230 mddev->new_chunk_sectors << 9, 4231 mddev->chunk_sectors << 9); 4232 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4233 } 4234 4235 static ssize_t 4236 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4237 { 4238 unsigned long n; 4239 int err; 4240 4241 err = kstrtoul(buf, 10, &n); 4242 if (err < 0) 4243 return err; 4244 4245 err = mddev_lock(mddev); 4246 if (err) 4247 return err; 4248 if (mddev->pers) { 4249 if (mddev->pers->check_reshape == NULL) 4250 err = -EBUSY; 4251 else if (!md_is_rdwr(mddev)) 4252 err = -EROFS; 4253 else { 4254 mddev->new_chunk_sectors = n >> 9; 4255 err = mddev->pers->check_reshape(mddev); 4256 if (err) 4257 mddev->new_chunk_sectors = mddev->chunk_sectors; 4258 } 4259 } else { 4260 mddev->new_chunk_sectors = n >> 9; 4261 if (mddev->reshape_position == MaxSector) 4262 mddev->chunk_sectors = n >> 9; 4263 } 4264 mddev_unlock(mddev); 4265 return err ?: len; 4266 } 4267 static struct md_sysfs_entry md_chunk_size = 4268 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4269 4270 static ssize_t 4271 resync_start_show(struct mddev *mddev, char *page) 4272 { 4273 if (mddev->recovery_cp == MaxSector) 4274 return sprintf(page, "none\n"); 4275 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4276 } 4277 4278 static ssize_t 4279 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4280 { 4281 unsigned long long n; 4282 int err; 4283 4284 if (cmd_match(buf, "none")) 4285 n = MaxSector; 4286 else { 4287 err = kstrtoull(buf, 10, &n); 4288 if (err < 0) 4289 return err; 4290 if (n != (sector_t)n) 4291 return -EINVAL; 4292 } 4293 4294 err = mddev_lock(mddev); 4295 if (err) 4296 return err; 4297 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4298 err = -EBUSY; 4299 4300 if (!err) { 4301 mddev->recovery_cp = n; 4302 if (mddev->pers) 4303 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4304 } 4305 mddev_unlock(mddev); 4306 return err ?: len; 4307 } 4308 static struct md_sysfs_entry md_resync_start = 4309 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4310 resync_start_show, resync_start_store); 4311 4312 /* 4313 * The array state can be: 4314 * 4315 * clear 4316 * No devices, no size, no level 4317 * Equivalent to STOP_ARRAY ioctl 4318 * inactive 4319 * May have some settings, but array is not active 4320 * all IO results in error 4321 * When written, doesn't tear down array, but just stops it 4322 * suspended (not supported yet) 4323 * All IO requests will block. The array can be reconfigured. 4324 * Writing this, if accepted, will block until array is quiescent 4325 * readonly 4326 * no resync can happen. no superblocks get written. 4327 * write requests fail 4328 * read-auto 4329 * like readonly, but behaves like 'clean' on a write request. 4330 * 4331 * clean - no pending writes, but otherwise active. 4332 * When written to inactive array, starts without resync 4333 * If a write request arrives then 4334 * if metadata is known, mark 'dirty' and switch to 'active'. 4335 * if not known, block and switch to write-pending 4336 * If written to an active array that has pending writes, then fails. 4337 * active 4338 * fully active: IO and resync can be happening. 4339 * When written to inactive array, starts with resync 4340 * 4341 * write-pending 4342 * clean, but writes are blocked waiting for 'active' to be written. 4343 * 4344 * active-idle 4345 * like active, but no writes have been seen for a while (100msec). 4346 * 4347 * broken 4348 * Array is failed. It's useful because mounted-arrays aren't stopped 4349 * when array is failed, so this state will at least alert the user that 4350 * something is wrong. 4351 */ 4352 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4353 write_pending, active_idle, broken, bad_word}; 4354 static char *array_states[] = { 4355 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4356 "write-pending", "active-idle", "broken", NULL }; 4357 4358 static int match_word(const char *word, char **list) 4359 { 4360 int n; 4361 for (n=0; list[n]; n++) 4362 if (cmd_match(word, list[n])) 4363 break; 4364 return n; 4365 } 4366 4367 static ssize_t 4368 array_state_show(struct mddev *mddev, char *page) 4369 { 4370 enum array_state st = inactive; 4371 4372 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4373 switch(mddev->ro) { 4374 case MD_RDONLY: 4375 st = readonly; 4376 break; 4377 case MD_AUTO_READ: 4378 st = read_auto; 4379 break; 4380 case MD_RDWR: 4381 spin_lock(&mddev->lock); 4382 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4383 st = write_pending; 4384 else if (mddev->in_sync) 4385 st = clean; 4386 else if (mddev->safemode) 4387 st = active_idle; 4388 else 4389 st = active; 4390 spin_unlock(&mddev->lock); 4391 } 4392 4393 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4394 st = broken; 4395 } else { 4396 if (list_empty(&mddev->disks) && 4397 mddev->raid_disks == 0 && 4398 mddev->dev_sectors == 0) 4399 st = clear; 4400 else 4401 st = inactive; 4402 } 4403 return sprintf(page, "%s\n", array_states[st]); 4404 } 4405 4406 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4407 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4408 static int restart_array(struct mddev *mddev); 4409 4410 static ssize_t 4411 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4412 { 4413 int err = 0; 4414 enum array_state st = match_word(buf, array_states); 4415 4416 if (mddev->pers && (st == active || st == clean) && 4417 mddev->ro != MD_RDONLY) { 4418 /* don't take reconfig_mutex when toggling between 4419 * clean and active 4420 */ 4421 spin_lock(&mddev->lock); 4422 if (st == active) { 4423 restart_array(mddev); 4424 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4425 md_wakeup_thread(mddev->thread); 4426 wake_up(&mddev->sb_wait); 4427 } else /* st == clean */ { 4428 restart_array(mddev); 4429 if (!set_in_sync(mddev)) 4430 err = -EBUSY; 4431 } 4432 if (!err) 4433 sysfs_notify_dirent_safe(mddev->sysfs_state); 4434 spin_unlock(&mddev->lock); 4435 return err ?: len; 4436 } 4437 err = mddev_lock(mddev); 4438 if (err) 4439 return err; 4440 err = -EINVAL; 4441 switch(st) { 4442 case bad_word: 4443 break; 4444 case clear: 4445 /* stopping an active array */ 4446 err = do_md_stop(mddev, 0, NULL); 4447 break; 4448 case inactive: 4449 /* stopping an active array */ 4450 if (mddev->pers) 4451 err = do_md_stop(mddev, 2, NULL); 4452 else 4453 err = 0; /* already inactive */ 4454 break; 4455 case suspended: 4456 break; /* not supported yet */ 4457 case readonly: 4458 if (mddev->pers) 4459 err = md_set_readonly(mddev, NULL); 4460 else { 4461 mddev->ro = MD_RDONLY; 4462 set_disk_ro(mddev->gendisk, 1); 4463 err = do_md_run(mddev); 4464 } 4465 break; 4466 case read_auto: 4467 if (mddev->pers) { 4468 if (md_is_rdwr(mddev)) 4469 err = md_set_readonly(mddev, NULL); 4470 else if (mddev->ro == MD_RDONLY) 4471 err = restart_array(mddev); 4472 if (err == 0) { 4473 mddev->ro = MD_AUTO_READ; 4474 set_disk_ro(mddev->gendisk, 0); 4475 } 4476 } else { 4477 mddev->ro = MD_AUTO_READ; 4478 err = do_md_run(mddev); 4479 } 4480 break; 4481 case clean: 4482 if (mddev->pers) { 4483 err = restart_array(mddev); 4484 if (err) 4485 break; 4486 spin_lock(&mddev->lock); 4487 if (!set_in_sync(mddev)) 4488 err = -EBUSY; 4489 spin_unlock(&mddev->lock); 4490 } else 4491 err = -EINVAL; 4492 break; 4493 case active: 4494 if (mddev->pers) { 4495 err = restart_array(mddev); 4496 if (err) 4497 break; 4498 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4499 wake_up(&mddev->sb_wait); 4500 err = 0; 4501 } else { 4502 mddev->ro = MD_RDWR; 4503 set_disk_ro(mddev->gendisk, 0); 4504 err = do_md_run(mddev); 4505 } 4506 break; 4507 case write_pending: 4508 case active_idle: 4509 case broken: 4510 /* these cannot be set */ 4511 break; 4512 } 4513 4514 if (!err) { 4515 if (mddev->hold_active == UNTIL_IOCTL) 4516 mddev->hold_active = 0; 4517 sysfs_notify_dirent_safe(mddev->sysfs_state); 4518 } 4519 mddev_unlock(mddev); 4520 return err ?: len; 4521 } 4522 static struct md_sysfs_entry md_array_state = 4523 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4524 4525 static ssize_t 4526 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4527 return sprintf(page, "%d\n", 4528 atomic_read(&mddev->max_corr_read_errors)); 4529 } 4530 4531 static ssize_t 4532 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4533 { 4534 unsigned int n; 4535 int rv; 4536 4537 rv = kstrtouint(buf, 10, &n); 4538 if (rv < 0) 4539 return rv; 4540 if (n > INT_MAX) 4541 return -EINVAL; 4542 atomic_set(&mddev->max_corr_read_errors, n); 4543 return len; 4544 } 4545 4546 static struct md_sysfs_entry max_corr_read_errors = 4547 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4548 max_corrected_read_errors_store); 4549 4550 static ssize_t 4551 null_show(struct mddev *mddev, char *page) 4552 { 4553 return -EINVAL; 4554 } 4555 4556 static ssize_t 4557 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4558 { 4559 /* buf must be %d:%d\n? giving major and minor numbers */ 4560 /* The new device is added to the array. 4561 * If the array has a persistent superblock, we read the 4562 * superblock to initialise info and check validity. 4563 * Otherwise, only checking done is that in bind_rdev_to_array, 4564 * which mainly checks size. 4565 */ 4566 char *e; 4567 int major = simple_strtoul(buf, &e, 10); 4568 int minor; 4569 dev_t dev; 4570 struct md_rdev *rdev; 4571 int err; 4572 4573 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4574 return -EINVAL; 4575 minor = simple_strtoul(e+1, &e, 10); 4576 if (*e && *e != '\n') 4577 return -EINVAL; 4578 dev = MKDEV(major, minor); 4579 if (major != MAJOR(dev) || 4580 minor != MINOR(dev)) 4581 return -EOVERFLOW; 4582 4583 err = mddev_lock(mddev); 4584 if (err) 4585 return err; 4586 if (mddev->persistent) { 4587 rdev = md_import_device(dev, mddev->major_version, 4588 mddev->minor_version); 4589 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4590 struct md_rdev *rdev0 4591 = list_entry(mddev->disks.next, 4592 struct md_rdev, same_set); 4593 err = super_types[mddev->major_version] 4594 .load_super(rdev, rdev0, mddev->minor_version); 4595 if (err < 0) 4596 goto out; 4597 } 4598 } else if (mddev->external) 4599 rdev = md_import_device(dev, -2, -1); 4600 else 4601 rdev = md_import_device(dev, -1, -1); 4602 4603 if (IS_ERR(rdev)) { 4604 mddev_unlock(mddev); 4605 return PTR_ERR(rdev); 4606 } 4607 err = bind_rdev_to_array(rdev, mddev); 4608 out: 4609 if (err) 4610 export_rdev(rdev, mddev); 4611 mddev_unlock(mddev); 4612 if (!err) 4613 md_new_event(); 4614 return err ? err : len; 4615 } 4616 4617 static struct md_sysfs_entry md_new_device = 4618 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4619 4620 static ssize_t 4621 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4622 { 4623 char *end; 4624 unsigned long chunk, end_chunk; 4625 int err; 4626 4627 err = mddev_lock(mddev); 4628 if (err) 4629 return err; 4630 if (!mddev->bitmap) 4631 goto out; 4632 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4633 while (*buf) { 4634 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4635 if (buf == end) break; 4636 if (*end == '-') { /* range */ 4637 buf = end + 1; 4638 end_chunk = simple_strtoul(buf, &end, 0); 4639 if (buf == end) break; 4640 } 4641 if (*end && !isspace(*end)) break; 4642 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4643 buf = skip_spaces(end); 4644 } 4645 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4646 out: 4647 mddev_unlock(mddev); 4648 return len; 4649 } 4650 4651 static struct md_sysfs_entry md_bitmap = 4652 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4653 4654 static ssize_t 4655 size_show(struct mddev *mddev, char *page) 4656 { 4657 return sprintf(page, "%llu\n", 4658 (unsigned long long)mddev->dev_sectors / 2); 4659 } 4660 4661 static int update_size(struct mddev *mddev, sector_t num_sectors); 4662 4663 static ssize_t 4664 size_store(struct mddev *mddev, const char *buf, size_t len) 4665 { 4666 /* If array is inactive, we can reduce the component size, but 4667 * not increase it (except from 0). 4668 * If array is active, we can try an on-line resize 4669 */ 4670 sector_t sectors; 4671 int err = strict_blocks_to_sectors(buf, §ors); 4672 4673 if (err < 0) 4674 return err; 4675 err = mddev_lock(mddev); 4676 if (err) 4677 return err; 4678 if (mddev->pers) { 4679 err = update_size(mddev, sectors); 4680 if (err == 0) 4681 md_update_sb(mddev, 1); 4682 } else { 4683 if (mddev->dev_sectors == 0 || 4684 mddev->dev_sectors > sectors) 4685 mddev->dev_sectors = sectors; 4686 else 4687 err = -ENOSPC; 4688 } 4689 mddev_unlock(mddev); 4690 return err ? err : len; 4691 } 4692 4693 static struct md_sysfs_entry md_size = 4694 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4695 4696 /* Metadata version. 4697 * This is one of 4698 * 'none' for arrays with no metadata (good luck...) 4699 * 'external' for arrays with externally managed metadata, 4700 * or N.M for internally known formats 4701 */ 4702 static ssize_t 4703 metadata_show(struct mddev *mddev, char *page) 4704 { 4705 if (mddev->persistent) 4706 return sprintf(page, "%d.%d\n", 4707 mddev->major_version, mddev->minor_version); 4708 else if (mddev->external) 4709 return sprintf(page, "external:%s\n", mddev->metadata_type); 4710 else 4711 return sprintf(page, "none\n"); 4712 } 4713 4714 static ssize_t 4715 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4716 { 4717 int major, minor; 4718 char *e; 4719 int err; 4720 /* Changing the details of 'external' metadata is 4721 * always permitted. Otherwise there must be 4722 * no devices attached to the array. 4723 */ 4724 4725 err = mddev_lock(mddev); 4726 if (err) 4727 return err; 4728 err = -EBUSY; 4729 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4730 ; 4731 else if (!list_empty(&mddev->disks)) 4732 goto out_unlock; 4733 4734 err = 0; 4735 if (cmd_match(buf, "none")) { 4736 mddev->persistent = 0; 4737 mddev->external = 0; 4738 mddev->major_version = 0; 4739 mddev->minor_version = 90; 4740 goto out_unlock; 4741 } 4742 if (strncmp(buf, "external:", 9) == 0) { 4743 size_t namelen = len-9; 4744 if (namelen >= sizeof(mddev->metadata_type)) 4745 namelen = sizeof(mddev->metadata_type)-1; 4746 strncpy(mddev->metadata_type, buf+9, namelen); 4747 mddev->metadata_type[namelen] = 0; 4748 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4749 mddev->metadata_type[--namelen] = 0; 4750 mddev->persistent = 0; 4751 mddev->external = 1; 4752 mddev->major_version = 0; 4753 mddev->minor_version = 90; 4754 goto out_unlock; 4755 } 4756 major = simple_strtoul(buf, &e, 10); 4757 err = -EINVAL; 4758 if (e==buf || *e != '.') 4759 goto out_unlock; 4760 buf = e+1; 4761 minor = simple_strtoul(buf, &e, 10); 4762 if (e==buf || (*e && *e != '\n') ) 4763 goto out_unlock; 4764 err = -ENOENT; 4765 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4766 goto out_unlock; 4767 mddev->major_version = major; 4768 mddev->minor_version = minor; 4769 mddev->persistent = 1; 4770 mddev->external = 0; 4771 err = 0; 4772 out_unlock: 4773 mddev_unlock(mddev); 4774 return err ?: len; 4775 } 4776 4777 static struct md_sysfs_entry md_metadata = 4778 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4779 4780 static ssize_t 4781 action_show(struct mddev *mddev, char *page) 4782 { 4783 char *type = "idle"; 4784 unsigned long recovery = mddev->recovery; 4785 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4786 type = "frozen"; 4787 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4788 (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4789 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4790 type = "reshape"; 4791 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4792 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4793 type = "resync"; 4794 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4795 type = "check"; 4796 else 4797 type = "repair"; 4798 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4799 type = "recover"; 4800 else if (mddev->reshape_position != MaxSector) 4801 type = "reshape"; 4802 } 4803 return sprintf(page, "%s\n", type); 4804 } 4805 4806 static void stop_sync_thread(struct mddev *mddev) 4807 { 4808 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4809 return; 4810 4811 if (mddev_lock(mddev)) 4812 return; 4813 4814 /* 4815 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4816 * held. 4817 */ 4818 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4819 mddev_unlock(mddev); 4820 return; 4821 } 4822 4823 if (work_pending(&mddev->del_work)) 4824 flush_workqueue(md_misc_wq); 4825 4826 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4827 /* 4828 * Thread might be blocked waiting for metadata update which will now 4829 * never happen 4830 */ 4831 md_wakeup_thread_directly(mddev->sync_thread); 4832 4833 mddev_unlock(mddev); 4834 } 4835 4836 static void idle_sync_thread(struct mddev *mddev) 4837 { 4838 int sync_seq = atomic_read(&mddev->sync_seq); 4839 4840 mutex_lock(&mddev->sync_mutex); 4841 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4842 stop_sync_thread(mddev); 4843 4844 wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4845 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4846 4847 mutex_unlock(&mddev->sync_mutex); 4848 } 4849 4850 static void frozen_sync_thread(struct mddev *mddev) 4851 { 4852 mutex_lock(&mddev->sync_mutex); 4853 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4854 stop_sync_thread(mddev); 4855 4856 wait_event(resync_wait, mddev->sync_thread == NULL && 4857 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4858 4859 mutex_unlock(&mddev->sync_mutex); 4860 } 4861 4862 static ssize_t 4863 action_store(struct mddev *mddev, const char *page, size_t len) 4864 { 4865 if (!mddev->pers || !mddev->pers->sync_request) 4866 return -EINVAL; 4867 4868 4869 if (cmd_match(page, "idle")) 4870 idle_sync_thread(mddev); 4871 else if (cmd_match(page, "frozen")) 4872 frozen_sync_thread(mddev); 4873 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4874 return -EBUSY; 4875 else if (cmd_match(page, "resync")) 4876 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4877 else if (cmd_match(page, "recover")) { 4878 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4879 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4880 } else if (cmd_match(page, "reshape")) { 4881 int err; 4882 if (mddev->pers->start_reshape == NULL) 4883 return -EINVAL; 4884 err = mddev_lock(mddev); 4885 if (!err) { 4886 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4887 err = -EBUSY; 4888 } else if (mddev->reshape_position == MaxSector || 4889 mddev->pers->check_reshape == NULL || 4890 mddev->pers->check_reshape(mddev)) { 4891 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4892 err = mddev->pers->start_reshape(mddev); 4893 } else { 4894 /* 4895 * If reshape is still in progress, and 4896 * md_check_recovery() can continue to reshape, 4897 * don't restart reshape because data can be 4898 * corrupted for raid456. 4899 */ 4900 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4901 } 4902 mddev_unlock(mddev); 4903 } 4904 if (err) 4905 return err; 4906 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4907 } else { 4908 if (cmd_match(page, "check")) 4909 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4910 else if (!cmd_match(page, "repair")) 4911 return -EINVAL; 4912 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4913 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4914 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4915 } 4916 if (mddev->ro == MD_AUTO_READ) { 4917 /* A write to sync_action is enough to justify 4918 * canceling read-auto mode 4919 */ 4920 mddev->ro = MD_RDWR; 4921 md_wakeup_thread(mddev->sync_thread); 4922 } 4923 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4924 md_wakeup_thread(mddev->thread); 4925 sysfs_notify_dirent_safe(mddev->sysfs_action); 4926 return len; 4927 } 4928 4929 static struct md_sysfs_entry md_scan_mode = 4930 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4931 4932 static ssize_t 4933 last_sync_action_show(struct mddev *mddev, char *page) 4934 { 4935 return sprintf(page, "%s\n", mddev->last_sync_action); 4936 } 4937 4938 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4939 4940 static ssize_t 4941 mismatch_cnt_show(struct mddev *mddev, char *page) 4942 { 4943 return sprintf(page, "%llu\n", 4944 (unsigned long long) 4945 atomic64_read(&mddev->resync_mismatches)); 4946 } 4947 4948 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4949 4950 static ssize_t 4951 sync_min_show(struct mddev *mddev, char *page) 4952 { 4953 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4954 mddev->sync_speed_min ? "local": "system"); 4955 } 4956 4957 static ssize_t 4958 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4959 { 4960 unsigned int min; 4961 int rv; 4962 4963 if (strncmp(buf, "system", 6)==0) { 4964 min = 0; 4965 } else { 4966 rv = kstrtouint(buf, 10, &min); 4967 if (rv < 0) 4968 return rv; 4969 if (min == 0) 4970 return -EINVAL; 4971 } 4972 mddev->sync_speed_min = min; 4973 return len; 4974 } 4975 4976 static struct md_sysfs_entry md_sync_min = 4977 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4978 4979 static ssize_t 4980 sync_max_show(struct mddev *mddev, char *page) 4981 { 4982 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4983 mddev->sync_speed_max ? "local": "system"); 4984 } 4985 4986 static ssize_t 4987 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4988 { 4989 unsigned int max; 4990 int rv; 4991 4992 if (strncmp(buf, "system", 6)==0) { 4993 max = 0; 4994 } else { 4995 rv = kstrtouint(buf, 10, &max); 4996 if (rv < 0) 4997 return rv; 4998 if (max == 0) 4999 return -EINVAL; 5000 } 5001 mddev->sync_speed_max = max; 5002 return len; 5003 } 5004 5005 static struct md_sysfs_entry md_sync_max = 5006 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5007 5008 static ssize_t 5009 degraded_show(struct mddev *mddev, char *page) 5010 { 5011 return sprintf(page, "%d\n", mddev->degraded); 5012 } 5013 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5014 5015 static ssize_t 5016 sync_force_parallel_show(struct mddev *mddev, char *page) 5017 { 5018 return sprintf(page, "%d\n", mddev->parallel_resync); 5019 } 5020 5021 static ssize_t 5022 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5023 { 5024 long n; 5025 5026 if (kstrtol(buf, 10, &n)) 5027 return -EINVAL; 5028 5029 if (n != 0 && n != 1) 5030 return -EINVAL; 5031 5032 mddev->parallel_resync = n; 5033 5034 if (mddev->sync_thread) 5035 wake_up(&resync_wait); 5036 5037 return len; 5038 } 5039 5040 /* force parallel resync, even with shared block devices */ 5041 static struct md_sysfs_entry md_sync_force_parallel = 5042 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5043 sync_force_parallel_show, sync_force_parallel_store); 5044 5045 static ssize_t 5046 sync_speed_show(struct mddev *mddev, char *page) 5047 { 5048 unsigned long resync, dt, db; 5049 if (mddev->curr_resync == MD_RESYNC_NONE) 5050 return sprintf(page, "none\n"); 5051 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5052 dt = (jiffies - mddev->resync_mark) / HZ; 5053 if (!dt) dt++; 5054 db = resync - mddev->resync_mark_cnt; 5055 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5056 } 5057 5058 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5059 5060 static ssize_t 5061 sync_completed_show(struct mddev *mddev, char *page) 5062 { 5063 unsigned long long max_sectors, resync; 5064 5065 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5066 return sprintf(page, "none\n"); 5067 5068 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5069 mddev->curr_resync == MD_RESYNC_DELAYED) 5070 return sprintf(page, "delayed\n"); 5071 5072 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5073 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5074 max_sectors = mddev->resync_max_sectors; 5075 else 5076 max_sectors = mddev->dev_sectors; 5077 5078 resync = mddev->curr_resync_completed; 5079 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5080 } 5081 5082 static struct md_sysfs_entry md_sync_completed = 5083 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5084 5085 static ssize_t 5086 min_sync_show(struct mddev *mddev, char *page) 5087 { 5088 return sprintf(page, "%llu\n", 5089 (unsigned long long)mddev->resync_min); 5090 } 5091 static ssize_t 5092 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5093 { 5094 unsigned long long min; 5095 int err; 5096 5097 if (kstrtoull(buf, 10, &min)) 5098 return -EINVAL; 5099 5100 spin_lock(&mddev->lock); 5101 err = -EINVAL; 5102 if (min > mddev->resync_max) 5103 goto out_unlock; 5104 5105 err = -EBUSY; 5106 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5107 goto out_unlock; 5108 5109 /* Round down to multiple of 4K for safety */ 5110 mddev->resync_min = round_down(min, 8); 5111 err = 0; 5112 5113 out_unlock: 5114 spin_unlock(&mddev->lock); 5115 return err ?: len; 5116 } 5117 5118 static struct md_sysfs_entry md_min_sync = 5119 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5120 5121 static ssize_t 5122 max_sync_show(struct mddev *mddev, char *page) 5123 { 5124 if (mddev->resync_max == MaxSector) 5125 return sprintf(page, "max\n"); 5126 else 5127 return sprintf(page, "%llu\n", 5128 (unsigned long long)mddev->resync_max); 5129 } 5130 static ssize_t 5131 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5132 { 5133 int err; 5134 spin_lock(&mddev->lock); 5135 if (strncmp(buf, "max", 3) == 0) 5136 mddev->resync_max = MaxSector; 5137 else { 5138 unsigned long long max; 5139 int chunk; 5140 5141 err = -EINVAL; 5142 if (kstrtoull(buf, 10, &max)) 5143 goto out_unlock; 5144 if (max < mddev->resync_min) 5145 goto out_unlock; 5146 5147 err = -EBUSY; 5148 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5149 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5150 goto out_unlock; 5151 5152 /* Must be a multiple of chunk_size */ 5153 chunk = mddev->chunk_sectors; 5154 if (chunk) { 5155 sector_t temp = max; 5156 5157 err = -EINVAL; 5158 if (sector_div(temp, chunk)) 5159 goto out_unlock; 5160 } 5161 mddev->resync_max = max; 5162 } 5163 wake_up(&mddev->recovery_wait); 5164 err = 0; 5165 out_unlock: 5166 spin_unlock(&mddev->lock); 5167 return err ?: len; 5168 } 5169 5170 static struct md_sysfs_entry md_max_sync = 5171 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5172 5173 static ssize_t 5174 suspend_lo_show(struct mddev *mddev, char *page) 5175 { 5176 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 5177 } 5178 5179 static ssize_t 5180 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5181 { 5182 unsigned long long new; 5183 int err; 5184 5185 err = kstrtoull(buf, 10, &new); 5186 if (err < 0) 5187 return err; 5188 if (new != (sector_t)new) 5189 return -EINVAL; 5190 5191 err = mddev_lock(mddev); 5192 if (err) 5193 return err; 5194 err = -EINVAL; 5195 if (mddev->pers == NULL || 5196 mddev->pers->quiesce == NULL) 5197 goto unlock; 5198 mddev_suspend(mddev); 5199 mddev->suspend_lo = new; 5200 mddev_resume(mddev); 5201 5202 err = 0; 5203 unlock: 5204 mddev_unlock(mddev); 5205 return err ?: len; 5206 } 5207 static struct md_sysfs_entry md_suspend_lo = 5208 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5209 5210 static ssize_t 5211 suspend_hi_show(struct mddev *mddev, char *page) 5212 { 5213 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 5214 } 5215 5216 static ssize_t 5217 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5218 { 5219 unsigned long long new; 5220 int err; 5221 5222 err = kstrtoull(buf, 10, &new); 5223 if (err < 0) 5224 return err; 5225 if (new != (sector_t)new) 5226 return -EINVAL; 5227 5228 err = mddev_lock(mddev); 5229 if (err) 5230 return err; 5231 err = -EINVAL; 5232 if (mddev->pers == NULL) 5233 goto unlock; 5234 5235 mddev_suspend(mddev); 5236 mddev->suspend_hi = new; 5237 mddev_resume(mddev); 5238 5239 err = 0; 5240 unlock: 5241 mddev_unlock(mddev); 5242 return err ?: len; 5243 } 5244 static struct md_sysfs_entry md_suspend_hi = 5245 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5246 5247 static ssize_t 5248 reshape_position_show(struct mddev *mddev, char *page) 5249 { 5250 if (mddev->reshape_position != MaxSector) 5251 return sprintf(page, "%llu\n", 5252 (unsigned long long)mddev->reshape_position); 5253 strcpy(page, "none\n"); 5254 return 5; 5255 } 5256 5257 static ssize_t 5258 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5259 { 5260 struct md_rdev *rdev; 5261 unsigned long long new; 5262 int err; 5263 5264 err = kstrtoull(buf, 10, &new); 5265 if (err < 0) 5266 return err; 5267 if (new != (sector_t)new) 5268 return -EINVAL; 5269 err = mddev_lock(mddev); 5270 if (err) 5271 return err; 5272 err = -EBUSY; 5273 if (mddev->pers) 5274 goto unlock; 5275 mddev->reshape_position = new; 5276 mddev->delta_disks = 0; 5277 mddev->reshape_backwards = 0; 5278 mddev->new_level = mddev->level; 5279 mddev->new_layout = mddev->layout; 5280 mddev->new_chunk_sectors = mddev->chunk_sectors; 5281 rdev_for_each(rdev, mddev) 5282 rdev->new_data_offset = rdev->data_offset; 5283 err = 0; 5284 unlock: 5285 mddev_unlock(mddev); 5286 return err ?: len; 5287 } 5288 5289 static struct md_sysfs_entry md_reshape_position = 5290 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5291 reshape_position_store); 5292 5293 static ssize_t 5294 reshape_direction_show(struct mddev *mddev, char *page) 5295 { 5296 return sprintf(page, "%s\n", 5297 mddev->reshape_backwards ? "backwards" : "forwards"); 5298 } 5299 5300 static ssize_t 5301 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5302 { 5303 int backwards = 0; 5304 int err; 5305 5306 if (cmd_match(buf, "forwards")) 5307 backwards = 0; 5308 else if (cmd_match(buf, "backwards")) 5309 backwards = 1; 5310 else 5311 return -EINVAL; 5312 if (mddev->reshape_backwards == backwards) 5313 return len; 5314 5315 err = mddev_lock(mddev); 5316 if (err) 5317 return err; 5318 /* check if we are allowed to change */ 5319 if (mddev->delta_disks) 5320 err = -EBUSY; 5321 else if (mddev->persistent && 5322 mddev->major_version == 0) 5323 err = -EINVAL; 5324 else 5325 mddev->reshape_backwards = backwards; 5326 mddev_unlock(mddev); 5327 return err ?: len; 5328 } 5329 5330 static struct md_sysfs_entry md_reshape_direction = 5331 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5332 reshape_direction_store); 5333 5334 static ssize_t 5335 array_size_show(struct mddev *mddev, char *page) 5336 { 5337 if (mddev->external_size) 5338 return sprintf(page, "%llu\n", 5339 (unsigned long long)mddev->array_sectors/2); 5340 else 5341 return sprintf(page, "default\n"); 5342 } 5343 5344 static ssize_t 5345 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5346 { 5347 sector_t sectors; 5348 int err; 5349 5350 err = mddev_lock(mddev); 5351 if (err) 5352 return err; 5353 5354 /* cluster raid doesn't support change array_sectors */ 5355 if (mddev_is_clustered(mddev)) { 5356 mddev_unlock(mddev); 5357 return -EINVAL; 5358 } 5359 5360 if (strncmp(buf, "default", 7) == 0) { 5361 if (mddev->pers) 5362 sectors = mddev->pers->size(mddev, 0, 0); 5363 else 5364 sectors = mddev->array_sectors; 5365 5366 mddev->external_size = 0; 5367 } else { 5368 if (strict_blocks_to_sectors(buf, §ors) < 0) 5369 err = -EINVAL; 5370 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5371 err = -E2BIG; 5372 else 5373 mddev->external_size = 1; 5374 } 5375 5376 if (!err) { 5377 mddev->array_sectors = sectors; 5378 if (mddev->pers) 5379 set_capacity_and_notify(mddev->gendisk, 5380 mddev->array_sectors); 5381 } 5382 mddev_unlock(mddev); 5383 return err ?: len; 5384 } 5385 5386 static struct md_sysfs_entry md_array_size = 5387 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5388 array_size_store); 5389 5390 static ssize_t 5391 consistency_policy_show(struct mddev *mddev, char *page) 5392 { 5393 int ret; 5394 5395 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5396 ret = sprintf(page, "journal\n"); 5397 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5398 ret = sprintf(page, "ppl\n"); 5399 } else if (mddev->bitmap) { 5400 ret = sprintf(page, "bitmap\n"); 5401 } else if (mddev->pers) { 5402 if (mddev->pers->sync_request) 5403 ret = sprintf(page, "resync\n"); 5404 else 5405 ret = sprintf(page, "none\n"); 5406 } else { 5407 ret = sprintf(page, "unknown\n"); 5408 } 5409 5410 return ret; 5411 } 5412 5413 static ssize_t 5414 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5415 { 5416 int err = 0; 5417 5418 if (mddev->pers) { 5419 if (mddev->pers->change_consistency_policy) 5420 err = mddev->pers->change_consistency_policy(mddev, buf); 5421 else 5422 err = -EBUSY; 5423 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5424 set_bit(MD_HAS_PPL, &mddev->flags); 5425 } else { 5426 err = -EINVAL; 5427 } 5428 5429 return err ? err : len; 5430 } 5431 5432 static struct md_sysfs_entry md_consistency_policy = 5433 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5434 consistency_policy_store); 5435 5436 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5437 { 5438 return sprintf(page, "%d\n", mddev->fail_last_dev); 5439 } 5440 5441 /* 5442 * Setting fail_last_dev to true to allow last device to be forcibly removed 5443 * from RAID1/RAID10. 5444 */ 5445 static ssize_t 5446 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5447 { 5448 int ret; 5449 bool value; 5450 5451 ret = kstrtobool(buf, &value); 5452 if (ret) 5453 return ret; 5454 5455 if (value != mddev->fail_last_dev) 5456 mddev->fail_last_dev = value; 5457 5458 return len; 5459 } 5460 static struct md_sysfs_entry md_fail_last_dev = 5461 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5462 fail_last_dev_store); 5463 5464 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5465 { 5466 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5467 return sprintf(page, "n/a\n"); 5468 else 5469 return sprintf(page, "%d\n", mddev->serialize_policy); 5470 } 5471 5472 /* 5473 * Setting serialize_policy to true to enforce write IO is not reordered 5474 * for raid1. 5475 */ 5476 static ssize_t 5477 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5478 { 5479 int err; 5480 bool value; 5481 5482 err = kstrtobool(buf, &value); 5483 if (err) 5484 return err; 5485 5486 if (value == mddev->serialize_policy) 5487 return len; 5488 5489 err = mddev_lock(mddev); 5490 if (err) 5491 return err; 5492 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5493 pr_err("md: serialize_policy is only effective for raid1\n"); 5494 err = -EINVAL; 5495 goto unlock; 5496 } 5497 5498 mddev_suspend(mddev); 5499 if (value) 5500 mddev_create_serial_pool(mddev, NULL, true); 5501 else 5502 mddev_destroy_serial_pool(mddev, NULL, true); 5503 mddev->serialize_policy = value; 5504 mddev_resume(mddev); 5505 unlock: 5506 mddev_unlock(mddev); 5507 return err ?: len; 5508 } 5509 5510 static struct md_sysfs_entry md_serialize_policy = 5511 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5512 serialize_policy_store); 5513 5514 5515 static struct attribute *md_default_attrs[] = { 5516 &md_level.attr, 5517 &md_layout.attr, 5518 &md_raid_disks.attr, 5519 &md_uuid.attr, 5520 &md_chunk_size.attr, 5521 &md_size.attr, 5522 &md_resync_start.attr, 5523 &md_metadata.attr, 5524 &md_new_device.attr, 5525 &md_safe_delay.attr, 5526 &md_array_state.attr, 5527 &md_reshape_position.attr, 5528 &md_reshape_direction.attr, 5529 &md_array_size.attr, 5530 &max_corr_read_errors.attr, 5531 &md_consistency_policy.attr, 5532 &md_fail_last_dev.attr, 5533 &md_serialize_policy.attr, 5534 NULL, 5535 }; 5536 5537 static const struct attribute_group md_default_group = { 5538 .attrs = md_default_attrs, 5539 }; 5540 5541 static struct attribute *md_redundancy_attrs[] = { 5542 &md_scan_mode.attr, 5543 &md_last_scan_mode.attr, 5544 &md_mismatches.attr, 5545 &md_sync_min.attr, 5546 &md_sync_max.attr, 5547 &md_sync_speed.attr, 5548 &md_sync_force_parallel.attr, 5549 &md_sync_completed.attr, 5550 &md_min_sync.attr, 5551 &md_max_sync.attr, 5552 &md_suspend_lo.attr, 5553 &md_suspend_hi.attr, 5554 &md_bitmap.attr, 5555 &md_degraded.attr, 5556 NULL, 5557 }; 5558 static const struct attribute_group md_redundancy_group = { 5559 .name = NULL, 5560 .attrs = md_redundancy_attrs, 5561 }; 5562 5563 static const struct attribute_group *md_attr_groups[] = { 5564 &md_default_group, 5565 &md_bitmap_group, 5566 NULL, 5567 }; 5568 5569 static ssize_t 5570 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5571 { 5572 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5573 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5574 ssize_t rv; 5575 5576 if (!entry->show) 5577 return -EIO; 5578 spin_lock(&all_mddevs_lock); 5579 if (!mddev_get(mddev)) { 5580 spin_unlock(&all_mddevs_lock); 5581 return -EBUSY; 5582 } 5583 spin_unlock(&all_mddevs_lock); 5584 5585 rv = entry->show(mddev, page); 5586 mddev_put(mddev); 5587 return rv; 5588 } 5589 5590 static ssize_t 5591 md_attr_store(struct kobject *kobj, struct attribute *attr, 5592 const char *page, size_t length) 5593 { 5594 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5595 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5596 ssize_t rv; 5597 5598 if (!entry->store) 5599 return -EIO; 5600 if (!capable(CAP_SYS_ADMIN)) 5601 return -EACCES; 5602 spin_lock(&all_mddevs_lock); 5603 if (!mddev_get(mddev)) { 5604 spin_unlock(&all_mddevs_lock); 5605 return -EBUSY; 5606 } 5607 spin_unlock(&all_mddevs_lock); 5608 rv = entry->store(mddev, page, length); 5609 mddev_put(mddev); 5610 return rv; 5611 } 5612 5613 static void md_kobj_release(struct kobject *ko) 5614 { 5615 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5616 5617 if (mddev->sysfs_state) 5618 sysfs_put(mddev->sysfs_state); 5619 if (mddev->sysfs_level) 5620 sysfs_put(mddev->sysfs_level); 5621 5622 del_gendisk(mddev->gendisk); 5623 put_disk(mddev->gendisk); 5624 } 5625 5626 static const struct sysfs_ops md_sysfs_ops = { 5627 .show = md_attr_show, 5628 .store = md_attr_store, 5629 }; 5630 static const struct kobj_type md_ktype = { 5631 .release = md_kobj_release, 5632 .sysfs_ops = &md_sysfs_ops, 5633 .default_groups = md_attr_groups, 5634 }; 5635 5636 int mdp_major = 0; 5637 5638 static void mddev_delayed_delete(struct work_struct *ws) 5639 { 5640 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5641 5642 kobject_put(&mddev->kobj); 5643 } 5644 5645 static void no_op(struct percpu_ref *r) {} 5646 5647 int mddev_init_writes_pending(struct mddev *mddev) 5648 { 5649 if (mddev->writes_pending.percpu_count_ptr) 5650 return 0; 5651 if (percpu_ref_init(&mddev->writes_pending, no_op, 5652 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) 5653 return -ENOMEM; 5654 /* We want to start with the refcount at zero */ 5655 percpu_ref_put(&mddev->writes_pending); 5656 return 0; 5657 } 5658 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5659 5660 struct mddev *md_alloc(dev_t dev, char *name) 5661 { 5662 /* 5663 * If dev is zero, name is the name of a device to allocate with 5664 * an arbitrary minor number. It will be "md_???" 5665 * If dev is non-zero it must be a device number with a MAJOR of 5666 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5667 * the device is being created by opening a node in /dev. 5668 * If "name" is not NULL, the device is being created by 5669 * writing to /sys/module/md_mod/parameters/new_array. 5670 */ 5671 static DEFINE_MUTEX(disks_mutex); 5672 struct mddev *mddev; 5673 struct gendisk *disk; 5674 int partitioned; 5675 int shift; 5676 int unit; 5677 int error ; 5678 5679 /* 5680 * Wait for any previous instance of this device to be completely 5681 * removed (mddev_delayed_delete). 5682 */ 5683 flush_workqueue(md_misc_wq); 5684 5685 mutex_lock(&disks_mutex); 5686 mddev = mddev_alloc(dev); 5687 if (IS_ERR(mddev)) { 5688 error = PTR_ERR(mddev); 5689 goto out_unlock; 5690 } 5691 5692 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5693 shift = partitioned ? MdpMinorShift : 0; 5694 unit = MINOR(mddev->unit) >> shift; 5695 5696 if (name && !dev) { 5697 /* Need to ensure that 'name' is not a duplicate. 5698 */ 5699 struct mddev *mddev2; 5700 spin_lock(&all_mddevs_lock); 5701 5702 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5703 if (mddev2->gendisk && 5704 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5705 spin_unlock(&all_mddevs_lock); 5706 error = -EEXIST; 5707 goto out_free_mddev; 5708 } 5709 spin_unlock(&all_mddevs_lock); 5710 } 5711 if (name && dev) 5712 /* 5713 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5714 */ 5715 mddev->hold_active = UNTIL_STOP; 5716 5717 error = -ENOMEM; 5718 disk = blk_alloc_disk(NUMA_NO_NODE); 5719 if (!disk) 5720 goto out_free_mddev; 5721 5722 disk->major = MAJOR(mddev->unit); 5723 disk->first_minor = unit << shift; 5724 disk->minors = 1 << shift; 5725 if (name) 5726 strcpy(disk->disk_name, name); 5727 else if (partitioned) 5728 sprintf(disk->disk_name, "md_d%d", unit); 5729 else 5730 sprintf(disk->disk_name, "md%d", unit); 5731 disk->fops = &md_fops; 5732 disk->private_data = mddev; 5733 5734 mddev->queue = disk->queue; 5735 blk_set_stacking_limits(&mddev->queue->limits); 5736 blk_queue_write_cache(mddev->queue, true, true); 5737 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5738 mddev->gendisk = disk; 5739 error = add_disk(disk); 5740 if (error) 5741 goto out_put_disk; 5742 5743 kobject_init(&mddev->kobj, &md_ktype); 5744 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5745 if (error) { 5746 /* 5747 * The disk is already live at this point. Clear the hold flag 5748 * and let mddev_put take care of the deletion, as it isn't any 5749 * different from a normal close on last release now. 5750 */ 5751 mddev->hold_active = 0; 5752 mutex_unlock(&disks_mutex); 5753 mddev_put(mddev); 5754 return ERR_PTR(error); 5755 } 5756 5757 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5758 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5759 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5760 mutex_unlock(&disks_mutex); 5761 return mddev; 5762 5763 out_put_disk: 5764 put_disk(disk); 5765 out_free_mddev: 5766 mddev_free(mddev); 5767 out_unlock: 5768 mutex_unlock(&disks_mutex); 5769 return ERR_PTR(error); 5770 } 5771 5772 static int md_alloc_and_put(dev_t dev, char *name) 5773 { 5774 struct mddev *mddev = md_alloc(dev, name); 5775 5776 if (IS_ERR(mddev)) 5777 return PTR_ERR(mddev); 5778 mddev_put(mddev); 5779 return 0; 5780 } 5781 5782 static void md_probe(dev_t dev) 5783 { 5784 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5785 return; 5786 if (create_on_open) 5787 md_alloc_and_put(dev, NULL); 5788 } 5789 5790 static int add_named_array(const char *val, const struct kernel_param *kp) 5791 { 5792 /* 5793 * val must be "md_*" or "mdNNN". 5794 * For "md_*" we allocate an array with a large free minor number, and 5795 * set the name to val. val must not already be an active name. 5796 * For "mdNNN" we allocate an array with the minor number NNN 5797 * which must not already be in use. 5798 */ 5799 int len = strlen(val); 5800 char buf[DISK_NAME_LEN]; 5801 unsigned long devnum; 5802 5803 while (len && val[len-1] == '\n') 5804 len--; 5805 if (len >= DISK_NAME_LEN) 5806 return -E2BIG; 5807 strscpy(buf, val, len+1); 5808 if (strncmp(buf, "md_", 3) == 0) 5809 return md_alloc_and_put(0, buf); 5810 if (strncmp(buf, "md", 2) == 0 && 5811 isdigit(buf[2]) && 5812 kstrtoul(buf+2, 10, &devnum) == 0 && 5813 devnum <= MINORMASK) 5814 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5815 5816 return -EINVAL; 5817 } 5818 5819 static void md_safemode_timeout(struct timer_list *t) 5820 { 5821 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5822 5823 mddev->safemode = 1; 5824 if (mddev->external) 5825 sysfs_notify_dirent_safe(mddev->sysfs_state); 5826 5827 md_wakeup_thread(mddev->thread); 5828 } 5829 5830 static int start_dirty_degraded; 5831 static void active_io_release(struct percpu_ref *ref) 5832 { 5833 struct mddev *mddev = container_of(ref, struct mddev, active_io); 5834 5835 wake_up(&mddev->sb_wait); 5836 } 5837 5838 int md_run(struct mddev *mddev) 5839 { 5840 int err; 5841 struct md_rdev *rdev; 5842 struct md_personality *pers; 5843 bool nowait = true; 5844 5845 if (list_empty(&mddev->disks)) 5846 /* cannot run an array with no devices.. */ 5847 return -EINVAL; 5848 5849 if (mddev->pers) 5850 return -EBUSY; 5851 /* Cannot run until previous stop completes properly */ 5852 if (mddev->sysfs_active) 5853 return -EBUSY; 5854 5855 /* 5856 * Analyze all RAID superblock(s) 5857 */ 5858 if (!mddev->raid_disks) { 5859 if (!mddev->persistent) 5860 return -EINVAL; 5861 err = analyze_sbs(mddev); 5862 if (err) 5863 return -EINVAL; 5864 } 5865 5866 if (mddev->level != LEVEL_NONE) 5867 request_module("md-level-%d", mddev->level); 5868 else if (mddev->clevel[0]) 5869 request_module("md-%s", mddev->clevel); 5870 5871 /* 5872 * Drop all container device buffers, from now on 5873 * the only valid external interface is through the md 5874 * device. 5875 */ 5876 mddev->has_superblocks = false; 5877 rdev_for_each(rdev, mddev) { 5878 if (test_bit(Faulty, &rdev->flags)) 5879 continue; 5880 sync_blockdev(rdev->bdev); 5881 invalidate_bdev(rdev->bdev); 5882 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 5883 mddev->ro = MD_RDONLY; 5884 if (mddev->gendisk) 5885 set_disk_ro(mddev->gendisk, 1); 5886 } 5887 5888 if (rdev->sb_page) 5889 mddev->has_superblocks = true; 5890 5891 /* perform some consistency tests on the device. 5892 * We don't want the data to overlap the metadata, 5893 * Internal Bitmap issues have been handled elsewhere. 5894 */ 5895 if (rdev->meta_bdev) { 5896 /* Nothing to check */; 5897 } else if (rdev->data_offset < rdev->sb_start) { 5898 if (mddev->dev_sectors && 5899 rdev->data_offset + mddev->dev_sectors 5900 > rdev->sb_start) { 5901 pr_warn("md: %s: data overlaps metadata\n", 5902 mdname(mddev)); 5903 return -EINVAL; 5904 } 5905 } else { 5906 if (rdev->sb_start + rdev->sb_size/512 5907 > rdev->data_offset) { 5908 pr_warn("md: %s: metadata overlaps data\n", 5909 mdname(mddev)); 5910 return -EINVAL; 5911 } 5912 } 5913 sysfs_notify_dirent_safe(rdev->sysfs_state); 5914 nowait = nowait && bdev_nowait(rdev->bdev); 5915 } 5916 5917 err = percpu_ref_init(&mddev->active_io, active_io_release, 5918 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); 5919 if (err) 5920 return err; 5921 5922 if (!bioset_initialized(&mddev->bio_set)) { 5923 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5924 if (err) 5925 goto exit_active_io; 5926 } 5927 if (!bioset_initialized(&mddev->sync_set)) { 5928 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5929 if (err) 5930 goto exit_bio_set; 5931 } 5932 5933 if (!bioset_initialized(&mddev->io_clone_set)) { 5934 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5935 offsetof(struct md_io_clone, bio_clone), 0); 5936 if (err) 5937 goto exit_sync_set; 5938 } 5939 5940 spin_lock(&pers_lock); 5941 pers = find_pers(mddev->level, mddev->clevel); 5942 if (!pers || !try_module_get(pers->owner)) { 5943 spin_unlock(&pers_lock); 5944 if (mddev->level != LEVEL_NONE) 5945 pr_warn("md: personality for level %d is not loaded!\n", 5946 mddev->level); 5947 else 5948 pr_warn("md: personality for level %s is not loaded!\n", 5949 mddev->clevel); 5950 err = -EINVAL; 5951 goto abort; 5952 } 5953 spin_unlock(&pers_lock); 5954 if (mddev->level != pers->level) { 5955 mddev->level = pers->level; 5956 mddev->new_level = pers->level; 5957 } 5958 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5959 5960 if (mddev->reshape_position != MaxSector && 5961 pers->start_reshape == NULL) { 5962 /* This personality cannot handle reshaping... */ 5963 module_put(pers->owner); 5964 err = -EINVAL; 5965 goto abort; 5966 } 5967 5968 if (pers->sync_request) { 5969 /* Warn if this is a potentially silly 5970 * configuration. 5971 */ 5972 struct md_rdev *rdev2; 5973 int warned = 0; 5974 5975 rdev_for_each(rdev, mddev) 5976 rdev_for_each(rdev2, mddev) { 5977 if (rdev < rdev2 && 5978 rdev->bdev->bd_disk == 5979 rdev2->bdev->bd_disk) { 5980 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 5981 mdname(mddev), 5982 rdev->bdev, 5983 rdev2->bdev); 5984 warned = 1; 5985 } 5986 } 5987 5988 if (warned) 5989 pr_warn("True protection against single-disk failure might be compromised.\n"); 5990 } 5991 5992 mddev->recovery = 0; 5993 /* may be over-ridden by personality */ 5994 mddev->resync_max_sectors = mddev->dev_sectors; 5995 5996 mddev->ok_start_degraded = start_dirty_degraded; 5997 5998 if (start_readonly && md_is_rdwr(mddev)) 5999 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6000 6001 err = pers->run(mddev); 6002 if (err) 6003 pr_warn("md: pers->run() failed ...\n"); 6004 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6005 WARN_ONCE(!mddev->external_size, 6006 "%s: default size too small, but 'external_size' not in effect?\n", 6007 __func__); 6008 pr_warn("md: invalid array_size %llu > default size %llu\n", 6009 (unsigned long long)mddev->array_sectors / 2, 6010 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6011 err = -EINVAL; 6012 } 6013 if (err == 0 && pers->sync_request && 6014 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6015 struct bitmap *bitmap; 6016 6017 bitmap = md_bitmap_create(mddev, -1); 6018 if (IS_ERR(bitmap)) { 6019 err = PTR_ERR(bitmap); 6020 pr_warn("%s: failed to create bitmap (%d)\n", 6021 mdname(mddev), err); 6022 } else 6023 mddev->bitmap = bitmap; 6024 6025 } 6026 if (err) 6027 goto bitmap_abort; 6028 6029 if (mddev->bitmap_info.max_write_behind > 0) { 6030 bool create_pool = false; 6031 6032 rdev_for_each(rdev, mddev) { 6033 if (test_bit(WriteMostly, &rdev->flags) && 6034 rdev_init_serial(rdev)) 6035 create_pool = true; 6036 } 6037 if (create_pool && mddev->serial_info_pool == NULL) { 6038 mddev->serial_info_pool = 6039 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6040 sizeof(struct serial_info)); 6041 if (!mddev->serial_info_pool) { 6042 err = -ENOMEM; 6043 goto bitmap_abort; 6044 } 6045 } 6046 } 6047 6048 if (mddev->queue) { 6049 bool nonrot = true; 6050 6051 rdev_for_each(rdev, mddev) { 6052 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { 6053 nonrot = false; 6054 break; 6055 } 6056 } 6057 if (mddev->degraded) 6058 nonrot = false; 6059 if (nonrot) 6060 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6061 else 6062 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6063 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6064 6065 /* Set the NOWAIT flags if all underlying devices support it */ 6066 if (nowait) 6067 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6068 } 6069 if (pers->sync_request) { 6070 if (mddev->kobj.sd && 6071 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6072 pr_warn("md: cannot register extra attributes for %s\n", 6073 mdname(mddev)); 6074 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6075 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6076 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6077 } else if (mddev->ro == MD_AUTO_READ) 6078 mddev->ro = MD_RDWR; 6079 6080 atomic_set(&mddev->max_corr_read_errors, 6081 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6082 mddev->safemode = 0; 6083 if (mddev_is_clustered(mddev)) 6084 mddev->safemode_delay = 0; 6085 else 6086 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6087 mddev->in_sync = 1; 6088 smp_wmb(); 6089 spin_lock(&mddev->lock); 6090 mddev->pers = pers; 6091 spin_unlock(&mddev->lock); 6092 rdev_for_each(rdev, mddev) 6093 if (rdev->raid_disk >= 0) 6094 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6095 6096 if (mddev->degraded && md_is_rdwr(mddev)) 6097 /* This ensures that recovering status is reported immediately 6098 * via sysfs - until a lack of spares is confirmed. 6099 */ 6100 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6101 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6102 6103 if (mddev->sb_flags) 6104 md_update_sb(mddev, 0); 6105 6106 md_new_event(); 6107 return 0; 6108 6109 bitmap_abort: 6110 mddev_detach(mddev); 6111 if (mddev->private) 6112 pers->free(mddev, mddev->private); 6113 mddev->private = NULL; 6114 module_put(pers->owner); 6115 md_bitmap_destroy(mddev); 6116 abort: 6117 bioset_exit(&mddev->io_clone_set); 6118 exit_sync_set: 6119 bioset_exit(&mddev->sync_set); 6120 exit_bio_set: 6121 bioset_exit(&mddev->bio_set); 6122 exit_active_io: 6123 percpu_ref_exit(&mddev->active_io); 6124 return err; 6125 } 6126 EXPORT_SYMBOL_GPL(md_run); 6127 6128 int do_md_run(struct mddev *mddev) 6129 { 6130 int err; 6131 6132 set_bit(MD_NOT_READY, &mddev->flags); 6133 err = md_run(mddev); 6134 if (err) 6135 goto out; 6136 err = md_bitmap_load(mddev); 6137 if (err) { 6138 md_bitmap_destroy(mddev); 6139 goto out; 6140 } 6141 6142 if (mddev_is_clustered(mddev)) 6143 md_allow_write(mddev); 6144 6145 /* run start up tasks that require md_thread */ 6146 md_start(mddev); 6147 6148 md_wakeup_thread(mddev->thread); 6149 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6150 6151 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6152 clear_bit(MD_NOT_READY, &mddev->flags); 6153 mddev->changed = 1; 6154 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6155 sysfs_notify_dirent_safe(mddev->sysfs_state); 6156 sysfs_notify_dirent_safe(mddev->sysfs_action); 6157 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6158 out: 6159 clear_bit(MD_NOT_READY, &mddev->flags); 6160 return err; 6161 } 6162 6163 int md_start(struct mddev *mddev) 6164 { 6165 int ret = 0; 6166 6167 if (mddev->pers->start) { 6168 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6169 md_wakeup_thread(mddev->thread); 6170 ret = mddev->pers->start(mddev); 6171 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6172 md_wakeup_thread(mddev->sync_thread); 6173 } 6174 return ret; 6175 } 6176 EXPORT_SYMBOL_GPL(md_start); 6177 6178 static int restart_array(struct mddev *mddev) 6179 { 6180 struct gendisk *disk = mddev->gendisk; 6181 struct md_rdev *rdev; 6182 bool has_journal = false; 6183 bool has_readonly = false; 6184 6185 /* Complain if it has no devices */ 6186 if (list_empty(&mddev->disks)) 6187 return -ENXIO; 6188 if (!mddev->pers) 6189 return -EINVAL; 6190 if (md_is_rdwr(mddev)) 6191 return -EBUSY; 6192 6193 rcu_read_lock(); 6194 rdev_for_each_rcu(rdev, mddev) { 6195 if (test_bit(Journal, &rdev->flags) && 6196 !test_bit(Faulty, &rdev->flags)) 6197 has_journal = true; 6198 if (rdev_read_only(rdev)) 6199 has_readonly = true; 6200 } 6201 rcu_read_unlock(); 6202 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6203 /* Don't restart rw with journal missing/faulty */ 6204 return -EINVAL; 6205 if (has_readonly) 6206 return -EROFS; 6207 6208 mddev->safemode = 0; 6209 mddev->ro = MD_RDWR; 6210 set_disk_ro(disk, 0); 6211 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6212 /* Kick recovery or resync if necessary */ 6213 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6214 md_wakeup_thread(mddev->thread); 6215 md_wakeup_thread(mddev->sync_thread); 6216 sysfs_notify_dirent_safe(mddev->sysfs_state); 6217 return 0; 6218 } 6219 6220 static void md_clean(struct mddev *mddev) 6221 { 6222 mddev->array_sectors = 0; 6223 mddev->external_size = 0; 6224 mddev->dev_sectors = 0; 6225 mddev->raid_disks = 0; 6226 mddev->recovery_cp = 0; 6227 mddev->resync_min = 0; 6228 mddev->resync_max = MaxSector; 6229 mddev->reshape_position = MaxSector; 6230 /* we still need mddev->external in export_rdev, do not clear it yet */ 6231 mddev->persistent = 0; 6232 mddev->level = LEVEL_NONE; 6233 mddev->clevel[0] = 0; 6234 mddev->flags = 0; 6235 mddev->sb_flags = 0; 6236 mddev->ro = MD_RDWR; 6237 mddev->metadata_type[0] = 0; 6238 mddev->chunk_sectors = 0; 6239 mddev->ctime = mddev->utime = 0; 6240 mddev->layout = 0; 6241 mddev->max_disks = 0; 6242 mddev->events = 0; 6243 mddev->can_decrease_events = 0; 6244 mddev->delta_disks = 0; 6245 mddev->reshape_backwards = 0; 6246 mddev->new_level = LEVEL_NONE; 6247 mddev->new_layout = 0; 6248 mddev->new_chunk_sectors = 0; 6249 mddev->curr_resync = MD_RESYNC_NONE; 6250 atomic64_set(&mddev->resync_mismatches, 0); 6251 mddev->suspend_lo = mddev->suspend_hi = 0; 6252 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6253 mddev->recovery = 0; 6254 mddev->in_sync = 0; 6255 mddev->changed = 0; 6256 mddev->degraded = 0; 6257 mddev->safemode = 0; 6258 mddev->private = NULL; 6259 mddev->cluster_info = NULL; 6260 mddev->bitmap_info.offset = 0; 6261 mddev->bitmap_info.default_offset = 0; 6262 mddev->bitmap_info.default_space = 0; 6263 mddev->bitmap_info.chunksize = 0; 6264 mddev->bitmap_info.daemon_sleep = 0; 6265 mddev->bitmap_info.max_write_behind = 0; 6266 mddev->bitmap_info.nodes = 0; 6267 } 6268 6269 static void __md_stop_writes(struct mddev *mddev) 6270 { 6271 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6272 if (work_pending(&mddev->del_work)) 6273 flush_workqueue(md_misc_wq); 6274 if (mddev->sync_thread) { 6275 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6276 md_reap_sync_thread(mddev); 6277 } 6278 6279 del_timer_sync(&mddev->safemode_timer); 6280 6281 if (mddev->pers && mddev->pers->quiesce) { 6282 mddev->pers->quiesce(mddev, 1); 6283 mddev->pers->quiesce(mddev, 0); 6284 } 6285 md_bitmap_flush(mddev); 6286 6287 if (md_is_rdwr(mddev) && 6288 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6289 mddev->sb_flags)) { 6290 /* mark array as shutdown cleanly */ 6291 if (!mddev_is_clustered(mddev)) 6292 mddev->in_sync = 1; 6293 md_update_sb(mddev, 1); 6294 } 6295 /* disable policy to guarantee rdevs free resources for serialization */ 6296 mddev->serialize_policy = 0; 6297 mddev_destroy_serial_pool(mddev, NULL, true); 6298 } 6299 6300 void md_stop_writes(struct mddev *mddev) 6301 { 6302 mddev_lock_nointr(mddev); 6303 __md_stop_writes(mddev); 6304 mddev_unlock(mddev); 6305 } 6306 EXPORT_SYMBOL_GPL(md_stop_writes); 6307 6308 static void mddev_detach(struct mddev *mddev) 6309 { 6310 md_bitmap_wait_behind_writes(mddev); 6311 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6312 mddev->pers->quiesce(mddev, 1); 6313 mddev->pers->quiesce(mddev, 0); 6314 } 6315 md_unregister_thread(mddev, &mddev->thread); 6316 if (mddev->queue) 6317 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6318 } 6319 6320 static void __md_stop(struct mddev *mddev) 6321 { 6322 struct md_personality *pers = mddev->pers; 6323 md_bitmap_destroy(mddev); 6324 mddev_detach(mddev); 6325 /* Ensure ->event_work is done */ 6326 if (mddev->event_work.func) 6327 flush_workqueue(md_misc_wq); 6328 spin_lock(&mddev->lock); 6329 mddev->pers = NULL; 6330 spin_unlock(&mddev->lock); 6331 if (mddev->private) 6332 pers->free(mddev, mddev->private); 6333 mddev->private = NULL; 6334 if (pers->sync_request && mddev->to_remove == NULL) 6335 mddev->to_remove = &md_redundancy_group; 6336 module_put(pers->owner); 6337 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6338 6339 percpu_ref_exit(&mddev->active_io); 6340 bioset_exit(&mddev->bio_set); 6341 bioset_exit(&mddev->sync_set); 6342 bioset_exit(&mddev->io_clone_set); 6343 } 6344 6345 void md_stop(struct mddev *mddev) 6346 { 6347 lockdep_assert_held(&mddev->reconfig_mutex); 6348 6349 /* stop the array and free an attached data structures. 6350 * This is called from dm-raid 6351 */ 6352 __md_stop_writes(mddev); 6353 __md_stop(mddev); 6354 percpu_ref_exit(&mddev->writes_pending); 6355 } 6356 6357 EXPORT_SYMBOL_GPL(md_stop); 6358 6359 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6360 { 6361 int err = 0; 6362 int did_freeze = 0; 6363 6364 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6365 return -EBUSY; 6366 6367 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6368 did_freeze = 1; 6369 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6370 md_wakeup_thread(mddev->thread); 6371 } 6372 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6373 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6374 6375 /* 6376 * Thread might be blocked waiting for metadata update which will now 6377 * never happen 6378 */ 6379 md_wakeup_thread_directly(mddev->sync_thread); 6380 6381 mddev_unlock(mddev); 6382 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6383 &mddev->recovery)); 6384 wait_event(mddev->sb_wait, 6385 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6386 mddev_lock_nointr(mddev); 6387 6388 mutex_lock(&mddev->open_mutex); 6389 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6390 mddev->sync_thread || 6391 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6392 pr_warn("md: %s still in use.\n",mdname(mddev)); 6393 err = -EBUSY; 6394 goto out; 6395 } 6396 6397 if (mddev->pers) { 6398 __md_stop_writes(mddev); 6399 6400 if (mddev->ro == MD_RDONLY) { 6401 err = -ENXIO; 6402 goto out; 6403 } 6404 6405 mddev->ro = MD_RDONLY; 6406 set_disk_ro(mddev->gendisk, 1); 6407 } 6408 6409 out: 6410 if ((mddev->pers && !err) || did_freeze) { 6411 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6412 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6413 md_wakeup_thread(mddev->thread); 6414 sysfs_notify_dirent_safe(mddev->sysfs_state); 6415 } 6416 6417 mutex_unlock(&mddev->open_mutex); 6418 return err; 6419 } 6420 6421 /* mode: 6422 * 0 - completely stop and dis-assemble array 6423 * 2 - stop but do not disassemble array 6424 */ 6425 static int do_md_stop(struct mddev *mddev, int mode, 6426 struct block_device *bdev) 6427 { 6428 struct gendisk *disk = mddev->gendisk; 6429 struct md_rdev *rdev; 6430 int did_freeze = 0; 6431 6432 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6433 did_freeze = 1; 6434 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6435 md_wakeup_thread(mddev->thread); 6436 } 6437 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6438 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6439 6440 /* 6441 * Thread might be blocked waiting for metadata update which will now 6442 * never happen 6443 */ 6444 md_wakeup_thread_directly(mddev->sync_thread); 6445 6446 mddev_unlock(mddev); 6447 wait_event(resync_wait, (mddev->sync_thread == NULL && 6448 !test_bit(MD_RECOVERY_RUNNING, 6449 &mddev->recovery))); 6450 mddev_lock_nointr(mddev); 6451 6452 mutex_lock(&mddev->open_mutex); 6453 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6454 mddev->sysfs_active || 6455 mddev->sync_thread || 6456 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6457 pr_warn("md: %s still in use.\n",mdname(mddev)); 6458 mutex_unlock(&mddev->open_mutex); 6459 if (did_freeze) { 6460 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6462 md_wakeup_thread(mddev->thread); 6463 } 6464 return -EBUSY; 6465 } 6466 if (mddev->pers) { 6467 if (!md_is_rdwr(mddev)) 6468 set_disk_ro(disk, 0); 6469 6470 __md_stop_writes(mddev); 6471 __md_stop(mddev); 6472 6473 /* tell userspace to handle 'inactive' */ 6474 sysfs_notify_dirent_safe(mddev->sysfs_state); 6475 6476 rdev_for_each(rdev, mddev) 6477 if (rdev->raid_disk >= 0) 6478 sysfs_unlink_rdev(mddev, rdev); 6479 6480 set_capacity_and_notify(disk, 0); 6481 mutex_unlock(&mddev->open_mutex); 6482 mddev->changed = 1; 6483 6484 if (!md_is_rdwr(mddev)) 6485 mddev->ro = MD_RDWR; 6486 } else 6487 mutex_unlock(&mddev->open_mutex); 6488 /* 6489 * Free resources if final stop 6490 */ 6491 if (mode == 0) { 6492 pr_info("md: %s stopped.\n", mdname(mddev)); 6493 6494 if (mddev->bitmap_info.file) { 6495 struct file *f = mddev->bitmap_info.file; 6496 spin_lock(&mddev->lock); 6497 mddev->bitmap_info.file = NULL; 6498 spin_unlock(&mddev->lock); 6499 fput(f); 6500 } 6501 mddev->bitmap_info.offset = 0; 6502 6503 export_array(mddev); 6504 6505 md_clean(mddev); 6506 if (mddev->hold_active == UNTIL_STOP) 6507 mddev->hold_active = 0; 6508 } 6509 md_new_event(); 6510 sysfs_notify_dirent_safe(mddev->sysfs_state); 6511 return 0; 6512 } 6513 6514 #ifndef MODULE 6515 static void autorun_array(struct mddev *mddev) 6516 { 6517 struct md_rdev *rdev; 6518 int err; 6519 6520 if (list_empty(&mddev->disks)) 6521 return; 6522 6523 pr_info("md: running: "); 6524 6525 rdev_for_each(rdev, mddev) { 6526 pr_cont("<%pg>", rdev->bdev); 6527 } 6528 pr_cont("\n"); 6529 6530 err = do_md_run(mddev); 6531 if (err) { 6532 pr_warn("md: do_md_run() returned %d\n", err); 6533 do_md_stop(mddev, 0, NULL); 6534 } 6535 } 6536 6537 /* 6538 * lets try to run arrays based on all disks that have arrived 6539 * until now. (those are in pending_raid_disks) 6540 * 6541 * the method: pick the first pending disk, collect all disks with 6542 * the same UUID, remove all from the pending list and put them into 6543 * the 'same_array' list. Then order this list based on superblock 6544 * update time (freshest comes first), kick out 'old' disks and 6545 * compare superblocks. If everything's fine then run it. 6546 * 6547 * If "unit" is allocated, then bump its reference count 6548 */ 6549 static void autorun_devices(int part) 6550 { 6551 struct md_rdev *rdev0, *rdev, *tmp; 6552 struct mddev *mddev; 6553 6554 pr_info("md: autorun ...\n"); 6555 while (!list_empty(&pending_raid_disks)) { 6556 int unit; 6557 dev_t dev; 6558 LIST_HEAD(candidates); 6559 rdev0 = list_entry(pending_raid_disks.next, 6560 struct md_rdev, same_set); 6561 6562 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6563 INIT_LIST_HEAD(&candidates); 6564 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6565 if (super_90_load(rdev, rdev0, 0) >= 0) { 6566 pr_debug("md: adding %pg ...\n", 6567 rdev->bdev); 6568 list_move(&rdev->same_set, &candidates); 6569 } 6570 /* 6571 * now we have a set of devices, with all of them having 6572 * mostly sane superblocks. It's time to allocate the 6573 * mddev. 6574 */ 6575 if (part) { 6576 dev = MKDEV(mdp_major, 6577 rdev0->preferred_minor << MdpMinorShift); 6578 unit = MINOR(dev) >> MdpMinorShift; 6579 } else { 6580 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6581 unit = MINOR(dev); 6582 } 6583 if (rdev0->preferred_minor != unit) { 6584 pr_warn("md: unit number in %pg is bad: %d\n", 6585 rdev0->bdev, rdev0->preferred_minor); 6586 break; 6587 } 6588 6589 mddev = md_alloc(dev, NULL); 6590 if (IS_ERR(mddev)) 6591 break; 6592 6593 if (mddev_lock(mddev)) 6594 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6595 else if (mddev->raid_disks || mddev->major_version 6596 || !list_empty(&mddev->disks)) { 6597 pr_warn("md: %s already running, cannot run %pg\n", 6598 mdname(mddev), rdev0->bdev); 6599 mddev_unlock(mddev); 6600 } else { 6601 pr_debug("md: created %s\n", mdname(mddev)); 6602 mddev->persistent = 1; 6603 rdev_for_each_list(rdev, tmp, &candidates) { 6604 list_del_init(&rdev->same_set); 6605 if (bind_rdev_to_array(rdev, mddev)) 6606 export_rdev(rdev, mddev); 6607 } 6608 autorun_array(mddev); 6609 mddev_unlock(mddev); 6610 } 6611 /* on success, candidates will be empty, on error 6612 * it won't... 6613 */ 6614 rdev_for_each_list(rdev, tmp, &candidates) { 6615 list_del_init(&rdev->same_set); 6616 export_rdev(rdev, mddev); 6617 } 6618 mddev_put(mddev); 6619 } 6620 pr_info("md: ... autorun DONE.\n"); 6621 } 6622 #endif /* !MODULE */ 6623 6624 static int get_version(void __user *arg) 6625 { 6626 mdu_version_t ver; 6627 6628 ver.major = MD_MAJOR_VERSION; 6629 ver.minor = MD_MINOR_VERSION; 6630 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6631 6632 if (copy_to_user(arg, &ver, sizeof(ver))) 6633 return -EFAULT; 6634 6635 return 0; 6636 } 6637 6638 static int get_array_info(struct mddev *mddev, void __user *arg) 6639 { 6640 mdu_array_info_t info; 6641 int nr,working,insync,failed,spare; 6642 struct md_rdev *rdev; 6643 6644 nr = working = insync = failed = spare = 0; 6645 rcu_read_lock(); 6646 rdev_for_each_rcu(rdev, mddev) { 6647 nr++; 6648 if (test_bit(Faulty, &rdev->flags)) 6649 failed++; 6650 else { 6651 working++; 6652 if (test_bit(In_sync, &rdev->flags)) 6653 insync++; 6654 else if (test_bit(Journal, &rdev->flags)) 6655 /* TODO: add journal count to md_u.h */ 6656 ; 6657 else 6658 spare++; 6659 } 6660 } 6661 rcu_read_unlock(); 6662 6663 info.major_version = mddev->major_version; 6664 info.minor_version = mddev->minor_version; 6665 info.patch_version = MD_PATCHLEVEL_VERSION; 6666 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6667 info.level = mddev->level; 6668 info.size = mddev->dev_sectors / 2; 6669 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6670 info.size = -1; 6671 info.nr_disks = nr; 6672 info.raid_disks = mddev->raid_disks; 6673 info.md_minor = mddev->md_minor; 6674 info.not_persistent= !mddev->persistent; 6675 6676 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6677 info.state = 0; 6678 if (mddev->in_sync) 6679 info.state = (1<<MD_SB_CLEAN); 6680 if (mddev->bitmap && mddev->bitmap_info.offset) 6681 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6682 if (mddev_is_clustered(mddev)) 6683 info.state |= (1<<MD_SB_CLUSTERED); 6684 info.active_disks = insync; 6685 info.working_disks = working; 6686 info.failed_disks = failed; 6687 info.spare_disks = spare; 6688 6689 info.layout = mddev->layout; 6690 info.chunk_size = mddev->chunk_sectors << 9; 6691 6692 if (copy_to_user(arg, &info, sizeof(info))) 6693 return -EFAULT; 6694 6695 return 0; 6696 } 6697 6698 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6699 { 6700 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6701 char *ptr; 6702 int err; 6703 6704 file = kzalloc(sizeof(*file), GFP_NOIO); 6705 if (!file) 6706 return -ENOMEM; 6707 6708 err = 0; 6709 spin_lock(&mddev->lock); 6710 /* bitmap enabled */ 6711 if (mddev->bitmap_info.file) { 6712 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6713 sizeof(file->pathname)); 6714 if (IS_ERR(ptr)) 6715 err = PTR_ERR(ptr); 6716 else 6717 memmove(file->pathname, ptr, 6718 sizeof(file->pathname)-(ptr-file->pathname)); 6719 } 6720 spin_unlock(&mddev->lock); 6721 6722 if (err == 0 && 6723 copy_to_user(arg, file, sizeof(*file))) 6724 err = -EFAULT; 6725 6726 kfree(file); 6727 return err; 6728 } 6729 6730 static int get_disk_info(struct mddev *mddev, void __user * arg) 6731 { 6732 mdu_disk_info_t info; 6733 struct md_rdev *rdev; 6734 6735 if (copy_from_user(&info, arg, sizeof(info))) 6736 return -EFAULT; 6737 6738 rcu_read_lock(); 6739 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6740 if (rdev) { 6741 info.major = MAJOR(rdev->bdev->bd_dev); 6742 info.minor = MINOR(rdev->bdev->bd_dev); 6743 info.raid_disk = rdev->raid_disk; 6744 info.state = 0; 6745 if (test_bit(Faulty, &rdev->flags)) 6746 info.state |= (1<<MD_DISK_FAULTY); 6747 else if (test_bit(In_sync, &rdev->flags)) { 6748 info.state |= (1<<MD_DISK_ACTIVE); 6749 info.state |= (1<<MD_DISK_SYNC); 6750 } 6751 if (test_bit(Journal, &rdev->flags)) 6752 info.state |= (1<<MD_DISK_JOURNAL); 6753 if (test_bit(WriteMostly, &rdev->flags)) 6754 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6755 if (test_bit(FailFast, &rdev->flags)) 6756 info.state |= (1<<MD_DISK_FAILFAST); 6757 } else { 6758 info.major = info.minor = 0; 6759 info.raid_disk = -1; 6760 info.state = (1<<MD_DISK_REMOVED); 6761 } 6762 rcu_read_unlock(); 6763 6764 if (copy_to_user(arg, &info, sizeof(info))) 6765 return -EFAULT; 6766 6767 return 0; 6768 } 6769 6770 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6771 { 6772 struct md_rdev *rdev; 6773 dev_t dev = MKDEV(info->major,info->minor); 6774 6775 if (mddev_is_clustered(mddev) && 6776 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6777 pr_warn("%s: Cannot add to clustered mddev.\n", 6778 mdname(mddev)); 6779 return -EINVAL; 6780 } 6781 6782 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6783 return -EOVERFLOW; 6784 6785 if (!mddev->raid_disks) { 6786 int err; 6787 /* expecting a device which has a superblock */ 6788 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6789 if (IS_ERR(rdev)) { 6790 pr_warn("md: md_import_device returned %ld\n", 6791 PTR_ERR(rdev)); 6792 return PTR_ERR(rdev); 6793 } 6794 if (!list_empty(&mddev->disks)) { 6795 struct md_rdev *rdev0 6796 = list_entry(mddev->disks.next, 6797 struct md_rdev, same_set); 6798 err = super_types[mddev->major_version] 6799 .load_super(rdev, rdev0, mddev->minor_version); 6800 if (err < 0) { 6801 pr_warn("md: %pg has different UUID to %pg\n", 6802 rdev->bdev, 6803 rdev0->bdev); 6804 export_rdev(rdev, mddev); 6805 return -EINVAL; 6806 } 6807 } 6808 err = bind_rdev_to_array(rdev, mddev); 6809 if (err) 6810 export_rdev(rdev, mddev); 6811 return err; 6812 } 6813 6814 /* 6815 * md_add_new_disk can be used once the array is assembled 6816 * to add "hot spares". They must already have a superblock 6817 * written 6818 */ 6819 if (mddev->pers) { 6820 int err; 6821 if (!mddev->pers->hot_add_disk) { 6822 pr_warn("%s: personality does not support diskops!\n", 6823 mdname(mddev)); 6824 return -EINVAL; 6825 } 6826 if (mddev->persistent) 6827 rdev = md_import_device(dev, mddev->major_version, 6828 mddev->minor_version); 6829 else 6830 rdev = md_import_device(dev, -1, -1); 6831 if (IS_ERR(rdev)) { 6832 pr_warn("md: md_import_device returned %ld\n", 6833 PTR_ERR(rdev)); 6834 return PTR_ERR(rdev); 6835 } 6836 /* set saved_raid_disk if appropriate */ 6837 if (!mddev->persistent) { 6838 if (info->state & (1<<MD_DISK_SYNC) && 6839 info->raid_disk < mddev->raid_disks) { 6840 rdev->raid_disk = info->raid_disk; 6841 clear_bit(Bitmap_sync, &rdev->flags); 6842 } else 6843 rdev->raid_disk = -1; 6844 rdev->saved_raid_disk = rdev->raid_disk; 6845 } else 6846 super_types[mddev->major_version]. 6847 validate_super(mddev, NULL/*freshest*/, rdev); 6848 if ((info->state & (1<<MD_DISK_SYNC)) && 6849 rdev->raid_disk != info->raid_disk) { 6850 /* This was a hot-add request, but events doesn't 6851 * match, so reject it. 6852 */ 6853 export_rdev(rdev, mddev); 6854 return -EINVAL; 6855 } 6856 6857 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6858 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6859 set_bit(WriteMostly, &rdev->flags); 6860 else 6861 clear_bit(WriteMostly, &rdev->flags); 6862 if (info->state & (1<<MD_DISK_FAILFAST)) 6863 set_bit(FailFast, &rdev->flags); 6864 else 6865 clear_bit(FailFast, &rdev->flags); 6866 6867 if (info->state & (1<<MD_DISK_JOURNAL)) { 6868 struct md_rdev *rdev2; 6869 bool has_journal = false; 6870 6871 /* make sure no existing journal disk */ 6872 rdev_for_each(rdev2, mddev) { 6873 if (test_bit(Journal, &rdev2->flags)) { 6874 has_journal = true; 6875 break; 6876 } 6877 } 6878 if (has_journal || mddev->bitmap) { 6879 export_rdev(rdev, mddev); 6880 return -EBUSY; 6881 } 6882 set_bit(Journal, &rdev->flags); 6883 } 6884 /* 6885 * check whether the device shows up in other nodes 6886 */ 6887 if (mddev_is_clustered(mddev)) { 6888 if (info->state & (1 << MD_DISK_CANDIDATE)) 6889 set_bit(Candidate, &rdev->flags); 6890 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6891 /* --add initiated by this node */ 6892 err = md_cluster_ops->add_new_disk(mddev, rdev); 6893 if (err) { 6894 export_rdev(rdev, mddev); 6895 return err; 6896 } 6897 } 6898 } 6899 6900 rdev->raid_disk = -1; 6901 err = bind_rdev_to_array(rdev, mddev); 6902 6903 if (err) 6904 export_rdev(rdev, mddev); 6905 6906 if (mddev_is_clustered(mddev)) { 6907 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6908 if (!err) { 6909 err = md_cluster_ops->new_disk_ack(mddev, 6910 err == 0); 6911 if (err) 6912 md_kick_rdev_from_array(rdev); 6913 } 6914 } else { 6915 if (err) 6916 md_cluster_ops->add_new_disk_cancel(mddev); 6917 else 6918 err = add_bound_rdev(rdev); 6919 } 6920 6921 } else if (!err) 6922 err = add_bound_rdev(rdev); 6923 6924 return err; 6925 } 6926 6927 /* otherwise, md_add_new_disk is only allowed 6928 * for major_version==0 superblocks 6929 */ 6930 if (mddev->major_version != 0) { 6931 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6932 return -EINVAL; 6933 } 6934 6935 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6936 int err; 6937 rdev = md_import_device(dev, -1, 0); 6938 if (IS_ERR(rdev)) { 6939 pr_warn("md: error, md_import_device() returned %ld\n", 6940 PTR_ERR(rdev)); 6941 return PTR_ERR(rdev); 6942 } 6943 rdev->desc_nr = info->number; 6944 if (info->raid_disk < mddev->raid_disks) 6945 rdev->raid_disk = info->raid_disk; 6946 else 6947 rdev->raid_disk = -1; 6948 6949 if (rdev->raid_disk < mddev->raid_disks) 6950 if (info->state & (1<<MD_DISK_SYNC)) 6951 set_bit(In_sync, &rdev->flags); 6952 6953 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6954 set_bit(WriteMostly, &rdev->flags); 6955 if (info->state & (1<<MD_DISK_FAILFAST)) 6956 set_bit(FailFast, &rdev->flags); 6957 6958 if (!mddev->persistent) { 6959 pr_debug("md: nonpersistent superblock ...\n"); 6960 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 6961 } else 6962 rdev->sb_start = calc_dev_sboffset(rdev); 6963 rdev->sectors = rdev->sb_start; 6964 6965 err = bind_rdev_to_array(rdev, mddev); 6966 if (err) { 6967 export_rdev(rdev, mddev); 6968 return err; 6969 } 6970 } 6971 6972 return 0; 6973 } 6974 6975 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6976 { 6977 struct md_rdev *rdev; 6978 6979 if (!mddev->pers) 6980 return -ENODEV; 6981 6982 rdev = find_rdev(mddev, dev); 6983 if (!rdev) 6984 return -ENXIO; 6985 6986 if (rdev->raid_disk < 0) 6987 goto kick_rdev; 6988 6989 clear_bit(Blocked, &rdev->flags); 6990 remove_and_add_spares(mddev, rdev); 6991 6992 if (rdev->raid_disk >= 0) 6993 goto busy; 6994 6995 kick_rdev: 6996 if (mddev_is_clustered(mddev)) { 6997 if (md_cluster_ops->remove_disk(mddev, rdev)) 6998 goto busy; 6999 } 7000 7001 md_kick_rdev_from_array(rdev); 7002 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7003 if (mddev->thread) 7004 md_wakeup_thread(mddev->thread); 7005 else 7006 md_update_sb(mddev, 1); 7007 md_new_event(); 7008 7009 return 0; 7010 busy: 7011 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7012 rdev->bdev, mdname(mddev)); 7013 return -EBUSY; 7014 } 7015 7016 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7017 { 7018 int err; 7019 struct md_rdev *rdev; 7020 7021 if (!mddev->pers) 7022 return -ENODEV; 7023 7024 if (mddev->major_version != 0) { 7025 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7026 mdname(mddev)); 7027 return -EINVAL; 7028 } 7029 if (!mddev->pers->hot_add_disk) { 7030 pr_warn("%s: personality does not support diskops!\n", 7031 mdname(mddev)); 7032 return -EINVAL; 7033 } 7034 7035 rdev = md_import_device(dev, -1, 0); 7036 if (IS_ERR(rdev)) { 7037 pr_warn("md: error, md_import_device() returned %ld\n", 7038 PTR_ERR(rdev)); 7039 return -EINVAL; 7040 } 7041 7042 if (mddev->persistent) 7043 rdev->sb_start = calc_dev_sboffset(rdev); 7044 else 7045 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7046 7047 rdev->sectors = rdev->sb_start; 7048 7049 if (test_bit(Faulty, &rdev->flags)) { 7050 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7051 rdev->bdev, mdname(mddev)); 7052 err = -EINVAL; 7053 goto abort_export; 7054 } 7055 7056 clear_bit(In_sync, &rdev->flags); 7057 rdev->desc_nr = -1; 7058 rdev->saved_raid_disk = -1; 7059 err = bind_rdev_to_array(rdev, mddev); 7060 if (err) 7061 goto abort_export; 7062 7063 /* 7064 * The rest should better be atomic, we can have disk failures 7065 * noticed in interrupt contexts ... 7066 */ 7067 7068 rdev->raid_disk = -1; 7069 7070 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7071 if (!mddev->thread) 7072 md_update_sb(mddev, 1); 7073 /* 7074 * If the new disk does not support REQ_NOWAIT, 7075 * disable on the whole MD. 7076 */ 7077 if (!bdev_nowait(rdev->bdev)) { 7078 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7079 mdname(mddev), rdev->bdev); 7080 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7081 } 7082 /* 7083 * Kick recovery, maybe this spare has to be added to the 7084 * array immediately. 7085 */ 7086 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7087 md_wakeup_thread(mddev->thread); 7088 md_new_event(); 7089 return 0; 7090 7091 abort_export: 7092 export_rdev(rdev, mddev); 7093 return err; 7094 } 7095 7096 static int set_bitmap_file(struct mddev *mddev, int fd) 7097 { 7098 int err = 0; 7099 7100 if (mddev->pers) { 7101 if (!mddev->pers->quiesce || !mddev->thread) 7102 return -EBUSY; 7103 if (mddev->recovery || mddev->sync_thread) 7104 return -EBUSY; 7105 /* we should be able to change the bitmap.. */ 7106 } 7107 7108 if (fd >= 0) { 7109 struct inode *inode; 7110 struct file *f; 7111 7112 if (mddev->bitmap || mddev->bitmap_info.file) 7113 return -EEXIST; /* cannot add when bitmap is present */ 7114 7115 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7116 pr_warn("%s: bitmap files not supported by this kernel\n", 7117 mdname(mddev)); 7118 return -EINVAL; 7119 } 7120 pr_warn("%s: using deprecated bitmap file support\n", 7121 mdname(mddev)); 7122 7123 f = fget(fd); 7124 7125 if (f == NULL) { 7126 pr_warn("%s: error: failed to get bitmap file\n", 7127 mdname(mddev)); 7128 return -EBADF; 7129 } 7130 7131 inode = f->f_mapping->host; 7132 if (!S_ISREG(inode->i_mode)) { 7133 pr_warn("%s: error: bitmap file must be a regular file\n", 7134 mdname(mddev)); 7135 err = -EBADF; 7136 } else if (!(f->f_mode & FMODE_WRITE)) { 7137 pr_warn("%s: error: bitmap file must open for write\n", 7138 mdname(mddev)); 7139 err = -EBADF; 7140 } else if (atomic_read(&inode->i_writecount) != 1) { 7141 pr_warn("%s: error: bitmap file is already in use\n", 7142 mdname(mddev)); 7143 err = -EBUSY; 7144 } 7145 if (err) { 7146 fput(f); 7147 return err; 7148 } 7149 mddev->bitmap_info.file = f; 7150 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7151 } else if (mddev->bitmap == NULL) 7152 return -ENOENT; /* cannot remove what isn't there */ 7153 err = 0; 7154 if (mddev->pers) { 7155 if (fd >= 0) { 7156 struct bitmap *bitmap; 7157 7158 bitmap = md_bitmap_create(mddev, -1); 7159 mddev_suspend(mddev); 7160 if (!IS_ERR(bitmap)) { 7161 mddev->bitmap = bitmap; 7162 err = md_bitmap_load(mddev); 7163 } else 7164 err = PTR_ERR(bitmap); 7165 if (err) { 7166 md_bitmap_destroy(mddev); 7167 fd = -1; 7168 } 7169 mddev_resume(mddev); 7170 } else if (fd < 0) { 7171 mddev_suspend(mddev); 7172 md_bitmap_destroy(mddev); 7173 mddev_resume(mddev); 7174 } 7175 } 7176 if (fd < 0) { 7177 struct file *f = mddev->bitmap_info.file; 7178 if (f) { 7179 spin_lock(&mddev->lock); 7180 mddev->bitmap_info.file = NULL; 7181 spin_unlock(&mddev->lock); 7182 fput(f); 7183 } 7184 } 7185 7186 return err; 7187 } 7188 7189 /* 7190 * md_set_array_info is used two different ways 7191 * The original usage is when creating a new array. 7192 * In this usage, raid_disks is > 0 and it together with 7193 * level, size, not_persistent,layout,chunksize determine the 7194 * shape of the array. 7195 * This will always create an array with a type-0.90.0 superblock. 7196 * The newer usage is when assembling an array. 7197 * In this case raid_disks will be 0, and the major_version field is 7198 * use to determine which style super-blocks are to be found on the devices. 7199 * The minor and patch _version numbers are also kept incase the 7200 * super_block handler wishes to interpret them. 7201 */ 7202 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7203 { 7204 if (info->raid_disks == 0) { 7205 /* just setting version number for superblock loading */ 7206 if (info->major_version < 0 || 7207 info->major_version >= ARRAY_SIZE(super_types) || 7208 super_types[info->major_version].name == NULL) { 7209 /* maybe try to auto-load a module? */ 7210 pr_warn("md: superblock version %d not known\n", 7211 info->major_version); 7212 return -EINVAL; 7213 } 7214 mddev->major_version = info->major_version; 7215 mddev->minor_version = info->minor_version; 7216 mddev->patch_version = info->patch_version; 7217 mddev->persistent = !info->not_persistent; 7218 /* ensure mddev_put doesn't delete this now that there 7219 * is some minimal configuration. 7220 */ 7221 mddev->ctime = ktime_get_real_seconds(); 7222 return 0; 7223 } 7224 mddev->major_version = MD_MAJOR_VERSION; 7225 mddev->minor_version = MD_MINOR_VERSION; 7226 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7227 mddev->ctime = ktime_get_real_seconds(); 7228 7229 mddev->level = info->level; 7230 mddev->clevel[0] = 0; 7231 mddev->dev_sectors = 2 * (sector_t)info->size; 7232 mddev->raid_disks = info->raid_disks; 7233 /* don't set md_minor, it is determined by which /dev/md* was 7234 * openned 7235 */ 7236 if (info->state & (1<<MD_SB_CLEAN)) 7237 mddev->recovery_cp = MaxSector; 7238 else 7239 mddev->recovery_cp = 0; 7240 mddev->persistent = ! info->not_persistent; 7241 mddev->external = 0; 7242 7243 mddev->layout = info->layout; 7244 if (mddev->level == 0) 7245 /* Cannot trust RAID0 layout info here */ 7246 mddev->layout = -1; 7247 mddev->chunk_sectors = info->chunk_size >> 9; 7248 7249 if (mddev->persistent) { 7250 mddev->max_disks = MD_SB_DISKS; 7251 mddev->flags = 0; 7252 mddev->sb_flags = 0; 7253 } 7254 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7255 7256 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7257 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7258 mddev->bitmap_info.offset = 0; 7259 7260 mddev->reshape_position = MaxSector; 7261 7262 /* 7263 * Generate a 128 bit UUID 7264 */ 7265 get_random_bytes(mddev->uuid, 16); 7266 7267 mddev->new_level = mddev->level; 7268 mddev->new_chunk_sectors = mddev->chunk_sectors; 7269 mddev->new_layout = mddev->layout; 7270 mddev->delta_disks = 0; 7271 mddev->reshape_backwards = 0; 7272 7273 return 0; 7274 } 7275 7276 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7277 { 7278 lockdep_assert_held(&mddev->reconfig_mutex); 7279 7280 if (mddev->external_size) 7281 return; 7282 7283 mddev->array_sectors = array_sectors; 7284 } 7285 EXPORT_SYMBOL(md_set_array_sectors); 7286 7287 static int update_size(struct mddev *mddev, sector_t num_sectors) 7288 { 7289 struct md_rdev *rdev; 7290 int rv; 7291 int fit = (num_sectors == 0); 7292 sector_t old_dev_sectors = mddev->dev_sectors; 7293 7294 if (mddev->pers->resize == NULL) 7295 return -EINVAL; 7296 /* The "num_sectors" is the number of sectors of each device that 7297 * is used. This can only make sense for arrays with redundancy. 7298 * linear and raid0 always use whatever space is available. We can only 7299 * consider changing this number if no resync or reconstruction is 7300 * happening, and if the new size is acceptable. It must fit before the 7301 * sb_start or, if that is <data_offset, it must fit before the size 7302 * of each device. If num_sectors is zero, we find the largest size 7303 * that fits. 7304 */ 7305 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7306 mddev->sync_thread) 7307 return -EBUSY; 7308 if (!md_is_rdwr(mddev)) 7309 return -EROFS; 7310 7311 rdev_for_each(rdev, mddev) { 7312 sector_t avail = rdev->sectors; 7313 7314 if (fit && (num_sectors == 0 || num_sectors > avail)) 7315 num_sectors = avail; 7316 if (avail < num_sectors) 7317 return -ENOSPC; 7318 } 7319 rv = mddev->pers->resize(mddev, num_sectors); 7320 if (!rv) { 7321 if (mddev_is_clustered(mddev)) 7322 md_cluster_ops->update_size(mddev, old_dev_sectors); 7323 else if (mddev->queue) { 7324 set_capacity_and_notify(mddev->gendisk, 7325 mddev->array_sectors); 7326 } 7327 } 7328 return rv; 7329 } 7330 7331 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7332 { 7333 int rv; 7334 struct md_rdev *rdev; 7335 /* change the number of raid disks */ 7336 if (mddev->pers->check_reshape == NULL) 7337 return -EINVAL; 7338 if (!md_is_rdwr(mddev)) 7339 return -EROFS; 7340 if (raid_disks <= 0 || 7341 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7342 return -EINVAL; 7343 if (mddev->sync_thread || 7344 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7345 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7346 mddev->reshape_position != MaxSector) 7347 return -EBUSY; 7348 7349 rdev_for_each(rdev, mddev) { 7350 if (mddev->raid_disks < raid_disks && 7351 rdev->data_offset < rdev->new_data_offset) 7352 return -EINVAL; 7353 if (mddev->raid_disks > raid_disks && 7354 rdev->data_offset > rdev->new_data_offset) 7355 return -EINVAL; 7356 } 7357 7358 mddev->delta_disks = raid_disks - mddev->raid_disks; 7359 if (mddev->delta_disks < 0) 7360 mddev->reshape_backwards = 1; 7361 else if (mddev->delta_disks > 0) 7362 mddev->reshape_backwards = 0; 7363 7364 rv = mddev->pers->check_reshape(mddev); 7365 if (rv < 0) { 7366 mddev->delta_disks = 0; 7367 mddev->reshape_backwards = 0; 7368 } 7369 return rv; 7370 } 7371 7372 /* 7373 * update_array_info is used to change the configuration of an 7374 * on-line array. 7375 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7376 * fields in the info are checked against the array. 7377 * Any differences that cannot be handled will cause an error. 7378 * Normally, only one change can be managed at a time. 7379 */ 7380 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7381 { 7382 int rv = 0; 7383 int cnt = 0; 7384 int state = 0; 7385 7386 /* calculate expected state,ignoring low bits */ 7387 if (mddev->bitmap && mddev->bitmap_info.offset) 7388 state |= (1 << MD_SB_BITMAP_PRESENT); 7389 7390 if (mddev->major_version != info->major_version || 7391 mddev->minor_version != info->minor_version || 7392 /* mddev->patch_version != info->patch_version || */ 7393 mddev->ctime != info->ctime || 7394 mddev->level != info->level || 7395 /* mddev->layout != info->layout || */ 7396 mddev->persistent != !info->not_persistent || 7397 mddev->chunk_sectors != info->chunk_size >> 9 || 7398 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7399 ((state^info->state) & 0xfffffe00) 7400 ) 7401 return -EINVAL; 7402 /* Check there is only one change */ 7403 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7404 cnt++; 7405 if (mddev->raid_disks != info->raid_disks) 7406 cnt++; 7407 if (mddev->layout != info->layout) 7408 cnt++; 7409 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7410 cnt++; 7411 if (cnt == 0) 7412 return 0; 7413 if (cnt > 1) 7414 return -EINVAL; 7415 7416 if (mddev->layout != info->layout) { 7417 /* Change layout 7418 * we don't need to do anything at the md level, the 7419 * personality will take care of it all. 7420 */ 7421 if (mddev->pers->check_reshape == NULL) 7422 return -EINVAL; 7423 else { 7424 mddev->new_layout = info->layout; 7425 rv = mddev->pers->check_reshape(mddev); 7426 if (rv) 7427 mddev->new_layout = mddev->layout; 7428 return rv; 7429 } 7430 } 7431 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7432 rv = update_size(mddev, (sector_t)info->size * 2); 7433 7434 if (mddev->raid_disks != info->raid_disks) 7435 rv = update_raid_disks(mddev, info->raid_disks); 7436 7437 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7438 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7439 rv = -EINVAL; 7440 goto err; 7441 } 7442 if (mddev->recovery || mddev->sync_thread) { 7443 rv = -EBUSY; 7444 goto err; 7445 } 7446 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7447 struct bitmap *bitmap; 7448 /* add the bitmap */ 7449 if (mddev->bitmap) { 7450 rv = -EEXIST; 7451 goto err; 7452 } 7453 if (mddev->bitmap_info.default_offset == 0) { 7454 rv = -EINVAL; 7455 goto err; 7456 } 7457 mddev->bitmap_info.offset = 7458 mddev->bitmap_info.default_offset; 7459 mddev->bitmap_info.space = 7460 mddev->bitmap_info.default_space; 7461 bitmap = md_bitmap_create(mddev, -1); 7462 mddev_suspend(mddev); 7463 if (!IS_ERR(bitmap)) { 7464 mddev->bitmap = bitmap; 7465 rv = md_bitmap_load(mddev); 7466 } else 7467 rv = PTR_ERR(bitmap); 7468 if (rv) 7469 md_bitmap_destroy(mddev); 7470 mddev_resume(mddev); 7471 } else { 7472 /* remove the bitmap */ 7473 if (!mddev->bitmap) { 7474 rv = -ENOENT; 7475 goto err; 7476 } 7477 if (mddev->bitmap->storage.file) { 7478 rv = -EINVAL; 7479 goto err; 7480 } 7481 if (mddev->bitmap_info.nodes) { 7482 /* hold PW on all the bitmap lock */ 7483 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7484 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7485 rv = -EPERM; 7486 md_cluster_ops->unlock_all_bitmaps(mddev); 7487 goto err; 7488 } 7489 7490 mddev->bitmap_info.nodes = 0; 7491 md_cluster_ops->leave(mddev); 7492 module_put(md_cluster_mod); 7493 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7494 } 7495 mddev_suspend(mddev); 7496 md_bitmap_destroy(mddev); 7497 mddev_resume(mddev); 7498 mddev->bitmap_info.offset = 0; 7499 } 7500 } 7501 md_update_sb(mddev, 1); 7502 return rv; 7503 err: 7504 return rv; 7505 } 7506 7507 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7508 { 7509 struct md_rdev *rdev; 7510 int err = 0; 7511 7512 if (mddev->pers == NULL) 7513 return -ENODEV; 7514 7515 rcu_read_lock(); 7516 rdev = md_find_rdev_rcu(mddev, dev); 7517 if (!rdev) 7518 err = -ENODEV; 7519 else { 7520 md_error(mddev, rdev); 7521 if (test_bit(MD_BROKEN, &mddev->flags)) 7522 err = -EBUSY; 7523 } 7524 rcu_read_unlock(); 7525 return err; 7526 } 7527 7528 /* 7529 * We have a problem here : there is no easy way to give a CHS 7530 * virtual geometry. We currently pretend that we have a 2 heads 7531 * 4 sectors (with a BIG number of cylinders...). This drives 7532 * dosfs just mad... ;-) 7533 */ 7534 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7535 { 7536 struct mddev *mddev = bdev->bd_disk->private_data; 7537 7538 geo->heads = 2; 7539 geo->sectors = 4; 7540 geo->cylinders = mddev->array_sectors / 8; 7541 return 0; 7542 } 7543 7544 static inline bool md_ioctl_valid(unsigned int cmd) 7545 { 7546 switch (cmd) { 7547 case ADD_NEW_DISK: 7548 case GET_ARRAY_INFO: 7549 case GET_BITMAP_FILE: 7550 case GET_DISK_INFO: 7551 case HOT_ADD_DISK: 7552 case HOT_REMOVE_DISK: 7553 case RAID_VERSION: 7554 case RESTART_ARRAY_RW: 7555 case RUN_ARRAY: 7556 case SET_ARRAY_INFO: 7557 case SET_BITMAP_FILE: 7558 case SET_DISK_FAULTY: 7559 case STOP_ARRAY: 7560 case STOP_ARRAY_RO: 7561 case CLUSTERED_DISK_NACK: 7562 return true; 7563 default: 7564 return false; 7565 } 7566 } 7567 7568 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7569 { 7570 mdu_array_info_t info; 7571 int err; 7572 7573 if (!argp) 7574 memset(&info, 0, sizeof(info)); 7575 else if (copy_from_user(&info, argp, sizeof(info))) 7576 return -EFAULT; 7577 7578 if (mddev->pers) { 7579 err = update_array_info(mddev, &info); 7580 if (err) 7581 pr_warn("md: couldn't update array info. %d\n", err); 7582 return err; 7583 } 7584 7585 if (!list_empty(&mddev->disks)) { 7586 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7587 return -EBUSY; 7588 } 7589 7590 if (mddev->raid_disks) { 7591 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7592 return -EBUSY; 7593 } 7594 7595 err = md_set_array_info(mddev, &info); 7596 if (err) 7597 pr_warn("md: couldn't set array info. %d\n", err); 7598 7599 return err; 7600 } 7601 7602 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7603 unsigned int cmd, unsigned long arg) 7604 { 7605 int err = 0; 7606 void __user *argp = (void __user *)arg; 7607 struct mddev *mddev = NULL; 7608 bool did_set_md_closing = false; 7609 7610 if (!md_ioctl_valid(cmd)) 7611 return -ENOTTY; 7612 7613 switch (cmd) { 7614 case RAID_VERSION: 7615 case GET_ARRAY_INFO: 7616 case GET_DISK_INFO: 7617 break; 7618 default: 7619 if (!capable(CAP_SYS_ADMIN)) 7620 return -EACCES; 7621 } 7622 7623 /* 7624 * Commands dealing with the RAID driver but not any 7625 * particular array: 7626 */ 7627 switch (cmd) { 7628 case RAID_VERSION: 7629 err = get_version(argp); 7630 goto out; 7631 default:; 7632 } 7633 7634 /* 7635 * Commands creating/starting a new array: 7636 */ 7637 7638 mddev = bdev->bd_disk->private_data; 7639 7640 if (!mddev) { 7641 BUG(); 7642 goto out; 7643 } 7644 7645 /* Some actions do not requires the mutex */ 7646 switch (cmd) { 7647 case GET_ARRAY_INFO: 7648 if (!mddev->raid_disks && !mddev->external) 7649 err = -ENODEV; 7650 else 7651 err = get_array_info(mddev, argp); 7652 goto out; 7653 7654 case GET_DISK_INFO: 7655 if (!mddev->raid_disks && !mddev->external) 7656 err = -ENODEV; 7657 else 7658 err = get_disk_info(mddev, argp); 7659 goto out; 7660 7661 case SET_DISK_FAULTY: 7662 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7663 goto out; 7664 7665 case GET_BITMAP_FILE: 7666 err = get_bitmap_file(mddev, argp); 7667 goto out; 7668 7669 } 7670 7671 if (cmd == HOT_REMOVE_DISK) 7672 /* need to ensure recovery thread has run */ 7673 wait_event_interruptible_timeout(mddev->sb_wait, 7674 !test_bit(MD_RECOVERY_NEEDED, 7675 &mddev->recovery), 7676 msecs_to_jiffies(5000)); 7677 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7678 /* Need to flush page cache, and ensure no-one else opens 7679 * and writes 7680 */ 7681 mutex_lock(&mddev->open_mutex); 7682 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7683 mutex_unlock(&mddev->open_mutex); 7684 err = -EBUSY; 7685 goto out; 7686 } 7687 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7688 mutex_unlock(&mddev->open_mutex); 7689 err = -EBUSY; 7690 goto out; 7691 } 7692 did_set_md_closing = true; 7693 mutex_unlock(&mddev->open_mutex); 7694 sync_blockdev(bdev); 7695 } 7696 err = mddev_lock(mddev); 7697 if (err) { 7698 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7699 err, cmd); 7700 goto out; 7701 } 7702 7703 if (cmd == SET_ARRAY_INFO) { 7704 err = __md_set_array_info(mddev, argp); 7705 goto unlock; 7706 } 7707 7708 /* 7709 * Commands querying/configuring an existing array: 7710 */ 7711 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7712 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7713 if ((!mddev->raid_disks && !mddev->external) 7714 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7715 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7716 && cmd != GET_BITMAP_FILE) { 7717 err = -ENODEV; 7718 goto unlock; 7719 } 7720 7721 /* 7722 * Commands even a read-only array can execute: 7723 */ 7724 switch (cmd) { 7725 case RESTART_ARRAY_RW: 7726 err = restart_array(mddev); 7727 goto unlock; 7728 7729 case STOP_ARRAY: 7730 err = do_md_stop(mddev, 0, bdev); 7731 goto unlock; 7732 7733 case STOP_ARRAY_RO: 7734 err = md_set_readonly(mddev, bdev); 7735 goto unlock; 7736 7737 case HOT_REMOVE_DISK: 7738 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7739 goto unlock; 7740 7741 case ADD_NEW_DISK: 7742 /* We can support ADD_NEW_DISK on read-only arrays 7743 * only if we are re-adding a preexisting device. 7744 * So require mddev->pers and MD_DISK_SYNC. 7745 */ 7746 if (mddev->pers) { 7747 mdu_disk_info_t info; 7748 if (copy_from_user(&info, argp, sizeof(info))) 7749 err = -EFAULT; 7750 else if (!(info.state & (1<<MD_DISK_SYNC))) 7751 /* Need to clear read-only for this */ 7752 break; 7753 else 7754 err = md_add_new_disk(mddev, &info); 7755 goto unlock; 7756 } 7757 break; 7758 } 7759 7760 /* 7761 * The remaining ioctls are changing the state of the 7762 * superblock, so we do not allow them on read-only arrays. 7763 */ 7764 if (!md_is_rdwr(mddev) && mddev->pers) { 7765 if (mddev->ro != MD_AUTO_READ) { 7766 err = -EROFS; 7767 goto unlock; 7768 } 7769 mddev->ro = MD_RDWR; 7770 sysfs_notify_dirent_safe(mddev->sysfs_state); 7771 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7772 /* mddev_unlock will wake thread */ 7773 /* If a device failed while we were read-only, we 7774 * need to make sure the metadata is updated now. 7775 */ 7776 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7777 mddev_unlock(mddev); 7778 wait_event(mddev->sb_wait, 7779 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7780 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7781 mddev_lock_nointr(mddev); 7782 } 7783 } 7784 7785 switch (cmd) { 7786 case ADD_NEW_DISK: 7787 { 7788 mdu_disk_info_t info; 7789 if (copy_from_user(&info, argp, sizeof(info))) 7790 err = -EFAULT; 7791 else 7792 err = md_add_new_disk(mddev, &info); 7793 goto unlock; 7794 } 7795 7796 case CLUSTERED_DISK_NACK: 7797 if (mddev_is_clustered(mddev)) 7798 md_cluster_ops->new_disk_ack(mddev, false); 7799 else 7800 err = -EINVAL; 7801 goto unlock; 7802 7803 case HOT_ADD_DISK: 7804 err = hot_add_disk(mddev, new_decode_dev(arg)); 7805 goto unlock; 7806 7807 case RUN_ARRAY: 7808 err = do_md_run(mddev); 7809 goto unlock; 7810 7811 case SET_BITMAP_FILE: 7812 err = set_bitmap_file(mddev, (int)arg); 7813 goto unlock; 7814 7815 default: 7816 err = -EINVAL; 7817 goto unlock; 7818 } 7819 7820 unlock: 7821 if (mddev->hold_active == UNTIL_IOCTL && 7822 err != -EINVAL) 7823 mddev->hold_active = 0; 7824 mddev_unlock(mddev); 7825 out: 7826 if(did_set_md_closing) 7827 clear_bit(MD_CLOSING, &mddev->flags); 7828 return err; 7829 } 7830 #ifdef CONFIG_COMPAT 7831 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7832 unsigned int cmd, unsigned long arg) 7833 { 7834 switch (cmd) { 7835 case HOT_REMOVE_DISK: 7836 case HOT_ADD_DISK: 7837 case SET_DISK_FAULTY: 7838 case SET_BITMAP_FILE: 7839 /* These take in integer arg, do not convert */ 7840 break; 7841 default: 7842 arg = (unsigned long)compat_ptr(arg); 7843 break; 7844 } 7845 7846 return md_ioctl(bdev, mode, cmd, arg); 7847 } 7848 #endif /* CONFIG_COMPAT */ 7849 7850 static int md_set_read_only(struct block_device *bdev, bool ro) 7851 { 7852 struct mddev *mddev = bdev->bd_disk->private_data; 7853 int err; 7854 7855 err = mddev_lock(mddev); 7856 if (err) 7857 return err; 7858 7859 if (!mddev->raid_disks && !mddev->external) { 7860 err = -ENODEV; 7861 goto out_unlock; 7862 } 7863 7864 /* 7865 * Transitioning to read-auto need only happen for arrays that call 7866 * md_write_start and which are not ready for writes yet. 7867 */ 7868 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7869 err = restart_array(mddev); 7870 if (err) 7871 goto out_unlock; 7872 mddev->ro = MD_AUTO_READ; 7873 } 7874 7875 out_unlock: 7876 mddev_unlock(mddev); 7877 return err; 7878 } 7879 7880 static int md_open(struct gendisk *disk, blk_mode_t mode) 7881 { 7882 struct mddev *mddev; 7883 int err; 7884 7885 spin_lock(&all_mddevs_lock); 7886 mddev = mddev_get(disk->private_data); 7887 spin_unlock(&all_mddevs_lock); 7888 if (!mddev) 7889 return -ENODEV; 7890 7891 err = mutex_lock_interruptible(&mddev->open_mutex); 7892 if (err) 7893 goto out; 7894 7895 err = -ENODEV; 7896 if (test_bit(MD_CLOSING, &mddev->flags)) 7897 goto out_unlock; 7898 7899 atomic_inc(&mddev->openers); 7900 mutex_unlock(&mddev->open_mutex); 7901 7902 disk_check_media_change(disk); 7903 return 0; 7904 7905 out_unlock: 7906 mutex_unlock(&mddev->open_mutex); 7907 out: 7908 mddev_put(mddev); 7909 return err; 7910 } 7911 7912 static void md_release(struct gendisk *disk) 7913 { 7914 struct mddev *mddev = disk->private_data; 7915 7916 BUG_ON(!mddev); 7917 atomic_dec(&mddev->openers); 7918 mddev_put(mddev); 7919 } 7920 7921 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7922 { 7923 struct mddev *mddev = disk->private_data; 7924 unsigned int ret = 0; 7925 7926 if (mddev->changed) 7927 ret = DISK_EVENT_MEDIA_CHANGE; 7928 mddev->changed = 0; 7929 return ret; 7930 } 7931 7932 static void md_free_disk(struct gendisk *disk) 7933 { 7934 struct mddev *mddev = disk->private_data; 7935 7936 percpu_ref_exit(&mddev->writes_pending); 7937 mddev_free(mddev); 7938 } 7939 7940 const struct block_device_operations md_fops = 7941 { 7942 .owner = THIS_MODULE, 7943 .submit_bio = md_submit_bio, 7944 .open = md_open, 7945 .release = md_release, 7946 .ioctl = md_ioctl, 7947 #ifdef CONFIG_COMPAT 7948 .compat_ioctl = md_compat_ioctl, 7949 #endif 7950 .getgeo = md_getgeo, 7951 .check_events = md_check_events, 7952 .set_read_only = md_set_read_only, 7953 .free_disk = md_free_disk, 7954 }; 7955 7956 static int md_thread(void *arg) 7957 { 7958 struct md_thread *thread = arg; 7959 7960 /* 7961 * md_thread is a 'system-thread', it's priority should be very 7962 * high. We avoid resource deadlocks individually in each 7963 * raid personality. (RAID5 does preallocation) We also use RR and 7964 * the very same RT priority as kswapd, thus we will never get 7965 * into a priority inversion deadlock. 7966 * 7967 * we definitely have to have equal or higher priority than 7968 * bdflush, otherwise bdflush will deadlock if there are too 7969 * many dirty RAID5 blocks. 7970 */ 7971 7972 allow_signal(SIGKILL); 7973 while (!kthread_should_stop()) { 7974 7975 /* We need to wait INTERRUPTIBLE so that 7976 * we don't add to the load-average. 7977 * That means we need to be sure no signals are 7978 * pending 7979 */ 7980 if (signal_pending(current)) 7981 flush_signals(current); 7982 7983 wait_event_interruptible_timeout 7984 (thread->wqueue, 7985 test_bit(THREAD_WAKEUP, &thread->flags) 7986 || kthread_should_stop() || kthread_should_park(), 7987 thread->timeout); 7988 7989 clear_bit(THREAD_WAKEUP, &thread->flags); 7990 if (kthread_should_park()) 7991 kthread_parkme(); 7992 if (!kthread_should_stop()) 7993 thread->run(thread); 7994 } 7995 7996 return 0; 7997 } 7998 7999 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8000 { 8001 struct md_thread *t; 8002 8003 rcu_read_lock(); 8004 t = rcu_dereference(thread); 8005 if (t) 8006 wake_up_process(t->tsk); 8007 rcu_read_unlock(); 8008 } 8009 8010 void md_wakeup_thread(struct md_thread __rcu *thread) 8011 { 8012 struct md_thread *t; 8013 8014 rcu_read_lock(); 8015 t = rcu_dereference(thread); 8016 if (t) { 8017 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8018 set_bit(THREAD_WAKEUP, &t->flags); 8019 wake_up(&t->wqueue); 8020 } 8021 rcu_read_unlock(); 8022 } 8023 EXPORT_SYMBOL(md_wakeup_thread); 8024 8025 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8026 struct mddev *mddev, const char *name) 8027 { 8028 struct md_thread *thread; 8029 8030 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8031 if (!thread) 8032 return NULL; 8033 8034 init_waitqueue_head(&thread->wqueue); 8035 8036 thread->run = run; 8037 thread->mddev = mddev; 8038 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8039 thread->tsk = kthread_run(md_thread, thread, 8040 "%s_%s", 8041 mdname(thread->mddev), 8042 name); 8043 if (IS_ERR(thread->tsk)) { 8044 kfree(thread); 8045 return NULL; 8046 } 8047 return thread; 8048 } 8049 EXPORT_SYMBOL(md_register_thread); 8050 8051 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8052 { 8053 struct md_thread *thread = rcu_dereference_protected(*threadp, 8054 lockdep_is_held(&mddev->reconfig_mutex)); 8055 8056 if (!thread) 8057 return; 8058 8059 rcu_assign_pointer(*threadp, NULL); 8060 synchronize_rcu(); 8061 8062 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8063 kthread_stop(thread->tsk); 8064 kfree(thread); 8065 } 8066 EXPORT_SYMBOL(md_unregister_thread); 8067 8068 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8069 { 8070 if (!rdev || test_bit(Faulty, &rdev->flags)) 8071 return; 8072 8073 if (!mddev->pers || !mddev->pers->error_handler) 8074 return; 8075 mddev->pers->error_handler(mddev, rdev); 8076 8077 if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) 8078 return; 8079 8080 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8081 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8082 sysfs_notify_dirent_safe(rdev->sysfs_state); 8083 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8084 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8085 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8086 md_wakeup_thread(mddev->thread); 8087 } 8088 if (mddev->event_work.func) 8089 queue_work(md_misc_wq, &mddev->event_work); 8090 md_new_event(); 8091 } 8092 EXPORT_SYMBOL(md_error); 8093 8094 /* seq_file implementation /proc/mdstat */ 8095 8096 static void status_unused(struct seq_file *seq) 8097 { 8098 int i = 0; 8099 struct md_rdev *rdev; 8100 8101 seq_printf(seq, "unused devices: "); 8102 8103 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8104 i++; 8105 seq_printf(seq, "%pg ", rdev->bdev); 8106 } 8107 if (!i) 8108 seq_printf(seq, "<none>"); 8109 8110 seq_printf(seq, "\n"); 8111 } 8112 8113 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8114 { 8115 sector_t max_sectors, resync, res; 8116 unsigned long dt, db = 0; 8117 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8118 int scale, recovery_active; 8119 unsigned int per_milli; 8120 8121 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8122 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8123 max_sectors = mddev->resync_max_sectors; 8124 else 8125 max_sectors = mddev->dev_sectors; 8126 8127 resync = mddev->curr_resync; 8128 if (resync < MD_RESYNC_ACTIVE) { 8129 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8130 /* Still cleaning up */ 8131 resync = max_sectors; 8132 } else if (resync > max_sectors) { 8133 resync = max_sectors; 8134 } else { 8135 res = atomic_read(&mddev->recovery_active); 8136 /* 8137 * Resync has started, but the subtraction has overflowed or 8138 * yielded one of the special values. Force it to active to 8139 * ensure the status reports an active resync. 8140 */ 8141 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8142 resync = MD_RESYNC_ACTIVE; 8143 else 8144 resync -= res; 8145 } 8146 8147 if (resync == MD_RESYNC_NONE) { 8148 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8149 struct md_rdev *rdev; 8150 8151 rdev_for_each(rdev, mddev) 8152 if (rdev->raid_disk >= 0 && 8153 !test_bit(Faulty, &rdev->flags) && 8154 rdev->recovery_offset != MaxSector && 8155 rdev->recovery_offset) { 8156 seq_printf(seq, "\trecover=REMOTE"); 8157 return 1; 8158 } 8159 if (mddev->reshape_position != MaxSector) 8160 seq_printf(seq, "\treshape=REMOTE"); 8161 else 8162 seq_printf(seq, "\tresync=REMOTE"); 8163 return 1; 8164 } 8165 if (mddev->recovery_cp < MaxSector) { 8166 seq_printf(seq, "\tresync=PENDING"); 8167 return 1; 8168 } 8169 return 0; 8170 } 8171 if (resync < MD_RESYNC_ACTIVE) { 8172 seq_printf(seq, "\tresync=DELAYED"); 8173 return 1; 8174 } 8175 8176 WARN_ON(max_sectors == 0); 8177 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8178 * in a sector_t, and (max_sectors>>scale) will fit in a 8179 * u32, as those are the requirements for sector_div. 8180 * Thus 'scale' must be at least 10 8181 */ 8182 scale = 10; 8183 if (sizeof(sector_t) > sizeof(unsigned long)) { 8184 while ( max_sectors/2 > (1ULL<<(scale+32))) 8185 scale++; 8186 } 8187 res = (resync>>scale)*1000; 8188 sector_div(res, (u32)((max_sectors>>scale)+1)); 8189 8190 per_milli = res; 8191 { 8192 int i, x = per_milli/50, y = 20-x; 8193 seq_printf(seq, "["); 8194 for (i = 0; i < x; i++) 8195 seq_printf(seq, "="); 8196 seq_printf(seq, ">"); 8197 for (i = 0; i < y; i++) 8198 seq_printf(seq, "."); 8199 seq_printf(seq, "] "); 8200 } 8201 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8202 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8203 "reshape" : 8204 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8205 "check" : 8206 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8207 "resync" : "recovery"))), 8208 per_milli/10, per_milli % 10, 8209 (unsigned long long) resync/2, 8210 (unsigned long long) max_sectors/2); 8211 8212 /* 8213 * dt: time from mark until now 8214 * db: blocks written from mark until now 8215 * rt: remaining time 8216 * 8217 * rt is a sector_t, which is always 64bit now. We are keeping 8218 * the original algorithm, but it is not really necessary. 8219 * 8220 * Original algorithm: 8221 * So we divide before multiply in case it is 32bit and close 8222 * to the limit. 8223 * We scale the divisor (db) by 32 to avoid losing precision 8224 * near the end of resync when the number of remaining sectors 8225 * is close to 'db'. 8226 * We then divide rt by 32 after multiplying by db to compensate. 8227 * The '+1' avoids division by zero if db is very small. 8228 */ 8229 dt = ((jiffies - mddev->resync_mark) / HZ); 8230 if (!dt) dt++; 8231 8232 curr_mark_cnt = mddev->curr_mark_cnt; 8233 recovery_active = atomic_read(&mddev->recovery_active); 8234 resync_mark_cnt = mddev->resync_mark_cnt; 8235 8236 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8237 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8238 8239 rt = max_sectors - resync; /* number of remaining sectors */ 8240 rt = div64_u64(rt, db/32+1); 8241 rt *= dt; 8242 rt >>= 5; 8243 8244 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8245 ((unsigned long)rt % 60)/6); 8246 8247 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8248 return 1; 8249 } 8250 8251 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8252 { 8253 struct list_head *tmp; 8254 loff_t l = *pos; 8255 struct mddev *mddev; 8256 8257 if (l == 0x10000) { 8258 ++*pos; 8259 return (void *)2; 8260 } 8261 if (l > 0x10000) 8262 return NULL; 8263 if (!l--) 8264 /* header */ 8265 return (void*)1; 8266 8267 spin_lock(&all_mddevs_lock); 8268 list_for_each(tmp,&all_mddevs) 8269 if (!l--) { 8270 mddev = list_entry(tmp, struct mddev, all_mddevs); 8271 if (!mddev_get(mddev)) 8272 continue; 8273 spin_unlock(&all_mddevs_lock); 8274 return mddev; 8275 } 8276 spin_unlock(&all_mddevs_lock); 8277 if (!l--) 8278 return (void*)2;/* tail */ 8279 return NULL; 8280 } 8281 8282 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8283 { 8284 struct list_head *tmp; 8285 struct mddev *next_mddev, *mddev = v; 8286 struct mddev *to_put = NULL; 8287 8288 ++*pos; 8289 if (v == (void*)2) 8290 return NULL; 8291 8292 spin_lock(&all_mddevs_lock); 8293 if (v == (void*)1) { 8294 tmp = all_mddevs.next; 8295 } else { 8296 to_put = mddev; 8297 tmp = mddev->all_mddevs.next; 8298 } 8299 8300 for (;;) { 8301 if (tmp == &all_mddevs) { 8302 next_mddev = (void*)2; 8303 *pos = 0x10000; 8304 break; 8305 } 8306 next_mddev = list_entry(tmp, struct mddev, all_mddevs); 8307 if (mddev_get(next_mddev)) 8308 break; 8309 mddev = next_mddev; 8310 tmp = mddev->all_mddevs.next; 8311 } 8312 spin_unlock(&all_mddevs_lock); 8313 8314 if (to_put) 8315 mddev_put(to_put); 8316 return next_mddev; 8317 8318 } 8319 8320 static void md_seq_stop(struct seq_file *seq, void *v) 8321 { 8322 struct mddev *mddev = v; 8323 8324 if (mddev && v != (void*)1 && v != (void*)2) 8325 mddev_put(mddev); 8326 } 8327 8328 static int md_seq_show(struct seq_file *seq, void *v) 8329 { 8330 struct mddev *mddev = v; 8331 sector_t sectors; 8332 struct md_rdev *rdev; 8333 8334 if (v == (void*)1) { 8335 struct md_personality *pers; 8336 seq_printf(seq, "Personalities : "); 8337 spin_lock(&pers_lock); 8338 list_for_each_entry(pers, &pers_list, list) 8339 seq_printf(seq, "[%s] ", pers->name); 8340 8341 spin_unlock(&pers_lock); 8342 seq_printf(seq, "\n"); 8343 seq->poll_event = atomic_read(&md_event_count); 8344 return 0; 8345 } 8346 if (v == (void*)2) { 8347 status_unused(seq); 8348 return 0; 8349 } 8350 8351 spin_lock(&mddev->lock); 8352 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8353 seq_printf(seq, "%s : %sactive", mdname(mddev), 8354 mddev->pers ? "" : "in"); 8355 if (mddev->pers) { 8356 if (mddev->ro == MD_RDONLY) 8357 seq_printf(seq, " (read-only)"); 8358 if (mddev->ro == MD_AUTO_READ) 8359 seq_printf(seq, " (auto-read-only)"); 8360 seq_printf(seq, " %s", mddev->pers->name); 8361 } 8362 8363 sectors = 0; 8364 rcu_read_lock(); 8365 rdev_for_each_rcu(rdev, mddev) { 8366 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8367 8368 if (test_bit(WriteMostly, &rdev->flags)) 8369 seq_printf(seq, "(W)"); 8370 if (test_bit(Journal, &rdev->flags)) 8371 seq_printf(seq, "(J)"); 8372 if (test_bit(Faulty, &rdev->flags)) { 8373 seq_printf(seq, "(F)"); 8374 continue; 8375 } 8376 if (rdev->raid_disk < 0) 8377 seq_printf(seq, "(S)"); /* spare */ 8378 if (test_bit(Replacement, &rdev->flags)) 8379 seq_printf(seq, "(R)"); 8380 sectors += rdev->sectors; 8381 } 8382 rcu_read_unlock(); 8383 8384 if (!list_empty(&mddev->disks)) { 8385 if (mddev->pers) 8386 seq_printf(seq, "\n %llu blocks", 8387 (unsigned long long) 8388 mddev->array_sectors / 2); 8389 else 8390 seq_printf(seq, "\n %llu blocks", 8391 (unsigned long long)sectors / 2); 8392 } 8393 if (mddev->persistent) { 8394 if (mddev->major_version != 0 || 8395 mddev->minor_version != 90) { 8396 seq_printf(seq," super %d.%d", 8397 mddev->major_version, 8398 mddev->minor_version); 8399 } 8400 } else if (mddev->external) 8401 seq_printf(seq, " super external:%s", 8402 mddev->metadata_type); 8403 else 8404 seq_printf(seq, " super non-persistent"); 8405 8406 if (mddev->pers) { 8407 mddev->pers->status(seq, mddev); 8408 seq_printf(seq, "\n "); 8409 if (mddev->pers->sync_request) { 8410 if (status_resync(seq, mddev)) 8411 seq_printf(seq, "\n "); 8412 } 8413 } else 8414 seq_printf(seq, "\n "); 8415 8416 md_bitmap_status(seq, mddev->bitmap); 8417 8418 seq_printf(seq, "\n"); 8419 } 8420 spin_unlock(&mddev->lock); 8421 8422 return 0; 8423 } 8424 8425 static const struct seq_operations md_seq_ops = { 8426 .start = md_seq_start, 8427 .next = md_seq_next, 8428 .stop = md_seq_stop, 8429 .show = md_seq_show, 8430 }; 8431 8432 static int md_seq_open(struct inode *inode, struct file *file) 8433 { 8434 struct seq_file *seq; 8435 int error; 8436 8437 error = seq_open(file, &md_seq_ops); 8438 if (error) 8439 return error; 8440 8441 seq = file->private_data; 8442 seq->poll_event = atomic_read(&md_event_count); 8443 return error; 8444 } 8445 8446 static int md_unloading; 8447 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8448 { 8449 struct seq_file *seq = filp->private_data; 8450 __poll_t mask; 8451 8452 if (md_unloading) 8453 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8454 poll_wait(filp, &md_event_waiters, wait); 8455 8456 /* always allow read */ 8457 mask = EPOLLIN | EPOLLRDNORM; 8458 8459 if (seq->poll_event != atomic_read(&md_event_count)) 8460 mask |= EPOLLERR | EPOLLPRI; 8461 return mask; 8462 } 8463 8464 static const struct proc_ops mdstat_proc_ops = { 8465 .proc_open = md_seq_open, 8466 .proc_read = seq_read, 8467 .proc_lseek = seq_lseek, 8468 .proc_release = seq_release, 8469 .proc_poll = mdstat_poll, 8470 }; 8471 8472 int register_md_personality(struct md_personality *p) 8473 { 8474 pr_debug("md: %s personality registered for level %d\n", 8475 p->name, p->level); 8476 spin_lock(&pers_lock); 8477 list_add_tail(&p->list, &pers_list); 8478 spin_unlock(&pers_lock); 8479 return 0; 8480 } 8481 EXPORT_SYMBOL(register_md_personality); 8482 8483 int unregister_md_personality(struct md_personality *p) 8484 { 8485 pr_debug("md: %s personality unregistered\n", p->name); 8486 spin_lock(&pers_lock); 8487 list_del_init(&p->list); 8488 spin_unlock(&pers_lock); 8489 return 0; 8490 } 8491 EXPORT_SYMBOL(unregister_md_personality); 8492 8493 int register_md_cluster_operations(struct md_cluster_operations *ops, 8494 struct module *module) 8495 { 8496 int ret = 0; 8497 spin_lock(&pers_lock); 8498 if (md_cluster_ops != NULL) 8499 ret = -EALREADY; 8500 else { 8501 md_cluster_ops = ops; 8502 md_cluster_mod = module; 8503 } 8504 spin_unlock(&pers_lock); 8505 return ret; 8506 } 8507 EXPORT_SYMBOL(register_md_cluster_operations); 8508 8509 int unregister_md_cluster_operations(void) 8510 { 8511 spin_lock(&pers_lock); 8512 md_cluster_ops = NULL; 8513 spin_unlock(&pers_lock); 8514 return 0; 8515 } 8516 EXPORT_SYMBOL(unregister_md_cluster_operations); 8517 8518 int md_setup_cluster(struct mddev *mddev, int nodes) 8519 { 8520 int ret; 8521 if (!md_cluster_ops) 8522 request_module("md-cluster"); 8523 spin_lock(&pers_lock); 8524 /* ensure module won't be unloaded */ 8525 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8526 pr_warn("can't find md-cluster module or get its reference.\n"); 8527 spin_unlock(&pers_lock); 8528 return -ENOENT; 8529 } 8530 spin_unlock(&pers_lock); 8531 8532 ret = md_cluster_ops->join(mddev, nodes); 8533 if (!ret) 8534 mddev->safemode_delay = 0; 8535 return ret; 8536 } 8537 8538 void md_cluster_stop(struct mddev *mddev) 8539 { 8540 if (!md_cluster_ops) 8541 return; 8542 md_cluster_ops->leave(mddev); 8543 module_put(md_cluster_mod); 8544 } 8545 8546 static int is_mddev_idle(struct mddev *mddev, int init) 8547 { 8548 struct md_rdev *rdev; 8549 int idle; 8550 int curr_events; 8551 8552 idle = 1; 8553 rcu_read_lock(); 8554 rdev_for_each_rcu(rdev, mddev) { 8555 struct gendisk *disk = rdev->bdev->bd_disk; 8556 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8557 atomic_read(&disk->sync_io); 8558 /* sync IO will cause sync_io to increase before the disk_stats 8559 * as sync_io is counted when a request starts, and 8560 * disk_stats is counted when it completes. 8561 * So resync activity will cause curr_events to be smaller than 8562 * when there was no such activity. 8563 * non-sync IO will cause disk_stat to increase without 8564 * increasing sync_io so curr_events will (eventually) 8565 * be larger than it was before. Once it becomes 8566 * substantially larger, the test below will cause 8567 * the array to appear non-idle, and resync will slow 8568 * down. 8569 * If there is a lot of outstanding resync activity when 8570 * we set last_event to curr_events, then all that activity 8571 * completing might cause the array to appear non-idle 8572 * and resync will be slowed down even though there might 8573 * not have been non-resync activity. This will only 8574 * happen once though. 'last_events' will soon reflect 8575 * the state where there is little or no outstanding 8576 * resync requests, and further resync activity will 8577 * always make curr_events less than last_events. 8578 * 8579 */ 8580 if (init || curr_events - rdev->last_events > 64) { 8581 rdev->last_events = curr_events; 8582 idle = 0; 8583 } 8584 } 8585 rcu_read_unlock(); 8586 return idle; 8587 } 8588 8589 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8590 { 8591 /* another "blocks" (512byte) blocks have been synced */ 8592 atomic_sub(blocks, &mddev->recovery_active); 8593 wake_up(&mddev->recovery_wait); 8594 if (!ok) { 8595 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8596 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8597 md_wakeup_thread(mddev->thread); 8598 // stop recovery, signal do_sync .... 8599 } 8600 } 8601 EXPORT_SYMBOL(md_done_sync); 8602 8603 /* md_write_start(mddev, bi) 8604 * If we need to update some array metadata (e.g. 'active' flag 8605 * in superblock) before writing, schedule a superblock update 8606 * and wait for it to complete. 8607 * A return value of 'false' means that the write wasn't recorded 8608 * and cannot proceed as the array is being suspend. 8609 */ 8610 bool md_write_start(struct mddev *mddev, struct bio *bi) 8611 { 8612 int did_change = 0; 8613 8614 if (bio_data_dir(bi) != WRITE) 8615 return true; 8616 8617 BUG_ON(mddev->ro == MD_RDONLY); 8618 if (mddev->ro == MD_AUTO_READ) { 8619 /* need to switch to read/write */ 8620 mddev->ro = MD_RDWR; 8621 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8622 md_wakeup_thread(mddev->thread); 8623 md_wakeup_thread(mddev->sync_thread); 8624 did_change = 1; 8625 } 8626 rcu_read_lock(); 8627 percpu_ref_get(&mddev->writes_pending); 8628 smp_mb(); /* Match smp_mb in set_in_sync() */ 8629 if (mddev->safemode == 1) 8630 mddev->safemode = 0; 8631 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8632 if (mddev->in_sync || mddev->sync_checkers) { 8633 spin_lock(&mddev->lock); 8634 if (mddev->in_sync) { 8635 mddev->in_sync = 0; 8636 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8637 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8638 md_wakeup_thread(mddev->thread); 8639 did_change = 1; 8640 } 8641 spin_unlock(&mddev->lock); 8642 } 8643 rcu_read_unlock(); 8644 if (did_change) 8645 sysfs_notify_dirent_safe(mddev->sysfs_state); 8646 if (!mddev->has_superblocks) 8647 return true; 8648 wait_event(mddev->sb_wait, 8649 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8650 is_md_suspended(mddev)); 8651 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8652 percpu_ref_put(&mddev->writes_pending); 8653 return false; 8654 } 8655 return true; 8656 } 8657 EXPORT_SYMBOL(md_write_start); 8658 8659 /* md_write_inc can only be called when md_write_start() has 8660 * already been called at least once of the current request. 8661 * It increments the counter and is useful when a single request 8662 * is split into several parts. Each part causes an increment and 8663 * so needs a matching md_write_end(). 8664 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8665 * a spinlocked region. 8666 */ 8667 void md_write_inc(struct mddev *mddev, struct bio *bi) 8668 { 8669 if (bio_data_dir(bi) != WRITE) 8670 return; 8671 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8672 percpu_ref_get(&mddev->writes_pending); 8673 } 8674 EXPORT_SYMBOL(md_write_inc); 8675 8676 void md_write_end(struct mddev *mddev) 8677 { 8678 percpu_ref_put(&mddev->writes_pending); 8679 8680 if (mddev->safemode == 2) 8681 md_wakeup_thread(mddev->thread); 8682 else if (mddev->safemode_delay) 8683 /* The roundup() ensures this only performs locking once 8684 * every ->safemode_delay jiffies 8685 */ 8686 mod_timer(&mddev->safemode_timer, 8687 roundup(jiffies, mddev->safemode_delay) + 8688 mddev->safemode_delay); 8689 } 8690 8691 EXPORT_SYMBOL(md_write_end); 8692 8693 /* This is used by raid0 and raid10 */ 8694 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8695 struct bio *bio, sector_t start, sector_t size) 8696 { 8697 struct bio *discard_bio = NULL; 8698 8699 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8700 &discard_bio) || !discard_bio) 8701 return; 8702 8703 bio_chain(discard_bio, bio); 8704 bio_clone_blkg_association(discard_bio, bio); 8705 if (mddev->gendisk) 8706 trace_block_bio_remap(discard_bio, 8707 disk_devt(mddev->gendisk), 8708 bio->bi_iter.bi_sector); 8709 submit_bio_noacct(discard_bio); 8710 } 8711 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8712 8713 static void md_end_clone_io(struct bio *bio) 8714 { 8715 struct md_io_clone *md_io_clone = bio->bi_private; 8716 struct bio *orig_bio = md_io_clone->orig_bio; 8717 struct mddev *mddev = md_io_clone->mddev; 8718 8719 if (bio->bi_status && !orig_bio->bi_status) 8720 orig_bio->bi_status = bio->bi_status; 8721 8722 if (md_io_clone->start_time) 8723 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8724 8725 bio_put(bio); 8726 bio_endio(orig_bio); 8727 percpu_ref_put(&mddev->active_io); 8728 } 8729 8730 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8731 { 8732 struct block_device *bdev = (*bio)->bi_bdev; 8733 struct md_io_clone *md_io_clone; 8734 struct bio *clone = 8735 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8736 8737 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8738 md_io_clone->orig_bio = *bio; 8739 md_io_clone->mddev = mddev; 8740 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8741 md_io_clone->start_time = bio_start_io_acct(*bio); 8742 8743 clone->bi_end_io = md_end_clone_io; 8744 clone->bi_private = md_io_clone; 8745 *bio = clone; 8746 } 8747 8748 void md_account_bio(struct mddev *mddev, struct bio **bio) 8749 { 8750 percpu_ref_get(&mddev->active_io); 8751 md_clone_bio(mddev, bio); 8752 } 8753 EXPORT_SYMBOL_GPL(md_account_bio); 8754 8755 /* md_allow_write(mddev) 8756 * Calling this ensures that the array is marked 'active' so that writes 8757 * may proceed without blocking. It is important to call this before 8758 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8759 * Must be called with mddev_lock held. 8760 */ 8761 void md_allow_write(struct mddev *mddev) 8762 { 8763 if (!mddev->pers) 8764 return; 8765 if (!md_is_rdwr(mddev)) 8766 return; 8767 if (!mddev->pers->sync_request) 8768 return; 8769 8770 spin_lock(&mddev->lock); 8771 if (mddev->in_sync) { 8772 mddev->in_sync = 0; 8773 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8774 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8775 if (mddev->safemode_delay && 8776 mddev->safemode == 0) 8777 mddev->safemode = 1; 8778 spin_unlock(&mddev->lock); 8779 md_update_sb(mddev, 0); 8780 sysfs_notify_dirent_safe(mddev->sysfs_state); 8781 /* wait for the dirty state to be recorded in the metadata */ 8782 wait_event(mddev->sb_wait, 8783 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8784 } else 8785 spin_unlock(&mddev->lock); 8786 } 8787 EXPORT_SYMBOL_GPL(md_allow_write); 8788 8789 #define SYNC_MARKS 10 8790 #define SYNC_MARK_STEP (3*HZ) 8791 #define UPDATE_FREQUENCY (5*60*HZ) 8792 void md_do_sync(struct md_thread *thread) 8793 { 8794 struct mddev *mddev = thread->mddev; 8795 struct mddev *mddev2; 8796 unsigned int currspeed = 0, window; 8797 sector_t max_sectors,j, io_sectors, recovery_done; 8798 unsigned long mark[SYNC_MARKS]; 8799 unsigned long update_time; 8800 sector_t mark_cnt[SYNC_MARKS]; 8801 int last_mark,m; 8802 sector_t last_check; 8803 int skipped = 0; 8804 struct md_rdev *rdev; 8805 char *desc, *action = NULL; 8806 struct blk_plug plug; 8807 int ret; 8808 8809 /* just incase thread restarts... */ 8810 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8811 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8812 return; 8813 if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8814 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8815 return; 8816 } 8817 8818 if (mddev_is_clustered(mddev)) { 8819 ret = md_cluster_ops->resync_start(mddev); 8820 if (ret) 8821 goto skip; 8822 8823 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8824 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8825 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8826 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8827 && ((unsigned long long)mddev->curr_resync_completed 8828 < (unsigned long long)mddev->resync_max_sectors)) 8829 goto skip; 8830 } 8831 8832 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8833 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8834 desc = "data-check"; 8835 action = "check"; 8836 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8837 desc = "requested-resync"; 8838 action = "repair"; 8839 } else 8840 desc = "resync"; 8841 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8842 desc = "reshape"; 8843 else 8844 desc = "recovery"; 8845 8846 mddev->last_sync_action = action ?: desc; 8847 8848 /* 8849 * Before starting a resync we must have set curr_resync to 8850 * 2, and then checked that every "conflicting" array has curr_resync 8851 * less than ours. When we find one that is the same or higher 8852 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8853 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8854 * This will mean we have to start checking from the beginning again. 8855 * 8856 */ 8857 8858 do { 8859 int mddev2_minor = -1; 8860 mddev->curr_resync = MD_RESYNC_DELAYED; 8861 8862 try_again: 8863 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8864 goto skip; 8865 spin_lock(&all_mddevs_lock); 8866 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8867 if (test_bit(MD_DELETED, &mddev2->flags)) 8868 continue; 8869 if (mddev2 == mddev) 8870 continue; 8871 if (!mddev->parallel_resync 8872 && mddev2->curr_resync 8873 && match_mddev_units(mddev, mddev2)) { 8874 DEFINE_WAIT(wq); 8875 if (mddev < mddev2 && 8876 mddev->curr_resync == MD_RESYNC_DELAYED) { 8877 /* arbitrarily yield */ 8878 mddev->curr_resync = MD_RESYNC_YIELDED; 8879 wake_up(&resync_wait); 8880 } 8881 if (mddev > mddev2 && 8882 mddev->curr_resync == MD_RESYNC_YIELDED) 8883 /* no need to wait here, we can wait the next 8884 * time 'round when curr_resync == 2 8885 */ 8886 continue; 8887 /* We need to wait 'interruptible' so as not to 8888 * contribute to the load average, and not to 8889 * be caught by 'softlockup' 8890 */ 8891 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8892 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8893 mddev2->curr_resync >= mddev->curr_resync) { 8894 if (mddev2_minor != mddev2->md_minor) { 8895 mddev2_minor = mddev2->md_minor; 8896 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8897 desc, mdname(mddev), 8898 mdname(mddev2)); 8899 } 8900 spin_unlock(&all_mddevs_lock); 8901 8902 if (signal_pending(current)) 8903 flush_signals(current); 8904 schedule(); 8905 finish_wait(&resync_wait, &wq); 8906 goto try_again; 8907 } 8908 finish_wait(&resync_wait, &wq); 8909 } 8910 } 8911 spin_unlock(&all_mddevs_lock); 8912 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8913 8914 j = 0; 8915 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8916 /* resync follows the size requested by the personality, 8917 * which defaults to physical size, but can be virtual size 8918 */ 8919 max_sectors = mddev->resync_max_sectors; 8920 atomic64_set(&mddev->resync_mismatches, 0); 8921 /* we don't use the checkpoint if there's a bitmap */ 8922 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8923 j = mddev->resync_min; 8924 else if (!mddev->bitmap) 8925 j = mddev->recovery_cp; 8926 8927 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8928 max_sectors = mddev->resync_max_sectors; 8929 /* 8930 * If the original node aborts reshaping then we continue the 8931 * reshaping, so set j again to avoid restart reshape from the 8932 * first beginning 8933 */ 8934 if (mddev_is_clustered(mddev) && 8935 mddev->reshape_position != MaxSector) 8936 j = mddev->reshape_position; 8937 } else { 8938 /* recovery follows the physical size of devices */ 8939 max_sectors = mddev->dev_sectors; 8940 j = MaxSector; 8941 rcu_read_lock(); 8942 rdev_for_each_rcu(rdev, mddev) 8943 if (rdev->raid_disk >= 0 && 8944 !test_bit(Journal, &rdev->flags) && 8945 !test_bit(Faulty, &rdev->flags) && 8946 !test_bit(In_sync, &rdev->flags) && 8947 rdev->recovery_offset < j) 8948 j = rdev->recovery_offset; 8949 rcu_read_unlock(); 8950 8951 /* If there is a bitmap, we need to make sure all 8952 * writes that started before we added a spare 8953 * complete before we start doing a recovery. 8954 * Otherwise the write might complete and (via 8955 * bitmap_endwrite) set a bit in the bitmap after the 8956 * recovery has checked that bit and skipped that 8957 * region. 8958 */ 8959 if (mddev->bitmap) { 8960 mddev->pers->quiesce(mddev, 1); 8961 mddev->pers->quiesce(mddev, 0); 8962 } 8963 } 8964 8965 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8966 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8967 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8968 speed_max(mddev), desc); 8969 8970 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8971 8972 io_sectors = 0; 8973 for (m = 0; m < SYNC_MARKS; m++) { 8974 mark[m] = jiffies; 8975 mark_cnt[m] = io_sectors; 8976 } 8977 last_mark = 0; 8978 mddev->resync_mark = mark[last_mark]; 8979 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8980 8981 /* 8982 * Tune reconstruction: 8983 */ 8984 window = 32 * (PAGE_SIZE / 512); 8985 pr_debug("md: using %dk window, over a total of %lluk.\n", 8986 window/2, (unsigned long long)max_sectors/2); 8987 8988 atomic_set(&mddev->recovery_active, 0); 8989 last_check = 0; 8990 8991 if (j >= MD_RESYNC_ACTIVE) { 8992 pr_debug("md: resuming %s of %s from checkpoint.\n", 8993 desc, mdname(mddev)); 8994 mddev->curr_resync = j; 8995 } else 8996 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 8997 mddev->curr_resync_completed = j; 8998 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8999 md_new_event(); 9000 update_time = jiffies; 9001 9002 blk_start_plug(&plug); 9003 while (j < max_sectors) { 9004 sector_t sectors; 9005 9006 skipped = 0; 9007 9008 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9009 ((mddev->curr_resync > mddev->curr_resync_completed && 9010 (mddev->curr_resync - mddev->curr_resync_completed) 9011 > (max_sectors >> 4)) || 9012 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9013 (j - mddev->curr_resync_completed)*2 9014 >= mddev->resync_max - mddev->curr_resync_completed || 9015 mddev->curr_resync_completed > mddev->resync_max 9016 )) { 9017 /* time to update curr_resync_completed */ 9018 wait_event(mddev->recovery_wait, 9019 atomic_read(&mddev->recovery_active) == 0); 9020 mddev->curr_resync_completed = j; 9021 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9022 j > mddev->recovery_cp) 9023 mddev->recovery_cp = j; 9024 update_time = jiffies; 9025 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9026 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9027 } 9028 9029 while (j >= mddev->resync_max && 9030 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9031 /* As this condition is controlled by user-space, 9032 * we can block indefinitely, so use '_interruptible' 9033 * to avoid triggering warnings. 9034 */ 9035 flush_signals(current); /* just in case */ 9036 wait_event_interruptible(mddev->recovery_wait, 9037 mddev->resync_max > j 9038 || test_bit(MD_RECOVERY_INTR, 9039 &mddev->recovery)); 9040 } 9041 9042 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9043 break; 9044 9045 sectors = mddev->pers->sync_request(mddev, j, &skipped); 9046 if (sectors == 0) { 9047 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9048 break; 9049 } 9050 9051 if (!skipped) { /* actual IO requested */ 9052 io_sectors += sectors; 9053 atomic_add(sectors, &mddev->recovery_active); 9054 } 9055 9056 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9057 break; 9058 9059 j += sectors; 9060 if (j > max_sectors) 9061 /* when skipping, extra large numbers can be returned. */ 9062 j = max_sectors; 9063 if (j >= MD_RESYNC_ACTIVE) 9064 mddev->curr_resync = j; 9065 mddev->curr_mark_cnt = io_sectors; 9066 if (last_check == 0) 9067 /* this is the earliest that rebuild will be 9068 * visible in /proc/mdstat 9069 */ 9070 md_new_event(); 9071 9072 if (last_check + window > io_sectors || j == max_sectors) 9073 continue; 9074 9075 last_check = io_sectors; 9076 repeat: 9077 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9078 /* step marks */ 9079 int next = (last_mark+1) % SYNC_MARKS; 9080 9081 mddev->resync_mark = mark[next]; 9082 mddev->resync_mark_cnt = mark_cnt[next]; 9083 mark[next] = jiffies; 9084 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9085 last_mark = next; 9086 } 9087 9088 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9089 break; 9090 9091 /* 9092 * this loop exits only if either when we are slower than 9093 * the 'hard' speed limit, or the system was IO-idle for 9094 * a jiffy. 9095 * the system might be non-idle CPU-wise, but we only care 9096 * about not overloading the IO subsystem. (things like an 9097 * e2fsck being done on the RAID array should execute fast) 9098 */ 9099 cond_resched(); 9100 9101 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9102 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9103 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9104 9105 if (currspeed > speed_min(mddev)) { 9106 if (currspeed > speed_max(mddev)) { 9107 msleep(500); 9108 goto repeat; 9109 } 9110 if (!is_mddev_idle(mddev, 0)) { 9111 /* 9112 * Give other IO more of a chance. 9113 * The faster the devices, the less we wait. 9114 */ 9115 wait_event(mddev->recovery_wait, 9116 !atomic_read(&mddev->recovery_active)); 9117 } 9118 } 9119 } 9120 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9121 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9122 ? "interrupted" : "done"); 9123 /* 9124 * this also signals 'finished resyncing' to md_stop 9125 */ 9126 blk_finish_plug(&plug); 9127 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9128 9129 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9130 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9131 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9132 mddev->curr_resync_completed = mddev->curr_resync; 9133 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9134 } 9135 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9136 9137 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9138 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9139 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9140 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9141 if (mddev->curr_resync >= mddev->recovery_cp) { 9142 pr_debug("md: checkpointing %s of %s.\n", 9143 desc, mdname(mddev)); 9144 if (test_bit(MD_RECOVERY_ERROR, 9145 &mddev->recovery)) 9146 mddev->recovery_cp = 9147 mddev->curr_resync_completed; 9148 else 9149 mddev->recovery_cp = 9150 mddev->curr_resync; 9151 } 9152 } else 9153 mddev->recovery_cp = MaxSector; 9154 } else { 9155 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9156 mddev->curr_resync = MaxSector; 9157 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9158 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9159 rcu_read_lock(); 9160 rdev_for_each_rcu(rdev, mddev) 9161 if (rdev->raid_disk >= 0 && 9162 mddev->delta_disks >= 0 && 9163 !test_bit(Journal, &rdev->flags) && 9164 !test_bit(Faulty, &rdev->flags) && 9165 !test_bit(In_sync, &rdev->flags) && 9166 rdev->recovery_offset < mddev->curr_resync) 9167 rdev->recovery_offset = mddev->curr_resync; 9168 rcu_read_unlock(); 9169 } 9170 } 9171 } 9172 skip: 9173 /* set CHANGE_PENDING here since maybe another update is needed, 9174 * so other nodes are informed. It should be harmless for normal 9175 * raid */ 9176 set_mask_bits(&mddev->sb_flags, 0, 9177 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9178 9179 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9180 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9181 mddev->delta_disks > 0 && 9182 mddev->pers->finish_reshape && 9183 mddev->pers->size && 9184 mddev->queue) { 9185 mddev_lock_nointr(mddev); 9186 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9187 mddev_unlock(mddev); 9188 if (!mddev_is_clustered(mddev)) 9189 set_capacity_and_notify(mddev->gendisk, 9190 mddev->array_sectors); 9191 } 9192 9193 spin_lock(&mddev->lock); 9194 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9195 /* We completed so min/max setting can be forgotten if used. */ 9196 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9197 mddev->resync_min = 0; 9198 mddev->resync_max = MaxSector; 9199 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9200 mddev->resync_min = mddev->curr_resync_completed; 9201 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9202 mddev->curr_resync = MD_RESYNC_NONE; 9203 spin_unlock(&mddev->lock); 9204 9205 wake_up(&resync_wait); 9206 wake_up(&mddev->sb_wait); 9207 md_wakeup_thread(mddev->thread); 9208 return; 9209 } 9210 EXPORT_SYMBOL_GPL(md_do_sync); 9211 9212 static int remove_and_add_spares(struct mddev *mddev, 9213 struct md_rdev *this) 9214 { 9215 struct md_rdev *rdev; 9216 int spares = 0; 9217 int removed = 0; 9218 bool remove_some = false; 9219 9220 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9221 /* Mustn't remove devices when resync thread is running */ 9222 return 0; 9223 9224 rdev_for_each(rdev, mddev) { 9225 if ((this == NULL || rdev == this) && 9226 rdev->raid_disk >= 0 && 9227 !test_bit(Blocked, &rdev->flags) && 9228 test_bit(Faulty, &rdev->flags) && 9229 atomic_read(&rdev->nr_pending)==0) { 9230 /* Faulty non-Blocked devices with nr_pending == 0 9231 * never get nr_pending incremented, 9232 * never get Faulty cleared, and never get Blocked set. 9233 * So we can synchronize_rcu now rather than once per device 9234 */ 9235 remove_some = true; 9236 set_bit(RemoveSynchronized, &rdev->flags); 9237 } 9238 } 9239 9240 if (remove_some) 9241 synchronize_rcu(); 9242 rdev_for_each(rdev, mddev) { 9243 if ((this == NULL || rdev == this) && 9244 rdev->raid_disk >= 0 && 9245 !test_bit(Blocked, &rdev->flags) && 9246 ((test_bit(RemoveSynchronized, &rdev->flags) || 9247 (!test_bit(In_sync, &rdev->flags) && 9248 !test_bit(Journal, &rdev->flags))) && 9249 atomic_read(&rdev->nr_pending)==0)) { 9250 if (mddev->pers->hot_remove_disk( 9251 mddev, rdev) == 0) { 9252 sysfs_unlink_rdev(mddev, rdev); 9253 rdev->saved_raid_disk = rdev->raid_disk; 9254 rdev->raid_disk = -1; 9255 removed++; 9256 } 9257 } 9258 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 9259 clear_bit(RemoveSynchronized, &rdev->flags); 9260 } 9261 9262 if (removed && mddev->kobj.sd) 9263 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9264 9265 if (this && removed) 9266 goto no_add; 9267 9268 rdev_for_each(rdev, mddev) { 9269 if (this && this != rdev) 9270 continue; 9271 if (test_bit(Candidate, &rdev->flags)) 9272 continue; 9273 if (rdev->raid_disk >= 0 && 9274 !test_bit(In_sync, &rdev->flags) && 9275 !test_bit(Journal, &rdev->flags) && 9276 !test_bit(Faulty, &rdev->flags)) 9277 spares++; 9278 if (rdev->raid_disk >= 0) 9279 continue; 9280 if (test_bit(Faulty, &rdev->flags)) 9281 continue; 9282 if (!test_bit(Journal, &rdev->flags)) { 9283 if (!md_is_rdwr(mddev) && 9284 !(rdev->saved_raid_disk >= 0 && 9285 !test_bit(Bitmap_sync, &rdev->flags))) 9286 continue; 9287 9288 rdev->recovery_offset = 0; 9289 } 9290 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9291 /* failure here is OK */ 9292 sysfs_link_rdev(mddev, rdev); 9293 if (!test_bit(Journal, &rdev->flags)) 9294 spares++; 9295 md_new_event(); 9296 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9297 } 9298 } 9299 no_add: 9300 if (removed) 9301 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9302 return spares; 9303 } 9304 9305 static void md_start_sync(struct work_struct *ws) 9306 { 9307 struct mddev *mddev = container_of(ws, struct mddev, del_work); 9308 9309 rcu_assign_pointer(mddev->sync_thread, 9310 md_register_thread(md_do_sync, mddev, "resync")); 9311 if (!mddev->sync_thread) { 9312 pr_warn("%s: could not start resync thread...\n", 9313 mdname(mddev)); 9314 /* leave the spares where they are, it shouldn't hurt */ 9315 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9316 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9317 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9318 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9319 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9320 wake_up(&resync_wait); 9321 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9322 &mddev->recovery)) 9323 if (mddev->sysfs_action) 9324 sysfs_notify_dirent_safe(mddev->sysfs_action); 9325 } else 9326 md_wakeup_thread(mddev->sync_thread); 9327 sysfs_notify_dirent_safe(mddev->sysfs_action); 9328 md_new_event(); 9329 } 9330 9331 /* 9332 * This routine is regularly called by all per-raid-array threads to 9333 * deal with generic issues like resync and super-block update. 9334 * Raid personalities that don't have a thread (linear/raid0) do not 9335 * need this as they never do any recovery or update the superblock. 9336 * 9337 * It does not do any resync itself, but rather "forks" off other threads 9338 * to do that as needed. 9339 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9340 * "->recovery" and create a thread at ->sync_thread. 9341 * When the thread finishes it sets MD_RECOVERY_DONE 9342 * and wakeups up this thread which will reap the thread and finish up. 9343 * This thread also removes any faulty devices (with nr_pending == 0). 9344 * 9345 * The overall approach is: 9346 * 1/ if the superblock needs updating, update it. 9347 * 2/ If a recovery thread is running, don't do anything else. 9348 * 3/ If recovery has finished, clean up, possibly marking spares active. 9349 * 4/ If there are any faulty devices, remove them. 9350 * 5/ If array is degraded, try to add spares devices 9351 * 6/ If array has spares or is not in-sync, start a resync thread. 9352 */ 9353 void md_check_recovery(struct mddev *mddev) 9354 { 9355 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { 9356 /* Write superblock - thread that called mddev_suspend() 9357 * holds reconfig_mutex for us. 9358 */ 9359 set_bit(MD_UPDATING_SB, &mddev->flags); 9360 smp_mb__after_atomic(); 9361 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) 9362 md_update_sb(mddev, 0); 9363 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); 9364 wake_up(&mddev->sb_wait); 9365 } 9366 9367 if (is_md_suspended(mddev)) 9368 return; 9369 9370 if (mddev->bitmap) 9371 md_bitmap_daemon_work(mddev); 9372 9373 if (signal_pending(current)) { 9374 if (mddev->pers->sync_request && !mddev->external) { 9375 pr_debug("md: %s in immediate safe mode\n", 9376 mdname(mddev)); 9377 mddev->safemode = 2; 9378 } 9379 flush_signals(current); 9380 } 9381 9382 if (!md_is_rdwr(mddev) && 9383 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9384 return; 9385 if ( ! ( 9386 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9387 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9388 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9389 (mddev->external == 0 && mddev->safemode == 1) || 9390 (mddev->safemode == 2 9391 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9392 )) 9393 return; 9394 9395 if (mddev_trylock(mddev)) { 9396 int spares = 0; 9397 bool try_set_sync = mddev->safemode != 0; 9398 9399 if (!mddev->external && mddev->safemode == 1) 9400 mddev->safemode = 0; 9401 9402 if (!md_is_rdwr(mddev)) { 9403 struct md_rdev *rdev; 9404 if (!mddev->external && mddev->in_sync) 9405 /* 'Blocked' flag not needed as failed devices 9406 * will be recorded if array switched to read/write. 9407 * Leaving it set will prevent the device 9408 * from being removed. 9409 */ 9410 rdev_for_each(rdev, mddev) 9411 clear_bit(Blocked, &rdev->flags); 9412 /* On a read-only array we can: 9413 * - remove failed devices 9414 * - add already-in_sync devices if the array itself 9415 * is in-sync. 9416 * As we only add devices that are already in-sync, 9417 * we can activate the spares immediately. 9418 */ 9419 remove_and_add_spares(mddev, NULL); 9420 /* There is no thread, but we need to call 9421 * ->spare_active and clear saved_raid_disk 9422 */ 9423 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9424 md_reap_sync_thread(mddev); 9425 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9426 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9427 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9428 goto unlock; 9429 } 9430 9431 if (mddev_is_clustered(mddev)) { 9432 struct md_rdev *rdev, *tmp; 9433 /* kick the device if another node issued a 9434 * remove disk. 9435 */ 9436 rdev_for_each_safe(rdev, tmp, mddev) { 9437 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9438 rdev->raid_disk < 0) 9439 md_kick_rdev_from_array(rdev); 9440 } 9441 } 9442 9443 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9444 spin_lock(&mddev->lock); 9445 set_in_sync(mddev); 9446 spin_unlock(&mddev->lock); 9447 } 9448 9449 if (mddev->sb_flags) 9450 md_update_sb(mddev, 0); 9451 9452 /* 9453 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9454 * still set. 9455 */ 9456 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9457 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9458 /* resync/recovery still happening */ 9459 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9460 goto unlock; 9461 } 9462 9463 if (WARN_ON_ONCE(!mddev->sync_thread)) 9464 goto unlock; 9465 9466 md_reap_sync_thread(mddev); 9467 goto unlock; 9468 } 9469 9470 /* Set RUNNING before clearing NEEDED to avoid 9471 * any transients in the value of "sync_action". 9472 */ 9473 mddev->curr_resync_completed = 0; 9474 spin_lock(&mddev->lock); 9475 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9476 spin_unlock(&mddev->lock); 9477 /* Clear some bits that don't mean anything, but 9478 * might be left set 9479 */ 9480 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9481 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9482 9483 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9484 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 9485 goto not_running; 9486 /* no recovery is running. 9487 * remove any failed drives, then 9488 * add spares if possible. 9489 * Spares are also removed and re-added, to allow 9490 * the personality to fail the re-add. 9491 */ 9492 9493 if (mddev->reshape_position != MaxSector) { 9494 if (mddev->pers->check_reshape == NULL || 9495 mddev->pers->check_reshape(mddev) != 0) 9496 /* Cannot proceed */ 9497 goto not_running; 9498 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9499 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9500 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 9501 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9502 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9503 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9504 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9505 } else if (mddev->recovery_cp < MaxSector) { 9506 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9507 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9508 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9509 /* nothing to be done ... */ 9510 goto not_running; 9511 9512 if (mddev->pers->sync_request) { 9513 if (spares) { 9514 /* We are adding a device or devices to an array 9515 * which has the bitmap stored on all devices. 9516 * So make sure all bitmap pages get written 9517 */ 9518 md_bitmap_write_all(mddev->bitmap); 9519 } 9520 INIT_WORK(&mddev->del_work, md_start_sync); 9521 queue_work(md_misc_wq, &mddev->del_work); 9522 goto unlock; 9523 } 9524 not_running: 9525 if (!mddev->sync_thread) { 9526 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9527 wake_up(&resync_wait); 9528 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9529 &mddev->recovery)) 9530 if (mddev->sysfs_action) 9531 sysfs_notify_dirent_safe(mddev->sysfs_action); 9532 } 9533 unlock: 9534 wake_up(&mddev->sb_wait); 9535 mddev_unlock(mddev); 9536 } 9537 } 9538 EXPORT_SYMBOL(md_check_recovery); 9539 9540 void md_reap_sync_thread(struct mddev *mddev) 9541 { 9542 struct md_rdev *rdev; 9543 sector_t old_dev_sectors = mddev->dev_sectors; 9544 bool is_reshaped = false; 9545 9546 /* resync has finished, collect result */ 9547 md_unregister_thread(mddev, &mddev->sync_thread); 9548 atomic_inc(&mddev->sync_seq); 9549 9550 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9551 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9552 mddev->degraded != mddev->raid_disks) { 9553 /* success...*/ 9554 /* activate any spares */ 9555 if (mddev->pers->spare_active(mddev)) { 9556 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9557 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9558 } 9559 } 9560 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9561 mddev->pers->finish_reshape) { 9562 mddev->pers->finish_reshape(mddev); 9563 if (mddev_is_clustered(mddev)) 9564 is_reshaped = true; 9565 } 9566 9567 /* If array is no-longer degraded, then any saved_raid_disk 9568 * information must be scrapped. 9569 */ 9570 if (!mddev->degraded) 9571 rdev_for_each(rdev, mddev) 9572 rdev->saved_raid_disk = -1; 9573 9574 md_update_sb(mddev, 1); 9575 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9576 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9577 * clustered raid */ 9578 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9579 md_cluster_ops->resync_finish(mddev); 9580 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9581 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9582 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9583 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9584 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9585 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9586 /* 9587 * We call md_cluster_ops->update_size here because sync_size could 9588 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9589 * so it is time to update size across cluster. 9590 */ 9591 if (mddev_is_clustered(mddev) && is_reshaped 9592 && !test_bit(MD_CLOSING, &mddev->flags)) 9593 md_cluster_ops->update_size(mddev, old_dev_sectors); 9594 /* flag recovery needed just to double check */ 9595 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9596 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9597 sysfs_notify_dirent_safe(mddev->sysfs_action); 9598 md_new_event(); 9599 if (mddev->event_work.func) 9600 queue_work(md_misc_wq, &mddev->event_work); 9601 wake_up(&resync_wait); 9602 } 9603 EXPORT_SYMBOL(md_reap_sync_thread); 9604 9605 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9606 { 9607 sysfs_notify_dirent_safe(rdev->sysfs_state); 9608 wait_event_timeout(rdev->blocked_wait, 9609 !test_bit(Blocked, &rdev->flags) && 9610 !test_bit(BlockedBadBlocks, &rdev->flags), 9611 msecs_to_jiffies(5000)); 9612 rdev_dec_pending(rdev, mddev); 9613 } 9614 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9615 9616 void md_finish_reshape(struct mddev *mddev) 9617 { 9618 /* called be personality module when reshape completes. */ 9619 struct md_rdev *rdev; 9620 9621 rdev_for_each(rdev, mddev) { 9622 if (rdev->data_offset > rdev->new_data_offset) 9623 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9624 else 9625 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9626 rdev->data_offset = rdev->new_data_offset; 9627 } 9628 } 9629 EXPORT_SYMBOL(md_finish_reshape); 9630 9631 /* Bad block management */ 9632 9633 /* Returns 1 on success, 0 on failure */ 9634 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9635 int is_new) 9636 { 9637 struct mddev *mddev = rdev->mddev; 9638 int rv; 9639 if (is_new) 9640 s += rdev->new_data_offset; 9641 else 9642 s += rdev->data_offset; 9643 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9644 if (rv == 0) { 9645 /* Make sure they get written out promptly */ 9646 if (test_bit(ExternalBbl, &rdev->flags)) 9647 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9648 sysfs_notify_dirent_safe(rdev->sysfs_state); 9649 set_mask_bits(&mddev->sb_flags, 0, 9650 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9651 md_wakeup_thread(rdev->mddev->thread); 9652 return 1; 9653 } else 9654 return 0; 9655 } 9656 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9657 9658 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9659 int is_new) 9660 { 9661 int rv; 9662 if (is_new) 9663 s += rdev->new_data_offset; 9664 else 9665 s += rdev->data_offset; 9666 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9667 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9668 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9669 return rv; 9670 } 9671 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9672 9673 static int md_notify_reboot(struct notifier_block *this, 9674 unsigned long code, void *x) 9675 { 9676 struct mddev *mddev, *n; 9677 int need_delay = 0; 9678 9679 spin_lock(&all_mddevs_lock); 9680 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9681 if (!mddev_get(mddev)) 9682 continue; 9683 spin_unlock(&all_mddevs_lock); 9684 if (mddev_trylock(mddev)) { 9685 if (mddev->pers) 9686 __md_stop_writes(mddev); 9687 if (mddev->persistent) 9688 mddev->safemode = 2; 9689 mddev_unlock(mddev); 9690 } 9691 need_delay = 1; 9692 mddev_put(mddev); 9693 spin_lock(&all_mddevs_lock); 9694 } 9695 spin_unlock(&all_mddevs_lock); 9696 9697 /* 9698 * certain more exotic SCSI devices are known to be 9699 * volatile wrt too early system reboots. While the 9700 * right place to handle this issue is the given 9701 * driver, we do want to have a safe RAID driver ... 9702 */ 9703 if (need_delay) 9704 msleep(1000); 9705 9706 return NOTIFY_DONE; 9707 } 9708 9709 static struct notifier_block md_notifier = { 9710 .notifier_call = md_notify_reboot, 9711 .next = NULL, 9712 .priority = INT_MAX, /* before any real devices */ 9713 }; 9714 9715 static void md_geninit(void) 9716 { 9717 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9718 9719 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9720 } 9721 9722 static int __init md_init(void) 9723 { 9724 int ret = -ENOMEM; 9725 9726 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9727 if (!md_wq) 9728 goto err_wq; 9729 9730 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9731 if (!md_misc_wq) 9732 goto err_misc_wq; 9733 9734 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9735 0); 9736 if (!md_bitmap_wq) 9737 goto err_bitmap_wq; 9738 9739 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9740 if (ret < 0) 9741 goto err_md; 9742 9743 ret = __register_blkdev(0, "mdp", md_probe); 9744 if (ret < 0) 9745 goto err_mdp; 9746 mdp_major = ret; 9747 9748 register_reboot_notifier(&md_notifier); 9749 raid_table_header = register_sysctl("dev/raid", raid_table); 9750 9751 md_geninit(); 9752 return 0; 9753 9754 err_mdp: 9755 unregister_blkdev(MD_MAJOR, "md"); 9756 err_md: 9757 destroy_workqueue(md_bitmap_wq); 9758 err_bitmap_wq: 9759 destroy_workqueue(md_misc_wq); 9760 err_misc_wq: 9761 destroy_workqueue(md_wq); 9762 err_wq: 9763 return ret; 9764 } 9765 9766 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9767 { 9768 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9769 struct md_rdev *rdev2, *tmp; 9770 int role, ret; 9771 9772 /* 9773 * If size is changed in another node then we need to 9774 * do resize as well. 9775 */ 9776 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9777 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9778 if (ret) 9779 pr_info("md-cluster: resize failed\n"); 9780 else 9781 md_bitmap_update_sb(mddev->bitmap); 9782 } 9783 9784 /* Check for change of roles in the active devices */ 9785 rdev_for_each_safe(rdev2, tmp, mddev) { 9786 if (test_bit(Faulty, &rdev2->flags)) 9787 continue; 9788 9789 /* Check if the roles changed */ 9790 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9791 9792 if (test_bit(Candidate, &rdev2->flags)) { 9793 if (role == MD_DISK_ROLE_FAULTY) { 9794 pr_info("md: Removing Candidate device %pg because add failed\n", 9795 rdev2->bdev); 9796 md_kick_rdev_from_array(rdev2); 9797 continue; 9798 } 9799 else 9800 clear_bit(Candidate, &rdev2->flags); 9801 } 9802 9803 if (role != rdev2->raid_disk) { 9804 /* 9805 * got activated except reshape is happening. 9806 */ 9807 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9808 !(le32_to_cpu(sb->feature_map) & 9809 MD_FEATURE_RESHAPE_ACTIVE)) { 9810 rdev2->saved_raid_disk = role; 9811 ret = remove_and_add_spares(mddev, rdev2); 9812 pr_info("Activated spare: %pg\n", 9813 rdev2->bdev); 9814 /* wakeup mddev->thread here, so array could 9815 * perform resync with the new activated disk */ 9816 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9817 md_wakeup_thread(mddev->thread); 9818 } 9819 /* device faulty 9820 * We just want to do the minimum to mark the disk 9821 * as faulty. The recovery is performed by the 9822 * one who initiated the error. 9823 */ 9824 if (role == MD_DISK_ROLE_FAULTY || 9825 role == MD_DISK_ROLE_JOURNAL) { 9826 md_error(mddev, rdev2); 9827 clear_bit(Blocked, &rdev2->flags); 9828 } 9829 } 9830 } 9831 9832 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9833 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9834 if (ret) 9835 pr_warn("md: updating array disks failed. %d\n", ret); 9836 } 9837 9838 /* 9839 * Since mddev->delta_disks has already updated in update_raid_disks, 9840 * so it is time to check reshape. 9841 */ 9842 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9843 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9844 /* 9845 * reshape is happening in the remote node, we need to 9846 * update reshape_position and call start_reshape. 9847 */ 9848 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9849 if (mddev->pers->update_reshape_pos) 9850 mddev->pers->update_reshape_pos(mddev); 9851 if (mddev->pers->start_reshape) 9852 mddev->pers->start_reshape(mddev); 9853 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9854 mddev->reshape_position != MaxSector && 9855 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9856 /* reshape is just done in another node. */ 9857 mddev->reshape_position = MaxSector; 9858 if (mddev->pers->update_reshape_pos) 9859 mddev->pers->update_reshape_pos(mddev); 9860 } 9861 9862 /* Finally set the event to be up to date */ 9863 mddev->events = le64_to_cpu(sb->events); 9864 } 9865 9866 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9867 { 9868 int err; 9869 struct page *swapout = rdev->sb_page; 9870 struct mdp_superblock_1 *sb; 9871 9872 /* Store the sb page of the rdev in the swapout temporary 9873 * variable in case we err in the future 9874 */ 9875 rdev->sb_page = NULL; 9876 err = alloc_disk_sb(rdev); 9877 if (err == 0) { 9878 ClearPageUptodate(rdev->sb_page); 9879 rdev->sb_loaded = 0; 9880 err = super_types[mddev->major_version]. 9881 load_super(rdev, NULL, mddev->minor_version); 9882 } 9883 if (err < 0) { 9884 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9885 __func__, __LINE__, rdev->desc_nr, err); 9886 if (rdev->sb_page) 9887 put_page(rdev->sb_page); 9888 rdev->sb_page = swapout; 9889 rdev->sb_loaded = 1; 9890 return err; 9891 } 9892 9893 sb = page_address(rdev->sb_page); 9894 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9895 * is not set 9896 */ 9897 9898 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9899 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9900 9901 /* The other node finished recovery, call spare_active to set 9902 * device In_sync and mddev->degraded 9903 */ 9904 if (rdev->recovery_offset == MaxSector && 9905 !test_bit(In_sync, &rdev->flags) && 9906 mddev->pers->spare_active(mddev)) 9907 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9908 9909 put_page(swapout); 9910 return 0; 9911 } 9912 9913 void md_reload_sb(struct mddev *mddev, int nr) 9914 { 9915 struct md_rdev *rdev = NULL, *iter; 9916 int err; 9917 9918 /* Find the rdev */ 9919 rdev_for_each_rcu(iter, mddev) { 9920 if (iter->desc_nr == nr) { 9921 rdev = iter; 9922 break; 9923 } 9924 } 9925 9926 if (!rdev) { 9927 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9928 return; 9929 } 9930 9931 err = read_rdev(mddev, rdev); 9932 if (err < 0) 9933 return; 9934 9935 check_sb_changes(mddev, rdev); 9936 9937 /* Read all rdev's to update recovery_offset */ 9938 rdev_for_each_rcu(rdev, mddev) { 9939 if (!test_bit(Faulty, &rdev->flags)) 9940 read_rdev(mddev, rdev); 9941 } 9942 } 9943 EXPORT_SYMBOL(md_reload_sb); 9944 9945 #ifndef MODULE 9946 9947 /* 9948 * Searches all registered partitions for autorun RAID arrays 9949 * at boot time. 9950 */ 9951 9952 static DEFINE_MUTEX(detected_devices_mutex); 9953 static LIST_HEAD(all_detected_devices); 9954 struct detected_devices_node { 9955 struct list_head list; 9956 dev_t dev; 9957 }; 9958 9959 void md_autodetect_dev(dev_t dev) 9960 { 9961 struct detected_devices_node *node_detected_dev; 9962 9963 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9964 if (node_detected_dev) { 9965 node_detected_dev->dev = dev; 9966 mutex_lock(&detected_devices_mutex); 9967 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9968 mutex_unlock(&detected_devices_mutex); 9969 } 9970 } 9971 9972 void md_autostart_arrays(int part) 9973 { 9974 struct md_rdev *rdev; 9975 struct detected_devices_node *node_detected_dev; 9976 dev_t dev; 9977 int i_scanned, i_passed; 9978 9979 i_scanned = 0; 9980 i_passed = 0; 9981 9982 pr_info("md: Autodetecting RAID arrays.\n"); 9983 9984 mutex_lock(&detected_devices_mutex); 9985 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9986 i_scanned++; 9987 node_detected_dev = list_entry(all_detected_devices.next, 9988 struct detected_devices_node, list); 9989 list_del(&node_detected_dev->list); 9990 dev = node_detected_dev->dev; 9991 kfree(node_detected_dev); 9992 mutex_unlock(&detected_devices_mutex); 9993 rdev = md_import_device(dev,0, 90); 9994 mutex_lock(&detected_devices_mutex); 9995 if (IS_ERR(rdev)) 9996 continue; 9997 9998 if (test_bit(Faulty, &rdev->flags)) 9999 continue; 10000 10001 set_bit(AutoDetected, &rdev->flags); 10002 list_add(&rdev->same_set, &pending_raid_disks); 10003 i_passed++; 10004 } 10005 mutex_unlock(&detected_devices_mutex); 10006 10007 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10008 10009 autorun_devices(part); 10010 } 10011 10012 #endif /* !MODULE */ 10013 10014 static __exit void md_exit(void) 10015 { 10016 struct mddev *mddev, *n; 10017 int delay = 1; 10018 10019 unregister_blkdev(MD_MAJOR,"md"); 10020 unregister_blkdev(mdp_major, "mdp"); 10021 unregister_reboot_notifier(&md_notifier); 10022 unregister_sysctl_table(raid_table_header); 10023 10024 /* We cannot unload the modules while some process is 10025 * waiting for us in select() or poll() - wake them up 10026 */ 10027 md_unloading = 1; 10028 while (waitqueue_active(&md_event_waiters)) { 10029 /* not safe to leave yet */ 10030 wake_up(&md_event_waiters); 10031 msleep(delay); 10032 delay += delay; 10033 } 10034 remove_proc_entry("mdstat", NULL); 10035 10036 spin_lock(&all_mddevs_lock); 10037 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10038 if (!mddev_get(mddev)) 10039 continue; 10040 spin_unlock(&all_mddevs_lock); 10041 export_array(mddev); 10042 mddev->ctime = 0; 10043 mddev->hold_active = 0; 10044 /* 10045 * As the mddev is now fully clear, mddev_put will schedule 10046 * the mddev for destruction by a workqueue, and the 10047 * destroy_workqueue() below will wait for that to complete. 10048 */ 10049 mddev_put(mddev); 10050 spin_lock(&all_mddevs_lock); 10051 } 10052 spin_unlock(&all_mddevs_lock); 10053 10054 destroy_workqueue(md_misc_wq); 10055 destroy_workqueue(md_bitmap_wq); 10056 destroy_workqueue(md_wq); 10057 } 10058 10059 subsys_initcall(md_init); 10060 module_exit(md_exit) 10061 10062 static int get_ro(char *buffer, const struct kernel_param *kp) 10063 { 10064 return sprintf(buffer, "%d\n", start_readonly); 10065 } 10066 static int set_ro(const char *val, const struct kernel_param *kp) 10067 { 10068 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10069 } 10070 10071 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10072 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10073 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10074 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10075 10076 MODULE_LICENSE("GPL"); 10077 MODULE_DESCRIPTION("MD RAID framework"); 10078 MODULE_ALIAS("md"); 10079 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10080