1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected 74 * by pers_lock. 75 * pers_lock does extra service to protect accesses to 76 * mddev->thread when the mutex cannot be held. 77 */ 78 static LIST_HEAD(pers_list); 79 static DEFINE_SPINLOCK(pers_lock); 80 81 static struct kobj_type md_ktype; 82 83 struct md_cluster_operations *md_cluster_ops; 84 EXPORT_SYMBOL(md_cluster_ops); 85 static struct module *md_cluster_mod; 86 87 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 88 static struct workqueue_struct *md_wq; 89 static struct workqueue_struct *md_misc_wq; 90 static struct workqueue_struct *md_rdev_misc_wq; 91 92 static int remove_and_add_spares(struct mddev *mddev, 93 struct md_rdev *this); 94 static void mddev_detach(struct mddev *mddev); 95 96 /* 97 * Default number of read corrections we'll attempt on an rdev 98 * before ejecting it from the array. We divide the read error 99 * count by 2 for every hour elapsed between read errors. 100 */ 101 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 102 /* Default safemode delay: 200 msec */ 103 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 104 /* 105 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 106 * is 1000 KB/sec, so the extra system load does not show up that much. 107 * Increase it if you want to have more _guaranteed_ speed. Note that 108 * the RAID driver will use the maximum available bandwidth if the IO 109 * subsystem is idle. There is also an 'absolute maximum' reconstruction 110 * speed limit - in case reconstruction slows down your system despite 111 * idle IO detection. 112 * 113 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 114 * or /sys/block/mdX/md/sync_speed_{min,max} 115 */ 116 117 static int sysctl_speed_limit_min = 1000; 118 static int sysctl_speed_limit_max = 200000; 119 static inline int speed_min(struct mddev *mddev) 120 { 121 return mddev->sync_speed_min ? 122 mddev->sync_speed_min : sysctl_speed_limit_min; 123 } 124 125 static inline int speed_max(struct mddev *mddev) 126 { 127 return mddev->sync_speed_max ? 128 mddev->sync_speed_max : sysctl_speed_limit_max; 129 } 130 131 static void rdev_uninit_serial(struct md_rdev *rdev) 132 { 133 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 134 return; 135 136 kvfree(rdev->serial); 137 rdev->serial = NULL; 138 } 139 140 static void rdevs_uninit_serial(struct mddev *mddev) 141 { 142 struct md_rdev *rdev; 143 144 rdev_for_each(rdev, mddev) 145 rdev_uninit_serial(rdev); 146 } 147 148 static int rdev_init_serial(struct md_rdev *rdev) 149 { 150 /* serial_nums equals with BARRIER_BUCKETS_NR */ 151 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 152 struct serial_in_rdev *serial = NULL; 153 154 if (test_bit(CollisionCheck, &rdev->flags)) 155 return 0; 156 157 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 158 GFP_KERNEL); 159 if (!serial) 160 return -ENOMEM; 161 162 for (i = 0; i < serial_nums; i++) { 163 struct serial_in_rdev *serial_tmp = &serial[i]; 164 165 spin_lock_init(&serial_tmp->serial_lock); 166 serial_tmp->serial_rb = RB_ROOT_CACHED; 167 init_waitqueue_head(&serial_tmp->serial_io_wait); 168 } 169 170 rdev->serial = serial; 171 set_bit(CollisionCheck, &rdev->flags); 172 173 return 0; 174 } 175 176 static int rdevs_init_serial(struct mddev *mddev) 177 { 178 struct md_rdev *rdev; 179 int ret = 0; 180 181 rdev_for_each(rdev, mddev) { 182 ret = rdev_init_serial(rdev); 183 if (ret) 184 break; 185 } 186 187 /* Free all resources if pool is not existed */ 188 if (ret && !mddev->serial_info_pool) 189 rdevs_uninit_serial(mddev); 190 191 return ret; 192 } 193 194 /* 195 * rdev needs to enable serial stuffs if it meets the conditions: 196 * 1. it is multi-queue device flaged with writemostly. 197 * 2. the write-behind mode is enabled. 198 */ 199 static int rdev_need_serial(struct md_rdev *rdev) 200 { 201 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 202 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 203 test_bit(WriteMostly, &rdev->flags)); 204 } 205 206 /* 207 * Init resource for rdev(s), then create serial_info_pool if: 208 * 1. rdev is the first device which return true from rdev_enable_serial. 209 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 210 */ 211 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 212 bool is_suspend) 213 { 214 int ret = 0; 215 216 if (rdev && !rdev_need_serial(rdev) && 217 !test_bit(CollisionCheck, &rdev->flags)) 218 return; 219 220 if (!is_suspend) 221 mddev_suspend(mddev); 222 223 if (!rdev) 224 ret = rdevs_init_serial(mddev); 225 else 226 ret = rdev_init_serial(rdev); 227 if (ret) 228 goto abort; 229 230 if (mddev->serial_info_pool == NULL) { 231 /* 232 * already in memalloc noio context by 233 * mddev_suspend() 234 */ 235 mddev->serial_info_pool = 236 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 237 sizeof(struct serial_info)); 238 if (!mddev->serial_info_pool) { 239 rdevs_uninit_serial(mddev); 240 pr_err("can't alloc memory pool for serialization\n"); 241 } 242 } 243 244 abort: 245 if (!is_suspend) 246 mddev_resume(mddev); 247 } 248 249 /* 250 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 251 * 1. rdev is the last device flaged with CollisionCheck. 252 * 2. when bitmap is destroyed while policy is not enabled. 253 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 254 */ 255 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 256 bool is_suspend) 257 { 258 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 259 return; 260 261 if (mddev->serial_info_pool) { 262 struct md_rdev *temp; 263 int num = 0; /* used to track if other rdevs need the pool */ 264 265 if (!is_suspend) 266 mddev_suspend(mddev); 267 rdev_for_each(temp, mddev) { 268 if (!rdev) { 269 if (!mddev->serialize_policy || 270 !rdev_need_serial(temp)) 271 rdev_uninit_serial(temp); 272 else 273 num++; 274 } else if (temp != rdev && 275 test_bit(CollisionCheck, &temp->flags)) 276 num++; 277 } 278 279 if (rdev) 280 rdev_uninit_serial(rdev); 281 282 if (num) 283 pr_info("The mempool could be used by other devices\n"); 284 else { 285 mempool_destroy(mddev->serial_info_pool); 286 mddev->serial_info_pool = NULL; 287 } 288 if (!is_suspend) 289 mddev_resume(mddev); 290 } 291 } 292 293 static struct ctl_table_header *raid_table_header; 294 295 static struct ctl_table raid_table[] = { 296 { 297 .procname = "speed_limit_min", 298 .data = &sysctl_speed_limit_min, 299 .maxlen = sizeof(int), 300 .mode = S_IRUGO|S_IWUSR, 301 .proc_handler = proc_dointvec, 302 }, 303 { 304 .procname = "speed_limit_max", 305 .data = &sysctl_speed_limit_max, 306 .maxlen = sizeof(int), 307 .mode = S_IRUGO|S_IWUSR, 308 .proc_handler = proc_dointvec, 309 }, 310 { } 311 }; 312 313 static struct ctl_table raid_dir_table[] = { 314 { 315 .procname = "raid", 316 .maxlen = 0, 317 .mode = S_IRUGO|S_IXUGO, 318 .child = raid_table, 319 }, 320 { } 321 }; 322 323 static struct ctl_table raid_root_table[] = { 324 { 325 .procname = "dev", 326 .maxlen = 0, 327 .mode = 0555, 328 .child = raid_dir_table, 329 }, 330 { } 331 }; 332 333 static int start_readonly; 334 335 /* 336 * The original mechanism for creating an md device is to create 337 * a device node in /dev and to open it. This causes races with device-close. 338 * The preferred method is to write to the "new_array" module parameter. 339 * This can avoid races. 340 * Setting create_on_open to false disables the original mechanism 341 * so all the races disappear. 342 */ 343 static bool create_on_open = true; 344 345 /* 346 * We have a system wide 'event count' that is incremented 347 * on any 'interesting' event, and readers of /proc/mdstat 348 * can use 'poll' or 'select' to find out when the event 349 * count increases. 350 * 351 * Events are: 352 * start array, stop array, error, add device, remove device, 353 * start build, activate spare 354 */ 355 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 356 static atomic_t md_event_count; 357 void md_new_event(struct mddev *mddev) 358 { 359 atomic_inc(&md_event_count); 360 wake_up(&md_event_waiters); 361 } 362 EXPORT_SYMBOL_GPL(md_new_event); 363 364 /* 365 * Enables to iterate over all existing md arrays 366 * all_mddevs_lock protects this list. 367 */ 368 static LIST_HEAD(all_mddevs); 369 static DEFINE_SPINLOCK(all_mddevs_lock); 370 371 /* 372 * iterates through all used mddevs in the system. 373 * We take care to grab the all_mddevs_lock whenever navigating 374 * the list, and to always hold a refcount when unlocked. 375 * Any code which breaks out of this loop while own 376 * a reference to the current mddev and must mddev_put it. 377 */ 378 #define for_each_mddev(_mddev,_tmp) \ 379 \ 380 for (({ spin_lock(&all_mddevs_lock); \ 381 _tmp = all_mddevs.next; \ 382 _mddev = NULL;}); \ 383 ({ if (_tmp != &all_mddevs) \ 384 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 385 spin_unlock(&all_mddevs_lock); \ 386 if (_mddev) mddev_put(_mddev); \ 387 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 388 _tmp != &all_mddevs;}); \ 389 ({ spin_lock(&all_mddevs_lock); \ 390 _tmp = _tmp->next;}) \ 391 ) 392 393 /* Rather than calling directly into the personality make_request function, 394 * IO requests come here first so that we can check if the device is 395 * being suspended pending a reconfiguration. 396 * We hold a refcount over the call to ->make_request. By the time that 397 * call has finished, the bio has been linked into some internal structure 398 * and so is visible to ->quiesce(), so we don't need the refcount any more. 399 */ 400 static bool is_suspended(struct mddev *mddev, struct bio *bio) 401 { 402 if (mddev->suspended) 403 return true; 404 if (bio_data_dir(bio) != WRITE) 405 return false; 406 if (mddev->suspend_lo >= mddev->suspend_hi) 407 return false; 408 if (bio->bi_iter.bi_sector >= mddev->suspend_hi) 409 return false; 410 if (bio_end_sector(bio) < mddev->suspend_lo) 411 return false; 412 return true; 413 } 414 415 void md_handle_request(struct mddev *mddev, struct bio *bio) 416 { 417 check_suspended: 418 rcu_read_lock(); 419 if (is_suspended(mddev, bio)) { 420 DEFINE_WAIT(__wait); 421 for (;;) { 422 prepare_to_wait(&mddev->sb_wait, &__wait, 423 TASK_UNINTERRUPTIBLE); 424 if (!is_suspended(mddev, bio)) 425 break; 426 rcu_read_unlock(); 427 schedule(); 428 rcu_read_lock(); 429 } 430 finish_wait(&mddev->sb_wait, &__wait); 431 } 432 atomic_inc(&mddev->active_io); 433 rcu_read_unlock(); 434 435 if (!mddev->pers->make_request(mddev, bio)) { 436 atomic_dec(&mddev->active_io); 437 wake_up(&mddev->sb_wait); 438 goto check_suspended; 439 } 440 441 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 442 wake_up(&mddev->sb_wait); 443 } 444 EXPORT_SYMBOL(md_handle_request); 445 446 static blk_qc_t md_submit_bio(struct bio *bio) 447 { 448 const int rw = bio_data_dir(bio); 449 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 450 451 if (mddev == NULL || mddev->pers == NULL) { 452 bio_io_error(bio); 453 return BLK_QC_T_NONE; 454 } 455 456 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 457 bio_io_error(bio); 458 return BLK_QC_T_NONE; 459 } 460 461 blk_queue_split(&bio); 462 463 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 464 if (bio_sectors(bio) != 0) 465 bio->bi_status = BLK_STS_IOERR; 466 bio_endio(bio); 467 return BLK_QC_T_NONE; 468 } 469 470 /* bio could be mergeable after passing to underlayer */ 471 bio->bi_opf &= ~REQ_NOMERGE; 472 473 md_handle_request(mddev, bio); 474 475 return BLK_QC_T_NONE; 476 } 477 478 /* mddev_suspend makes sure no new requests are submitted 479 * to the device, and that any requests that have been submitted 480 * are completely handled. 481 * Once mddev_detach() is called and completes, the module will be 482 * completely unused. 483 */ 484 void mddev_suspend(struct mddev *mddev) 485 { 486 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 487 lockdep_assert_held(&mddev->reconfig_mutex); 488 if (mddev->suspended++) 489 return; 490 synchronize_rcu(); 491 wake_up(&mddev->sb_wait); 492 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); 493 smp_mb__after_atomic(); 494 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 495 mddev->pers->quiesce(mddev, 1); 496 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); 497 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); 498 499 del_timer_sync(&mddev->safemode_timer); 500 /* restrict memory reclaim I/O during raid array is suspend */ 501 mddev->noio_flag = memalloc_noio_save(); 502 } 503 EXPORT_SYMBOL_GPL(mddev_suspend); 504 505 void mddev_resume(struct mddev *mddev) 506 { 507 /* entred the memalloc scope from mddev_suspend() */ 508 memalloc_noio_restore(mddev->noio_flag); 509 lockdep_assert_held(&mddev->reconfig_mutex); 510 if (--mddev->suspended) 511 return; 512 wake_up(&mddev->sb_wait); 513 mddev->pers->quiesce(mddev, 0); 514 515 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 516 md_wakeup_thread(mddev->thread); 517 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 518 } 519 EXPORT_SYMBOL_GPL(mddev_resume); 520 521 /* 522 * Generic flush handling for md 523 */ 524 525 static void md_end_flush(struct bio *bio) 526 { 527 struct md_rdev *rdev = bio->bi_private; 528 struct mddev *mddev = rdev->mddev; 529 530 rdev_dec_pending(rdev, mddev); 531 532 if (atomic_dec_and_test(&mddev->flush_pending)) { 533 /* The pre-request flush has finished */ 534 queue_work(md_wq, &mddev->flush_work); 535 } 536 bio_put(bio); 537 } 538 539 static void md_submit_flush_data(struct work_struct *ws); 540 541 static void submit_flushes(struct work_struct *ws) 542 { 543 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 544 struct md_rdev *rdev; 545 546 mddev->start_flush = ktime_get_boottime(); 547 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 548 atomic_set(&mddev->flush_pending, 1); 549 rcu_read_lock(); 550 rdev_for_each_rcu(rdev, mddev) 551 if (rdev->raid_disk >= 0 && 552 !test_bit(Faulty, &rdev->flags)) { 553 /* Take two references, one is dropped 554 * when request finishes, one after 555 * we reclaim rcu_read_lock 556 */ 557 struct bio *bi; 558 atomic_inc(&rdev->nr_pending); 559 atomic_inc(&rdev->nr_pending); 560 rcu_read_unlock(); 561 bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set); 562 bi->bi_end_io = md_end_flush; 563 bi->bi_private = rdev; 564 bio_set_dev(bi, rdev->bdev); 565 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 566 atomic_inc(&mddev->flush_pending); 567 submit_bio(bi); 568 rcu_read_lock(); 569 rdev_dec_pending(rdev, mddev); 570 } 571 rcu_read_unlock(); 572 if (atomic_dec_and_test(&mddev->flush_pending)) 573 queue_work(md_wq, &mddev->flush_work); 574 } 575 576 static void md_submit_flush_data(struct work_struct *ws) 577 { 578 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 579 struct bio *bio = mddev->flush_bio; 580 581 /* 582 * must reset flush_bio before calling into md_handle_request to avoid a 583 * deadlock, because other bios passed md_handle_request suspend check 584 * could wait for this and below md_handle_request could wait for those 585 * bios because of suspend check 586 */ 587 spin_lock_irq(&mddev->lock); 588 mddev->prev_flush_start = mddev->start_flush; 589 mddev->flush_bio = NULL; 590 spin_unlock_irq(&mddev->lock); 591 wake_up(&mddev->sb_wait); 592 593 if (bio->bi_iter.bi_size == 0) { 594 /* an empty barrier - all done */ 595 bio_endio(bio); 596 } else { 597 bio->bi_opf &= ~REQ_PREFLUSH; 598 md_handle_request(mddev, bio); 599 } 600 } 601 602 /* 603 * Manages consolidation of flushes and submitting any flushes needed for 604 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 605 * being finished in another context. Returns false if the flushing is 606 * complete but still needs the I/O portion of the bio to be processed. 607 */ 608 bool md_flush_request(struct mddev *mddev, struct bio *bio) 609 { 610 ktime_t req_start = ktime_get_boottime(); 611 spin_lock_irq(&mddev->lock); 612 /* flush requests wait until ongoing flush completes, 613 * hence coalescing all the pending requests. 614 */ 615 wait_event_lock_irq(mddev->sb_wait, 616 !mddev->flush_bio || 617 ktime_before(req_start, mddev->prev_flush_start), 618 mddev->lock); 619 /* new request after previous flush is completed */ 620 if (ktime_after(req_start, mddev->prev_flush_start)) { 621 WARN_ON(mddev->flush_bio); 622 mddev->flush_bio = bio; 623 bio = NULL; 624 } 625 spin_unlock_irq(&mddev->lock); 626 627 if (!bio) { 628 INIT_WORK(&mddev->flush_work, submit_flushes); 629 queue_work(md_wq, &mddev->flush_work); 630 } else { 631 /* flush was performed for some other bio while we waited. */ 632 if (bio->bi_iter.bi_size == 0) 633 /* an empty barrier - all done */ 634 bio_endio(bio); 635 else { 636 bio->bi_opf &= ~REQ_PREFLUSH; 637 return false; 638 } 639 } 640 return true; 641 } 642 EXPORT_SYMBOL(md_flush_request); 643 644 static inline struct mddev *mddev_get(struct mddev *mddev) 645 { 646 atomic_inc(&mddev->active); 647 return mddev; 648 } 649 650 static void mddev_delayed_delete(struct work_struct *ws); 651 652 static void mddev_put(struct mddev *mddev) 653 { 654 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 655 return; 656 if (!mddev->raid_disks && list_empty(&mddev->disks) && 657 mddev->ctime == 0 && !mddev->hold_active) { 658 /* Array is not configured at all, and not held active, 659 * so destroy it */ 660 list_del_init(&mddev->all_mddevs); 661 662 /* 663 * Call queue_work inside the spinlock so that 664 * flush_workqueue() after mddev_find will succeed in waiting 665 * for the work to be done. 666 */ 667 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 668 queue_work(md_misc_wq, &mddev->del_work); 669 } 670 spin_unlock(&all_mddevs_lock); 671 } 672 673 static void md_safemode_timeout(struct timer_list *t); 674 675 void mddev_init(struct mddev *mddev) 676 { 677 kobject_init(&mddev->kobj, &md_ktype); 678 mutex_init(&mddev->open_mutex); 679 mutex_init(&mddev->reconfig_mutex); 680 mutex_init(&mddev->bitmap_info.mutex); 681 INIT_LIST_HEAD(&mddev->disks); 682 INIT_LIST_HEAD(&mddev->all_mddevs); 683 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 684 atomic_set(&mddev->active, 1); 685 atomic_set(&mddev->openers, 0); 686 atomic_set(&mddev->active_io, 0); 687 spin_lock_init(&mddev->lock); 688 atomic_set(&mddev->flush_pending, 0); 689 init_waitqueue_head(&mddev->sb_wait); 690 init_waitqueue_head(&mddev->recovery_wait); 691 mddev->reshape_position = MaxSector; 692 mddev->reshape_backwards = 0; 693 mddev->last_sync_action = "none"; 694 mddev->resync_min = 0; 695 mddev->resync_max = MaxSector; 696 mddev->level = LEVEL_NONE; 697 } 698 EXPORT_SYMBOL_GPL(mddev_init); 699 700 static struct mddev *mddev_find_locked(dev_t unit) 701 { 702 struct mddev *mddev; 703 704 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 705 if (mddev->unit == unit) 706 return mddev; 707 708 return NULL; 709 } 710 711 /* find an unused unit number */ 712 static dev_t mddev_alloc_unit(void) 713 { 714 static int next_minor = 512; 715 int start = next_minor; 716 bool is_free = 0; 717 dev_t dev = 0; 718 719 while (!is_free) { 720 dev = MKDEV(MD_MAJOR, next_minor); 721 next_minor++; 722 if (next_minor > MINORMASK) 723 next_minor = 0; 724 if (next_minor == start) 725 return 0; /* Oh dear, all in use. */ 726 is_free = !mddev_find_locked(dev); 727 } 728 729 return dev; 730 } 731 732 static struct mddev *mddev_find(dev_t unit) 733 { 734 struct mddev *mddev; 735 736 if (MAJOR(unit) != MD_MAJOR) 737 unit &= ~((1 << MdpMinorShift) - 1); 738 739 spin_lock(&all_mddevs_lock); 740 mddev = mddev_find_locked(unit); 741 if (mddev) 742 mddev_get(mddev); 743 spin_unlock(&all_mddevs_lock); 744 745 return mddev; 746 } 747 748 static struct mddev *mddev_alloc(dev_t unit) 749 { 750 struct mddev *new; 751 int error; 752 753 if (unit && MAJOR(unit) != MD_MAJOR) 754 unit &= ~((1 << MdpMinorShift) - 1); 755 756 new = kzalloc(sizeof(*new), GFP_KERNEL); 757 if (!new) 758 return ERR_PTR(-ENOMEM); 759 mddev_init(new); 760 761 spin_lock(&all_mddevs_lock); 762 if (unit) { 763 error = -EEXIST; 764 if (mddev_find_locked(unit)) 765 goto out_free_new; 766 new->unit = unit; 767 if (MAJOR(unit) == MD_MAJOR) 768 new->md_minor = MINOR(unit); 769 else 770 new->md_minor = MINOR(unit) >> MdpMinorShift; 771 new->hold_active = UNTIL_IOCTL; 772 } else { 773 error = -ENODEV; 774 new->unit = mddev_alloc_unit(); 775 if (!new->unit) 776 goto out_free_new; 777 new->md_minor = MINOR(new->unit); 778 new->hold_active = UNTIL_STOP; 779 } 780 781 list_add(&new->all_mddevs, &all_mddevs); 782 spin_unlock(&all_mddevs_lock); 783 return new; 784 out_free_new: 785 spin_unlock(&all_mddevs_lock); 786 kfree(new); 787 return ERR_PTR(error); 788 } 789 790 static const struct attribute_group md_redundancy_group; 791 792 void mddev_unlock(struct mddev *mddev) 793 { 794 if (mddev->to_remove) { 795 /* These cannot be removed under reconfig_mutex as 796 * an access to the files will try to take reconfig_mutex 797 * while holding the file unremovable, which leads to 798 * a deadlock. 799 * So hold set sysfs_active while the remove in happeing, 800 * and anything else which might set ->to_remove or my 801 * otherwise change the sysfs namespace will fail with 802 * -EBUSY if sysfs_active is still set. 803 * We set sysfs_active under reconfig_mutex and elsewhere 804 * test it under the same mutex to ensure its correct value 805 * is seen. 806 */ 807 const struct attribute_group *to_remove = mddev->to_remove; 808 mddev->to_remove = NULL; 809 mddev->sysfs_active = 1; 810 mutex_unlock(&mddev->reconfig_mutex); 811 812 if (mddev->kobj.sd) { 813 if (to_remove != &md_redundancy_group) 814 sysfs_remove_group(&mddev->kobj, to_remove); 815 if (mddev->pers == NULL || 816 mddev->pers->sync_request == NULL) { 817 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 818 if (mddev->sysfs_action) 819 sysfs_put(mddev->sysfs_action); 820 if (mddev->sysfs_completed) 821 sysfs_put(mddev->sysfs_completed); 822 if (mddev->sysfs_degraded) 823 sysfs_put(mddev->sysfs_degraded); 824 mddev->sysfs_action = NULL; 825 mddev->sysfs_completed = NULL; 826 mddev->sysfs_degraded = NULL; 827 } 828 } 829 mddev->sysfs_active = 0; 830 } else 831 mutex_unlock(&mddev->reconfig_mutex); 832 833 /* As we've dropped the mutex we need a spinlock to 834 * make sure the thread doesn't disappear 835 */ 836 spin_lock(&pers_lock); 837 md_wakeup_thread(mddev->thread); 838 wake_up(&mddev->sb_wait); 839 spin_unlock(&pers_lock); 840 } 841 EXPORT_SYMBOL_GPL(mddev_unlock); 842 843 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 844 { 845 struct md_rdev *rdev; 846 847 rdev_for_each_rcu(rdev, mddev) 848 if (rdev->desc_nr == nr) 849 return rdev; 850 851 return NULL; 852 } 853 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 854 855 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 856 { 857 struct md_rdev *rdev; 858 859 rdev_for_each(rdev, mddev) 860 if (rdev->bdev->bd_dev == dev) 861 return rdev; 862 863 return NULL; 864 } 865 866 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 867 { 868 struct md_rdev *rdev; 869 870 rdev_for_each_rcu(rdev, mddev) 871 if (rdev->bdev->bd_dev == dev) 872 return rdev; 873 874 return NULL; 875 } 876 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 877 878 static struct md_personality *find_pers(int level, char *clevel) 879 { 880 struct md_personality *pers; 881 list_for_each_entry(pers, &pers_list, list) { 882 if (level != LEVEL_NONE && pers->level == level) 883 return pers; 884 if (strcmp(pers->name, clevel)==0) 885 return pers; 886 } 887 return NULL; 888 } 889 890 /* return the offset of the super block in 512byte sectors */ 891 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 892 { 893 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 894 return MD_NEW_SIZE_SECTORS(num_sectors); 895 } 896 897 static int alloc_disk_sb(struct md_rdev *rdev) 898 { 899 rdev->sb_page = alloc_page(GFP_KERNEL); 900 if (!rdev->sb_page) 901 return -ENOMEM; 902 return 0; 903 } 904 905 void md_rdev_clear(struct md_rdev *rdev) 906 { 907 if (rdev->sb_page) { 908 put_page(rdev->sb_page); 909 rdev->sb_loaded = 0; 910 rdev->sb_page = NULL; 911 rdev->sb_start = 0; 912 rdev->sectors = 0; 913 } 914 if (rdev->bb_page) { 915 put_page(rdev->bb_page); 916 rdev->bb_page = NULL; 917 } 918 badblocks_exit(&rdev->badblocks); 919 } 920 EXPORT_SYMBOL_GPL(md_rdev_clear); 921 922 static void super_written(struct bio *bio) 923 { 924 struct md_rdev *rdev = bio->bi_private; 925 struct mddev *mddev = rdev->mddev; 926 927 if (bio->bi_status) { 928 pr_err("md: %s gets error=%d\n", __func__, 929 blk_status_to_errno(bio->bi_status)); 930 md_error(mddev, rdev); 931 if (!test_bit(Faulty, &rdev->flags) 932 && (bio->bi_opf & MD_FAILFAST)) { 933 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 934 set_bit(LastDev, &rdev->flags); 935 } 936 } else 937 clear_bit(LastDev, &rdev->flags); 938 939 if (atomic_dec_and_test(&mddev->pending_writes)) 940 wake_up(&mddev->sb_wait); 941 rdev_dec_pending(rdev, mddev); 942 bio_put(bio); 943 } 944 945 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 946 sector_t sector, int size, struct page *page) 947 { 948 /* write first size bytes of page to sector of rdev 949 * Increment mddev->pending_writes before returning 950 * and decrement it on completion, waking up sb_wait 951 * if zero is reached. 952 * If an error occurred, call md_error 953 */ 954 struct bio *bio; 955 int ff = 0; 956 957 if (!page) 958 return; 959 960 if (test_bit(Faulty, &rdev->flags)) 961 return; 962 963 bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); 964 965 atomic_inc(&rdev->nr_pending); 966 967 bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev); 968 bio->bi_iter.bi_sector = sector; 969 bio_add_page(bio, page, size, 0); 970 bio->bi_private = rdev; 971 bio->bi_end_io = super_written; 972 973 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 974 test_bit(FailFast, &rdev->flags) && 975 !test_bit(LastDev, &rdev->flags)) 976 ff = MD_FAILFAST; 977 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff; 978 979 atomic_inc(&mddev->pending_writes); 980 submit_bio(bio); 981 } 982 983 int md_super_wait(struct mddev *mddev) 984 { 985 /* wait for all superblock writes that were scheduled to complete */ 986 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 987 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 988 return -EAGAIN; 989 return 0; 990 } 991 992 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 993 struct page *page, int op, int op_flags, bool metadata_op) 994 { 995 struct bio bio; 996 struct bio_vec bvec; 997 998 bio_init(&bio, &bvec, 1); 999 1000 if (metadata_op && rdev->meta_bdev) 1001 bio_set_dev(&bio, rdev->meta_bdev); 1002 else 1003 bio_set_dev(&bio, rdev->bdev); 1004 bio.bi_opf = op | op_flags; 1005 if (metadata_op) 1006 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1007 else if (rdev->mddev->reshape_position != MaxSector && 1008 (rdev->mddev->reshape_backwards == 1009 (sector >= rdev->mddev->reshape_position))) 1010 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1011 else 1012 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1013 bio_add_page(&bio, page, size, 0); 1014 1015 submit_bio_wait(&bio); 1016 1017 return !bio.bi_status; 1018 } 1019 EXPORT_SYMBOL_GPL(sync_page_io); 1020 1021 static int read_disk_sb(struct md_rdev *rdev, int size) 1022 { 1023 char b[BDEVNAME_SIZE]; 1024 1025 if (rdev->sb_loaded) 1026 return 0; 1027 1028 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) 1029 goto fail; 1030 rdev->sb_loaded = 1; 1031 return 0; 1032 1033 fail: 1034 pr_err("md: disabled device %s, could not read superblock.\n", 1035 bdevname(rdev->bdev,b)); 1036 return -EINVAL; 1037 } 1038 1039 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1040 { 1041 return sb1->set_uuid0 == sb2->set_uuid0 && 1042 sb1->set_uuid1 == sb2->set_uuid1 && 1043 sb1->set_uuid2 == sb2->set_uuid2 && 1044 sb1->set_uuid3 == sb2->set_uuid3; 1045 } 1046 1047 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1048 { 1049 int ret; 1050 mdp_super_t *tmp1, *tmp2; 1051 1052 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1053 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1054 1055 if (!tmp1 || !tmp2) { 1056 ret = 0; 1057 goto abort; 1058 } 1059 1060 *tmp1 = *sb1; 1061 *tmp2 = *sb2; 1062 1063 /* 1064 * nr_disks is not constant 1065 */ 1066 tmp1->nr_disks = 0; 1067 tmp2->nr_disks = 0; 1068 1069 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1070 abort: 1071 kfree(tmp1); 1072 kfree(tmp2); 1073 return ret; 1074 } 1075 1076 static u32 md_csum_fold(u32 csum) 1077 { 1078 csum = (csum & 0xffff) + (csum >> 16); 1079 return (csum & 0xffff) + (csum >> 16); 1080 } 1081 1082 static unsigned int calc_sb_csum(mdp_super_t *sb) 1083 { 1084 u64 newcsum = 0; 1085 u32 *sb32 = (u32*)sb; 1086 int i; 1087 unsigned int disk_csum, csum; 1088 1089 disk_csum = sb->sb_csum; 1090 sb->sb_csum = 0; 1091 1092 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1093 newcsum += sb32[i]; 1094 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1095 1096 #ifdef CONFIG_ALPHA 1097 /* This used to use csum_partial, which was wrong for several 1098 * reasons including that different results are returned on 1099 * different architectures. It isn't critical that we get exactly 1100 * the same return value as before (we always csum_fold before 1101 * testing, and that removes any differences). However as we 1102 * know that csum_partial always returned a 16bit value on 1103 * alphas, do a fold to maximise conformity to previous behaviour. 1104 */ 1105 sb->sb_csum = md_csum_fold(disk_csum); 1106 #else 1107 sb->sb_csum = disk_csum; 1108 #endif 1109 return csum; 1110 } 1111 1112 /* 1113 * Handle superblock details. 1114 * We want to be able to handle multiple superblock formats 1115 * so we have a common interface to them all, and an array of 1116 * different handlers. 1117 * We rely on user-space to write the initial superblock, and support 1118 * reading and updating of superblocks. 1119 * Interface methods are: 1120 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1121 * loads and validates a superblock on dev. 1122 * if refdev != NULL, compare superblocks on both devices 1123 * Return: 1124 * 0 - dev has a superblock that is compatible with refdev 1125 * 1 - dev has a superblock that is compatible and newer than refdev 1126 * so dev should be used as the refdev in future 1127 * -EINVAL superblock incompatible or invalid 1128 * -othererror e.g. -EIO 1129 * 1130 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1131 * Verify that dev is acceptable into mddev. 1132 * The first time, mddev->raid_disks will be 0, and data from 1133 * dev should be merged in. Subsequent calls check that dev 1134 * is new enough. Return 0 or -EINVAL 1135 * 1136 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1137 * Update the superblock for rdev with data in mddev 1138 * This does not write to disc. 1139 * 1140 */ 1141 1142 struct super_type { 1143 char *name; 1144 struct module *owner; 1145 int (*load_super)(struct md_rdev *rdev, 1146 struct md_rdev *refdev, 1147 int minor_version); 1148 int (*validate_super)(struct mddev *mddev, 1149 struct md_rdev *rdev); 1150 void (*sync_super)(struct mddev *mddev, 1151 struct md_rdev *rdev); 1152 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1153 sector_t num_sectors); 1154 int (*allow_new_offset)(struct md_rdev *rdev, 1155 unsigned long long new_offset); 1156 }; 1157 1158 /* 1159 * Check that the given mddev has no bitmap. 1160 * 1161 * This function is called from the run method of all personalities that do not 1162 * support bitmaps. It prints an error message and returns non-zero if mddev 1163 * has a bitmap. Otherwise, it returns 0. 1164 * 1165 */ 1166 int md_check_no_bitmap(struct mddev *mddev) 1167 { 1168 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1169 return 0; 1170 pr_warn("%s: bitmaps are not supported for %s\n", 1171 mdname(mddev), mddev->pers->name); 1172 return 1; 1173 } 1174 EXPORT_SYMBOL(md_check_no_bitmap); 1175 1176 /* 1177 * load_super for 0.90.0 1178 */ 1179 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1180 { 1181 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1182 mdp_super_t *sb; 1183 int ret; 1184 bool spare_disk = true; 1185 1186 /* 1187 * Calculate the position of the superblock (512byte sectors), 1188 * it's at the end of the disk. 1189 * 1190 * It also happens to be a multiple of 4Kb. 1191 */ 1192 rdev->sb_start = calc_dev_sboffset(rdev); 1193 1194 ret = read_disk_sb(rdev, MD_SB_BYTES); 1195 if (ret) 1196 return ret; 1197 1198 ret = -EINVAL; 1199 1200 bdevname(rdev->bdev, b); 1201 sb = page_address(rdev->sb_page); 1202 1203 if (sb->md_magic != MD_SB_MAGIC) { 1204 pr_warn("md: invalid raid superblock magic on %s\n", b); 1205 goto abort; 1206 } 1207 1208 if (sb->major_version != 0 || 1209 sb->minor_version < 90 || 1210 sb->minor_version > 91) { 1211 pr_warn("Bad version number %d.%d on %s\n", 1212 sb->major_version, sb->minor_version, b); 1213 goto abort; 1214 } 1215 1216 if (sb->raid_disks <= 0) 1217 goto abort; 1218 1219 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1220 pr_warn("md: invalid superblock checksum on %s\n", b); 1221 goto abort; 1222 } 1223 1224 rdev->preferred_minor = sb->md_minor; 1225 rdev->data_offset = 0; 1226 rdev->new_data_offset = 0; 1227 rdev->sb_size = MD_SB_BYTES; 1228 rdev->badblocks.shift = -1; 1229 1230 if (sb->level == LEVEL_MULTIPATH) 1231 rdev->desc_nr = -1; 1232 else 1233 rdev->desc_nr = sb->this_disk.number; 1234 1235 /* not spare disk, or LEVEL_MULTIPATH */ 1236 if (sb->level == LEVEL_MULTIPATH || 1237 (rdev->desc_nr >= 0 && 1238 rdev->desc_nr < MD_SB_DISKS && 1239 sb->disks[rdev->desc_nr].state & 1240 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) 1241 spare_disk = false; 1242 1243 if (!refdev) { 1244 if (!spare_disk) 1245 ret = 1; 1246 else 1247 ret = 0; 1248 } else { 1249 __u64 ev1, ev2; 1250 mdp_super_t *refsb = page_address(refdev->sb_page); 1251 if (!md_uuid_equal(refsb, sb)) { 1252 pr_warn("md: %s has different UUID to %s\n", 1253 b, bdevname(refdev->bdev,b2)); 1254 goto abort; 1255 } 1256 if (!md_sb_equal(refsb, sb)) { 1257 pr_warn("md: %s has same UUID but different superblock to %s\n", 1258 b, bdevname(refdev->bdev, b2)); 1259 goto abort; 1260 } 1261 ev1 = md_event(sb); 1262 ev2 = md_event(refsb); 1263 1264 if (!spare_disk && ev1 > ev2) 1265 ret = 1; 1266 else 1267 ret = 0; 1268 } 1269 rdev->sectors = rdev->sb_start; 1270 /* Limit to 4TB as metadata cannot record more than that. 1271 * (not needed for Linear and RAID0 as metadata doesn't 1272 * record this size) 1273 */ 1274 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1275 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1276 1277 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1278 /* "this cannot possibly happen" ... */ 1279 ret = -EINVAL; 1280 1281 abort: 1282 return ret; 1283 } 1284 1285 /* 1286 * validate_super for 0.90.0 1287 */ 1288 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1289 { 1290 mdp_disk_t *desc; 1291 mdp_super_t *sb = page_address(rdev->sb_page); 1292 __u64 ev1 = md_event(sb); 1293 1294 rdev->raid_disk = -1; 1295 clear_bit(Faulty, &rdev->flags); 1296 clear_bit(In_sync, &rdev->flags); 1297 clear_bit(Bitmap_sync, &rdev->flags); 1298 clear_bit(WriteMostly, &rdev->flags); 1299 1300 if (mddev->raid_disks == 0) { 1301 mddev->major_version = 0; 1302 mddev->minor_version = sb->minor_version; 1303 mddev->patch_version = sb->patch_version; 1304 mddev->external = 0; 1305 mddev->chunk_sectors = sb->chunk_size >> 9; 1306 mddev->ctime = sb->ctime; 1307 mddev->utime = sb->utime; 1308 mddev->level = sb->level; 1309 mddev->clevel[0] = 0; 1310 mddev->layout = sb->layout; 1311 mddev->raid_disks = sb->raid_disks; 1312 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1313 mddev->events = ev1; 1314 mddev->bitmap_info.offset = 0; 1315 mddev->bitmap_info.space = 0; 1316 /* bitmap can use 60 K after the 4K superblocks */ 1317 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1318 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1319 mddev->reshape_backwards = 0; 1320 1321 if (mddev->minor_version >= 91) { 1322 mddev->reshape_position = sb->reshape_position; 1323 mddev->delta_disks = sb->delta_disks; 1324 mddev->new_level = sb->new_level; 1325 mddev->new_layout = sb->new_layout; 1326 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1327 if (mddev->delta_disks < 0) 1328 mddev->reshape_backwards = 1; 1329 } else { 1330 mddev->reshape_position = MaxSector; 1331 mddev->delta_disks = 0; 1332 mddev->new_level = mddev->level; 1333 mddev->new_layout = mddev->layout; 1334 mddev->new_chunk_sectors = mddev->chunk_sectors; 1335 } 1336 if (mddev->level == 0) 1337 mddev->layout = -1; 1338 1339 if (sb->state & (1<<MD_SB_CLEAN)) 1340 mddev->recovery_cp = MaxSector; 1341 else { 1342 if (sb->events_hi == sb->cp_events_hi && 1343 sb->events_lo == sb->cp_events_lo) { 1344 mddev->recovery_cp = sb->recovery_cp; 1345 } else 1346 mddev->recovery_cp = 0; 1347 } 1348 1349 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1350 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1351 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1352 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1353 1354 mddev->max_disks = MD_SB_DISKS; 1355 1356 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1357 mddev->bitmap_info.file == NULL) { 1358 mddev->bitmap_info.offset = 1359 mddev->bitmap_info.default_offset; 1360 mddev->bitmap_info.space = 1361 mddev->bitmap_info.default_space; 1362 } 1363 1364 } else if (mddev->pers == NULL) { 1365 /* Insist on good event counter while assembling, except 1366 * for spares (which don't need an event count) */ 1367 ++ev1; 1368 if (sb->disks[rdev->desc_nr].state & ( 1369 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1370 if (ev1 < mddev->events) 1371 return -EINVAL; 1372 } else if (mddev->bitmap) { 1373 /* if adding to array with a bitmap, then we can accept an 1374 * older device ... but not too old. 1375 */ 1376 if (ev1 < mddev->bitmap->events_cleared) 1377 return 0; 1378 if (ev1 < mddev->events) 1379 set_bit(Bitmap_sync, &rdev->flags); 1380 } else { 1381 if (ev1 < mddev->events) 1382 /* just a hot-add of a new device, leave raid_disk at -1 */ 1383 return 0; 1384 } 1385 1386 if (mddev->level != LEVEL_MULTIPATH) { 1387 desc = sb->disks + rdev->desc_nr; 1388 1389 if (desc->state & (1<<MD_DISK_FAULTY)) 1390 set_bit(Faulty, &rdev->flags); 1391 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1392 desc->raid_disk < mddev->raid_disks */) { 1393 set_bit(In_sync, &rdev->flags); 1394 rdev->raid_disk = desc->raid_disk; 1395 rdev->saved_raid_disk = desc->raid_disk; 1396 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1397 /* active but not in sync implies recovery up to 1398 * reshape position. We don't know exactly where 1399 * that is, so set to zero for now */ 1400 if (mddev->minor_version >= 91) { 1401 rdev->recovery_offset = 0; 1402 rdev->raid_disk = desc->raid_disk; 1403 } 1404 } 1405 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1406 set_bit(WriteMostly, &rdev->flags); 1407 if (desc->state & (1<<MD_DISK_FAILFAST)) 1408 set_bit(FailFast, &rdev->flags); 1409 } else /* MULTIPATH are always insync */ 1410 set_bit(In_sync, &rdev->flags); 1411 return 0; 1412 } 1413 1414 /* 1415 * sync_super for 0.90.0 1416 */ 1417 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1418 { 1419 mdp_super_t *sb; 1420 struct md_rdev *rdev2; 1421 int next_spare = mddev->raid_disks; 1422 1423 /* make rdev->sb match mddev data.. 1424 * 1425 * 1/ zero out disks 1426 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1427 * 3/ any empty disks < next_spare become removed 1428 * 1429 * disks[0] gets initialised to REMOVED because 1430 * we cannot be sure from other fields if it has 1431 * been initialised or not. 1432 */ 1433 int i; 1434 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1435 1436 rdev->sb_size = MD_SB_BYTES; 1437 1438 sb = page_address(rdev->sb_page); 1439 1440 memset(sb, 0, sizeof(*sb)); 1441 1442 sb->md_magic = MD_SB_MAGIC; 1443 sb->major_version = mddev->major_version; 1444 sb->patch_version = mddev->patch_version; 1445 sb->gvalid_words = 0; /* ignored */ 1446 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1447 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1448 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1449 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1450 1451 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1452 sb->level = mddev->level; 1453 sb->size = mddev->dev_sectors / 2; 1454 sb->raid_disks = mddev->raid_disks; 1455 sb->md_minor = mddev->md_minor; 1456 sb->not_persistent = 0; 1457 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1458 sb->state = 0; 1459 sb->events_hi = (mddev->events>>32); 1460 sb->events_lo = (u32)mddev->events; 1461 1462 if (mddev->reshape_position == MaxSector) 1463 sb->minor_version = 90; 1464 else { 1465 sb->minor_version = 91; 1466 sb->reshape_position = mddev->reshape_position; 1467 sb->new_level = mddev->new_level; 1468 sb->delta_disks = mddev->delta_disks; 1469 sb->new_layout = mddev->new_layout; 1470 sb->new_chunk = mddev->new_chunk_sectors << 9; 1471 } 1472 mddev->minor_version = sb->minor_version; 1473 if (mddev->in_sync) 1474 { 1475 sb->recovery_cp = mddev->recovery_cp; 1476 sb->cp_events_hi = (mddev->events>>32); 1477 sb->cp_events_lo = (u32)mddev->events; 1478 if (mddev->recovery_cp == MaxSector) 1479 sb->state = (1<< MD_SB_CLEAN); 1480 } else 1481 sb->recovery_cp = 0; 1482 1483 sb->layout = mddev->layout; 1484 sb->chunk_size = mddev->chunk_sectors << 9; 1485 1486 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1487 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1488 1489 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1490 rdev_for_each(rdev2, mddev) { 1491 mdp_disk_t *d; 1492 int desc_nr; 1493 int is_active = test_bit(In_sync, &rdev2->flags); 1494 1495 if (rdev2->raid_disk >= 0 && 1496 sb->minor_version >= 91) 1497 /* we have nowhere to store the recovery_offset, 1498 * but if it is not below the reshape_position, 1499 * we can piggy-back on that. 1500 */ 1501 is_active = 1; 1502 if (rdev2->raid_disk < 0 || 1503 test_bit(Faulty, &rdev2->flags)) 1504 is_active = 0; 1505 if (is_active) 1506 desc_nr = rdev2->raid_disk; 1507 else 1508 desc_nr = next_spare++; 1509 rdev2->desc_nr = desc_nr; 1510 d = &sb->disks[rdev2->desc_nr]; 1511 nr_disks++; 1512 d->number = rdev2->desc_nr; 1513 d->major = MAJOR(rdev2->bdev->bd_dev); 1514 d->minor = MINOR(rdev2->bdev->bd_dev); 1515 if (is_active) 1516 d->raid_disk = rdev2->raid_disk; 1517 else 1518 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1519 if (test_bit(Faulty, &rdev2->flags)) 1520 d->state = (1<<MD_DISK_FAULTY); 1521 else if (is_active) { 1522 d->state = (1<<MD_DISK_ACTIVE); 1523 if (test_bit(In_sync, &rdev2->flags)) 1524 d->state |= (1<<MD_DISK_SYNC); 1525 active++; 1526 working++; 1527 } else { 1528 d->state = 0; 1529 spare++; 1530 working++; 1531 } 1532 if (test_bit(WriteMostly, &rdev2->flags)) 1533 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1534 if (test_bit(FailFast, &rdev2->flags)) 1535 d->state |= (1<<MD_DISK_FAILFAST); 1536 } 1537 /* now set the "removed" and "faulty" bits on any missing devices */ 1538 for (i=0 ; i < mddev->raid_disks ; i++) { 1539 mdp_disk_t *d = &sb->disks[i]; 1540 if (d->state == 0 && d->number == 0) { 1541 d->number = i; 1542 d->raid_disk = i; 1543 d->state = (1<<MD_DISK_REMOVED); 1544 d->state |= (1<<MD_DISK_FAULTY); 1545 failed++; 1546 } 1547 } 1548 sb->nr_disks = nr_disks; 1549 sb->active_disks = active; 1550 sb->working_disks = working; 1551 sb->failed_disks = failed; 1552 sb->spare_disks = spare; 1553 1554 sb->this_disk = sb->disks[rdev->desc_nr]; 1555 sb->sb_csum = calc_sb_csum(sb); 1556 } 1557 1558 /* 1559 * rdev_size_change for 0.90.0 1560 */ 1561 static unsigned long long 1562 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1563 { 1564 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1565 return 0; /* component must fit device */ 1566 if (rdev->mddev->bitmap_info.offset) 1567 return 0; /* can't move bitmap */ 1568 rdev->sb_start = calc_dev_sboffset(rdev); 1569 if (!num_sectors || num_sectors > rdev->sb_start) 1570 num_sectors = rdev->sb_start; 1571 /* Limit to 4TB as metadata cannot record more than that. 1572 * 4TB == 2^32 KB, or 2*2^32 sectors. 1573 */ 1574 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1575 num_sectors = (sector_t)(2ULL << 32) - 2; 1576 do { 1577 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1578 rdev->sb_page); 1579 } while (md_super_wait(rdev->mddev) < 0); 1580 return num_sectors; 1581 } 1582 1583 static int 1584 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1585 { 1586 /* non-zero offset changes not possible with v0.90 */ 1587 return new_offset == 0; 1588 } 1589 1590 /* 1591 * version 1 superblock 1592 */ 1593 1594 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1595 { 1596 __le32 disk_csum; 1597 u32 csum; 1598 unsigned long long newcsum; 1599 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1600 __le32 *isuper = (__le32*)sb; 1601 1602 disk_csum = sb->sb_csum; 1603 sb->sb_csum = 0; 1604 newcsum = 0; 1605 for (; size >= 4; size -= 4) 1606 newcsum += le32_to_cpu(*isuper++); 1607 1608 if (size == 2) 1609 newcsum += le16_to_cpu(*(__le16*) isuper); 1610 1611 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1612 sb->sb_csum = disk_csum; 1613 return cpu_to_le32(csum); 1614 } 1615 1616 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1617 { 1618 struct mdp_superblock_1 *sb; 1619 int ret; 1620 sector_t sb_start; 1621 sector_t sectors; 1622 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1623 int bmask; 1624 bool spare_disk = true; 1625 1626 /* 1627 * Calculate the position of the superblock in 512byte sectors. 1628 * It is always aligned to a 4K boundary and 1629 * depeding on minor_version, it can be: 1630 * 0: At least 8K, but less than 12K, from end of device 1631 * 1: At start of device 1632 * 2: 4K from start of device. 1633 */ 1634 switch(minor_version) { 1635 case 0: 1636 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1637 sb_start -= 8*2; 1638 sb_start &= ~(sector_t)(4*2-1); 1639 break; 1640 case 1: 1641 sb_start = 0; 1642 break; 1643 case 2: 1644 sb_start = 8; 1645 break; 1646 default: 1647 return -EINVAL; 1648 } 1649 rdev->sb_start = sb_start; 1650 1651 /* superblock is rarely larger than 1K, but it can be larger, 1652 * and it is safe to read 4k, so we do that 1653 */ 1654 ret = read_disk_sb(rdev, 4096); 1655 if (ret) return ret; 1656 1657 sb = page_address(rdev->sb_page); 1658 1659 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1660 sb->major_version != cpu_to_le32(1) || 1661 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1662 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1663 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1664 return -EINVAL; 1665 1666 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1667 pr_warn("md: invalid superblock checksum on %s\n", 1668 bdevname(rdev->bdev,b)); 1669 return -EINVAL; 1670 } 1671 if (le64_to_cpu(sb->data_size) < 10) { 1672 pr_warn("md: data_size too small on %s\n", 1673 bdevname(rdev->bdev,b)); 1674 return -EINVAL; 1675 } 1676 if (sb->pad0 || 1677 sb->pad3[0] || 1678 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1679 /* Some padding is non-zero, might be a new feature */ 1680 return -EINVAL; 1681 1682 rdev->preferred_minor = 0xffff; 1683 rdev->data_offset = le64_to_cpu(sb->data_offset); 1684 rdev->new_data_offset = rdev->data_offset; 1685 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1686 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1687 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1688 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1689 1690 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1691 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1692 if (rdev->sb_size & bmask) 1693 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1694 1695 if (minor_version 1696 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1697 return -EINVAL; 1698 if (minor_version 1699 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1700 return -EINVAL; 1701 1702 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1703 rdev->desc_nr = -1; 1704 else 1705 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1706 1707 if (!rdev->bb_page) { 1708 rdev->bb_page = alloc_page(GFP_KERNEL); 1709 if (!rdev->bb_page) 1710 return -ENOMEM; 1711 } 1712 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1713 rdev->badblocks.count == 0) { 1714 /* need to load the bad block list. 1715 * Currently we limit it to one page. 1716 */ 1717 s32 offset; 1718 sector_t bb_sector; 1719 __le64 *bbp; 1720 int i; 1721 int sectors = le16_to_cpu(sb->bblog_size); 1722 if (sectors > (PAGE_SIZE / 512)) 1723 return -EINVAL; 1724 offset = le32_to_cpu(sb->bblog_offset); 1725 if (offset == 0) 1726 return -EINVAL; 1727 bb_sector = (long long)offset; 1728 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1729 rdev->bb_page, REQ_OP_READ, 0, true)) 1730 return -EIO; 1731 bbp = (__le64 *)page_address(rdev->bb_page); 1732 rdev->badblocks.shift = sb->bblog_shift; 1733 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1734 u64 bb = le64_to_cpu(*bbp); 1735 int count = bb & (0x3ff); 1736 u64 sector = bb >> 10; 1737 sector <<= sb->bblog_shift; 1738 count <<= sb->bblog_shift; 1739 if (bb + 1 == 0) 1740 break; 1741 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1742 return -EINVAL; 1743 } 1744 } else if (sb->bblog_offset != 0) 1745 rdev->badblocks.shift = 0; 1746 1747 if ((le32_to_cpu(sb->feature_map) & 1748 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1749 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1750 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1751 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1752 } 1753 1754 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1755 sb->level != 0) 1756 return -EINVAL; 1757 1758 /* not spare disk, or LEVEL_MULTIPATH */ 1759 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || 1760 (rdev->desc_nr >= 0 && 1761 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1762 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1763 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) 1764 spare_disk = false; 1765 1766 if (!refdev) { 1767 if (!spare_disk) 1768 ret = 1; 1769 else 1770 ret = 0; 1771 } else { 1772 __u64 ev1, ev2; 1773 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1774 1775 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1776 sb->level != refsb->level || 1777 sb->layout != refsb->layout || 1778 sb->chunksize != refsb->chunksize) { 1779 pr_warn("md: %s has strangely different superblock to %s\n", 1780 bdevname(rdev->bdev,b), 1781 bdevname(refdev->bdev,b2)); 1782 return -EINVAL; 1783 } 1784 ev1 = le64_to_cpu(sb->events); 1785 ev2 = le64_to_cpu(refsb->events); 1786 1787 if (!spare_disk && ev1 > ev2) 1788 ret = 1; 1789 else 1790 ret = 0; 1791 } 1792 if (minor_version) { 1793 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1794 sectors -= rdev->data_offset; 1795 } else 1796 sectors = rdev->sb_start; 1797 if (sectors < le64_to_cpu(sb->data_size)) 1798 return -EINVAL; 1799 rdev->sectors = le64_to_cpu(sb->data_size); 1800 return ret; 1801 } 1802 1803 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1804 { 1805 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1806 __u64 ev1 = le64_to_cpu(sb->events); 1807 1808 rdev->raid_disk = -1; 1809 clear_bit(Faulty, &rdev->flags); 1810 clear_bit(In_sync, &rdev->flags); 1811 clear_bit(Bitmap_sync, &rdev->flags); 1812 clear_bit(WriteMostly, &rdev->flags); 1813 1814 if (mddev->raid_disks == 0) { 1815 mddev->major_version = 1; 1816 mddev->patch_version = 0; 1817 mddev->external = 0; 1818 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1819 mddev->ctime = le64_to_cpu(sb->ctime); 1820 mddev->utime = le64_to_cpu(sb->utime); 1821 mddev->level = le32_to_cpu(sb->level); 1822 mddev->clevel[0] = 0; 1823 mddev->layout = le32_to_cpu(sb->layout); 1824 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1825 mddev->dev_sectors = le64_to_cpu(sb->size); 1826 mddev->events = ev1; 1827 mddev->bitmap_info.offset = 0; 1828 mddev->bitmap_info.space = 0; 1829 /* Default location for bitmap is 1K after superblock 1830 * using 3K - total of 4K 1831 */ 1832 mddev->bitmap_info.default_offset = 1024 >> 9; 1833 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1834 mddev->reshape_backwards = 0; 1835 1836 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1837 memcpy(mddev->uuid, sb->set_uuid, 16); 1838 1839 mddev->max_disks = (4096-256)/2; 1840 1841 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1842 mddev->bitmap_info.file == NULL) { 1843 mddev->bitmap_info.offset = 1844 (__s32)le32_to_cpu(sb->bitmap_offset); 1845 /* Metadata doesn't record how much space is available. 1846 * For 1.0, we assume we can use up to the superblock 1847 * if before, else to 4K beyond superblock. 1848 * For others, assume no change is possible. 1849 */ 1850 if (mddev->minor_version > 0) 1851 mddev->bitmap_info.space = 0; 1852 else if (mddev->bitmap_info.offset > 0) 1853 mddev->bitmap_info.space = 1854 8 - mddev->bitmap_info.offset; 1855 else 1856 mddev->bitmap_info.space = 1857 -mddev->bitmap_info.offset; 1858 } 1859 1860 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1861 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1862 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1863 mddev->new_level = le32_to_cpu(sb->new_level); 1864 mddev->new_layout = le32_to_cpu(sb->new_layout); 1865 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1866 if (mddev->delta_disks < 0 || 1867 (mddev->delta_disks == 0 && 1868 (le32_to_cpu(sb->feature_map) 1869 & MD_FEATURE_RESHAPE_BACKWARDS))) 1870 mddev->reshape_backwards = 1; 1871 } else { 1872 mddev->reshape_position = MaxSector; 1873 mddev->delta_disks = 0; 1874 mddev->new_level = mddev->level; 1875 mddev->new_layout = mddev->layout; 1876 mddev->new_chunk_sectors = mddev->chunk_sectors; 1877 } 1878 1879 if (mddev->level == 0 && 1880 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1881 mddev->layout = -1; 1882 1883 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1884 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1885 1886 if (le32_to_cpu(sb->feature_map) & 1887 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1888 if (le32_to_cpu(sb->feature_map) & 1889 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1890 return -EINVAL; 1891 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1892 (le32_to_cpu(sb->feature_map) & 1893 MD_FEATURE_MULTIPLE_PPLS)) 1894 return -EINVAL; 1895 set_bit(MD_HAS_PPL, &mddev->flags); 1896 } 1897 } else if (mddev->pers == NULL) { 1898 /* Insist of good event counter while assembling, except for 1899 * spares (which don't need an event count) */ 1900 ++ev1; 1901 if (rdev->desc_nr >= 0 && 1902 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1903 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1904 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1905 if (ev1 < mddev->events) 1906 return -EINVAL; 1907 } else if (mddev->bitmap) { 1908 /* If adding to array with a bitmap, then we can accept an 1909 * older device, but not too old. 1910 */ 1911 if (ev1 < mddev->bitmap->events_cleared) 1912 return 0; 1913 if (ev1 < mddev->events) 1914 set_bit(Bitmap_sync, &rdev->flags); 1915 } else { 1916 if (ev1 < mddev->events) 1917 /* just a hot-add of a new device, leave raid_disk at -1 */ 1918 return 0; 1919 } 1920 if (mddev->level != LEVEL_MULTIPATH) { 1921 int role; 1922 if (rdev->desc_nr < 0 || 1923 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1924 role = MD_DISK_ROLE_SPARE; 1925 rdev->desc_nr = -1; 1926 } else 1927 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1928 switch(role) { 1929 case MD_DISK_ROLE_SPARE: /* spare */ 1930 break; 1931 case MD_DISK_ROLE_FAULTY: /* faulty */ 1932 set_bit(Faulty, &rdev->flags); 1933 break; 1934 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1935 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1936 /* journal device without journal feature */ 1937 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1938 return -EINVAL; 1939 } 1940 set_bit(Journal, &rdev->flags); 1941 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1942 rdev->raid_disk = 0; 1943 break; 1944 default: 1945 rdev->saved_raid_disk = role; 1946 if ((le32_to_cpu(sb->feature_map) & 1947 MD_FEATURE_RECOVERY_OFFSET)) { 1948 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1949 if (!(le32_to_cpu(sb->feature_map) & 1950 MD_FEATURE_RECOVERY_BITMAP)) 1951 rdev->saved_raid_disk = -1; 1952 } else { 1953 /* 1954 * If the array is FROZEN, then the device can't 1955 * be in_sync with rest of array. 1956 */ 1957 if (!test_bit(MD_RECOVERY_FROZEN, 1958 &mddev->recovery)) 1959 set_bit(In_sync, &rdev->flags); 1960 } 1961 rdev->raid_disk = role; 1962 break; 1963 } 1964 if (sb->devflags & WriteMostly1) 1965 set_bit(WriteMostly, &rdev->flags); 1966 if (sb->devflags & FailFast1) 1967 set_bit(FailFast, &rdev->flags); 1968 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1969 set_bit(Replacement, &rdev->flags); 1970 } else /* MULTIPATH are always insync */ 1971 set_bit(In_sync, &rdev->flags); 1972 1973 return 0; 1974 } 1975 1976 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1977 { 1978 struct mdp_superblock_1 *sb; 1979 struct md_rdev *rdev2; 1980 int max_dev, i; 1981 /* make rdev->sb match mddev and rdev data. */ 1982 1983 sb = page_address(rdev->sb_page); 1984 1985 sb->feature_map = 0; 1986 sb->pad0 = 0; 1987 sb->recovery_offset = cpu_to_le64(0); 1988 memset(sb->pad3, 0, sizeof(sb->pad3)); 1989 1990 sb->utime = cpu_to_le64((__u64)mddev->utime); 1991 sb->events = cpu_to_le64(mddev->events); 1992 if (mddev->in_sync) 1993 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1994 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1995 sb->resync_offset = cpu_to_le64(MaxSector); 1996 else 1997 sb->resync_offset = cpu_to_le64(0); 1998 1999 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2000 2001 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2002 sb->size = cpu_to_le64(mddev->dev_sectors); 2003 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2004 sb->level = cpu_to_le32(mddev->level); 2005 sb->layout = cpu_to_le32(mddev->layout); 2006 if (test_bit(FailFast, &rdev->flags)) 2007 sb->devflags |= FailFast1; 2008 else 2009 sb->devflags &= ~FailFast1; 2010 2011 if (test_bit(WriteMostly, &rdev->flags)) 2012 sb->devflags |= WriteMostly1; 2013 else 2014 sb->devflags &= ~WriteMostly1; 2015 sb->data_offset = cpu_to_le64(rdev->data_offset); 2016 sb->data_size = cpu_to_le64(rdev->sectors); 2017 2018 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2019 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2020 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2021 } 2022 2023 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2024 !test_bit(In_sync, &rdev->flags)) { 2025 sb->feature_map |= 2026 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2027 sb->recovery_offset = 2028 cpu_to_le64(rdev->recovery_offset); 2029 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2030 sb->feature_map |= 2031 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2032 } 2033 /* Note: recovery_offset and journal_tail share space */ 2034 if (test_bit(Journal, &rdev->flags)) 2035 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2036 if (test_bit(Replacement, &rdev->flags)) 2037 sb->feature_map |= 2038 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2039 2040 if (mddev->reshape_position != MaxSector) { 2041 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2042 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2043 sb->new_layout = cpu_to_le32(mddev->new_layout); 2044 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2045 sb->new_level = cpu_to_le32(mddev->new_level); 2046 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2047 if (mddev->delta_disks == 0 && 2048 mddev->reshape_backwards) 2049 sb->feature_map 2050 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2051 if (rdev->new_data_offset != rdev->data_offset) { 2052 sb->feature_map 2053 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2054 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2055 - rdev->data_offset)); 2056 } 2057 } 2058 2059 if (mddev_is_clustered(mddev)) 2060 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2061 2062 if (rdev->badblocks.count == 0) 2063 /* Nothing to do for bad blocks*/ ; 2064 else if (sb->bblog_offset == 0) 2065 /* Cannot record bad blocks on this device */ 2066 md_error(mddev, rdev); 2067 else { 2068 struct badblocks *bb = &rdev->badblocks; 2069 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2070 u64 *p = bb->page; 2071 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2072 if (bb->changed) { 2073 unsigned seq; 2074 2075 retry: 2076 seq = read_seqbegin(&bb->lock); 2077 2078 memset(bbp, 0xff, PAGE_SIZE); 2079 2080 for (i = 0 ; i < bb->count ; i++) { 2081 u64 internal_bb = p[i]; 2082 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2083 | BB_LEN(internal_bb)); 2084 bbp[i] = cpu_to_le64(store_bb); 2085 } 2086 bb->changed = 0; 2087 if (read_seqretry(&bb->lock, seq)) 2088 goto retry; 2089 2090 bb->sector = (rdev->sb_start + 2091 (int)le32_to_cpu(sb->bblog_offset)); 2092 bb->size = le16_to_cpu(sb->bblog_size); 2093 } 2094 } 2095 2096 max_dev = 0; 2097 rdev_for_each(rdev2, mddev) 2098 if (rdev2->desc_nr+1 > max_dev) 2099 max_dev = rdev2->desc_nr+1; 2100 2101 if (max_dev > le32_to_cpu(sb->max_dev)) { 2102 int bmask; 2103 sb->max_dev = cpu_to_le32(max_dev); 2104 rdev->sb_size = max_dev * 2 + 256; 2105 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2106 if (rdev->sb_size & bmask) 2107 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2108 } else 2109 max_dev = le32_to_cpu(sb->max_dev); 2110 2111 for (i=0; i<max_dev;i++) 2112 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2113 2114 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2115 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2116 2117 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2118 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2119 sb->feature_map |= 2120 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2121 else 2122 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2123 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2124 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2125 } 2126 2127 rdev_for_each(rdev2, mddev) { 2128 i = rdev2->desc_nr; 2129 if (test_bit(Faulty, &rdev2->flags)) 2130 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2131 else if (test_bit(In_sync, &rdev2->flags)) 2132 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2133 else if (test_bit(Journal, &rdev2->flags)) 2134 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2135 else if (rdev2->raid_disk >= 0) 2136 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2137 else 2138 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2139 } 2140 2141 sb->sb_csum = calc_sb_1_csum(sb); 2142 } 2143 2144 static sector_t super_1_choose_bm_space(sector_t dev_size) 2145 { 2146 sector_t bm_space; 2147 2148 /* if the device is bigger than 8Gig, save 64k for bitmap 2149 * usage, if bigger than 200Gig, save 128k 2150 */ 2151 if (dev_size < 64*2) 2152 bm_space = 0; 2153 else if (dev_size - 64*2 >= 200*1024*1024*2) 2154 bm_space = 128*2; 2155 else if (dev_size - 4*2 > 8*1024*1024*2) 2156 bm_space = 64*2; 2157 else 2158 bm_space = 4*2; 2159 return bm_space; 2160 } 2161 2162 static unsigned long long 2163 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2164 { 2165 struct mdp_superblock_1 *sb; 2166 sector_t max_sectors; 2167 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2168 return 0; /* component must fit device */ 2169 if (rdev->data_offset != rdev->new_data_offset) 2170 return 0; /* too confusing */ 2171 if (rdev->sb_start < rdev->data_offset) { 2172 /* minor versions 1 and 2; superblock before data */ 2173 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 2174 max_sectors -= rdev->data_offset; 2175 if (!num_sectors || num_sectors > max_sectors) 2176 num_sectors = max_sectors; 2177 } else if (rdev->mddev->bitmap_info.offset) { 2178 /* minor version 0 with bitmap we can't move */ 2179 return 0; 2180 } else { 2181 /* minor version 0; superblock after data */ 2182 sector_t sb_start, bm_space; 2183 sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9; 2184 2185 /* 8K is for superblock */ 2186 sb_start = dev_size - 8*2; 2187 sb_start &= ~(sector_t)(4*2 - 1); 2188 2189 bm_space = super_1_choose_bm_space(dev_size); 2190 2191 /* Space that can be used to store date needs to decrease 2192 * superblock bitmap space and bad block space(4K) 2193 */ 2194 max_sectors = sb_start - bm_space - 4*2; 2195 2196 if (!num_sectors || num_sectors > max_sectors) 2197 num_sectors = max_sectors; 2198 } 2199 sb = page_address(rdev->sb_page); 2200 sb->data_size = cpu_to_le64(num_sectors); 2201 sb->super_offset = cpu_to_le64(rdev->sb_start); 2202 sb->sb_csum = calc_sb_1_csum(sb); 2203 do { 2204 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2205 rdev->sb_page); 2206 } while (md_super_wait(rdev->mddev) < 0); 2207 return num_sectors; 2208 2209 } 2210 2211 static int 2212 super_1_allow_new_offset(struct md_rdev *rdev, 2213 unsigned long long new_offset) 2214 { 2215 /* All necessary checks on new >= old have been done */ 2216 struct bitmap *bitmap; 2217 if (new_offset >= rdev->data_offset) 2218 return 1; 2219 2220 /* with 1.0 metadata, there is no metadata to tread on 2221 * so we can always move back */ 2222 if (rdev->mddev->minor_version == 0) 2223 return 1; 2224 2225 /* otherwise we must be sure not to step on 2226 * any metadata, so stay: 2227 * 36K beyond start of superblock 2228 * beyond end of badblocks 2229 * beyond write-intent bitmap 2230 */ 2231 if (rdev->sb_start + (32+4)*2 > new_offset) 2232 return 0; 2233 bitmap = rdev->mddev->bitmap; 2234 if (bitmap && !rdev->mddev->bitmap_info.file && 2235 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2236 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2237 return 0; 2238 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2239 return 0; 2240 2241 return 1; 2242 } 2243 2244 static struct super_type super_types[] = { 2245 [0] = { 2246 .name = "0.90.0", 2247 .owner = THIS_MODULE, 2248 .load_super = super_90_load, 2249 .validate_super = super_90_validate, 2250 .sync_super = super_90_sync, 2251 .rdev_size_change = super_90_rdev_size_change, 2252 .allow_new_offset = super_90_allow_new_offset, 2253 }, 2254 [1] = { 2255 .name = "md-1", 2256 .owner = THIS_MODULE, 2257 .load_super = super_1_load, 2258 .validate_super = super_1_validate, 2259 .sync_super = super_1_sync, 2260 .rdev_size_change = super_1_rdev_size_change, 2261 .allow_new_offset = super_1_allow_new_offset, 2262 }, 2263 }; 2264 2265 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2266 { 2267 if (mddev->sync_super) { 2268 mddev->sync_super(mddev, rdev); 2269 return; 2270 } 2271 2272 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2273 2274 super_types[mddev->major_version].sync_super(mddev, rdev); 2275 } 2276 2277 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2278 { 2279 struct md_rdev *rdev, *rdev2; 2280 2281 rcu_read_lock(); 2282 rdev_for_each_rcu(rdev, mddev1) { 2283 if (test_bit(Faulty, &rdev->flags) || 2284 test_bit(Journal, &rdev->flags) || 2285 rdev->raid_disk == -1) 2286 continue; 2287 rdev_for_each_rcu(rdev2, mddev2) { 2288 if (test_bit(Faulty, &rdev2->flags) || 2289 test_bit(Journal, &rdev2->flags) || 2290 rdev2->raid_disk == -1) 2291 continue; 2292 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2293 rcu_read_unlock(); 2294 return 1; 2295 } 2296 } 2297 } 2298 rcu_read_unlock(); 2299 return 0; 2300 } 2301 2302 static LIST_HEAD(pending_raid_disks); 2303 2304 /* 2305 * Try to register data integrity profile for an mddev 2306 * 2307 * This is called when an array is started and after a disk has been kicked 2308 * from the array. It only succeeds if all working and active component devices 2309 * are integrity capable with matching profiles. 2310 */ 2311 int md_integrity_register(struct mddev *mddev) 2312 { 2313 struct md_rdev *rdev, *reference = NULL; 2314 2315 if (list_empty(&mddev->disks)) 2316 return 0; /* nothing to do */ 2317 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2318 return 0; /* shouldn't register, or already is */ 2319 rdev_for_each(rdev, mddev) { 2320 /* skip spares and non-functional disks */ 2321 if (test_bit(Faulty, &rdev->flags)) 2322 continue; 2323 if (rdev->raid_disk < 0) 2324 continue; 2325 if (!reference) { 2326 /* Use the first rdev as the reference */ 2327 reference = rdev; 2328 continue; 2329 } 2330 /* does this rdev's profile match the reference profile? */ 2331 if (blk_integrity_compare(reference->bdev->bd_disk, 2332 rdev->bdev->bd_disk) < 0) 2333 return -EINVAL; 2334 } 2335 if (!reference || !bdev_get_integrity(reference->bdev)) 2336 return 0; 2337 /* 2338 * All component devices are integrity capable and have matching 2339 * profiles, register the common profile for the md device. 2340 */ 2341 blk_integrity_register(mddev->gendisk, 2342 bdev_get_integrity(reference->bdev)); 2343 2344 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2345 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2346 (mddev->level != 1 && mddev->level != 10 && 2347 bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { 2348 /* 2349 * No need to handle the failure of bioset_integrity_create, 2350 * because the function is called by md_run() -> pers->run(), 2351 * md_run calls bioset_exit -> bioset_integrity_free in case 2352 * of failure case. 2353 */ 2354 pr_err("md: failed to create integrity pool for %s\n", 2355 mdname(mddev)); 2356 return -EINVAL; 2357 } 2358 return 0; 2359 } 2360 EXPORT_SYMBOL(md_integrity_register); 2361 2362 /* 2363 * Attempt to add an rdev, but only if it is consistent with the current 2364 * integrity profile 2365 */ 2366 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2367 { 2368 struct blk_integrity *bi_mddev; 2369 char name[BDEVNAME_SIZE]; 2370 2371 if (!mddev->gendisk) 2372 return 0; 2373 2374 bi_mddev = blk_get_integrity(mddev->gendisk); 2375 2376 if (!bi_mddev) /* nothing to do */ 2377 return 0; 2378 2379 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2380 pr_err("%s: incompatible integrity profile for %s\n", 2381 mdname(mddev), bdevname(rdev->bdev, name)); 2382 return -ENXIO; 2383 } 2384 2385 return 0; 2386 } 2387 EXPORT_SYMBOL(md_integrity_add_rdev); 2388 2389 static bool rdev_read_only(struct md_rdev *rdev) 2390 { 2391 return bdev_read_only(rdev->bdev) || 2392 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2393 } 2394 2395 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2396 { 2397 char b[BDEVNAME_SIZE]; 2398 int err; 2399 2400 /* prevent duplicates */ 2401 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2402 return -EEXIST; 2403 2404 if (rdev_read_only(rdev) && mddev->pers) 2405 return -EROFS; 2406 2407 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2408 if (!test_bit(Journal, &rdev->flags) && 2409 rdev->sectors && 2410 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2411 if (mddev->pers) { 2412 /* Cannot change size, so fail 2413 * If mddev->level <= 0, then we don't care 2414 * about aligning sizes (e.g. linear) 2415 */ 2416 if (mddev->level > 0) 2417 return -ENOSPC; 2418 } else 2419 mddev->dev_sectors = rdev->sectors; 2420 } 2421 2422 /* Verify rdev->desc_nr is unique. 2423 * If it is -1, assign a free number, else 2424 * check number is not in use 2425 */ 2426 rcu_read_lock(); 2427 if (rdev->desc_nr < 0) { 2428 int choice = 0; 2429 if (mddev->pers) 2430 choice = mddev->raid_disks; 2431 while (md_find_rdev_nr_rcu(mddev, choice)) 2432 choice++; 2433 rdev->desc_nr = choice; 2434 } else { 2435 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2436 rcu_read_unlock(); 2437 return -EBUSY; 2438 } 2439 } 2440 rcu_read_unlock(); 2441 if (!test_bit(Journal, &rdev->flags) && 2442 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2443 pr_warn("md: %s: array is limited to %d devices\n", 2444 mdname(mddev), mddev->max_disks); 2445 return -EBUSY; 2446 } 2447 bdevname(rdev->bdev,b); 2448 strreplace(b, '/', '!'); 2449 2450 rdev->mddev = mddev; 2451 pr_debug("md: bind<%s>\n", b); 2452 2453 if (mddev->raid_disks) 2454 mddev_create_serial_pool(mddev, rdev, false); 2455 2456 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2457 goto fail; 2458 2459 /* failure here is OK */ 2460 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2461 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2462 rdev->sysfs_unack_badblocks = 2463 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2464 rdev->sysfs_badblocks = 2465 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2466 2467 list_add_rcu(&rdev->same_set, &mddev->disks); 2468 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2469 2470 /* May as well allow recovery to be retried once */ 2471 mddev->recovery_disabled++; 2472 2473 return 0; 2474 2475 fail: 2476 pr_warn("md: failed to register dev-%s for %s\n", 2477 b, mdname(mddev)); 2478 return err; 2479 } 2480 2481 static void rdev_delayed_delete(struct work_struct *ws) 2482 { 2483 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2484 kobject_del(&rdev->kobj); 2485 kobject_put(&rdev->kobj); 2486 } 2487 2488 static void unbind_rdev_from_array(struct md_rdev *rdev) 2489 { 2490 char b[BDEVNAME_SIZE]; 2491 2492 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2493 list_del_rcu(&rdev->same_set); 2494 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2495 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 2496 rdev->mddev = NULL; 2497 sysfs_remove_link(&rdev->kobj, "block"); 2498 sysfs_put(rdev->sysfs_state); 2499 sysfs_put(rdev->sysfs_unack_badblocks); 2500 sysfs_put(rdev->sysfs_badblocks); 2501 rdev->sysfs_state = NULL; 2502 rdev->sysfs_unack_badblocks = NULL; 2503 rdev->sysfs_badblocks = NULL; 2504 rdev->badblocks.count = 0; 2505 /* We need to delay this, otherwise we can deadlock when 2506 * writing to 'remove' to "dev/state". We also need 2507 * to delay it due to rcu usage. 2508 */ 2509 synchronize_rcu(); 2510 INIT_WORK(&rdev->del_work, rdev_delayed_delete); 2511 kobject_get(&rdev->kobj); 2512 queue_work(md_rdev_misc_wq, &rdev->del_work); 2513 } 2514 2515 /* 2516 * prevent the device from being mounted, repartitioned or 2517 * otherwise reused by a RAID array (or any other kernel 2518 * subsystem), by bd_claiming the device. 2519 */ 2520 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2521 { 2522 int err = 0; 2523 struct block_device *bdev; 2524 2525 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2526 shared ? (struct md_rdev *)lock_rdev : rdev); 2527 if (IS_ERR(bdev)) { 2528 pr_warn("md: could not open device unknown-block(%u,%u).\n", 2529 MAJOR(dev), MINOR(dev)); 2530 return PTR_ERR(bdev); 2531 } 2532 rdev->bdev = bdev; 2533 return err; 2534 } 2535 2536 static void unlock_rdev(struct md_rdev *rdev) 2537 { 2538 struct block_device *bdev = rdev->bdev; 2539 rdev->bdev = NULL; 2540 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2541 } 2542 2543 void md_autodetect_dev(dev_t dev); 2544 2545 static void export_rdev(struct md_rdev *rdev) 2546 { 2547 char b[BDEVNAME_SIZE]; 2548 2549 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); 2550 md_rdev_clear(rdev); 2551 #ifndef MODULE 2552 if (test_bit(AutoDetected, &rdev->flags)) 2553 md_autodetect_dev(rdev->bdev->bd_dev); 2554 #endif 2555 unlock_rdev(rdev); 2556 kobject_put(&rdev->kobj); 2557 } 2558 2559 void md_kick_rdev_from_array(struct md_rdev *rdev) 2560 { 2561 unbind_rdev_from_array(rdev); 2562 export_rdev(rdev); 2563 } 2564 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2565 2566 static void export_array(struct mddev *mddev) 2567 { 2568 struct md_rdev *rdev; 2569 2570 while (!list_empty(&mddev->disks)) { 2571 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2572 same_set); 2573 md_kick_rdev_from_array(rdev); 2574 } 2575 mddev->raid_disks = 0; 2576 mddev->major_version = 0; 2577 } 2578 2579 static bool set_in_sync(struct mddev *mddev) 2580 { 2581 lockdep_assert_held(&mddev->lock); 2582 if (!mddev->in_sync) { 2583 mddev->sync_checkers++; 2584 spin_unlock(&mddev->lock); 2585 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2586 spin_lock(&mddev->lock); 2587 if (!mddev->in_sync && 2588 percpu_ref_is_zero(&mddev->writes_pending)) { 2589 mddev->in_sync = 1; 2590 /* 2591 * Ensure ->in_sync is visible before we clear 2592 * ->sync_checkers. 2593 */ 2594 smp_mb(); 2595 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2596 sysfs_notify_dirent_safe(mddev->sysfs_state); 2597 } 2598 if (--mddev->sync_checkers == 0) 2599 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2600 } 2601 if (mddev->safemode == 1) 2602 mddev->safemode = 0; 2603 return mddev->in_sync; 2604 } 2605 2606 static void sync_sbs(struct mddev *mddev, int nospares) 2607 { 2608 /* Update each superblock (in-memory image), but 2609 * if we are allowed to, skip spares which already 2610 * have the right event counter, or have one earlier 2611 * (which would mean they aren't being marked as dirty 2612 * with the rest of the array) 2613 */ 2614 struct md_rdev *rdev; 2615 rdev_for_each(rdev, mddev) { 2616 if (rdev->sb_events == mddev->events || 2617 (nospares && 2618 rdev->raid_disk < 0 && 2619 rdev->sb_events+1 == mddev->events)) { 2620 /* Don't update this superblock */ 2621 rdev->sb_loaded = 2; 2622 } else { 2623 sync_super(mddev, rdev); 2624 rdev->sb_loaded = 1; 2625 } 2626 } 2627 } 2628 2629 static bool does_sb_need_changing(struct mddev *mddev) 2630 { 2631 struct md_rdev *rdev; 2632 struct mdp_superblock_1 *sb; 2633 int role; 2634 2635 /* Find a good rdev */ 2636 rdev_for_each(rdev, mddev) 2637 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2638 break; 2639 2640 /* No good device found. */ 2641 if (!rdev) 2642 return false; 2643 2644 sb = page_address(rdev->sb_page); 2645 /* Check if a device has become faulty or a spare become active */ 2646 rdev_for_each(rdev, mddev) { 2647 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2648 /* Device activated? */ 2649 if (role == 0xffff && rdev->raid_disk >=0 && 2650 !test_bit(Faulty, &rdev->flags)) 2651 return true; 2652 /* Device turned faulty? */ 2653 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2654 return true; 2655 } 2656 2657 /* Check if any mddev parameters have changed */ 2658 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2659 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2660 (mddev->layout != le32_to_cpu(sb->layout)) || 2661 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2662 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2663 return true; 2664 2665 return false; 2666 } 2667 2668 void md_update_sb(struct mddev *mddev, int force_change) 2669 { 2670 struct md_rdev *rdev; 2671 int sync_req; 2672 int nospares = 0; 2673 int any_badblocks_changed = 0; 2674 int ret = -1; 2675 2676 if (mddev->ro) { 2677 if (force_change) 2678 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2679 return; 2680 } 2681 2682 repeat: 2683 if (mddev_is_clustered(mddev)) { 2684 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2685 force_change = 1; 2686 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2687 nospares = 1; 2688 ret = md_cluster_ops->metadata_update_start(mddev); 2689 /* Has someone else has updated the sb */ 2690 if (!does_sb_need_changing(mddev)) { 2691 if (ret == 0) 2692 md_cluster_ops->metadata_update_cancel(mddev); 2693 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2694 BIT(MD_SB_CHANGE_DEVS) | 2695 BIT(MD_SB_CHANGE_CLEAN)); 2696 return; 2697 } 2698 } 2699 2700 /* 2701 * First make sure individual recovery_offsets are correct 2702 * curr_resync_completed can only be used during recovery. 2703 * During reshape/resync it might use array-addresses rather 2704 * that device addresses. 2705 */ 2706 rdev_for_each(rdev, mddev) { 2707 if (rdev->raid_disk >= 0 && 2708 mddev->delta_disks >= 0 && 2709 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2710 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2711 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2712 !test_bit(Journal, &rdev->flags) && 2713 !test_bit(In_sync, &rdev->flags) && 2714 mddev->curr_resync_completed > rdev->recovery_offset) 2715 rdev->recovery_offset = mddev->curr_resync_completed; 2716 2717 } 2718 if (!mddev->persistent) { 2719 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2720 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2721 if (!mddev->external) { 2722 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2723 rdev_for_each(rdev, mddev) { 2724 if (rdev->badblocks.changed) { 2725 rdev->badblocks.changed = 0; 2726 ack_all_badblocks(&rdev->badblocks); 2727 md_error(mddev, rdev); 2728 } 2729 clear_bit(Blocked, &rdev->flags); 2730 clear_bit(BlockedBadBlocks, &rdev->flags); 2731 wake_up(&rdev->blocked_wait); 2732 } 2733 } 2734 wake_up(&mddev->sb_wait); 2735 return; 2736 } 2737 2738 spin_lock(&mddev->lock); 2739 2740 mddev->utime = ktime_get_real_seconds(); 2741 2742 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2743 force_change = 1; 2744 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2745 /* just a clean<-> dirty transition, possibly leave spares alone, 2746 * though if events isn't the right even/odd, we will have to do 2747 * spares after all 2748 */ 2749 nospares = 1; 2750 if (force_change) 2751 nospares = 0; 2752 if (mddev->degraded) 2753 /* If the array is degraded, then skipping spares is both 2754 * dangerous and fairly pointless. 2755 * Dangerous because a device that was removed from the array 2756 * might have a event_count that still looks up-to-date, 2757 * so it can be re-added without a resync. 2758 * Pointless because if there are any spares to skip, 2759 * then a recovery will happen and soon that array won't 2760 * be degraded any more and the spare can go back to sleep then. 2761 */ 2762 nospares = 0; 2763 2764 sync_req = mddev->in_sync; 2765 2766 /* If this is just a dirty<->clean transition, and the array is clean 2767 * and 'events' is odd, we can roll back to the previous clean state */ 2768 if (nospares 2769 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2770 && mddev->can_decrease_events 2771 && mddev->events != 1) { 2772 mddev->events--; 2773 mddev->can_decrease_events = 0; 2774 } else { 2775 /* otherwise we have to go forward and ... */ 2776 mddev->events ++; 2777 mddev->can_decrease_events = nospares; 2778 } 2779 2780 /* 2781 * This 64-bit counter should never wrap. 2782 * Either we are in around ~1 trillion A.C., assuming 2783 * 1 reboot per second, or we have a bug... 2784 */ 2785 WARN_ON(mddev->events == 0); 2786 2787 rdev_for_each(rdev, mddev) { 2788 if (rdev->badblocks.changed) 2789 any_badblocks_changed++; 2790 if (test_bit(Faulty, &rdev->flags)) 2791 set_bit(FaultRecorded, &rdev->flags); 2792 } 2793 2794 sync_sbs(mddev, nospares); 2795 spin_unlock(&mddev->lock); 2796 2797 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2798 mdname(mddev), mddev->in_sync); 2799 2800 if (mddev->queue) 2801 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2802 rewrite: 2803 md_bitmap_update_sb(mddev->bitmap); 2804 rdev_for_each(rdev, mddev) { 2805 char b[BDEVNAME_SIZE]; 2806 2807 if (rdev->sb_loaded != 1) 2808 continue; /* no noise on spare devices */ 2809 2810 if (!test_bit(Faulty, &rdev->flags)) { 2811 md_super_write(mddev,rdev, 2812 rdev->sb_start, rdev->sb_size, 2813 rdev->sb_page); 2814 pr_debug("md: (write) %s's sb offset: %llu\n", 2815 bdevname(rdev->bdev, b), 2816 (unsigned long long)rdev->sb_start); 2817 rdev->sb_events = mddev->events; 2818 if (rdev->badblocks.size) { 2819 md_super_write(mddev, rdev, 2820 rdev->badblocks.sector, 2821 rdev->badblocks.size << 9, 2822 rdev->bb_page); 2823 rdev->badblocks.size = 0; 2824 } 2825 2826 } else 2827 pr_debug("md: %s (skipping faulty)\n", 2828 bdevname(rdev->bdev, b)); 2829 2830 if (mddev->level == LEVEL_MULTIPATH) 2831 /* only need to write one superblock... */ 2832 break; 2833 } 2834 if (md_super_wait(mddev) < 0) 2835 goto rewrite; 2836 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2837 2838 if (mddev_is_clustered(mddev) && ret == 0) 2839 md_cluster_ops->metadata_update_finish(mddev); 2840 2841 if (mddev->in_sync != sync_req || 2842 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2843 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2844 /* have to write it out again */ 2845 goto repeat; 2846 wake_up(&mddev->sb_wait); 2847 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2848 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2849 2850 rdev_for_each(rdev, mddev) { 2851 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2852 clear_bit(Blocked, &rdev->flags); 2853 2854 if (any_badblocks_changed) 2855 ack_all_badblocks(&rdev->badblocks); 2856 clear_bit(BlockedBadBlocks, &rdev->flags); 2857 wake_up(&rdev->blocked_wait); 2858 } 2859 } 2860 EXPORT_SYMBOL(md_update_sb); 2861 2862 static int add_bound_rdev(struct md_rdev *rdev) 2863 { 2864 struct mddev *mddev = rdev->mddev; 2865 int err = 0; 2866 bool add_journal = test_bit(Journal, &rdev->flags); 2867 2868 if (!mddev->pers->hot_remove_disk || add_journal) { 2869 /* If there is hot_add_disk but no hot_remove_disk 2870 * then added disks for geometry changes, 2871 * and should be added immediately. 2872 */ 2873 super_types[mddev->major_version]. 2874 validate_super(mddev, rdev); 2875 if (add_journal) 2876 mddev_suspend(mddev); 2877 err = mddev->pers->hot_add_disk(mddev, rdev); 2878 if (add_journal) 2879 mddev_resume(mddev); 2880 if (err) { 2881 md_kick_rdev_from_array(rdev); 2882 return err; 2883 } 2884 } 2885 sysfs_notify_dirent_safe(rdev->sysfs_state); 2886 2887 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2888 if (mddev->degraded) 2889 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2890 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2891 md_new_event(mddev); 2892 md_wakeup_thread(mddev->thread); 2893 return 0; 2894 } 2895 2896 /* words written to sysfs files may, or may not, be \n terminated. 2897 * We want to accept with case. For this we use cmd_match. 2898 */ 2899 static int cmd_match(const char *cmd, const char *str) 2900 { 2901 /* See if cmd, written into a sysfs file, matches 2902 * str. They must either be the same, or cmd can 2903 * have a trailing newline 2904 */ 2905 while (*cmd && *str && *cmd == *str) { 2906 cmd++; 2907 str++; 2908 } 2909 if (*cmd == '\n') 2910 cmd++; 2911 if (*str || *cmd) 2912 return 0; 2913 return 1; 2914 } 2915 2916 struct rdev_sysfs_entry { 2917 struct attribute attr; 2918 ssize_t (*show)(struct md_rdev *, char *); 2919 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2920 }; 2921 2922 static ssize_t 2923 state_show(struct md_rdev *rdev, char *page) 2924 { 2925 char *sep = ","; 2926 size_t len = 0; 2927 unsigned long flags = READ_ONCE(rdev->flags); 2928 2929 if (test_bit(Faulty, &flags) || 2930 (!test_bit(ExternalBbl, &flags) && 2931 rdev->badblocks.unacked_exist)) 2932 len += sprintf(page+len, "faulty%s", sep); 2933 if (test_bit(In_sync, &flags)) 2934 len += sprintf(page+len, "in_sync%s", sep); 2935 if (test_bit(Journal, &flags)) 2936 len += sprintf(page+len, "journal%s", sep); 2937 if (test_bit(WriteMostly, &flags)) 2938 len += sprintf(page+len, "write_mostly%s", sep); 2939 if (test_bit(Blocked, &flags) || 2940 (rdev->badblocks.unacked_exist 2941 && !test_bit(Faulty, &flags))) 2942 len += sprintf(page+len, "blocked%s", sep); 2943 if (!test_bit(Faulty, &flags) && 2944 !test_bit(Journal, &flags) && 2945 !test_bit(In_sync, &flags)) 2946 len += sprintf(page+len, "spare%s", sep); 2947 if (test_bit(WriteErrorSeen, &flags)) 2948 len += sprintf(page+len, "write_error%s", sep); 2949 if (test_bit(WantReplacement, &flags)) 2950 len += sprintf(page+len, "want_replacement%s", sep); 2951 if (test_bit(Replacement, &flags)) 2952 len += sprintf(page+len, "replacement%s", sep); 2953 if (test_bit(ExternalBbl, &flags)) 2954 len += sprintf(page+len, "external_bbl%s", sep); 2955 if (test_bit(FailFast, &flags)) 2956 len += sprintf(page+len, "failfast%s", sep); 2957 2958 if (len) 2959 len -= strlen(sep); 2960 2961 return len+sprintf(page+len, "\n"); 2962 } 2963 2964 static ssize_t 2965 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2966 { 2967 /* can write 2968 * faulty - simulates an error 2969 * remove - disconnects the device 2970 * writemostly - sets write_mostly 2971 * -writemostly - clears write_mostly 2972 * blocked - sets the Blocked flags 2973 * -blocked - clears the Blocked and possibly simulates an error 2974 * insync - sets Insync providing device isn't active 2975 * -insync - clear Insync for a device with a slot assigned, 2976 * so that it gets rebuilt based on bitmap 2977 * write_error - sets WriteErrorSeen 2978 * -write_error - clears WriteErrorSeen 2979 * {,-}failfast - set/clear FailFast 2980 */ 2981 int err = -EINVAL; 2982 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2983 md_error(rdev->mddev, rdev); 2984 if (test_bit(Faulty, &rdev->flags)) 2985 err = 0; 2986 else 2987 err = -EBUSY; 2988 } else if (cmd_match(buf, "remove")) { 2989 if (rdev->mddev->pers) { 2990 clear_bit(Blocked, &rdev->flags); 2991 remove_and_add_spares(rdev->mddev, rdev); 2992 } 2993 if (rdev->raid_disk >= 0) 2994 err = -EBUSY; 2995 else { 2996 struct mddev *mddev = rdev->mddev; 2997 err = 0; 2998 if (mddev_is_clustered(mddev)) 2999 err = md_cluster_ops->remove_disk(mddev, rdev); 3000 3001 if (err == 0) { 3002 md_kick_rdev_from_array(rdev); 3003 if (mddev->pers) { 3004 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3005 md_wakeup_thread(mddev->thread); 3006 } 3007 md_new_event(mddev); 3008 } 3009 } 3010 } else if (cmd_match(buf, "writemostly")) { 3011 set_bit(WriteMostly, &rdev->flags); 3012 mddev_create_serial_pool(rdev->mddev, rdev, false); 3013 err = 0; 3014 } else if (cmd_match(buf, "-writemostly")) { 3015 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 3016 clear_bit(WriteMostly, &rdev->flags); 3017 err = 0; 3018 } else if (cmd_match(buf, "blocked")) { 3019 set_bit(Blocked, &rdev->flags); 3020 err = 0; 3021 } else if (cmd_match(buf, "-blocked")) { 3022 if (!test_bit(Faulty, &rdev->flags) && 3023 !test_bit(ExternalBbl, &rdev->flags) && 3024 rdev->badblocks.unacked_exist) { 3025 /* metadata handler doesn't understand badblocks, 3026 * so we need to fail the device 3027 */ 3028 md_error(rdev->mddev, rdev); 3029 } 3030 clear_bit(Blocked, &rdev->flags); 3031 clear_bit(BlockedBadBlocks, &rdev->flags); 3032 wake_up(&rdev->blocked_wait); 3033 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3034 md_wakeup_thread(rdev->mddev->thread); 3035 3036 err = 0; 3037 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3038 set_bit(In_sync, &rdev->flags); 3039 err = 0; 3040 } else if (cmd_match(buf, "failfast")) { 3041 set_bit(FailFast, &rdev->flags); 3042 err = 0; 3043 } else if (cmd_match(buf, "-failfast")) { 3044 clear_bit(FailFast, &rdev->flags); 3045 err = 0; 3046 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3047 !test_bit(Journal, &rdev->flags)) { 3048 if (rdev->mddev->pers == NULL) { 3049 clear_bit(In_sync, &rdev->flags); 3050 rdev->saved_raid_disk = rdev->raid_disk; 3051 rdev->raid_disk = -1; 3052 err = 0; 3053 } 3054 } else if (cmd_match(buf, "write_error")) { 3055 set_bit(WriteErrorSeen, &rdev->flags); 3056 err = 0; 3057 } else if (cmd_match(buf, "-write_error")) { 3058 clear_bit(WriteErrorSeen, &rdev->flags); 3059 err = 0; 3060 } else if (cmd_match(buf, "want_replacement")) { 3061 /* Any non-spare device that is not a replacement can 3062 * become want_replacement at any time, but we then need to 3063 * check if recovery is needed. 3064 */ 3065 if (rdev->raid_disk >= 0 && 3066 !test_bit(Journal, &rdev->flags) && 3067 !test_bit(Replacement, &rdev->flags)) 3068 set_bit(WantReplacement, &rdev->flags); 3069 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3070 md_wakeup_thread(rdev->mddev->thread); 3071 err = 0; 3072 } else if (cmd_match(buf, "-want_replacement")) { 3073 /* Clearing 'want_replacement' is always allowed. 3074 * Once replacements starts it is too late though. 3075 */ 3076 err = 0; 3077 clear_bit(WantReplacement, &rdev->flags); 3078 } else if (cmd_match(buf, "replacement")) { 3079 /* Can only set a device as a replacement when array has not 3080 * yet been started. Once running, replacement is automatic 3081 * from spares, or by assigning 'slot'. 3082 */ 3083 if (rdev->mddev->pers) 3084 err = -EBUSY; 3085 else { 3086 set_bit(Replacement, &rdev->flags); 3087 err = 0; 3088 } 3089 } else if (cmd_match(buf, "-replacement")) { 3090 /* Similarly, can only clear Replacement before start */ 3091 if (rdev->mddev->pers) 3092 err = -EBUSY; 3093 else { 3094 clear_bit(Replacement, &rdev->flags); 3095 err = 0; 3096 } 3097 } else if (cmd_match(buf, "re-add")) { 3098 if (!rdev->mddev->pers) 3099 err = -EINVAL; 3100 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3101 rdev->saved_raid_disk >= 0) { 3102 /* clear_bit is performed _after_ all the devices 3103 * have their local Faulty bit cleared. If any writes 3104 * happen in the meantime in the local node, they 3105 * will land in the local bitmap, which will be synced 3106 * by this node eventually 3107 */ 3108 if (!mddev_is_clustered(rdev->mddev) || 3109 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3110 clear_bit(Faulty, &rdev->flags); 3111 err = add_bound_rdev(rdev); 3112 } 3113 } else 3114 err = -EBUSY; 3115 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3116 set_bit(ExternalBbl, &rdev->flags); 3117 rdev->badblocks.shift = 0; 3118 err = 0; 3119 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3120 clear_bit(ExternalBbl, &rdev->flags); 3121 err = 0; 3122 } 3123 if (!err) 3124 sysfs_notify_dirent_safe(rdev->sysfs_state); 3125 return err ? err : len; 3126 } 3127 static struct rdev_sysfs_entry rdev_state = 3128 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3129 3130 static ssize_t 3131 errors_show(struct md_rdev *rdev, char *page) 3132 { 3133 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3134 } 3135 3136 static ssize_t 3137 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3138 { 3139 unsigned int n; 3140 int rv; 3141 3142 rv = kstrtouint(buf, 10, &n); 3143 if (rv < 0) 3144 return rv; 3145 atomic_set(&rdev->corrected_errors, n); 3146 return len; 3147 } 3148 static struct rdev_sysfs_entry rdev_errors = 3149 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3150 3151 static ssize_t 3152 slot_show(struct md_rdev *rdev, char *page) 3153 { 3154 if (test_bit(Journal, &rdev->flags)) 3155 return sprintf(page, "journal\n"); 3156 else if (rdev->raid_disk < 0) 3157 return sprintf(page, "none\n"); 3158 else 3159 return sprintf(page, "%d\n", rdev->raid_disk); 3160 } 3161 3162 static ssize_t 3163 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3164 { 3165 int slot; 3166 int err; 3167 3168 if (test_bit(Journal, &rdev->flags)) 3169 return -EBUSY; 3170 if (strncmp(buf, "none", 4)==0) 3171 slot = -1; 3172 else { 3173 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3174 if (err < 0) 3175 return err; 3176 } 3177 if (rdev->mddev->pers && slot == -1) { 3178 /* Setting 'slot' on an active array requires also 3179 * updating the 'rd%d' link, and communicating 3180 * with the personality with ->hot_*_disk. 3181 * For now we only support removing 3182 * failed/spare devices. This normally happens automatically, 3183 * but not when the metadata is externally managed. 3184 */ 3185 if (rdev->raid_disk == -1) 3186 return -EEXIST; 3187 /* personality does all needed checks */ 3188 if (rdev->mddev->pers->hot_remove_disk == NULL) 3189 return -EINVAL; 3190 clear_bit(Blocked, &rdev->flags); 3191 remove_and_add_spares(rdev->mddev, rdev); 3192 if (rdev->raid_disk >= 0) 3193 return -EBUSY; 3194 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3195 md_wakeup_thread(rdev->mddev->thread); 3196 } else if (rdev->mddev->pers) { 3197 /* Activating a spare .. or possibly reactivating 3198 * if we ever get bitmaps working here. 3199 */ 3200 int err; 3201 3202 if (rdev->raid_disk != -1) 3203 return -EBUSY; 3204 3205 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3206 return -EBUSY; 3207 3208 if (rdev->mddev->pers->hot_add_disk == NULL) 3209 return -EINVAL; 3210 3211 if (slot >= rdev->mddev->raid_disks && 3212 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3213 return -ENOSPC; 3214 3215 rdev->raid_disk = slot; 3216 if (test_bit(In_sync, &rdev->flags)) 3217 rdev->saved_raid_disk = slot; 3218 else 3219 rdev->saved_raid_disk = -1; 3220 clear_bit(In_sync, &rdev->flags); 3221 clear_bit(Bitmap_sync, &rdev->flags); 3222 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3223 if (err) { 3224 rdev->raid_disk = -1; 3225 return err; 3226 } else 3227 sysfs_notify_dirent_safe(rdev->sysfs_state); 3228 /* failure here is OK */; 3229 sysfs_link_rdev(rdev->mddev, rdev); 3230 /* don't wakeup anyone, leave that to userspace. */ 3231 } else { 3232 if (slot >= rdev->mddev->raid_disks && 3233 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3234 return -ENOSPC; 3235 rdev->raid_disk = slot; 3236 /* assume it is working */ 3237 clear_bit(Faulty, &rdev->flags); 3238 clear_bit(WriteMostly, &rdev->flags); 3239 set_bit(In_sync, &rdev->flags); 3240 sysfs_notify_dirent_safe(rdev->sysfs_state); 3241 } 3242 return len; 3243 } 3244 3245 static struct rdev_sysfs_entry rdev_slot = 3246 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3247 3248 static ssize_t 3249 offset_show(struct md_rdev *rdev, char *page) 3250 { 3251 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3252 } 3253 3254 static ssize_t 3255 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3256 { 3257 unsigned long long offset; 3258 if (kstrtoull(buf, 10, &offset) < 0) 3259 return -EINVAL; 3260 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3261 return -EBUSY; 3262 if (rdev->sectors && rdev->mddev->external) 3263 /* Must set offset before size, so overlap checks 3264 * can be sane */ 3265 return -EBUSY; 3266 rdev->data_offset = offset; 3267 rdev->new_data_offset = offset; 3268 return len; 3269 } 3270 3271 static struct rdev_sysfs_entry rdev_offset = 3272 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3273 3274 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3275 { 3276 return sprintf(page, "%llu\n", 3277 (unsigned long long)rdev->new_data_offset); 3278 } 3279 3280 static ssize_t new_offset_store(struct md_rdev *rdev, 3281 const char *buf, size_t len) 3282 { 3283 unsigned long long new_offset; 3284 struct mddev *mddev = rdev->mddev; 3285 3286 if (kstrtoull(buf, 10, &new_offset) < 0) 3287 return -EINVAL; 3288 3289 if (mddev->sync_thread || 3290 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3291 return -EBUSY; 3292 if (new_offset == rdev->data_offset) 3293 /* reset is always permitted */ 3294 ; 3295 else if (new_offset > rdev->data_offset) { 3296 /* must not push array size beyond rdev_sectors */ 3297 if (new_offset - rdev->data_offset 3298 + mddev->dev_sectors > rdev->sectors) 3299 return -E2BIG; 3300 } 3301 /* Metadata worries about other space details. */ 3302 3303 /* decreasing the offset is inconsistent with a backwards 3304 * reshape. 3305 */ 3306 if (new_offset < rdev->data_offset && 3307 mddev->reshape_backwards) 3308 return -EINVAL; 3309 /* Increasing offset is inconsistent with forwards 3310 * reshape. reshape_direction should be set to 3311 * 'backwards' first. 3312 */ 3313 if (new_offset > rdev->data_offset && 3314 !mddev->reshape_backwards) 3315 return -EINVAL; 3316 3317 if (mddev->pers && mddev->persistent && 3318 !super_types[mddev->major_version] 3319 .allow_new_offset(rdev, new_offset)) 3320 return -E2BIG; 3321 rdev->new_data_offset = new_offset; 3322 if (new_offset > rdev->data_offset) 3323 mddev->reshape_backwards = 1; 3324 else if (new_offset < rdev->data_offset) 3325 mddev->reshape_backwards = 0; 3326 3327 return len; 3328 } 3329 static struct rdev_sysfs_entry rdev_new_offset = 3330 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3331 3332 static ssize_t 3333 rdev_size_show(struct md_rdev *rdev, char *page) 3334 { 3335 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3336 } 3337 3338 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 3339 { 3340 /* check if two start/length pairs overlap */ 3341 if (s1+l1 <= s2) 3342 return 0; 3343 if (s2+l2 <= s1) 3344 return 0; 3345 return 1; 3346 } 3347 3348 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3349 { 3350 unsigned long long blocks; 3351 sector_t new; 3352 3353 if (kstrtoull(buf, 10, &blocks) < 0) 3354 return -EINVAL; 3355 3356 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3357 return -EINVAL; /* sector conversion overflow */ 3358 3359 new = blocks * 2; 3360 if (new != blocks * 2) 3361 return -EINVAL; /* unsigned long long to sector_t overflow */ 3362 3363 *sectors = new; 3364 return 0; 3365 } 3366 3367 static ssize_t 3368 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3369 { 3370 struct mddev *my_mddev = rdev->mddev; 3371 sector_t oldsectors = rdev->sectors; 3372 sector_t sectors; 3373 3374 if (test_bit(Journal, &rdev->flags)) 3375 return -EBUSY; 3376 if (strict_blocks_to_sectors(buf, §ors) < 0) 3377 return -EINVAL; 3378 if (rdev->data_offset != rdev->new_data_offset) 3379 return -EINVAL; /* too confusing */ 3380 if (my_mddev->pers && rdev->raid_disk >= 0) { 3381 if (my_mddev->persistent) { 3382 sectors = super_types[my_mddev->major_version]. 3383 rdev_size_change(rdev, sectors); 3384 if (!sectors) 3385 return -EBUSY; 3386 } else if (!sectors) 3387 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 3388 rdev->data_offset; 3389 if (!my_mddev->pers->resize) 3390 /* Cannot change size for RAID0 or Linear etc */ 3391 return -EINVAL; 3392 } 3393 if (sectors < my_mddev->dev_sectors) 3394 return -EINVAL; /* component must fit device */ 3395 3396 rdev->sectors = sectors; 3397 if (sectors > oldsectors && my_mddev->external) { 3398 /* Need to check that all other rdevs with the same 3399 * ->bdev do not overlap. 'rcu' is sufficient to walk 3400 * the rdev lists safely. 3401 * This check does not provide a hard guarantee, it 3402 * just helps avoid dangerous mistakes. 3403 */ 3404 struct mddev *mddev; 3405 int overlap = 0; 3406 struct list_head *tmp; 3407 3408 rcu_read_lock(); 3409 for_each_mddev(mddev, tmp) { 3410 struct md_rdev *rdev2; 3411 3412 rdev_for_each(rdev2, mddev) 3413 if (rdev->bdev == rdev2->bdev && 3414 rdev != rdev2 && 3415 overlaps(rdev->data_offset, rdev->sectors, 3416 rdev2->data_offset, 3417 rdev2->sectors)) { 3418 overlap = 1; 3419 break; 3420 } 3421 if (overlap) { 3422 mddev_put(mddev); 3423 break; 3424 } 3425 } 3426 rcu_read_unlock(); 3427 if (overlap) { 3428 /* Someone else could have slipped in a size 3429 * change here, but doing so is just silly. 3430 * We put oldsectors back because we *know* it is 3431 * safe, and trust userspace not to race with 3432 * itself 3433 */ 3434 rdev->sectors = oldsectors; 3435 return -EBUSY; 3436 } 3437 } 3438 return len; 3439 } 3440 3441 static struct rdev_sysfs_entry rdev_size = 3442 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3443 3444 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3445 { 3446 unsigned long long recovery_start = rdev->recovery_offset; 3447 3448 if (test_bit(In_sync, &rdev->flags) || 3449 recovery_start == MaxSector) 3450 return sprintf(page, "none\n"); 3451 3452 return sprintf(page, "%llu\n", recovery_start); 3453 } 3454 3455 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3456 { 3457 unsigned long long recovery_start; 3458 3459 if (cmd_match(buf, "none")) 3460 recovery_start = MaxSector; 3461 else if (kstrtoull(buf, 10, &recovery_start)) 3462 return -EINVAL; 3463 3464 if (rdev->mddev->pers && 3465 rdev->raid_disk >= 0) 3466 return -EBUSY; 3467 3468 rdev->recovery_offset = recovery_start; 3469 if (recovery_start == MaxSector) 3470 set_bit(In_sync, &rdev->flags); 3471 else 3472 clear_bit(In_sync, &rdev->flags); 3473 return len; 3474 } 3475 3476 static struct rdev_sysfs_entry rdev_recovery_start = 3477 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3478 3479 /* sysfs access to bad-blocks list. 3480 * We present two files. 3481 * 'bad-blocks' lists sector numbers and lengths of ranges that 3482 * are recorded as bad. The list is truncated to fit within 3483 * the one-page limit of sysfs. 3484 * Writing "sector length" to this file adds an acknowledged 3485 * bad block list. 3486 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3487 * been acknowledged. Writing to this file adds bad blocks 3488 * without acknowledging them. This is largely for testing. 3489 */ 3490 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3491 { 3492 return badblocks_show(&rdev->badblocks, page, 0); 3493 } 3494 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3495 { 3496 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3497 /* Maybe that ack was all we needed */ 3498 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3499 wake_up(&rdev->blocked_wait); 3500 return rv; 3501 } 3502 static struct rdev_sysfs_entry rdev_bad_blocks = 3503 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3504 3505 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3506 { 3507 return badblocks_show(&rdev->badblocks, page, 1); 3508 } 3509 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3510 { 3511 return badblocks_store(&rdev->badblocks, page, len, 1); 3512 } 3513 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3514 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3515 3516 static ssize_t 3517 ppl_sector_show(struct md_rdev *rdev, char *page) 3518 { 3519 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3520 } 3521 3522 static ssize_t 3523 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3524 { 3525 unsigned long long sector; 3526 3527 if (kstrtoull(buf, 10, §or) < 0) 3528 return -EINVAL; 3529 if (sector != (sector_t)sector) 3530 return -EINVAL; 3531 3532 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3533 rdev->raid_disk >= 0) 3534 return -EBUSY; 3535 3536 if (rdev->mddev->persistent) { 3537 if (rdev->mddev->major_version == 0) 3538 return -EINVAL; 3539 if ((sector > rdev->sb_start && 3540 sector - rdev->sb_start > S16_MAX) || 3541 (sector < rdev->sb_start && 3542 rdev->sb_start - sector > -S16_MIN)) 3543 return -EINVAL; 3544 rdev->ppl.offset = sector - rdev->sb_start; 3545 } else if (!rdev->mddev->external) { 3546 return -EBUSY; 3547 } 3548 rdev->ppl.sector = sector; 3549 return len; 3550 } 3551 3552 static struct rdev_sysfs_entry rdev_ppl_sector = 3553 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3554 3555 static ssize_t 3556 ppl_size_show(struct md_rdev *rdev, char *page) 3557 { 3558 return sprintf(page, "%u\n", rdev->ppl.size); 3559 } 3560 3561 static ssize_t 3562 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3563 { 3564 unsigned int size; 3565 3566 if (kstrtouint(buf, 10, &size) < 0) 3567 return -EINVAL; 3568 3569 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3570 rdev->raid_disk >= 0) 3571 return -EBUSY; 3572 3573 if (rdev->mddev->persistent) { 3574 if (rdev->mddev->major_version == 0) 3575 return -EINVAL; 3576 if (size > U16_MAX) 3577 return -EINVAL; 3578 } else if (!rdev->mddev->external) { 3579 return -EBUSY; 3580 } 3581 rdev->ppl.size = size; 3582 return len; 3583 } 3584 3585 static struct rdev_sysfs_entry rdev_ppl_size = 3586 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3587 3588 static struct attribute *rdev_default_attrs[] = { 3589 &rdev_state.attr, 3590 &rdev_errors.attr, 3591 &rdev_slot.attr, 3592 &rdev_offset.attr, 3593 &rdev_new_offset.attr, 3594 &rdev_size.attr, 3595 &rdev_recovery_start.attr, 3596 &rdev_bad_blocks.attr, 3597 &rdev_unack_bad_blocks.attr, 3598 &rdev_ppl_sector.attr, 3599 &rdev_ppl_size.attr, 3600 NULL, 3601 }; 3602 static ssize_t 3603 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3604 { 3605 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3606 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3607 3608 if (!entry->show) 3609 return -EIO; 3610 if (!rdev->mddev) 3611 return -ENODEV; 3612 return entry->show(rdev, page); 3613 } 3614 3615 static ssize_t 3616 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3617 const char *page, size_t length) 3618 { 3619 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3620 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3621 ssize_t rv; 3622 struct mddev *mddev = rdev->mddev; 3623 3624 if (!entry->store) 3625 return -EIO; 3626 if (!capable(CAP_SYS_ADMIN)) 3627 return -EACCES; 3628 rv = mddev ? mddev_lock(mddev) : -ENODEV; 3629 if (!rv) { 3630 if (rdev->mddev == NULL) 3631 rv = -ENODEV; 3632 else 3633 rv = entry->store(rdev, page, length); 3634 mddev_unlock(mddev); 3635 } 3636 return rv; 3637 } 3638 3639 static void rdev_free(struct kobject *ko) 3640 { 3641 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3642 kfree(rdev); 3643 } 3644 static const struct sysfs_ops rdev_sysfs_ops = { 3645 .show = rdev_attr_show, 3646 .store = rdev_attr_store, 3647 }; 3648 static struct kobj_type rdev_ktype = { 3649 .release = rdev_free, 3650 .sysfs_ops = &rdev_sysfs_ops, 3651 .default_attrs = rdev_default_attrs, 3652 }; 3653 3654 int md_rdev_init(struct md_rdev *rdev) 3655 { 3656 rdev->desc_nr = -1; 3657 rdev->saved_raid_disk = -1; 3658 rdev->raid_disk = -1; 3659 rdev->flags = 0; 3660 rdev->data_offset = 0; 3661 rdev->new_data_offset = 0; 3662 rdev->sb_events = 0; 3663 rdev->last_read_error = 0; 3664 rdev->sb_loaded = 0; 3665 rdev->bb_page = NULL; 3666 atomic_set(&rdev->nr_pending, 0); 3667 atomic_set(&rdev->read_errors, 0); 3668 atomic_set(&rdev->corrected_errors, 0); 3669 3670 INIT_LIST_HEAD(&rdev->same_set); 3671 init_waitqueue_head(&rdev->blocked_wait); 3672 3673 /* Add space to store bad block list. 3674 * This reserves the space even on arrays where it cannot 3675 * be used - I wonder if that matters 3676 */ 3677 return badblocks_init(&rdev->badblocks, 0); 3678 } 3679 EXPORT_SYMBOL_GPL(md_rdev_init); 3680 /* 3681 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3682 * 3683 * mark the device faulty if: 3684 * 3685 * - the device is nonexistent (zero size) 3686 * - the device has no valid superblock 3687 * 3688 * a faulty rdev _never_ has rdev->sb set. 3689 */ 3690 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3691 { 3692 char b[BDEVNAME_SIZE]; 3693 int err; 3694 struct md_rdev *rdev; 3695 sector_t size; 3696 3697 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3698 if (!rdev) 3699 return ERR_PTR(-ENOMEM); 3700 3701 err = md_rdev_init(rdev); 3702 if (err) 3703 goto abort_free; 3704 err = alloc_disk_sb(rdev); 3705 if (err) 3706 goto abort_free; 3707 3708 err = lock_rdev(rdev, newdev, super_format == -2); 3709 if (err) 3710 goto abort_free; 3711 3712 kobject_init(&rdev->kobj, &rdev_ktype); 3713 3714 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3715 if (!size) { 3716 pr_warn("md: %s has zero or unknown size, marking faulty!\n", 3717 bdevname(rdev->bdev,b)); 3718 err = -EINVAL; 3719 goto abort_free; 3720 } 3721 3722 if (super_format >= 0) { 3723 err = super_types[super_format]. 3724 load_super(rdev, NULL, super_minor); 3725 if (err == -EINVAL) { 3726 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", 3727 bdevname(rdev->bdev,b), 3728 super_format, super_minor); 3729 goto abort_free; 3730 } 3731 if (err < 0) { 3732 pr_warn("md: could not read %s's sb, not importing!\n", 3733 bdevname(rdev->bdev,b)); 3734 goto abort_free; 3735 } 3736 } 3737 3738 return rdev; 3739 3740 abort_free: 3741 if (rdev->bdev) 3742 unlock_rdev(rdev); 3743 md_rdev_clear(rdev); 3744 kfree(rdev); 3745 return ERR_PTR(err); 3746 } 3747 3748 /* 3749 * Check a full RAID array for plausibility 3750 */ 3751 3752 static int analyze_sbs(struct mddev *mddev) 3753 { 3754 int i; 3755 struct md_rdev *rdev, *freshest, *tmp; 3756 char b[BDEVNAME_SIZE]; 3757 3758 freshest = NULL; 3759 rdev_for_each_safe(rdev, tmp, mddev) 3760 switch (super_types[mddev->major_version]. 3761 load_super(rdev, freshest, mddev->minor_version)) { 3762 case 1: 3763 freshest = rdev; 3764 break; 3765 case 0: 3766 break; 3767 default: 3768 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", 3769 bdevname(rdev->bdev,b)); 3770 md_kick_rdev_from_array(rdev); 3771 } 3772 3773 /* Cannot find a valid fresh disk */ 3774 if (!freshest) { 3775 pr_warn("md: cannot find a valid disk\n"); 3776 return -EINVAL; 3777 } 3778 3779 super_types[mddev->major_version]. 3780 validate_super(mddev, freshest); 3781 3782 i = 0; 3783 rdev_for_each_safe(rdev, tmp, mddev) { 3784 if (mddev->max_disks && 3785 (rdev->desc_nr >= mddev->max_disks || 3786 i > mddev->max_disks)) { 3787 pr_warn("md: %s: %s: only %d devices permitted\n", 3788 mdname(mddev), bdevname(rdev->bdev, b), 3789 mddev->max_disks); 3790 md_kick_rdev_from_array(rdev); 3791 continue; 3792 } 3793 if (rdev != freshest) { 3794 if (super_types[mddev->major_version]. 3795 validate_super(mddev, rdev)) { 3796 pr_warn("md: kicking non-fresh %s from array!\n", 3797 bdevname(rdev->bdev,b)); 3798 md_kick_rdev_from_array(rdev); 3799 continue; 3800 } 3801 } 3802 if (mddev->level == LEVEL_MULTIPATH) { 3803 rdev->desc_nr = i++; 3804 rdev->raid_disk = rdev->desc_nr; 3805 set_bit(In_sync, &rdev->flags); 3806 } else if (rdev->raid_disk >= 3807 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3808 !test_bit(Journal, &rdev->flags)) { 3809 rdev->raid_disk = -1; 3810 clear_bit(In_sync, &rdev->flags); 3811 } 3812 } 3813 3814 return 0; 3815 } 3816 3817 /* Read a fixed-point number. 3818 * Numbers in sysfs attributes should be in "standard" units where 3819 * possible, so time should be in seconds. 3820 * However we internally use a a much smaller unit such as 3821 * milliseconds or jiffies. 3822 * This function takes a decimal number with a possible fractional 3823 * component, and produces an integer which is the result of 3824 * multiplying that number by 10^'scale'. 3825 * all without any floating-point arithmetic. 3826 */ 3827 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3828 { 3829 unsigned long result = 0; 3830 long decimals = -1; 3831 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3832 if (*cp == '.') 3833 decimals = 0; 3834 else if (decimals < scale) { 3835 unsigned int value; 3836 value = *cp - '0'; 3837 result = result * 10 + value; 3838 if (decimals >= 0) 3839 decimals++; 3840 } 3841 cp++; 3842 } 3843 if (*cp == '\n') 3844 cp++; 3845 if (*cp) 3846 return -EINVAL; 3847 if (decimals < 0) 3848 decimals = 0; 3849 *res = result * int_pow(10, scale - decimals); 3850 return 0; 3851 } 3852 3853 static ssize_t 3854 safe_delay_show(struct mddev *mddev, char *page) 3855 { 3856 int msec = (mddev->safemode_delay*1000)/HZ; 3857 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3858 } 3859 static ssize_t 3860 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3861 { 3862 unsigned long msec; 3863 3864 if (mddev_is_clustered(mddev)) { 3865 pr_warn("md: Safemode is disabled for clustered mode\n"); 3866 return -EINVAL; 3867 } 3868 3869 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3870 return -EINVAL; 3871 if (msec == 0) 3872 mddev->safemode_delay = 0; 3873 else { 3874 unsigned long old_delay = mddev->safemode_delay; 3875 unsigned long new_delay = (msec*HZ)/1000; 3876 3877 if (new_delay == 0) 3878 new_delay = 1; 3879 mddev->safemode_delay = new_delay; 3880 if (new_delay < old_delay || old_delay == 0) 3881 mod_timer(&mddev->safemode_timer, jiffies+1); 3882 } 3883 return len; 3884 } 3885 static struct md_sysfs_entry md_safe_delay = 3886 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3887 3888 static ssize_t 3889 level_show(struct mddev *mddev, char *page) 3890 { 3891 struct md_personality *p; 3892 int ret; 3893 spin_lock(&mddev->lock); 3894 p = mddev->pers; 3895 if (p) 3896 ret = sprintf(page, "%s\n", p->name); 3897 else if (mddev->clevel[0]) 3898 ret = sprintf(page, "%s\n", mddev->clevel); 3899 else if (mddev->level != LEVEL_NONE) 3900 ret = sprintf(page, "%d\n", mddev->level); 3901 else 3902 ret = 0; 3903 spin_unlock(&mddev->lock); 3904 return ret; 3905 } 3906 3907 static ssize_t 3908 level_store(struct mddev *mddev, const char *buf, size_t len) 3909 { 3910 char clevel[16]; 3911 ssize_t rv; 3912 size_t slen = len; 3913 struct md_personality *pers, *oldpers; 3914 long level; 3915 void *priv, *oldpriv; 3916 struct md_rdev *rdev; 3917 3918 if (slen == 0 || slen >= sizeof(clevel)) 3919 return -EINVAL; 3920 3921 rv = mddev_lock(mddev); 3922 if (rv) 3923 return rv; 3924 3925 if (mddev->pers == NULL) { 3926 strncpy(mddev->clevel, buf, slen); 3927 if (mddev->clevel[slen-1] == '\n') 3928 slen--; 3929 mddev->clevel[slen] = 0; 3930 mddev->level = LEVEL_NONE; 3931 rv = len; 3932 goto out_unlock; 3933 } 3934 rv = -EROFS; 3935 if (mddev->ro) 3936 goto out_unlock; 3937 3938 /* request to change the personality. Need to ensure: 3939 * - array is not engaged in resync/recovery/reshape 3940 * - old personality can be suspended 3941 * - new personality will access other array. 3942 */ 3943 3944 rv = -EBUSY; 3945 if (mddev->sync_thread || 3946 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3947 mddev->reshape_position != MaxSector || 3948 mddev->sysfs_active) 3949 goto out_unlock; 3950 3951 rv = -EINVAL; 3952 if (!mddev->pers->quiesce) { 3953 pr_warn("md: %s: %s does not support online personality change\n", 3954 mdname(mddev), mddev->pers->name); 3955 goto out_unlock; 3956 } 3957 3958 /* Now find the new personality */ 3959 strncpy(clevel, buf, slen); 3960 if (clevel[slen-1] == '\n') 3961 slen--; 3962 clevel[slen] = 0; 3963 if (kstrtol(clevel, 10, &level)) 3964 level = LEVEL_NONE; 3965 3966 if (request_module("md-%s", clevel) != 0) 3967 request_module("md-level-%s", clevel); 3968 spin_lock(&pers_lock); 3969 pers = find_pers(level, clevel); 3970 if (!pers || !try_module_get(pers->owner)) { 3971 spin_unlock(&pers_lock); 3972 pr_warn("md: personality %s not loaded\n", clevel); 3973 rv = -EINVAL; 3974 goto out_unlock; 3975 } 3976 spin_unlock(&pers_lock); 3977 3978 if (pers == mddev->pers) { 3979 /* Nothing to do! */ 3980 module_put(pers->owner); 3981 rv = len; 3982 goto out_unlock; 3983 } 3984 if (!pers->takeover) { 3985 module_put(pers->owner); 3986 pr_warn("md: %s: %s does not support personality takeover\n", 3987 mdname(mddev), clevel); 3988 rv = -EINVAL; 3989 goto out_unlock; 3990 } 3991 3992 rdev_for_each(rdev, mddev) 3993 rdev->new_raid_disk = rdev->raid_disk; 3994 3995 /* ->takeover must set new_* and/or delta_disks 3996 * if it succeeds, and may set them when it fails. 3997 */ 3998 priv = pers->takeover(mddev); 3999 if (IS_ERR(priv)) { 4000 mddev->new_level = mddev->level; 4001 mddev->new_layout = mddev->layout; 4002 mddev->new_chunk_sectors = mddev->chunk_sectors; 4003 mddev->raid_disks -= mddev->delta_disks; 4004 mddev->delta_disks = 0; 4005 mddev->reshape_backwards = 0; 4006 module_put(pers->owner); 4007 pr_warn("md: %s: %s would not accept array\n", 4008 mdname(mddev), clevel); 4009 rv = PTR_ERR(priv); 4010 goto out_unlock; 4011 } 4012 4013 /* Looks like we have a winner */ 4014 mddev_suspend(mddev); 4015 mddev_detach(mddev); 4016 4017 spin_lock(&mddev->lock); 4018 oldpers = mddev->pers; 4019 oldpriv = mddev->private; 4020 mddev->pers = pers; 4021 mddev->private = priv; 4022 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4023 mddev->level = mddev->new_level; 4024 mddev->layout = mddev->new_layout; 4025 mddev->chunk_sectors = mddev->new_chunk_sectors; 4026 mddev->delta_disks = 0; 4027 mddev->reshape_backwards = 0; 4028 mddev->degraded = 0; 4029 spin_unlock(&mddev->lock); 4030 4031 if (oldpers->sync_request == NULL && 4032 mddev->external) { 4033 /* We are converting from a no-redundancy array 4034 * to a redundancy array and metadata is managed 4035 * externally so we need to be sure that writes 4036 * won't block due to a need to transition 4037 * clean->dirty 4038 * until external management is started. 4039 */ 4040 mddev->in_sync = 0; 4041 mddev->safemode_delay = 0; 4042 mddev->safemode = 0; 4043 } 4044 4045 oldpers->free(mddev, oldpriv); 4046 4047 if (oldpers->sync_request == NULL && 4048 pers->sync_request != NULL) { 4049 /* need to add the md_redundancy_group */ 4050 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4051 pr_warn("md: cannot register extra attributes for %s\n", 4052 mdname(mddev)); 4053 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4054 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4055 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4056 } 4057 if (oldpers->sync_request != NULL && 4058 pers->sync_request == NULL) { 4059 /* need to remove the md_redundancy_group */ 4060 if (mddev->to_remove == NULL) 4061 mddev->to_remove = &md_redundancy_group; 4062 } 4063 4064 module_put(oldpers->owner); 4065 4066 rdev_for_each(rdev, mddev) { 4067 if (rdev->raid_disk < 0) 4068 continue; 4069 if (rdev->new_raid_disk >= mddev->raid_disks) 4070 rdev->new_raid_disk = -1; 4071 if (rdev->new_raid_disk == rdev->raid_disk) 4072 continue; 4073 sysfs_unlink_rdev(mddev, rdev); 4074 } 4075 rdev_for_each(rdev, mddev) { 4076 if (rdev->raid_disk < 0) 4077 continue; 4078 if (rdev->new_raid_disk == rdev->raid_disk) 4079 continue; 4080 rdev->raid_disk = rdev->new_raid_disk; 4081 if (rdev->raid_disk < 0) 4082 clear_bit(In_sync, &rdev->flags); 4083 else { 4084 if (sysfs_link_rdev(mddev, rdev)) 4085 pr_warn("md: cannot register rd%d for %s after level change\n", 4086 rdev->raid_disk, mdname(mddev)); 4087 } 4088 } 4089 4090 if (pers->sync_request == NULL) { 4091 /* this is now an array without redundancy, so 4092 * it must always be in_sync 4093 */ 4094 mddev->in_sync = 1; 4095 del_timer_sync(&mddev->safemode_timer); 4096 } 4097 blk_set_stacking_limits(&mddev->queue->limits); 4098 pers->run(mddev); 4099 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4100 mddev_resume(mddev); 4101 if (!mddev->thread) 4102 md_update_sb(mddev, 1); 4103 sysfs_notify_dirent_safe(mddev->sysfs_level); 4104 md_new_event(mddev); 4105 rv = len; 4106 out_unlock: 4107 mddev_unlock(mddev); 4108 return rv; 4109 } 4110 4111 static struct md_sysfs_entry md_level = 4112 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4113 4114 static ssize_t 4115 layout_show(struct mddev *mddev, char *page) 4116 { 4117 /* just a number, not meaningful for all levels */ 4118 if (mddev->reshape_position != MaxSector && 4119 mddev->layout != mddev->new_layout) 4120 return sprintf(page, "%d (%d)\n", 4121 mddev->new_layout, mddev->layout); 4122 return sprintf(page, "%d\n", mddev->layout); 4123 } 4124 4125 static ssize_t 4126 layout_store(struct mddev *mddev, const char *buf, size_t len) 4127 { 4128 unsigned int n; 4129 int err; 4130 4131 err = kstrtouint(buf, 10, &n); 4132 if (err < 0) 4133 return err; 4134 err = mddev_lock(mddev); 4135 if (err) 4136 return err; 4137 4138 if (mddev->pers) { 4139 if (mddev->pers->check_reshape == NULL) 4140 err = -EBUSY; 4141 else if (mddev->ro) 4142 err = -EROFS; 4143 else { 4144 mddev->new_layout = n; 4145 err = mddev->pers->check_reshape(mddev); 4146 if (err) 4147 mddev->new_layout = mddev->layout; 4148 } 4149 } else { 4150 mddev->new_layout = n; 4151 if (mddev->reshape_position == MaxSector) 4152 mddev->layout = n; 4153 } 4154 mddev_unlock(mddev); 4155 return err ?: len; 4156 } 4157 static struct md_sysfs_entry md_layout = 4158 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4159 4160 static ssize_t 4161 raid_disks_show(struct mddev *mddev, char *page) 4162 { 4163 if (mddev->raid_disks == 0) 4164 return 0; 4165 if (mddev->reshape_position != MaxSector && 4166 mddev->delta_disks != 0) 4167 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4168 mddev->raid_disks - mddev->delta_disks); 4169 return sprintf(page, "%d\n", mddev->raid_disks); 4170 } 4171 4172 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4173 4174 static ssize_t 4175 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4176 { 4177 unsigned int n; 4178 int err; 4179 4180 err = kstrtouint(buf, 10, &n); 4181 if (err < 0) 4182 return err; 4183 4184 err = mddev_lock(mddev); 4185 if (err) 4186 return err; 4187 if (mddev->pers) 4188 err = update_raid_disks(mddev, n); 4189 else if (mddev->reshape_position != MaxSector) { 4190 struct md_rdev *rdev; 4191 int olddisks = mddev->raid_disks - mddev->delta_disks; 4192 4193 err = -EINVAL; 4194 rdev_for_each(rdev, mddev) { 4195 if (olddisks < n && 4196 rdev->data_offset < rdev->new_data_offset) 4197 goto out_unlock; 4198 if (olddisks > n && 4199 rdev->data_offset > rdev->new_data_offset) 4200 goto out_unlock; 4201 } 4202 err = 0; 4203 mddev->delta_disks = n - olddisks; 4204 mddev->raid_disks = n; 4205 mddev->reshape_backwards = (mddev->delta_disks < 0); 4206 } else 4207 mddev->raid_disks = n; 4208 out_unlock: 4209 mddev_unlock(mddev); 4210 return err ? err : len; 4211 } 4212 static struct md_sysfs_entry md_raid_disks = 4213 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4214 4215 static ssize_t 4216 uuid_show(struct mddev *mddev, char *page) 4217 { 4218 return sprintf(page, "%pU\n", mddev->uuid); 4219 } 4220 static struct md_sysfs_entry md_uuid = 4221 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4222 4223 static ssize_t 4224 chunk_size_show(struct mddev *mddev, char *page) 4225 { 4226 if (mddev->reshape_position != MaxSector && 4227 mddev->chunk_sectors != mddev->new_chunk_sectors) 4228 return sprintf(page, "%d (%d)\n", 4229 mddev->new_chunk_sectors << 9, 4230 mddev->chunk_sectors << 9); 4231 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4232 } 4233 4234 static ssize_t 4235 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4236 { 4237 unsigned long n; 4238 int err; 4239 4240 err = kstrtoul(buf, 10, &n); 4241 if (err < 0) 4242 return err; 4243 4244 err = mddev_lock(mddev); 4245 if (err) 4246 return err; 4247 if (mddev->pers) { 4248 if (mddev->pers->check_reshape == NULL) 4249 err = -EBUSY; 4250 else if (mddev->ro) 4251 err = -EROFS; 4252 else { 4253 mddev->new_chunk_sectors = n >> 9; 4254 err = mddev->pers->check_reshape(mddev); 4255 if (err) 4256 mddev->new_chunk_sectors = mddev->chunk_sectors; 4257 } 4258 } else { 4259 mddev->new_chunk_sectors = n >> 9; 4260 if (mddev->reshape_position == MaxSector) 4261 mddev->chunk_sectors = n >> 9; 4262 } 4263 mddev_unlock(mddev); 4264 return err ?: len; 4265 } 4266 static struct md_sysfs_entry md_chunk_size = 4267 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4268 4269 static ssize_t 4270 resync_start_show(struct mddev *mddev, char *page) 4271 { 4272 if (mddev->recovery_cp == MaxSector) 4273 return sprintf(page, "none\n"); 4274 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4275 } 4276 4277 static ssize_t 4278 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4279 { 4280 unsigned long long n; 4281 int err; 4282 4283 if (cmd_match(buf, "none")) 4284 n = MaxSector; 4285 else { 4286 err = kstrtoull(buf, 10, &n); 4287 if (err < 0) 4288 return err; 4289 if (n != (sector_t)n) 4290 return -EINVAL; 4291 } 4292 4293 err = mddev_lock(mddev); 4294 if (err) 4295 return err; 4296 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4297 err = -EBUSY; 4298 4299 if (!err) { 4300 mddev->recovery_cp = n; 4301 if (mddev->pers) 4302 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4303 } 4304 mddev_unlock(mddev); 4305 return err ?: len; 4306 } 4307 static struct md_sysfs_entry md_resync_start = 4308 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4309 resync_start_show, resync_start_store); 4310 4311 /* 4312 * The array state can be: 4313 * 4314 * clear 4315 * No devices, no size, no level 4316 * Equivalent to STOP_ARRAY ioctl 4317 * inactive 4318 * May have some settings, but array is not active 4319 * all IO results in error 4320 * When written, doesn't tear down array, but just stops it 4321 * suspended (not supported yet) 4322 * All IO requests will block. The array can be reconfigured. 4323 * Writing this, if accepted, will block until array is quiescent 4324 * readonly 4325 * no resync can happen. no superblocks get written. 4326 * write requests fail 4327 * read-auto 4328 * like readonly, but behaves like 'clean' on a write request. 4329 * 4330 * clean - no pending writes, but otherwise active. 4331 * When written to inactive array, starts without resync 4332 * If a write request arrives then 4333 * if metadata is known, mark 'dirty' and switch to 'active'. 4334 * if not known, block and switch to write-pending 4335 * If written to an active array that has pending writes, then fails. 4336 * active 4337 * fully active: IO and resync can be happening. 4338 * When written to inactive array, starts with resync 4339 * 4340 * write-pending 4341 * clean, but writes are blocked waiting for 'active' to be written. 4342 * 4343 * active-idle 4344 * like active, but no writes have been seen for a while (100msec). 4345 * 4346 * broken 4347 * RAID0/LINEAR-only: same as clean, but array is missing a member. 4348 * It's useful because RAID0/LINEAR mounted-arrays aren't stopped 4349 * when a member is gone, so this state will at least alert the 4350 * user that something is wrong. 4351 */ 4352 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4353 write_pending, active_idle, broken, bad_word}; 4354 static char *array_states[] = { 4355 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4356 "write-pending", "active-idle", "broken", NULL }; 4357 4358 static int match_word(const char *word, char **list) 4359 { 4360 int n; 4361 for (n=0; list[n]; n++) 4362 if (cmd_match(word, list[n])) 4363 break; 4364 return n; 4365 } 4366 4367 static ssize_t 4368 array_state_show(struct mddev *mddev, char *page) 4369 { 4370 enum array_state st = inactive; 4371 4372 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4373 switch(mddev->ro) { 4374 case 1: 4375 st = readonly; 4376 break; 4377 case 2: 4378 st = read_auto; 4379 break; 4380 case 0: 4381 spin_lock(&mddev->lock); 4382 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4383 st = write_pending; 4384 else if (mddev->in_sync) 4385 st = clean; 4386 else if (mddev->safemode) 4387 st = active_idle; 4388 else 4389 st = active; 4390 spin_unlock(&mddev->lock); 4391 } 4392 4393 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4394 st = broken; 4395 } else { 4396 if (list_empty(&mddev->disks) && 4397 mddev->raid_disks == 0 && 4398 mddev->dev_sectors == 0) 4399 st = clear; 4400 else 4401 st = inactive; 4402 } 4403 return sprintf(page, "%s\n", array_states[st]); 4404 } 4405 4406 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4407 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4408 static int restart_array(struct mddev *mddev); 4409 4410 static ssize_t 4411 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4412 { 4413 int err = 0; 4414 enum array_state st = match_word(buf, array_states); 4415 4416 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 4417 /* don't take reconfig_mutex when toggling between 4418 * clean and active 4419 */ 4420 spin_lock(&mddev->lock); 4421 if (st == active) { 4422 restart_array(mddev); 4423 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4424 md_wakeup_thread(mddev->thread); 4425 wake_up(&mddev->sb_wait); 4426 } else /* st == clean */ { 4427 restart_array(mddev); 4428 if (!set_in_sync(mddev)) 4429 err = -EBUSY; 4430 } 4431 if (!err) 4432 sysfs_notify_dirent_safe(mddev->sysfs_state); 4433 spin_unlock(&mddev->lock); 4434 return err ?: len; 4435 } 4436 err = mddev_lock(mddev); 4437 if (err) 4438 return err; 4439 err = -EINVAL; 4440 switch(st) { 4441 case bad_word: 4442 break; 4443 case clear: 4444 /* stopping an active array */ 4445 err = do_md_stop(mddev, 0, NULL); 4446 break; 4447 case inactive: 4448 /* stopping an active array */ 4449 if (mddev->pers) 4450 err = do_md_stop(mddev, 2, NULL); 4451 else 4452 err = 0; /* already inactive */ 4453 break; 4454 case suspended: 4455 break; /* not supported yet */ 4456 case readonly: 4457 if (mddev->pers) 4458 err = md_set_readonly(mddev, NULL); 4459 else { 4460 mddev->ro = 1; 4461 set_disk_ro(mddev->gendisk, 1); 4462 err = do_md_run(mddev); 4463 } 4464 break; 4465 case read_auto: 4466 if (mddev->pers) { 4467 if (mddev->ro == 0) 4468 err = md_set_readonly(mddev, NULL); 4469 else if (mddev->ro == 1) 4470 err = restart_array(mddev); 4471 if (err == 0) { 4472 mddev->ro = 2; 4473 set_disk_ro(mddev->gendisk, 0); 4474 } 4475 } else { 4476 mddev->ro = 2; 4477 err = do_md_run(mddev); 4478 } 4479 break; 4480 case clean: 4481 if (mddev->pers) { 4482 err = restart_array(mddev); 4483 if (err) 4484 break; 4485 spin_lock(&mddev->lock); 4486 if (!set_in_sync(mddev)) 4487 err = -EBUSY; 4488 spin_unlock(&mddev->lock); 4489 } else 4490 err = -EINVAL; 4491 break; 4492 case active: 4493 if (mddev->pers) { 4494 err = restart_array(mddev); 4495 if (err) 4496 break; 4497 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4498 wake_up(&mddev->sb_wait); 4499 err = 0; 4500 } else { 4501 mddev->ro = 0; 4502 set_disk_ro(mddev->gendisk, 0); 4503 err = do_md_run(mddev); 4504 } 4505 break; 4506 case write_pending: 4507 case active_idle: 4508 case broken: 4509 /* these cannot be set */ 4510 break; 4511 } 4512 4513 if (!err) { 4514 if (mddev->hold_active == UNTIL_IOCTL) 4515 mddev->hold_active = 0; 4516 sysfs_notify_dirent_safe(mddev->sysfs_state); 4517 } 4518 mddev_unlock(mddev); 4519 return err ?: len; 4520 } 4521 static struct md_sysfs_entry md_array_state = 4522 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4523 4524 static ssize_t 4525 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4526 return sprintf(page, "%d\n", 4527 atomic_read(&mddev->max_corr_read_errors)); 4528 } 4529 4530 static ssize_t 4531 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4532 { 4533 unsigned int n; 4534 int rv; 4535 4536 rv = kstrtouint(buf, 10, &n); 4537 if (rv < 0) 4538 return rv; 4539 atomic_set(&mddev->max_corr_read_errors, n); 4540 return len; 4541 } 4542 4543 static struct md_sysfs_entry max_corr_read_errors = 4544 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4545 max_corrected_read_errors_store); 4546 4547 static ssize_t 4548 null_show(struct mddev *mddev, char *page) 4549 { 4550 return -EINVAL; 4551 } 4552 4553 /* need to ensure rdev_delayed_delete() has completed */ 4554 static void flush_rdev_wq(struct mddev *mddev) 4555 { 4556 struct md_rdev *rdev; 4557 4558 rcu_read_lock(); 4559 rdev_for_each_rcu(rdev, mddev) 4560 if (work_pending(&rdev->del_work)) { 4561 flush_workqueue(md_rdev_misc_wq); 4562 break; 4563 } 4564 rcu_read_unlock(); 4565 } 4566 4567 static ssize_t 4568 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4569 { 4570 /* buf must be %d:%d\n? giving major and minor numbers */ 4571 /* The new device is added to the array. 4572 * If the array has a persistent superblock, we read the 4573 * superblock to initialise info and check validity. 4574 * Otherwise, only checking done is that in bind_rdev_to_array, 4575 * which mainly checks size. 4576 */ 4577 char *e; 4578 int major = simple_strtoul(buf, &e, 10); 4579 int minor; 4580 dev_t dev; 4581 struct md_rdev *rdev; 4582 int err; 4583 4584 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4585 return -EINVAL; 4586 minor = simple_strtoul(e+1, &e, 10); 4587 if (*e && *e != '\n') 4588 return -EINVAL; 4589 dev = MKDEV(major, minor); 4590 if (major != MAJOR(dev) || 4591 minor != MINOR(dev)) 4592 return -EOVERFLOW; 4593 4594 flush_rdev_wq(mddev); 4595 err = mddev_lock(mddev); 4596 if (err) 4597 return err; 4598 if (mddev->persistent) { 4599 rdev = md_import_device(dev, mddev->major_version, 4600 mddev->minor_version); 4601 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4602 struct md_rdev *rdev0 4603 = list_entry(mddev->disks.next, 4604 struct md_rdev, same_set); 4605 err = super_types[mddev->major_version] 4606 .load_super(rdev, rdev0, mddev->minor_version); 4607 if (err < 0) 4608 goto out; 4609 } 4610 } else if (mddev->external) 4611 rdev = md_import_device(dev, -2, -1); 4612 else 4613 rdev = md_import_device(dev, -1, -1); 4614 4615 if (IS_ERR(rdev)) { 4616 mddev_unlock(mddev); 4617 return PTR_ERR(rdev); 4618 } 4619 err = bind_rdev_to_array(rdev, mddev); 4620 out: 4621 if (err) 4622 export_rdev(rdev); 4623 mddev_unlock(mddev); 4624 if (!err) 4625 md_new_event(mddev); 4626 return err ? err : len; 4627 } 4628 4629 static struct md_sysfs_entry md_new_device = 4630 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4631 4632 static ssize_t 4633 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4634 { 4635 char *end; 4636 unsigned long chunk, end_chunk; 4637 int err; 4638 4639 err = mddev_lock(mddev); 4640 if (err) 4641 return err; 4642 if (!mddev->bitmap) 4643 goto out; 4644 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4645 while (*buf) { 4646 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4647 if (buf == end) break; 4648 if (*end == '-') { /* range */ 4649 buf = end + 1; 4650 end_chunk = simple_strtoul(buf, &end, 0); 4651 if (buf == end) break; 4652 } 4653 if (*end && !isspace(*end)) break; 4654 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4655 buf = skip_spaces(end); 4656 } 4657 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4658 out: 4659 mddev_unlock(mddev); 4660 return len; 4661 } 4662 4663 static struct md_sysfs_entry md_bitmap = 4664 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4665 4666 static ssize_t 4667 size_show(struct mddev *mddev, char *page) 4668 { 4669 return sprintf(page, "%llu\n", 4670 (unsigned long long)mddev->dev_sectors / 2); 4671 } 4672 4673 static int update_size(struct mddev *mddev, sector_t num_sectors); 4674 4675 static ssize_t 4676 size_store(struct mddev *mddev, const char *buf, size_t len) 4677 { 4678 /* If array is inactive, we can reduce the component size, but 4679 * not increase it (except from 0). 4680 * If array is active, we can try an on-line resize 4681 */ 4682 sector_t sectors; 4683 int err = strict_blocks_to_sectors(buf, §ors); 4684 4685 if (err < 0) 4686 return err; 4687 err = mddev_lock(mddev); 4688 if (err) 4689 return err; 4690 if (mddev->pers) { 4691 err = update_size(mddev, sectors); 4692 if (err == 0) 4693 md_update_sb(mddev, 1); 4694 } else { 4695 if (mddev->dev_sectors == 0 || 4696 mddev->dev_sectors > sectors) 4697 mddev->dev_sectors = sectors; 4698 else 4699 err = -ENOSPC; 4700 } 4701 mddev_unlock(mddev); 4702 return err ? err : len; 4703 } 4704 4705 static struct md_sysfs_entry md_size = 4706 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4707 4708 /* Metadata version. 4709 * This is one of 4710 * 'none' for arrays with no metadata (good luck...) 4711 * 'external' for arrays with externally managed metadata, 4712 * or N.M for internally known formats 4713 */ 4714 static ssize_t 4715 metadata_show(struct mddev *mddev, char *page) 4716 { 4717 if (mddev->persistent) 4718 return sprintf(page, "%d.%d\n", 4719 mddev->major_version, mddev->minor_version); 4720 else if (mddev->external) 4721 return sprintf(page, "external:%s\n", mddev->metadata_type); 4722 else 4723 return sprintf(page, "none\n"); 4724 } 4725 4726 static ssize_t 4727 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4728 { 4729 int major, minor; 4730 char *e; 4731 int err; 4732 /* Changing the details of 'external' metadata is 4733 * always permitted. Otherwise there must be 4734 * no devices attached to the array. 4735 */ 4736 4737 err = mddev_lock(mddev); 4738 if (err) 4739 return err; 4740 err = -EBUSY; 4741 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4742 ; 4743 else if (!list_empty(&mddev->disks)) 4744 goto out_unlock; 4745 4746 err = 0; 4747 if (cmd_match(buf, "none")) { 4748 mddev->persistent = 0; 4749 mddev->external = 0; 4750 mddev->major_version = 0; 4751 mddev->minor_version = 90; 4752 goto out_unlock; 4753 } 4754 if (strncmp(buf, "external:", 9) == 0) { 4755 size_t namelen = len-9; 4756 if (namelen >= sizeof(mddev->metadata_type)) 4757 namelen = sizeof(mddev->metadata_type)-1; 4758 strncpy(mddev->metadata_type, buf+9, namelen); 4759 mddev->metadata_type[namelen] = 0; 4760 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4761 mddev->metadata_type[--namelen] = 0; 4762 mddev->persistent = 0; 4763 mddev->external = 1; 4764 mddev->major_version = 0; 4765 mddev->minor_version = 90; 4766 goto out_unlock; 4767 } 4768 major = simple_strtoul(buf, &e, 10); 4769 err = -EINVAL; 4770 if (e==buf || *e != '.') 4771 goto out_unlock; 4772 buf = e+1; 4773 minor = simple_strtoul(buf, &e, 10); 4774 if (e==buf || (*e && *e != '\n') ) 4775 goto out_unlock; 4776 err = -ENOENT; 4777 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4778 goto out_unlock; 4779 mddev->major_version = major; 4780 mddev->minor_version = minor; 4781 mddev->persistent = 1; 4782 mddev->external = 0; 4783 err = 0; 4784 out_unlock: 4785 mddev_unlock(mddev); 4786 return err ?: len; 4787 } 4788 4789 static struct md_sysfs_entry md_metadata = 4790 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4791 4792 static ssize_t 4793 action_show(struct mddev *mddev, char *page) 4794 { 4795 char *type = "idle"; 4796 unsigned long recovery = mddev->recovery; 4797 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4798 type = "frozen"; 4799 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4800 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4801 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4802 type = "reshape"; 4803 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4804 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4805 type = "resync"; 4806 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4807 type = "check"; 4808 else 4809 type = "repair"; 4810 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4811 type = "recover"; 4812 else if (mddev->reshape_position != MaxSector) 4813 type = "reshape"; 4814 } 4815 return sprintf(page, "%s\n", type); 4816 } 4817 4818 static ssize_t 4819 action_store(struct mddev *mddev, const char *page, size_t len) 4820 { 4821 if (!mddev->pers || !mddev->pers->sync_request) 4822 return -EINVAL; 4823 4824 4825 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4826 if (cmd_match(page, "frozen")) 4827 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4828 else 4829 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4830 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4831 mddev_lock(mddev) == 0) { 4832 if (work_pending(&mddev->del_work)) 4833 flush_workqueue(md_misc_wq); 4834 if (mddev->sync_thread) { 4835 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4836 md_reap_sync_thread(mddev); 4837 } 4838 mddev_unlock(mddev); 4839 } 4840 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4841 return -EBUSY; 4842 else if (cmd_match(page, "resync")) 4843 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4844 else if (cmd_match(page, "recover")) { 4845 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4846 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4847 } else if (cmd_match(page, "reshape")) { 4848 int err; 4849 if (mddev->pers->start_reshape == NULL) 4850 return -EINVAL; 4851 err = mddev_lock(mddev); 4852 if (!err) { 4853 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4854 err = -EBUSY; 4855 else { 4856 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4857 err = mddev->pers->start_reshape(mddev); 4858 } 4859 mddev_unlock(mddev); 4860 } 4861 if (err) 4862 return err; 4863 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4864 } else { 4865 if (cmd_match(page, "check")) 4866 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4867 else if (!cmd_match(page, "repair")) 4868 return -EINVAL; 4869 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4870 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4871 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4872 } 4873 if (mddev->ro == 2) { 4874 /* A write to sync_action is enough to justify 4875 * canceling read-auto mode 4876 */ 4877 mddev->ro = 0; 4878 md_wakeup_thread(mddev->sync_thread); 4879 } 4880 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4881 md_wakeup_thread(mddev->thread); 4882 sysfs_notify_dirent_safe(mddev->sysfs_action); 4883 return len; 4884 } 4885 4886 static struct md_sysfs_entry md_scan_mode = 4887 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4888 4889 static ssize_t 4890 last_sync_action_show(struct mddev *mddev, char *page) 4891 { 4892 return sprintf(page, "%s\n", mddev->last_sync_action); 4893 } 4894 4895 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4896 4897 static ssize_t 4898 mismatch_cnt_show(struct mddev *mddev, char *page) 4899 { 4900 return sprintf(page, "%llu\n", 4901 (unsigned long long) 4902 atomic64_read(&mddev->resync_mismatches)); 4903 } 4904 4905 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4906 4907 static ssize_t 4908 sync_min_show(struct mddev *mddev, char *page) 4909 { 4910 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4911 mddev->sync_speed_min ? "local": "system"); 4912 } 4913 4914 static ssize_t 4915 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4916 { 4917 unsigned int min; 4918 int rv; 4919 4920 if (strncmp(buf, "system", 6)==0) { 4921 min = 0; 4922 } else { 4923 rv = kstrtouint(buf, 10, &min); 4924 if (rv < 0) 4925 return rv; 4926 if (min == 0) 4927 return -EINVAL; 4928 } 4929 mddev->sync_speed_min = min; 4930 return len; 4931 } 4932 4933 static struct md_sysfs_entry md_sync_min = 4934 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4935 4936 static ssize_t 4937 sync_max_show(struct mddev *mddev, char *page) 4938 { 4939 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4940 mddev->sync_speed_max ? "local": "system"); 4941 } 4942 4943 static ssize_t 4944 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4945 { 4946 unsigned int max; 4947 int rv; 4948 4949 if (strncmp(buf, "system", 6)==0) { 4950 max = 0; 4951 } else { 4952 rv = kstrtouint(buf, 10, &max); 4953 if (rv < 0) 4954 return rv; 4955 if (max == 0) 4956 return -EINVAL; 4957 } 4958 mddev->sync_speed_max = max; 4959 return len; 4960 } 4961 4962 static struct md_sysfs_entry md_sync_max = 4963 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4964 4965 static ssize_t 4966 degraded_show(struct mddev *mddev, char *page) 4967 { 4968 return sprintf(page, "%d\n", mddev->degraded); 4969 } 4970 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4971 4972 static ssize_t 4973 sync_force_parallel_show(struct mddev *mddev, char *page) 4974 { 4975 return sprintf(page, "%d\n", mddev->parallel_resync); 4976 } 4977 4978 static ssize_t 4979 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4980 { 4981 long n; 4982 4983 if (kstrtol(buf, 10, &n)) 4984 return -EINVAL; 4985 4986 if (n != 0 && n != 1) 4987 return -EINVAL; 4988 4989 mddev->parallel_resync = n; 4990 4991 if (mddev->sync_thread) 4992 wake_up(&resync_wait); 4993 4994 return len; 4995 } 4996 4997 /* force parallel resync, even with shared block devices */ 4998 static struct md_sysfs_entry md_sync_force_parallel = 4999 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5000 sync_force_parallel_show, sync_force_parallel_store); 5001 5002 static ssize_t 5003 sync_speed_show(struct mddev *mddev, char *page) 5004 { 5005 unsigned long resync, dt, db; 5006 if (mddev->curr_resync == 0) 5007 return sprintf(page, "none\n"); 5008 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5009 dt = (jiffies - mddev->resync_mark) / HZ; 5010 if (!dt) dt++; 5011 db = resync - mddev->resync_mark_cnt; 5012 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5013 } 5014 5015 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5016 5017 static ssize_t 5018 sync_completed_show(struct mddev *mddev, char *page) 5019 { 5020 unsigned long long max_sectors, resync; 5021 5022 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5023 return sprintf(page, "none\n"); 5024 5025 if (mddev->curr_resync == 1 || 5026 mddev->curr_resync == 2) 5027 return sprintf(page, "delayed\n"); 5028 5029 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5030 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5031 max_sectors = mddev->resync_max_sectors; 5032 else 5033 max_sectors = mddev->dev_sectors; 5034 5035 resync = mddev->curr_resync_completed; 5036 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5037 } 5038 5039 static struct md_sysfs_entry md_sync_completed = 5040 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5041 5042 static ssize_t 5043 min_sync_show(struct mddev *mddev, char *page) 5044 { 5045 return sprintf(page, "%llu\n", 5046 (unsigned long long)mddev->resync_min); 5047 } 5048 static ssize_t 5049 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5050 { 5051 unsigned long long min; 5052 int err; 5053 5054 if (kstrtoull(buf, 10, &min)) 5055 return -EINVAL; 5056 5057 spin_lock(&mddev->lock); 5058 err = -EINVAL; 5059 if (min > mddev->resync_max) 5060 goto out_unlock; 5061 5062 err = -EBUSY; 5063 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5064 goto out_unlock; 5065 5066 /* Round down to multiple of 4K for safety */ 5067 mddev->resync_min = round_down(min, 8); 5068 err = 0; 5069 5070 out_unlock: 5071 spin_unlock(&mddev->lock); 5072 return err ?: len; 5073 } 5074 5075 static struct md_sysfs_entry md_min_sync = 5076 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5077 5078 static ssize_t 5079 max_sync_show(struct mddev *mddev, char *page) 5080 { 5081 if (mddev->resync_max == MaxSector) 5082 return sprintf(page, "max\n"); 5083 else 5084 return sprintf(page, "%llu\n", 5085 (unsigned long long)mddev->resync_max); 5086 } 5087 static ssize_t 5088 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5089 { 5090 int err; 5091 spin_lock(&mddev->lock); 5092 if (strncmp(buf, "max", 3) == 0) 5093 mddev->resync_max = MaxSector; 5094 else { 5095 unsigned long long max; 5096 int chunk; 5097 5098 err = -EINVAL; 5099 if (kstrtoull(buf, 10, &max)) 5100 goto out_unlock; 5101 if (max < mddev->resync_min) 5102 goto out_unlock; 5103 5104 err = -EBUSY; 5105 if (max < mddev->resync_max && 5106 mddev->ro == 0 && 5107 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5108 goto out_unlock; 5109 5110 /* Must be a multiple of chunk_size */ 5111 chunk = mddev->chunk_sectors; 5112 if (chunk) { 5113 sector_t temp = max; 5114 5115 err = -EINVAL; 5116 if (sector_div(temp, chunk)) 5117 goto out_unlock; 5118 } 5119 mddev->resync_max = max; 5120 } 5121 wake_up(&mddev->recovery_wait); 5122 err = 0; 5123 out_unlock: 5124 spin_unlock(&mddev->lock); 5125 return err ?: len; 5126 } 5127 5128 static struct md_sysfs_entry md_max_sync = 5129 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5130 5131 static ssize_t 5132 suspend_lo_show(struct mddev *mddev, char *page) 5133 { 5134 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 5135 } 5136 5137 static ssize_t 5138 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5139 { 5140 unsigned long long new; 5141 int err; 5142 5143 err = kstrtoull(buf, 10, &new); 5144 if (err < 0) 5145 return err; 5146 if (new != (sector_t)new) 5147 return -EINVAL; 5148 5149 err = mddev_lock(mddev); 5150 if (err) 5151 return err; 5152 err = -EINVAL; 5153 if (mddev->pers == NULL || 5154 mddev->pers->quiesce == NULL) 5155 goto unlock; 5156 mddev_suspend(mddev); 5157 mddev->suspend_lo = new; 5158 mddev_resume(mddev); 5159 5160 err = 0; 5161 unlock: 5162 mddev_unlock(mddev); 5163 return err ?: len; 5164 } 5165 static struct md_sysfs_entry md_suspend_lo = 5166 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5167 5168 static ssize_t 5169 suspend_hi_show(struct mddev *mddev, char *page) 5170 { 5171 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 5172 } 5173 5174 static ssize_t 5175 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5176 { 5177 unsigned long long new; 5178 int err; 5179 5180 err = kstrtoull(buf, 10, &new); 5181 if (err < 0) 5182 return err; 5183 if (new != (sector_t)new) 5184 return -EINVAL; 5185 5186 err = mddev_lock(mddev); 5187 if (err) 5188 return err; 5189 err = -EINVAL; 5190 if (mddev->pers == NULL) 5191 goto unlock; 5192 5193 mddev_suspend(mddev); 5194 mddev->suspend_hi = new; 5195 mddev_resume(mddev); 5196 5197 err = 0; 5198 unlock: 5199 mddev_unlock(mddev); 5200 return err ?: len; 5201 } 5202 static struct md_sysfs_entry md_suspend_hi = 5203 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5204 5205 static ssize_t 5206 reshape_position_show(struct mddev *mddev, char *page) 5207 { 5208 if (mddev->reshape_position != MaxSector) 5209 return sprintf(page, "%llu\n", 5210 (unsigned long long)mddev->reshape_position); 5211 strcpy(page, "none\n"); 5212 return 5; 5213 } 5214 5215 static ssize_t 5216 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5217 { 5218 struct md_rdev *rdev; 5219 unsigned long long new; 5220 int err; 5221 5222 err = kstrtoull(buf, 10, &new); 5223 if (err < 0) 5224 return err; 5225 if (new != (sector_t)new) 5226 return -EINVAL; 5227 err = mddev_lock(mddev); 5228 if (err) 5229 return err; 5230 err = -EBUSY; 5231 if (mddev->pers) 5232 goto unlock; 5233 mddev->reshape_position = new; 5234 mddev->delta_disks = 0; 5235 mddev->reshape_backwards = 0; 5236 mddev->new_level = mddev->level; 5237 mddev->new_layout = mddev->layout; 5238 mddev->new_chunk_sectors = mddev->chunk_sectors; 5239 rdev_for_each(rdev, mddev) 5240 rdev->new_data_offset = rdev->data_offset; 5241 err = 0; 5242 unlock: 5243 mddev_unlock(mddev); 5244 return err ?: len; 5245 } 5246 5247 static struct md_sysfs_entry md_reshape_position = 5248 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5249 reshape_position_store); 5250 5251 static ssize_t 5252 reshape_direction_show(struct mddev *mddev, char *page) 5253 { 5254 return sprintf(page, "%s\n", 5255 mddev->reshape_backwards ? "backwards" : "forwards"); 5256 } 5257 5258 static ssize_t 5259 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5260 { 5261 int backwards = 0; 5262 int err; 5263 5264 if (cmd_match(buf, "forwards")) 5265 backwards = 0; 5266 else if (cmd_match(buf, "backwards")) 5267 backwards = 1; 5268 else 5269 return -EINVAL; 5270 if (mddev->reshape_backwards == backwards) 5271 return len; 5272 5273 err = mddev_lock(mddev); 5274 if (err) 5275 return err; 5276 /* check if we are allowed to change */ 5277 if (mddev->delta_disks) 5278 err = -EBUSY; 5279 else if (mddev->persistent && 5280 mddev->major_version == 0) 5281 err = -EINVAL; 5282 else 5283 mddev->reshape_backwards = backwards; 5284 mddev_unlock(mddev); 5285 return err ?: len; 5286 } 5287 5288 static struct md_sysfs_entry md_reshape_direction = 5289 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5290 reshape_direction_store); 5291 5292 static ssize_t 5293 array_size_show(struct mddev *mddev, char *page) 5294 { 5295 if (mddev->external_size) 5296 return sprintf(page, "%llu\n", 5297 (unsigned long long)mddev->array_sectors/2); 5298 else 5299 return sprintf(page, "default\n"); 5300 } 5301 5302 static ssize_t 5303 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5304 { 5305 sector_t sectors; 5306 int err; 5307 5308 err = mddev_lock(mddev); 5309 if (err) 5310 return err; 5311 5312 /* cluster raid doesn't support change array_sectors */ 5313 if (mddev_is_clustered(mddev)) { 5314 mddev_unlock(mddev); 5315 return -EINVAL; 5316 } 5317 5318 if (strncmp(buf, "default", 7) == 0) { 5319 if (mddev->pers) 5320 sectors = mddev->pers->size(mddev, 0, 0); 5321 else 5322 sectors = mddev->array_sectors; 5323 5324 mddev->external_size = 0; 5325 } else { 5326 if (strict_blocks_to_sectors(buf, §ors) < 0) 5327 err = -EINVAL; 5328 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5329 err = -E2BIG; 5330 else 5331 mddev->external_size = 1; 5332 } 5333 5334 if (!err) { 5335 mddev->array_sectors = sectors; 5336 if (mddev->pers) 5337 set_capacity_and_notify(mddev->gendisk, 5338 mddev->array_sectors); 5339 } 5340 mddev_unlock(mddev); 5341 return err ?: len; 5342 } 5343 5344 static struct md_sysfs_entry md_array_size = 5345 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5346 array_size_store); 5347 5348 static ssize_t 5349 consistency_policy_show(struct mddev *mddev, char *page) 5350 { 5351 int ret; 5352 5353 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5354 ret = sprintf(page, "journal\n"); 5355 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5356 ret = sprintf(page, "ppl\n"); 5357 } else if (mddev->bitmap) { 5358 ret = sprintf(page, "bitmap\n"); 5359 } else if (mddev->pers) { 5360 if (mddev->pers->sync_request) 5361 ret = sprintf(page, "resync\n"); 5362 else 5363 ret = sprintf(page, "none\n"); 5364 } else { 5365 ret = sprintf(page, "unknown\n"); 5366 } 5367 5368 return ret; 5369 } 5370 5371 static ssize_t 5372 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5373 { 5374 int err = 0; 5375 5376 if (mddev->pers) { 5377 if (mddev->pers->change_consistency_policy) 5378 err = mddev->pers->change_consistency_policy(mddev, buf); 5379 else 5380 err = -EBUSY; 5381 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5382 set_bit(MD_HAS_PPL, &mddev->flags); 5383 } else { 5384 err = -EINVAL; 5385 } 5386 5387 return err ? err : len; 5388 } 5389 5390 static struct md_sysfs_entry md_consistency_policy = 5391 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5392 consistency_policy_store); 5393 5394 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5395 { 5396 return sprintf(page, "%d\n", mddev->fail_last_dev); 5397 } 5398 5399 /* 5400 * Setting fail_last_dev to true to allow last device to be forcibly removed 5401 * from RAID1/RAID10. 5402 */ 5403 static ssize_t 5404 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5405 { 5406 int ret; 5407 bool value; 5408 5409 ret = kstrtobool(buf, &value); 5410 if (ret) 5411 return ret; 5412 5413 if (value != mddev->fail_last_dev) 5414 mddev->fail_last_dev = value; 5415 5416 return len; 5417 } 5418 static struct md_sysfs_entry md_fail_last_dev = 5419 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5420 fail_last_dev_store); 5421 5422 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5423 { 5424 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5425 return sprintf(page, "n/a\n"); 5426 else 5427 return sprintf(page, "%d\n", mddev->serialize_policy); 5428 } 5429 5430 /* 5431 * Setting serialize_policy to true to enforce write IO is not reordered 5432 * for raid1. 5433 */ 5434 static ssize_t 5435 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5436 { 5437 int err; 5438 bool value; 5439 5440 err = kstrtobool(buf, &value); 5441 if (err) 5442 return err; 5443 5444 if (value == mddev->serialize_policy) 5445 return len; 5446 5447 err = mddev_lock(mddev); 5448 if (err) 5449 return err; 5450 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5451 pr_err("md: serialize_policy is only effective for raid1\n"); 5452 err = -EINVAL; 5453 goto unlock; 5454 } 5455 5456 mddev_suspend(mddev); 5457 if (value) 5458 mddev_create_serial_pool(mddev, NULL, true); 5459 else 5460 mddev_destroy_serial_pool(mddev, NULL, true); 5461 mddev->serialize_policy = value; 5462 mddev_resume(mddev); 5463 unlock: 5464 mddev_unlock(mddev); 5465 return err ?: len; 5466 } 5467 5468 static struct md_sysfs_entry md_serialize_policy = 5469 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5470 serialize_policy_store); 5471 5472 5473 static struct attribute *md_default_attrs[] = { 5474 &md_level.attr, 5475 &md_layout.attr, 5476 &md_raid_disks.attr, 5477 &md_uuid.attr, 5478 &md_chunk_size.attr, 5479 &md_size.attr, 5480 &md_resync_start.attr, 5481 &md_metadata.attr, 5482 &md_new_device.attr, 5483 &md_safe_delay.attr, 5484 &md_array_state.attr, 5485 &md_reshape_position.attr, 5486 &md_reshape_direction.attr, 5487 &md_array_size.attr, 5488 &max_corr_read_errors.attr, 5489 &md_consistency_policy.attr, 5490 &md_fail_last_dev.attr, 5491 &md_serialize_policy.attr, 5492 NULL, 5493 }; 5494 5495 static struct attribute *md_redundancy_attrs[] = { 5496 &md_scan_mode.attr, 5497 &md_last_scan_mode.attr, 5498 &md_mismatches.attr, 5499 &md_sync_min.attr, 5500 &md_sync_max.attr, 5501 &md_sync_speed.attr, 5502 &md_sync_force_parallel.attr, 5503 &md_sync_completed.attr, 5504 &md_min_sync.attr, 5505 &md_max_sync.attr, 5506 &md_suspend_lo.attr, 5507 &md_suspend_hi.attr, 5508 &md_bitmap.attr, 5509 &md_degraded.attr, 5510 NULL, 5511 }; 5512 static const struct attribute_group md_redundancy_group = { 5513 .name = NULL, 5514 .attrs = md_redundancy_attrs, 5515 }; 5516 5517 static ssize_t 5518 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5519 { 5520 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5521 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5522 ssize_t rv; 5523 5524 if (!entry->show) 5525 return -EIO; 5526 spin_lock(&all_mddevs_lock); 5527 if (list_empty(&mddev->all_mddevs)) { 5528 spin_unlock(&all_mddevs_lock); 5529 return -EBUSY; 5530 } 5531 mddev_get(mddev); 5532 spin_unlock(&all_mddevs_lock); 5533 5534 rv = entry->show(mddev, page); 5535 mddev_put(mddev); 5536 return rv; 5537 } 5538 5539 static ssize_t 5540 md_attr_store(struct kobject *kobj, struct attribute *attr, 5541 const char *page, size_t length) 5542 { 5543 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5544 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5545 ssize_t rv; 5546 5547 if (!entry->store) 5548 return -EIO; 5549 if (!capable(CAP_SYS_ADMIN)) 5550 return -EACCES; 5551 spin_lock(&all_mddevs_lock); 5552 if (list_empty(&mddev->all_mddevs)) { 5553 spin_unlock(&all_mddevs_lock); 5554 return -EBUSY; 5555 } 5556 mddev_get(mddev); 5557 spin_unlock(&all_mddevs_lock); 5558 rv = entry->store(mddev, page, length); 5559 mddev_put(mddev); 5560 return rv; 5561 } 5562 5563 static void md_free(struct kobject *ko) 5564 { 5565 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5566 5567 if (mddev->sysfs_state) 5568 sysfs_put(mddev->sysfs_state); 5569 if (mddev->sysfs_level) 5570 sysfs_put(mddev->sysfs_level); 5571 5572 if (mddev->gendisk) { 5573 del_gendisk(mddev->gendisk); 5574 blk_cleanup_disk(mddev->gendisk); 5575 } 5576 percpu_ref_exit(&mddev->writes_pending); 5577 5578 bioset_exit(&mddev->bio_set); 5579 bioset_exit(&mddev->sync_set); 5580 if (mddev->level != 1 && mddev->level != 10) 5581 bioset_exit(&mddev->io_acct_set); 5582 kfree(mddev); 5583 } 5584 5585 static const struct sysfs_ops md_sysfs_ops = { 5586 .show = md_attr_show, 5587 .store = md_attr_store, 5588 }; 5589 static struct kobj_type md_ktype = { 5590 .release = md_free, 5591 .sysfs_ops = &md_sysfs_ops, 5592 .default_attrs = md_default_attrs, 5593 }; 5594 5595 int mdp_major = 0; 5596 5597 static void mddev_delayed_delete(struct work_struct *ws) 5598 { 5599 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5600 5601 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 5602 kobject_del(&mddev->kobj); 5603 kobject_put(&mddev->kobj); 5604 } 5605 5606 static void no_op(struct percpu_ref *r) {} 5607 5608 int mddev_init_writes_pending(struct mddev *mddev) 5609 { 5610 if (mddev->writes_pending.percpu_count_ptr) 5611 return 0; 5612 if (percpu_ref_init(&mddev->writes_pending, no_op, 5613 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) 5614 return -ENOMEM; 5615 /* We want to start with the refcount at zero */ 5616 percpu_ref_put(&mddev->writes_pending); 5617 return 0; 5618 } 5619 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5620 5621 static int md_alloc(dev_t dev, char *name) 5622 { 5623 /* 5624 * If dev is zero, name is the name of a device to allocate with 5625 * an arbitrary minor number. It will be "md_???" 5626 * If dev is non-zero it must be a device number with a MAJOR of 5627 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5628 * the device is being created by opening a node in /dev. 5629 * If "name" is not NULL, the device is being created by 5630 * writing to /sys/module/md_mod/parameters/new_array. 5631 */ 5632 static DEFINE_MUTEX(disks_mutex); 5633 struct mddev *mddev; 5634 struct gendisk *disk; 5635 int partitioned; 5636 int shift; 5637 int unit; 5638 int error ; 5639 5640 /* 5641 * Wait for any previous instance of this device to be completely 5642 * removed (mddev_delayed_delete). 5643 */ 5644 flush_workqueue(md_misc_wq); 5645 5646 mutex_lock(&disks_mutex); 5647 mddev = mddev_alloc(dev); 5648 if (IS_ERR(mddev)) { 5649 mutex_unlock(&disks_mutex); 5650 return PTR_ERR(mddev); 5651 } 5652 5653 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5654 shift = partitioned ? MdpMinorShift : 0; 5655 unit = MINOR(mddev->unit) >> shift; 5656 5657 if (name && !dev) { 5658 /* Need to ensure that 'name' is not a duplicate. 5659 */ 5660 struct mddev *mddev2; 5661 spin_lock(&all_mddevs_lock); 5662 5663 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5664 if (mddev2->gendisk && 5665 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5666 spin_unlock(&all_mddevs_lock); 5667 error = -EEXIST; 5668 goto abort; 5669 } 5670 spin_unlock(&all_mddevs_lock); 5671 } 5672 if (name && dev) 5673 /* 5674 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5675 */ 5676 mddev->hold_active = UNTIL_STOP; 5677 5678 error = -ENOMEM; 5679 disk = blk_alloc_disk(NUMA_NO_NODE); 5680 if (!disk) 5681 goto abort; 5682 5683 disk->major = MAJOR(mddev->unit); 5684 disk->first_minor = unit << shift; 5685 disk->minors = 1 << shift; 5686 if (name) 5687 strcpy(disk->disk_name, name); 5688 else if (partitioned) 5689 sprintf(disk->disk_name, "md_d%d", unit); 5690 else 5691 sprintf(disk->disk_name, "md%d", unit); 5692 disk->fops = &md_fops; 5693 disk->private_data = mddev; 5694 5695 mddev->queue = disk->queue; 5696 blk_set_stacking_limits(&mddev->queue->limits); 5697 blk_queue_write_cache(mddev->queue, true, true); 5698 /* Allow extended partitions. This makes the 5699 * 'mdp' device redundant, but we can't really 5700 * remove it now. 5701 */ 5702 disk->flags |= GENHD_FL_EXT_DEVT; 5703 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5704 mddev->gendisk = disk; 5705 add_disk(disk); 5706 5707 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5708 if (error) { 5709 /* This isn't possible, but as kobject_init_and_add is marked 5710 * __must_check, we must do something with the result 5711 */ 5712 pr_debug("md: cannot register %s/md - name in use\n", 5713 disk->disk_name); 5714 error = 0; 5715 } 5716 if (mddev->kobj.sd && 5717 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5718 pr_debug("pointless warning\n"); 5719 abort: 5720 mutex_unlock(&disks_mutex); 5721 if (!error && mddev->kobj.sd) { 5722 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5723 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5724 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5725 } 5726 mddev_put(mddev); 5727 return error; 5728 } 5729 5730 static void md_probe(dev_t dev) 5731 { 5732 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5733 return; 5734 if (create_on_open) 5735 md_alloc(dev, NULL); 5736 } 5737 5738 static int add_named_array(const char *val, const struct kernel_param *kp) 5739 { 5740 /* 5741 * val must be "md_*" or "mdNNN". 5742 * For "md_*" we allocate an array with a large free minor number, and 5743 * set the name to val. val must not already be an active name. 5744 * For "mdNNN" we allocate an array with the minor number NNN 5745 * which must not already be in use. 5746 */ 5747 int len = strlen(val); 5748 char buf[DISK_NAME_LEN]; 5749 unsigned long devnum; 5750 5751 while (len && val[len-1] == '\n') 5752 len--; 5753 if (len >= DISK_NAME_LEN) 5754 return -E2BIG; 5755 strlcpy(buf, val, len+1); 5756 if (strncmp(buf, "md_", 3) == 0) 5757 return md_alloc(0, buf); 5758 if (strncmp(buf, "md", 2) == 0 && 5759 isdigit(buf[2]) && 5760 kstrtoul(buf+2, 10, &devnum) == 0 && 5761 devnum <= MINORMASK) 5762 return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); 5763 5764 return -EINVAL; 5765 } 5766 5767 static void md_safemode_timeout(struct timer_list *t) 5768 { 5769 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5770 5771 mddev->safemode = 1; 5772 if (mddev->external) 5773 sysfs_notify_dirent_safe(mddev->sysfs_state); 5774 5775 md_wakeup_thread(mddev->thread); 5776 } 5777 5778 static int start_dirty_degraded; 5779 5780 int md_run(struct mddev *mddev) 5781 { 5782 int err; 5783 struct md_rdev *rdev; 5784 struct md_personality *pers; 5785 5786 if (list_empty(&mddev->disks)) 5787 /* cannot run an array with no devices.. */ 5788 return -EINVAL; 5789 5790 if (mddev->pers) 5791 return -EBUSY; 5792 /* Cannot run until previous stop completes properly */ 5793 if (mddev->sysfs_active) 5794 return -EBUSY; 5795 5796 /* 5797 * Analyze all RAID superblock(s) 5798 */ 5799 if (!mddev->raid_disks) { 5800 if (!mddev->persistent) 5801 return -EINVAL; 5802 err = analyze_sbs(mddev); 5803 if (err) 5804 return -EINVAL; 5805 } 5806 5807 if (mddev->level != LEVEL_NONE) 5808 request_module("md-level-%d", mddev->level); 5809 else if (mddev->clevel[0]) 5810 request_module("md-%s", mddev->clevel); 5811 5812 /* 5813 * Drop all container device buffers, from now on 5814 * the only valid external interface is through the md 5815 * device. 5816 */ 5817 mddev->has_superblocks = false; 5818 rdev_for_each(rdev, mddev) { 5819 if (test_bit(Faulty, &rdev->flags)) 5820 continue; 5821 sync_blockdev(rdev->bdev); 5822 invalidate_bdev(rdev->bdev); 5823 if (mddev->ro != 1 && rdev_read_only(rdev)) { 5824 mddev->ro = 1; 5825 if (mddev->gendisk) 5826 set_disk_ro(mddev->gendisk, 1); 5827 } 5828 5829 if (rdev->sb_page) 5830 mddev->has_superblocks = true; 5831 5832 /* perform some consistency tests on the device. 5833 * We don't want the data to overlap the metadata, 5834 * Internal Bitmap issues have been handled elsewhere. 5835 */ 5836 if (rdev->meta_bdev) { 5837 /* Nothing to check */; 5838 } else if (rdev->data_offset < rdev->sb_start) { 5839 if (mddev->dev_sectors && 5840 rdev->data_offset + mddev->dev_sectors 5841 > rdev->sb_start) { 5842 pr_warn("md: %s: data overlaps metadata\n", 5843 mdname(mddev)); 5844 return -EINVAL; 5845 } 5846 } else { 5847 if (rdev->sb_start + rdev->sb_size/512 5848 > rdev->data_offset) { 5849 pr_warn("md: %s: metadata overlaps data\n", 5850 mdname(mddev)); 5851 return -EINVAL; 5852 } 5853 } 5854 sysfs_notify_dirent_safe(rdev->sysfs_state); 5855 } 5856 5857 if (!bioset_initialized(&mddev->bio_set)) { 5858 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5859 if (err) 5860 return err; 5861 } 5862 if (!bioset_initialized(&mddev->sync_set)) { 5863 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5864 if (err) 5865 goto exit_bio_set; 5866 } 5867 if (mddev->level != 1 && mddev->level != 10 && 5868 !bioset_initialized(&mddev->io_acct_set)) { 5869 err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, 5870 offsetof(struct md_io_acct, bio_clone), 0); 5871 if (err) 5872 goto exit_sync_set; 5873 } 5874 5875 spin_lock(&pers_lock); 5876 pers = find_pers(mddev->level, mddev->clevel); 5877 if (!pers || !try_module_get(pers->owner)) { 5878 spin_unlock(&pers_lock); 5879 if (mddev->level != LEVEL_NONE) 5880 pr_warn("md: personality for level %d is not loaded!\n", 5881 mddev->level); 5882 else 5883 pr_warn("md: personality for level %s is not loaded!\n", 5884 mddev->clevel); 5885 err = -EINVAL; 5886 goto abort; 5887 } 5888 spin_unlock(&pers_lock); 5889 if (mddev->level != pers->level) { 5890 mddev->level = pers->level; 5891 mddev->new_level = pers->level; 5892 } 5893 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5894 5895 if (mddev->reshape_position != MaxSector && 5896 pers->start_reshape == NULL) { 5897 /* This personality cannot handle reshaping... */ 5898 module_put(pers->owner); 5899 err = -EINVAL; 5900 goto abort; 5901 } 5902 5903 if (pers->sync_request) { 5904 /* Warn if this is a potentially silly 5905 * configuration. 5906 */ 5907 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5908 struct md_rdev *rdev2; 5909 int warned = 0; 5910 5911 rdev_for_each(rdev, mddev) 5912 rdev_for_each(rdev2, mddev) { 5913 if (rdev < rdev2 && 5914 rdev->bdev->bd_disk == 5915 rdev2->bdev->bd_disk) { 5916 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", 5917 mdname(mddev), 5918 bdevname(rdev->bdev,b), 5919 bdevname(rdev2->bdev,b2)); 5920 warned = 1; 5921 } 5922 } 5923 5924 if (warned) 5925 pr_warn("True protection against single-disk failure might be compromised.\n"); 5926 } 5927 5928 mddev->recovery = 0; 5929 /* may be over-ridden by personality */ 5930 mddev->resync_max_sectors = mddev->dev_sectors; 5931 5932 mddev->ok_start_degraded = start_dirty_degraded; 5933 5934 if (start_readonly && mddev->ro == 0) 5935 mddev->ro = 2; /* read-only, but switch on first write */ 5936 5937 err = pers->run(mddev); 5938 if (err) 5939 pr_warn("md: pers->run() failed ...\n"); 5940 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5941 WARN_ONCE(!mddev->external_size, 5942 "%s: default size too small, but 'external_size' not in effect?\n", 5943 __func__); 5944 pr_warn("md: invalid array_size %llu > default size %llu\n", 5945 (unsigned long long)mddev->array_sectors / 2, 5946 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5947 err = -EINVAL; 5948 } 5949 if (err == 0 && pers->sync_request && 5950 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5951 struct bitmap *bitmap; 5952 5953 bitmap = md_bitmap_create(mddev, -1); 5954 if (IS_ERR(bitmap)) { 5955 err = PTR_ERR(bitmap); 5956 pr_warn("%s: failed to create bitmap (%d)\n", 5957 mdname(mddev), err); 5958 } else 5959 mddev->bitmap = bitmap; 5960 5961 } 5962 if (err) 5963 goto bitmap_abort; 5964 5965 if (mddev->bitmap_info.max_write_behind > 0) { 5966 bool create_pool = false; 5967 5968 rdev_for_each(rdev, mddev) { 5969 if (test_bit(WriteMostly, &rdev->flags) && 5970 rdev_init_serial(rdev)) 5971 create_pool = true; 5972 } 5973 if (create_pool && mddev->serial_info_pool == NULL) { 5974 mddev->serial_info_pool = 5975 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 5976 sizeof(struct serial_info)); 5977 if (!mddev->serial_info_pool) { 5978 err = -ENOMEM; 5979 goto bitmap_abort; 5980 } 5981 } 5982 } 5983 5984 if (mddev->queue) { 5985 bool nonrot = true; 5986 5987 rdev_for_each(rdev, mddev) { 5988 if (rdev->raid_disk >= 0 && 5989 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 5990 nonrot = false; 5991 break; 5992 } 5993 } 5994 if (mddev->degraded) 5995 nonrot = false; 5996 if (nonrot) 5997 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 5998 else 5999 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6000 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6001 } 6002 if (pers->sync_request) { 6003 if (mddev->kobj.sd && 6004 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6005 pr_warn("md: cannot register extra attributes for %s\n", 6006 mdname(mddev)); 6007 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6008 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6009 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6010 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 6011 mddev->ro = 0; 6012 6013 atomic_set(&mddev->max_corr_read_errors, 6014 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6015 mddev->safemode = 0; 6016 if (mddev_is_clustered(mddev)) 6017 mddev->safemode_delay = 0; 6018 else 6019 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6020 mddev->in_sync = 1; 6021 smp_wmb(); 6022 spin_lock(&mddev->lock); 6023 mddev->pers = pers; 6024 spin_unlock(&mddev->lock); 6025 rdev_for_each(rdev, mddev) 6026 if (rdev->raid_disk >= 0) 6027 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6028 6029 if (mddev->degraded && !mddev->ro) 6030 /* This ensures that recovering status is reported immediately 6031 * via sysfs - until a lack of spares is confirmed. 6032 */ 6033 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6034 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6035 6036 if (mddev->sb_flags) 6037 md_update_sb(mddev, 0); 6038 6039 md_new_event(mddev); 6040 return 0; 6041 6042 bitmap_abort: 6043 mddev_detach(mddev); 6044 if (mddev->private) 6045 pers->free(mddev, mddev->private); 6046 mddev->private = NULL; 6047 module_put(pers->owner); 6048 md_bitmap_destroy(mddev); 6049 abort: 6050 if (mddev->level != 1 && mddev->level != 10) 6051 bioset_exit(&mddev->io_acct_set); 6052 exit_sync_set: 6053 bioset_exit(&mddev->sync_set); 6054 exit_bio_set: 6055 bioset_exit(&mddev->bio_set); 6056 return err; 6057 } 6058 EXPORT_SYMBOL_GPL(md_run); 6059 6060 int do_md_run(struct mddev *mddev) 6061 { 6062 int err; 6063 6064 set_bit(MD_NOT_READY, &mddev->flags); 6065 err = md_run(mddev); 6066 if (err) 6067 goto out; 6068 err = md_bitmap_load(mddev); 6069 if (err) { 6070 md_bitmap_destroy(mddev); 6071 goto out; 6072 } 6073 6074 if (mddev_is_clustered(mddev)) 6075 md_allow_write(mddev); 6076 6077 /* run start up tasks that require md_thread */ 6078 md_start(mddev); 6079 6080 md_wakeup_thread(mddev->thread); 6081 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6082 6083 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6084 clear_bit(MD_NOT_READY, &mddev->flags); 6085 mddev->changed = 1; 6086 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6087 sysfs_notify_dirent_safe(mddev->sysfs_state); 6088 sysfs_notify_dirent_safe(mddev->sysfs_action); 6089 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6090 out: 6091 clear_bit(MD_NOT_READY, &mddev->flags); 6092 return err; 6093 } 6094 6095 int md_start(struct mddev *mddev) 6096 { 6097 int ret = 0; 6098 6099 if (mddev->pers->start) { 6100 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6101 md_wakeup_thread(mddev->thread); 6102 ret = mddev->pers->start(mddev); 6103 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6104 md_wakeup_thread(mddev->sync_thread); 6105 } 6106 return ret; 6107 } 6108 EXPORT_SYMBOL_GPL(md_start); 6109 6110 static int restart_array(struct mddev *mddev) 6111 { 6112 struct gendisk *disk = mddev->gendisk; 6113 struct md_rdev *rdev; 6114 bool has_journal = false; 6115 bool has_readonly = false; 6116 6117 /* Complain if it has no devices */ 6118 if (list_empty(&mddev->disks)) 6119 return -ENXIO; 6120 if (!mddev->pers) 6121 return -EINVAL; 6122 if (!mddev->ro) 6123 return -EBUSY; 6124 6125 rcu_read_lock(); 6126 rdev_for_each_rcu(rdev, mddev) { 6127 if (test_bit(Journal, &rdev->flags) && 6128 !test_bit(Faulty, &rdev->flags)) 6129 has_journal = true; 6130 if (rdev_read_only(rdev)) 6131 has_readonly = true; 6132 } 6133 rcu_read_unlock(); 6134 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6135 /* Don't restart rw with journal missing/faulty */ 6136 return -EINVAL; 6137 if (has_readonly) 6138 return -EROFS; 6139 6140 mddev->safemode = 0; 6141 mddev->ro = 0; 6142 set_disk_ro(disk, 0); 6143 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6144 /* Kick recovery or resync if necessary */ 6145 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6146 md_wakeup_thread(mddev->thread); 6147 md_wakeup_thread(mddev->sync_thread); 6148 sysfs_notify_dirent_safe(mddev->sysfs_state); 6149 return 0; 6150 } 6151 6152 static void md_clean(struct mddev *mddev) 6153 { 6154 mddev->array_sectors = 0; 6155 mddev->external_size = 0; 6156 mddev->dev_sectors = 0; 6157 mddev->raid_disks = 0; 6158 mddev->recovery_cp = 0; 6159 mddev->resync_min = 0; 6160 mddev->resync_max = MaxSector; 6161 mddev->reshape_position = MaxSector; 6162 mddev->external = 0; 6163 mddev->persistent = 0; 6164 mddev->level = LEVEL_NONE; 6165 mddev->clevel[0] = 0; 6166 mddev->flags = 0; 6167 mddev->sb_flags = 0; 6168 mddev->ro = 0; 6169 mddev->metadata_type[0] = 0; 6170 mddev->chunk_sectors = 0; 6171 mddev->ctime = mddev->utime = 0; 6172 mddev->layout = 0; 6173 mddev->max_disks = 0; 6174 mddev->events = 0; 6175 mddev->can_decrease_events = 0; 6176 mddev->delta_disks = 0; 6177 mddev->reshape_backwards = 0; 6178 mddev->new_level = LEVEL_NONE; 6179 mddev->new_layout = 0; 6180 mddev->new_chunk_sectors = 0; 6181 mddev->curr_resync = 0; 6182 atomic64_set(&mddev->resync_mismatches, 0); 6183 mddev->suspend_lo = mddev->suspend_hi = 0; 6184 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6185 mddev->recovery = 0; 6186 mddev->in_sync = 0; 6187 mddev->changed = 0; 6188 mddev->degraded = 0; 6189 mddev->safemode = 0; 6190 mddev->private = NULL; 6191 mddev->cluster_info = NULL; 6192 mddev->bitmap_info.offset = 0; 6193 mddev->bitmap_info.default_offset = 0; 6194 mddev->bitmap_info.default_space = 0; 6195 mddev->bitmap_info.chunksize = 0; 6196 mddev->bitmap_info.daemon_sleep = 0; 6197 mddev->bitmap_info.max_write_behind = 0; 6198 mddev->bitmap_info.nodes = 0; 6199 } 6200 6201 static void __md_stop_writes(struct mddev *mddev) 6202 { 6203 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6204 if (work_pending(&mddev->del_work)) 6205 flush_workqueue(md_misc_wq); 6206 if (mddev->sync_thread) { 6207 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6208 md_reap_sync_thread(mddev); 6209 } 6210 6211 del_timer_sync(&mddev->safemode_timer); 6212 6213 if (mddev->pers && mddev->pers->quiesce) { 6214 mddev->pers->quiesce(mddev, 1); 6215 mddev->pers->quiesce(mddev, 0); 6216 } 6217 md_bitmap_flush(mddev); 6218 6219 if (mddev->ro == 0 && 6220 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6221 mddev->sb_flags)) { 6222 /* mark array as shutdown cleanly */ 6223 if (!mddev_is_clustered(mddev)) 6224 mddev->in_sync = 1; 6225 md_update_sb(mddev, 1); 6226 } 6227 /* disable policy to guarantee rdevs free resources for serialization */ 6228 mddev->serialize_policy = 0; 6229 mddev_destroy_serial_pool(mddev, NULL, true); 6230 } 6231 6232 void md_stop_writes(struct mddev *mddev) 6233 { 6234 mddev_lock_nointr(mddev); 6235 __md_stop_writes(mddev); 6236 mddev_unlock(mddev); 6237 } 6238 EXPORT_SYMBOL_GPL(md_stop_writes); 6239 6240 static void mddev_detach(struct mddev *mddev) 6241 { 6242 md_bitmap_wait_behind_writes(mddev); 6243 if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { 6244 mddev->pers->quiesce(mddev, 1); 6245 mddev->pers->quiesce(mddev, 0); 6246 } 6247 md_unregister_thread(&mddev->thread); 6248 if (mddev->queue) 6249 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6250 } 6251 6252 static void __md_stop(struct mddev *mddev) 6253 { 6254 struct md_personality *pers = mddev->pers; 6255 md_bitmap_destroy(mddev); 6256 mddev_detach(mddev); 6257 /* Ensure ->event_work is done */ 6258 if (mddev->event_work.func) 6259 flush_workqueue(md_misc_wq); 6260 spin_lock(&mddev->lock); 6261 mddev->pers = NULL; 6262 spin_unlock(&mddev->lock); 6263 pers->free(mddev, mddev->private); 6264 mddev->private = NULL; 6265 if (pers->sync_request && mddev->to_remove == NULL) 6266 mddev->to_remove = &md_redundancy_group; 6267 module_put(pers->owner); 6268 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6269 } 6270 6271 void md_stop(struct mddev *mddev) 6272 { 6273 /* stop the array and free an attached data structures. 6274 * This is called from dm-raid 6275 */ 6276 __md_stop(mddev); 6277 bioset_exit(&mddev->bio_set); 6278 bioset_exit(&mddev->sync_set); 6279 if (mddev->level != 1 && mddev->level != 10) 6280 bioset_exit(&mddev->io_acct_set); 6281 } 6282 6283 EXPORT_SYMBOL_GPL(md_stop); 6284 6285 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6286 { 6287 int err = 0; 6288 int did_freeze = 0; 6289 6290 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6291 did_freeze = 1; 6292 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6293 md_wakeup_thread(mddev->thread); 6294 } 6295 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6296 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6297 if (mddev->sync_thread) 6298 /* Thread might be blocked waiting for metadata update 6299 * which will now never happen */ 6300 wake_up_process(mddev->sync_thread->tsk); 6301 6302 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6303 return -EBUSY; 6304 mddev_unlock(mddev); 6305 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6306 &mddev->recovery)); 6307 wait_event(mddev->sb_wait, 6308 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6309 mddev_lock_nointr(mddev); 6310 6311 mutex_lock(&mddev->open_mutex); 6312 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6313 mddev->sync_thread || 6314 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6315 pr_warn("md: %s still in use.\n",mdname(mddev)); 6316 if (did_freeze) { 6317 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6318 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6319 md_wakeup_thread(mddev->thread); 6320 } 6321 err = -EBUSY; 6322 goto out; 6323 } 6324 if (mddev->pers) { 6325 __md_stop_writes(mddev); 6326 6327 err = -ENXIO; 6328 if (mddev->ro==1) 6329 goto out; 6330 mddev->ro = 1; 6331 set_disk_ro(mddev->gendisk, 1); 6332 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6333 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6334 md_wakeup_thread(mddev->thread); 6335 sysfs_notify_dirent_safe(mddev->sysfs_state); 6336 err = 0; 6337 } 6338 out: 6339 mutex_unlock(&mddev->open_mutex); 6340 return err; 6341 } 6342 6343 /* mode: 6344 * 0 - completely stop and dis-assemble array 6345 * 2 - stop but do not disassemble array 6346 */ 6347 static int do_md_stop(struct mddev *mddev, int mode, 6348 struct block_device *bdev) 6349 { 6350 struct gendisk *disk = mddev->gendisk; 6351 struct md_rdev *rdev; 6352 int did_freeze = 0; 6353 6354 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6355 did_freeze = 1; 6356 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6357 md_wakeup_thread(mddev->thread); 6358 } 6359 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6360 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6361 if (mddev->sync_thread) 6362 /* Thread might be blocked waiting for metadata update 6363 * which will now never happen */ 6364 wake_up_process(mddev->sync_thread->tsk); 6365 6366 mddev_unlock(mddev); 6367 wait_event(resync_wait, (mddev->sync_thread == NULL && 6368 !test_bit(MD_RECOVERY_RUNNING, 6369 &mddev->recovery))); 6370 mddev_lock_nointr(mddev); 6371 6372 mutex_lock(&mddev->open_mutex); 6373 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6374 mddev->sysfs_active || 6375 mddev->sync_thread || 6376 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6377 pr_warn("md: %s still in use.\n",mdname(mddev)); 6378 mutex_unlock(&mddev->open_mutex); 6379 if (did_freeze) { 6380 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6381 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6382 md_wakeup_thread(mddev->thread); 6383 } 6384 return -EBUSY; 6385 } 6386 if (mddev->pers) { 6387 if (mddev->ro) 6388 set_disk_ro(disk, 0); 6389 6390 __md_stop_writes(mddev); 6391 __md_stop(mddev); 6392 6393 /* tell userspace to handle 'inactive' */ 6394 sysfs_notify_dirent_safe(mddev->sysfs_state); 6395 6396 rdev_for_each(rdev, mddev) 6397 if (rdev->raid_disk >= 0) 6398 sysfs_unlink_rdev(mddev, rdev); 6399 6400 set_capacity_and_notify(disk, 0); 6401 mutex_unlock(&mddev->open_mutex); 6402 mddev->changed = 1; 6403 6404 if (mddev->ro) 6405 mddev->ro = 0; 6406 } else 6407 mutex_unlock(&mddev->open_mutex); 6408 /* 6409 * Free resources if final stop 6410 */ 6411 if (mode == 0) { 6412 pr_info("md: %s stopped.\n", mdname(mddev)); 6413 6414 if (mddev->bitmap_info.file) { 6415 struct file *f = mddev->bitmap_info.file; 6416 spin_lock(&mddev->lock); 6417 mddev->bitmap_info.file = NULL; 6418 spin_unlock(&mddev->lock); 6419 fput(f); 6420 } 6421 mddev->bitmap_info.offset = 0; 6422 6423 export_array(mddev); 6424 6425 md_clean(mddev); 6426 if (mddev->hold_active == UNTIL_STOP) 6427 mddev->hold_active = 0; 6428 } 6429 md_new_event(mddev); 6430 sysfs_notify_dirent_safe(mddev->sysfs_state); 6431 return 0; 6432 } 6433 6434 #ifndef MODULE 6435 static void autorun_array(struct mddev *mddev) 6436 { 6437 struct md_rdev *rdev; 6438 int err; 6439 6440 if (list_empty(&mddev->disks)) 6441 return; 6442 6443 pr_info("md: running: "); 6444 6445 rdev_for_each(rdev, mddev) { 6446 char b[BDEVNAME_SIZE]; 6447 pr_cont("<%s>", bdevname(rdev->bdev,b)); 6448 } 6449 pr_cont("\n"); 6450 6451 err = do_md_run(mddev); 6452 if (err) { 6453 pr_warn("md: do_md_run() returned %d\n", err); 6454 do_md_stop(mddev, 0, NULL); 6455 } 6456 } 6457 6458 /* 6459 * lets try to run arrays based on all disks that have arrived 6460 * until now. (those are in pending_raid_disks) 6461 * 6462 * the method: pick the first pending disk, collect all disks with 6463 * the same UUID, remove all from the pending list and put them into 6464 * the 'same_array' list. Then order this list based on superblock 6465 * update time (freshest comes first), kick out 'old' disks and 6466 * compare superblocks. If everything's fine then run it. 6467 * 6468 * If "unit" is allocated, then bump its reference count 6469 */ 6470 static void autorun_devices(int part) 6471 { 6472 struct md_rdev *rdev0, *rdev, *tmp; 6473 struct mddev *mddev; 6474 char b[BDEVNAME_SIZE]; 6475 6476 pr_info("md: autorun ...\n"); 6477 while (!list_empty(&pending_raid_disks)) { 6478 int unit; 6479 dev_t dev; 6480 LIST_HEAD(candidates); 6481 rdev0 = list_entry(pending_raid_disks.next, 6482 struct md_rdev, same_set); 6483 6484 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); 6485 INIT_LIST_HEAD(&candidates); 6486 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6487 if (super_90_load(rdev, rdev0, 0) >= 0) { 6488 pr_debug("md: adding %s ...\n", 6489 bdevname(rdev->bdev,b)); 6490 list_move(&rdev->same_set, &candidates); 6491 } 6492 /* 6493 * now we have a set of devices, with all of them having 6494 * mostly sane superblocks. It's time to allocate the 6495 * mddev. 6496 */ 6497 if (part) { 6498 dev = MKDEV(mdp_major, 6499 rdev0->preferred_minor << MdpMinorShift); 6500 unit = MINOR(dev) >> MdpMinorShift; 6501 } else { 6502 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6503 unit = MINOR(dev); 6504 } 6505 if (rdev0->preferred_minor != unit) { 6506 pr_warn("md: unit number in %s is bad: %d\n", 6507 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 6508 break; 6509 } 6510 6511 md_probe(dev); 6512 mddev = mddev_find(dev); 6513 if (!mddev) 6514 break; 6515 6516 if (mddev_lock(mddev)) 6517 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6518 else if (mddev->raid_disks || mddev->major_version 6519 || !list_empty(&mddev->disks)) { 6520 pr_warn("md: %s already running, cannot run %s\n", 6521 mdname(mddev), bdevname(rdev0->bdev,b)); 6522 mddev_unlock(mddev); 6523 } else { 6524 pr_debug("md: created %s\n", mdname(mddev)); 6525 mddev->persistent = 1; 6526 rdev_for_each_list(rdev, tmp, &candidates) { 6527 list_del_init(&rdev->same_set); 6528 if (bind_rdev_to_array(rdev, mddev)) 6529 export_rdev(rdev); 6530 } 6531 autorun_array(mddev); 6532 mddev_unlock(mddev); 6533 } 6534 /* on success, candidates will be empty, on error 6535 * it won't... 6536 */ 6537 rdev_for_each_list(rdev, tmp, &candidates) { 6538 list_del_init(&rdev->same_set); 6539 export_rdev(rdev); 6540 } 6541 mddev_put(mddev); 6542 } 6543 pr_info("md: ... autorun DONE.\n"); 6544 } 6545 #endif /* !MODULE */ 6546 6547 static int get_version(void __user *arg) 6548 { 6549 mdu_version_t ver; 6550 6551 ver.major = MD_MAJOR_VERSION; 6552 ver.minor = MD_MINOR_VERSION; 6553 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6554 6555 if (copy_to_user(arg, &ver, sizeof(ver))) 6556 return -EFAULT; 6557 6558 return 0; 6559 } 6560 6561 static int get_array_info(struct mddev *mddev, void __user *arg) 6562 { 6563 mdu_array_info_t info; 6564 int nr,working,insync,failed,spare; 6565 struct md_rdev *rdev; 6566 6567 nr = working = insync = failed = spare = 0; 6568 rcu_read_lock(); 6569 rdev_for_each_rcu(rdev, mddev) { 6570 nr++; 6571 if (test_bit(Faulty, &rdev->flags)) 6572 failed++; 6573 else { 6574 working++; 6575 if (test_bit(In_sync, &rdev->flags)) 6576 insync++; 6577 else if (test_bit(Journal, &rdev->flags)) 6578 /* TODO: add journal count to md_u.h */ 6579 ; 6580 else 6581 spare++; 6582 } 6583 } 6584 rcu_read_unlock(); 6585 6586 info.major_version = mddev->major_version; 6587 info.minor_version = mddev->minor_version; 6588 info.patch_version = MD_PATCHLEVEL_VERSION; 6589 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6590 info.level = mddev->level; 6591 info.size = mddev->dev_sectors / 2; 6592 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6593 info.size = -1; 6594 info.nr_disks = nr; 6595 info.raid_disks = mddev->raid_disks; 6596 info.md_minor = mddev->md_minor; 6597 info.not_persistent= !mddev->persistent; 6598 6599 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6600 info.state = 0; 6601 if (mddev->in_sync) 6602 info.state = (1<<MD_SB_CLEAN); 6603 if (mddev->bitmap && mddev->bitmap_info.offset) 6604 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6605 if (mddev_is_clustered(mddev)) 6606 info.state |= (1<<MD_SB_CLUSTERED); 6607 info.active_disks = insync; 6608 info.working_disks = working; 6609 info.failed_disks = failed; 6610 info.spare_disks = spare; 6611 6612 info.layout = mddev->layout; 6613 info.chunk_size = mddev->chunk_sectors << 9; 6614 6615 if (copy_to_user(arg, &info, sizeof(info))) 6616 return -EFAULT; 6617 6618 return 0; 6619 } 6620 6621 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6622 { 6623 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6624 char *ptr; 6625 int err; 6626 6627 file = kzalloc(sizeof(*file), GFP_NOIO); 6628 if (!file) 6629 return -ENOMEM; 6630 6631 err = 0; 6632 spin_lock(&mddev->lock); 6633 /* bitmap enabled */ 6634 if (mddev->bitmap_info.file) { 6635 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6636 sizeof(file->pathname)); 6637 if (IS_ERR(ptr)) 6638 err = PTR_ERR(ptr); 6639 else 6640 memmove(file->pathname, ptr, 6641 sizeof(file->pathname)-(ptr-file->pathname)); 6642 } 6643 spin_unlock(&mddev->lock); 6644 6645 if (err == 0 && 6646 copy_to_user(arg, file, sizeof(*file))) 6647 err = -EFAULT; 6648 6649 kfree(file); 6650 return err; 6651 } 6652 6653 static int get_disk_info(struct mddev *mddev, void __user * arg) 6654 { 6655 mdu_disk_info_t info; 6656 struct md_rdev *rdev; 6657 6658 if (copy_from_user(&info, arg, sizeof(info))) 6659 return -EFAULT; 6660 6661 rcu_read_lock(); 6662 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6663 if (rdev) { 6664 info.major = MAJOR(rdev->bdev->bd_dev); 6665 info.minor = MINOR(rdev->bdev->bd_dev); 6666 info.raid_disk = rdev->raid_disk; 6667 info.state = 0; 6668 if (test_bit(Faulty, &rdev->flags)) 6669 info.state |= (1<<MD_DISK_FAULTY); 6670 else if (test_bit(In_sync, &rdev->flags)) { 6671 info.state |= (1<<MD_DISK_ACTIVE); 6672 info.state |= (1<<MD_DISK_SYNC); 6673 } 6674 if (test_bit(Journal, &rdev->flags)) 6675 info.state |= (1<<MD_DISK_JOURNAL); 6676 if (test_bit(WriteMostly, &rdev->flags)) 6677 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6678 if (test_bit(FailFast, &rdev->flags)) 6679 info.state |= (1<<MD_DISK_FAILFAST); 6680 } else { 6681 info.major = info.minor = 0; 6682 info.raid_disk = -1; 6683 info.state = (1<<MD_DISK_REMOVED); 6684 } 6685 rcu_read_unlock(); 6686 6687 if (copy_to_user(arg, &info, sizeof(info))) 6688 return -EFAULT; 6689 6690 return 0; 6691 } 6692 6693 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6694 { 6695 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 6696 struct md_rdev *rdev; 6697 dev_t dev = MKDEV(info->major,info->minor); 6698 6699 if (mddev_is_clustered(mddev) && 6700 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6701 pr_warn("%s: Cannot add to clustered mddev.\n", 6702 mdname(mddev)); 6703 return -EINVAL; 6704 } 6705 6706 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6707 return -EOVERFLOW; 6708 6709 if (!mddev->raid_disks) { 6710 int err; 6711 /* expecting a device which has a superblock */ 6712 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6713 if (IS_ERR(rdev)) { 6714 pr_warn("md: md_import_device returned %ld\n", 6715 PTR_ERR(rdev)); 6716 return PTR_ERR(rdev); 6717 } 6718 if (!list_empty(&mddev->disks)) { 6719 struct md_rdev *rdev0 6720 = list_entry(mddev->disks.next, 6721 struct md_rdev, same_set); 6722 err = super_types[mddev->major_version] 6723 .load_super(rdev, rdev0, mddev->minor_version); 6724 if (err < 0) { 6725 pr_warn("md: %s has different UUID to %s\n", 6726 bdevname(rdev->bdev,b), 6727 bdevname(rdev0->bdev,b2)); 6728 export_rdev(rdev); 6729 return -EINVAL; 6730 } 6731 } 6732 err = bind_rdev_to_array(rdev, mddev); 6733 if (err) 6734 export_rdev(rdev); 6735 return err; 6736 } 6737 6738 /* 6739 * md_add_new_disk can be used once the array is assembled 6740 * to add "hot spares". They must already have a superblock 6741 * written 6742 */ 6743 if (mddev->pers) { 6744 int err; 6745 if (!mddev->pers->hot_add_disk) { 6746 pr_warn("%s: personality does not support diskops!\n", 6747 mdname(mddev)); 6748 return -EINVAL; 6749 } 6750 if (mddev->persistent) 6751 rdev = md_import_device(dev, mddev->major_version, 6752 mddev->minor_version); 6753 else 6754 rdev = md_import_device(dev, -1, -1); 6755 if (IS_ERR(rdev)) { 6756 pr_warn("md: md_import_device returned %ld\n", 6757 PTR_ERR(rdev)); 6758 return PTR_ERR(rdev); 6759 } 6760 /* set saved_raid_disk if appropriate */ 6761 if (!mddev->persistent) { 6762 if (info->state & (1<<MD_DISK_SYNC) && 6763 info->raid_disk < mddev->raid_disks) { 6764 rdev->raid_disk = info->raid_disk; 6765 set_bit(In_sync, &rdev->flags); 6766 clear_bit(Bitmap_sync, &rdev->flags); 6767 } else 6768 rdev->raid_disk = -1; 6769 rdev->saved_raid_disk = rdev->raid_disk; 6770 } else 6771 super_types[mddev->major_version]. 6772 validate_super(mddev, rdev); 6773 if ((info->state & (1<<MD_DISK_SYNC)) && 6774 rdev->raid_disk != info->raid_disk) { 6775 /* This was a hot-add request, but events doesn't 6776 * match, so reject it. 6777 */ 6778 export_rdev(rdev); 6779 return -EINVAL; 6780 } 6781 6782 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6783 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6784 set_bit(WriteMostly, &rdev->flags); 6785 else 6786 clear_bit(WriteMostly, &rdev->flags); 6787 if (info->state & (1<<MD_DISK_FAILFAST)) 6788 set_bit(FailFast, &rdev->flags); 6789 else 6790 clear_bit(FailFast, &rdev->flags); 6791 6792 if (info->state & (1<<MD_DISK_JOURNAL)) { 6793 struct md_rdev *rdev2; 6794 bool has_journal = false; 6795 6796 /* make sure no existing journal disk */ 6797 rdev_for_each(rdev2, mddev) { 6798 if (test_bit(Journal, &rdev2->flags)) { 6799 has_journal = true; 6800 break; 6801 } 6802 } 6803 if (has_journal || mddev->bitmap) { 6804 export_rdev(rdev); 6805 return -EBUSY; 6806 } 6807 set_bit(Journal, &rdev->flags); 6808 } 6809 /* 6810 * check whether the device shows up in other nodes 6811 */ 6812 if (mddev_is_clustered(mddev)) { 6813 if (info->state & (1 << MD_DISK_CANDIDATE)) 6814 set_bit(Candidate, &rdev->flags); 6815 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6816 /* --add initiated by this node */ 6817 err = md_cluster_ops->add_new_disk(mddev, rdev); 6818 if (err) { 6819 export_rdev(rdev); 6820 return err; 6821 } 6822 } 6823 } 6824 6825 rdev->raid_disk = -1; 6826 err = bind_rdev_to_array(rdev, mddev); 6827 6828 if (err) 6829 export_rdev(rdev); 6830 6831 if (mddev_is_clustered(mddev)) { 6832 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6833 if (!err) { 6834 err = md_cluster_ops->new_disk_ack(mddev, 6835 err == 0); 6836 if (err) 6837 md_kick_rdev_from_array(rdev); 6838 } 6839 } else { 6840 if (err) 6841 md_cluster_ops->add_new_disk_cancel(mddev); 6842 else 6843 err = add_bound_rdev(rdev); 6844 } 6845 6846 } else if (!err) 6847 err = add_bound_rdev(rdev); 6848 6849 return err; 6850 } 6851 6852 /* otherwise, md_add_new_disk is only allowed 6853 * for major_version==0 superblocks 6854 */ 6855 if (mddev->major_version != 0) { 6856 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6857 return -EINVAL; 6858 } 6859 6860 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6861 int err; 6862 rdev = md_import_device(dev, -1, 0); 6863 if (IS_ERR(rdev)) { 6864 pr_warn("md: error, md_import_device() returned %ld\n", 6865 PTR_ERR(rdev)); 6866 return PTR_ERR(rdev); 6867 } 6868 rdev->desc_nr = info->number; 6869 if (info->raid_disk < mddev->raid_disks) 6870 rdev->raid_disk = info->raid_disk; 6871 else 6872 rdev->raid_disk = -1; 6873 6874 if (rdev->raid_disk < mddev->raid_disks) 6875 if (info->state & (1<<MD_DISK_SYNC)) 6876 set_bit(In_sync, &rdev->flags); 6877 6878 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6879 set_bit(WriteMostly, &rdev->flags); 6880 if (info->state & (1<<MD_DISK_FAILFAST)) 6881 set_bit(FailFast, &rdev->flags); 6882 6883 if (!mddev->persistent) { 6884 pr_debug("md: nonpersistent superblock ...\n"); 6885 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6886 } else 6887 rdev->sb_start = calc_dev_sboffset(rdev); 6888 rdev->sectors = rdev->sb_start; 6889 6890 err = bind_rdev_to_array(rdev, mddev); 6891 if (err) { 6892 export_rdev(rdev); 6893 return err; 6894 } 6895 } 6896 6897 return 0; 6898 } 6899 6900 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6901 { 6902 char b[BDEVNAME_SIZE]; 6903 struct md_rdev *rdev; 6904 6905 if (!mddev->pers) 6906 return -ENODEV; 6907 6908 rdev = find_rdev(mddev, dev); 6909 if (!rdev) 6910 return -ENXIO; 6911 6912 if (rdev->raid_disk < 0) 6913 goto kick_rdev; 6914 6915 clear_bit(Blocked, &rdev->flags); 6916 remove_and_add_spares(mddev, rdev); 6917 6918 if (rdev->raid_disk >= 0) 6919 goto busy; 6920 6921 kick_rdev: 6922 if (mddev_is_clustered(mddev)) { 6923 if (md_cluster_ops->remove_disk(mddev, rdev)) 6924 goto busy; 6925 } 6926 6927 md_kick_rdev_from_array(rdev); 6928 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6929 if (mddev->thread) 6930 md_wakeup_thread(mddev->thread); 6931 else 6932 md_update_sb(mddev, 1); 6933 md_new_event(mddev); 6934 6935 return 0; 6936 busy: 6937 pr_debug("md: cannot remove active disk %s from %s ...\n", 6938 bdevname(rdev->bdev,b), mdname(mddev)); 6939 return -EBUSY; 6940 } 6941 6942 static int hot_add_disk(struct mddev *mddev, dev_t dev) 6943 { 6944 char b[BDEVNAME_SIZE]; 6945 int err; 6946 struct md_rdev *rdev; 6947 6948 if (!mddev->pers) 6949 return -ENODEV; 6950 6951 if (mddev->major_version != 0) { 6952 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 6953 mdname(mddev)); 6954 return -EINVAL; 6955 } 6956 if (!mddev->pers->hot_add_disk) { 6957 pr_warn("%s: personality does not support diskops!\n", 6958 mdname(mddev)); 6959 return -EINVAL; 6960 } 6961 6962 rdev = md_import_device(dev, -1, 0); 6963 if (IS_ERR(rdev)) { 6964 pr_warn("md: error, md_import_device() returned %ld\n", 6965 PTR_ERR(rdev)); 6966 return -EINVAL; 6967 } 6968 6969 if (mddev->persistent) 6970 rdev->sb_start = calc_dev_sboffset(rdev); 6971 else 6972 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6973 6974 rdev->sectors = rdev->sb_start; 6975 6976 if (test_bit(Faulty, &rdev->flags)) { 6977 pr_warn("md: can not hot-add faulty %s disk to %s!\n", 6978 bdevname(rdev->bdev,b), mdname(mddev)); 6979 err = -EINVAL; 6980 goto abort_export; 6981 } 6982 6983 clear_bit(In_sync, &rdev->flags); 6984 rdev->desc_nr = -1; 6985 rdev->saved_raid_disk = -1; 6986 err = bind_rdev_to_array(rdev, mddev); 6987 if (err) 6988 goto abort_export; 6989 6990 /* 6991 * The rest should better be atomic, we can have disk failures 6992 * noticed in interrupt contexts ... 6993 */ 6994 6995 rdev->raid_disk = -1; 6996 6997 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6998 if (!mddev->thread) 6999 md_update_sb(mddev, 1); 7000 /* 7001 * Kick recovery, maybe this spare has to be added to the 7002 * array immediately. 7003 */ 7004 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7005 md_wakeup_thread(mddev->thread); 7006 md_new_event(mddev); 7007 return 0; 7008 7009 abort_export: 7010 export_rdev(rdev); 7011 return err; 7012 } 7013 7014 static int set_bitmap_file(struct mddev *mddev, int fd) 7015 { 7016 int err = 0; 7017 7018 if (mddev->pers) { 7019 if (!mddev->pers->quiesce || !mddev->thread) 7020 return -EBUSY; 7021 if (mddev->recovery || mddev->sync_thread) 7022 return -EBUSY; 7023 /* we should be able to change the bitmap.. */ 7024 } 7025 7026 if (fd >= 0) { 7027 struct inode *inode; 7028 struct file *f; 7029 7030 if (mddev->bitmap || mddev->bitmap_info.file) 7031 return -EEXIST; /* cannot add when bitmap is present */ 7032 f = fget(fd); 7033 7034 if (f == NULL) { 7035 pr_warn("%s: error: failed to get bitmap file\n", 7036 mdname(mddev)); 7037 return -EBADF; 7038 } 7039 7040 inode = f->f_mapping->host; 7041 if (!S_ISREG(inode->i_mode)) { 7042 pr_warn("%s: error: bitmap file must be a regular file\n", 7043 mdname(mddev)); 7044 err = -EBADF; 7045 } else if (!(f->f_mode & FMODE_WRITE)) { 7046 pr_warn("%s: error: bitmap file must open for write\n", 7047 mdname(mddev)); 7048 err = -EBADF; 7049 } else if (atomic_read(&inode->i_writecount) != 1) { 7050 pr_warn("%s: error: bitmap file is already in use\n", 7051 mdname(mddev)); 7052 err = -EBUSY; 7053 } 7054 if (err) { 7055 fput(f); 7056 return err; 7057 } 7058 mddev->bitmap_info.file = f; 7059 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7060 } else if (mddev->bitmap == NULL) 7061 return -ENOENT; /* cannot remove what isn't there */ 7062 err = 0; 7063 if (mddev->pers) { 7064 if (fd >= 0) { 7065 struct bitmap *bitmap; 7066 7067 bitmap = md_bitmap_create(mddev, -1); 7068 mddev_suspend(mddev); 7069 if (!IS_ERR(bitmap)) { 7070 mddev->bitmap = bitmap; 7071 err = md_bitmap_load(mddev); 7072 } else 7073 err = PTR_ERR(bitmap); 7074 if (err) { 7075 md_bitmap_destroy(mddev); 7076 fd = -1; 7077 } 7078 mddev_resume(mddev); 7079 } else if (fd < 0) { 7080 mddev_suspend(mddev); 7081 md_bitmap_destroy(mddev); 7082 mddev_resume(mddev); 7083 } 7084 } 7085 if (fd < 0) { 7086 struct file *f = mddev->bitmap_info.file; 7087 if (f) { 7088 spin_lock(&mddev->lock); 7089 mddev->bitmap_info.file = NULL; 7090 spin_unlock(&mddev->lock); 7091 fput(f); 7092 } 7093 } 7094 7095 return err; 7096 } 7097 7098 /* 7099 * md_set_array_info is used two different ways 7100 * The original usage is when creating a new array. 7101 * In this usage, raid_disks is > 0 and it together with 7102 * level, size, not_persistent,layout,chunksize determine the 7103 * shape of the array. 7104 * This will always create an array with a type-0.90.0 superblock. 7105 * The newer usage is when assembling an array. 7106 * In this case raid_disks will be 0, and the major_version field is 7107 * use to determine which style super-blocks are to be found on the devices. 7108 * The minor and patch _version numbers are also kept incase the 7109 * super_block handler wishes to interpret them. 7110 */ 7111 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7112 { 7113 if (info->raid_disks == 0) { 7114 /* just setting version number for superblock loading */ 7115 if (info->major_version < 0 || 7116 info->major_version >= ARRAY_SIZE(super_types) || 7117 super_types[info->major_version].name == NULL) { 7118 /* maybe try to auto-load a module? */ 7119 pr_warn("md: superblock version %d not known\n", 7120 info->major_version); 7121 return -EINVAL; 7122 } 7123 mddev->major_version = info->major_version; 7124 mddev->minor_version = info->minor_version; 7125 mddev->patch_version = info->patch_version; 7126 mddev->persistent = !info->not_persistent; 7127 /* ensure mddev_put doesn't delete this now that there 7128 * is some minimal configuration. 7129 */ 7130 mddev->ctime = ktime_get_real_seconds(); 7131 return 0; 7132 } 7133 mddev->major_version = MD_MAJOR_VERSION; 7134 mddev->minor_version = MD_MINOR_VERSION; 7135 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7136 mddev->ctime = ktime_get_real_seconds(); 7137 7138 mddev->level = info->level; 7139 mddev->clevel[0] = 0; 7140 mddev->dev_sectors = 2 * (sector_t)info->size; 7141 mddev->raid_disks = info->raid_disks; 7142 /* don't set md_minor, it is determined by which /dev/md* was 7143 * openned 7144 */ 7145 if (info->state & (1<<MD_SB_CLEAN)) 7146 mddev->recovery_cp = MaxSector; 7147 else 7148 mddev->recovery_cp = 0; 7149 mddev->persistent = ! info->not_persistent; 7150 mddev->external = 0; 7151 7152 mddev->layout = info->layout; 7153 if (mddev->level == 0) 7154 /* Cannot trust RAID0 layout info here */ 7155 mddev->layout = -1; 7156 mddev->chunk_sectors = info->chunk_size >> 9; 7157 7158 if (mddev->persistent) { 7159 mddev->max_disks = MD_SB_DISKS; 7160 mddev->flags = 0; 7161 mddev->sb_flags = 0; 7162 } 7163 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7164 7165 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7166 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7167 mddev->bitmap_info.offset = 0; 7168 7169 mddev->reshape_position = MaxSector; 7170 7171 /* 7172 * Generate a 128 bit UUID 7173 */ 7174 get_random_bytes(mddev->uuid, 16); 7175 7176 mddev->new_level = mddev->level; 7177 mddev->new_chunk_sectors = mddev->chunk_sectors; 7178 mddev->new_layout = mddev->layout; 7179 mddev->delta_disks = 0; 7180 mddev->reshape_backwards = 0; 7181 7182 return 0; 7183 } 7184 7185 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7186 { 7187 lockdep_assert_held(&mddev->reconfig_mutex); 7188 7189 if (mddev->external_size) 7190 return; 7191 7192 mddev->array_sectors = array_sectors; 7193 } 7194 EXPORT_SYMBOL(md_set_array_sectors); 7195 7196 static int update_size(struct mddev *mddev, sector_t num_sectors) 7197 { 7198 struct md_rdev *rdev; 7199 int rv; 7200 int fit = (num_sectors == 0); 7201 sector_t old_dev_sectors = mddev->dev_sectors; 7202 7203 if (mddev->pers->resize == NULL) 7204 return -EINVAL; 7205 /* The "num_sectors" is the number of sectors of each device that 7206 * is used. This can only make sense for arrays with redundancy. 7207 * linear and raid0 always use whatever space is available. We can only 7208 * consider changing this number if no resync or reconstruction is 7209 * happening, and if the new size is acceptable. It must fit before the 7210 * sb_start or, if that is <data_offset, it must fit before the size 7211 * of each device. If num_sectors is zero, we find the largest size 7212 * that fits. 7213 */ 7214 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7215 mddev->sync_thread) 7216 return -EBUSY; 7217 if (mddev->ro) 7218 return -EROFS; 7219 7220 rdev_for_each(rdev, mddev) { 7221 sector_t avail = rdev->sectors; 7222 7223 if (fit && (num_sectors == 0 || num_sectors > avail)) 7224 num_sectors = avail; 7225 if (avail < num_sectors) 7226 return -ENOSPC; 7227 } 7228 rv = mddev->pers->resize(mddev, num_sectors); 7229 if (!rv) { 7230 if (mddev_is_clustered(mddev)) 7231 md_cluster_ops->update_size(mddev, old_dev_sectors); 7232 else if (mddev->queue) { 7233 set_capacity_and_notify(mddev->gendisk, 7234 mddev->array_sectors); 7235 } 7236 } 7237 return rv; 7238 } 7239 7240 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7241 { 7242 int rv; 7243 struct md_rdev *rdev; 7244 /* change the number of raid disks */ 7245 if (mddev->pers->check_reshape == NULL) 7246 return -EINVAL; 7247 if (mddev->ro) 7248 return -EROFS; 7249 if (raid_disks <= 0 || 7250 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7251 return -EINVAL; 7252 if (mddev->sync_thread || 7253 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7254 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7255 mddev->reshape_position != MaxSector) 7256 return -EBUSY; 7257 7258 rdev_for_each(rdev, mddev) { 7259 if (mddev->raid_disks < raid_disks && 7260 rdev->data_offset < rdev->new_data_offset) 7261 return -EINVAL; 7262 if (mddev->raid_disks > raid_disks && 7263 rdev->data_offset > rdev->new_data_offset) 7264 return -EINVAL; 7265 } 7266 7267 mddev->delta_disks = raid_disks - mddev->raid_disks; 7268 if (mddev->delta_disks < 0) 7269 mddev->reshape_backwards = 1; 7270 else if (mddev->delta_disks > 0) 7271 mddev->reshape_backwards = 0; 7272 7273 rv = mddev->pers->check_reshape(mddev); 7274 if (rv < 0) { 7275 mddev->delta_disks = 0; 7276 mddev->reshape_backwards = 0; 7277 } 7278 return rv; 7279 } 7280 7281 /* 7282 * update_array_info is used to change the configuration of an 7283 * on-line array. 7284 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7285 * fields in the info are checked against the array. 7286 * Any differences that cannot be handled will cause an error. 7287 * Normally, only one change can be managed at a time. 7288 */ 7289 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7290 { 7291 int rv = 0; 7292 int cnt = 0; 7293 int state = 0; 7294 7295 /* calculate expected state,ignoring low bits */ 7296 if (mddev->bitmap && mddev->bitmap_info.offset) 7297 state |= (1 << MD_SB_BITMAP_PRESENT); 7298 7299 if (mddev->major_version != info->major_version || 7300 mddev->minor_version != info->minor_version || 7301 /* mddev->patch_version != info->patch_version || */ 7302 mddev->ctime != info->ctime || 7303 mddev->level != info->level || 7304 /* mddev->layout != info->layout || */ 7305 mddev->persistent != !info->not_persistent || 7306 mddev->chunk_sectors != info->chunk_size >> 9 || 7307 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7308 ((state^info->state) & 0xfffffe00) 7309 ) 7310 return -EINVAL; 7311 /* Check there is only one change */ 7312 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7313 cnt++; 7314 if (mddev->raid_disks != info->raid_disks) 7315 cnt++; 7316 if (mddev->layout != info->layout) 7317 cnt++; 7318 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7319 cnt++; 7320 if (cnt == 0) 7321 return 0; 7322 if (cnt > 1) 7323 return -EINVAL; 7324 7325 if (mddev->layout != info->layout) { 7326 /* Change layout 7327 * we don't need to do anything at the md level, the 7328 * personality will take care of it all. 7329 */ 7330 if (mddev->pers->check_reshape == NULL) 7331 return -EINVAL; 7332 else { 7333 mddev->new_layout = info->layout; 7334 rv = mddev->pers->check_reshape(mddev); 7335 if (rv) 7336 mddev->new_layout = mddev->layout; 7337 return rv; 7338 } 7339 } 7340 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7341 rv = update_size(mddev, (sector_t)info->size * 2); 7342 7343 if (mddev->raid_disks != info->raid_disks) 7344 rv = update_raid_disks(mddev, info->raid_disks); 7345 7346 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7347 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7348 rv = -EINVAL; 7349 goto err; 7350 } 7351 if (mddev->recovery || mddev->sync_thread) { 7352 rv = -EBUSY; 7353 goto err; 7354 } 7355 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7356 struct bitmap *bitmap; 7357 /* add the bitmap */ 7358 if (mddev->bitmap) { 7359 rv = -EEXIST; 7360 goto err; 7361 } 7362 if (mddev->bitmap_info.default_offset == 0) { 7363 rv = -EINVAL; 7364 goto err; 7365 } 7366 mddev->bitmap_info.offset = 7367 mddev->bitmap_info.default_offset; 7368 mddev->bitmap_info.space = 7369 mddev->bitmap_info.default_space; 7370 bitmap = md_bitmap_create(mddev, -1); 7371 mddev_suspend(mddev); 7372 if (!IS_ERR(bitmap)) { 7373 mddev->bitmap = bitmap; 7374 rv = md_bitmap_load(mddev); 7375 } else 7376 rv = PTR_ERR(bitmap); 7377 if (rv) 7378 md_bitmap_destroy(mddev); 7379 mddev_resume(mddev); 7380 } else { 7381 /* remove the bitmap */ 7382 if (!mddev->bitmap) { 7383 rv = -ENOENT; 7384 goto err; 7385 } 7386 if (mddev->bitmap->storage.file) { 7387 rv = -EINVAL; 7388 goto err; 7389 } 7390 if (mddev->bitmap_info.nodes) { 7391 /* hold PW on all the bitmap lock */ 7392 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7393 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7394 rv = -EPERM; 7395 md_cluster_ops->unlock_all_bitmaps(mddev); 7396 goto err; 7397 } 7398 7399 mddev->bitmap_info.nodes = 0; 7400 md_cluster_ops->leave(mddev); 7401 module_put(md_cluster_mod); 7402 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7403 } 7404 mddev_suspend(mddev); 7405 md_bitmap_destroy(mddev); 7406 mddev_resume(mddev); 7407 mddev->bitmap_info.offset = 0; 7408 } 7409 } 7410 md_update_sb(mddev, 1); 7411 return rv; 7412 err: 7413 return rv; 7414 } 7415 7416 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7417 { 7418 struct md_rdev *rdev; 7419 int err = 0; 7420 7421 if (mddev->pers == NULL) 7422 return -ENODEV; 7423 7424 rcu_read_lock(); 7425 rdev = md_find_rdev_rcu(mddev, dev); 7426 if (!rdev) 7427 err = -ENODEV; 7428 else { 7429 md_error(mddev, rdev); 7430 if (!test_bit(Faulty, &rdev->flags)) 7431 err = -EBUSY; 7432 } 7433 rcu_read_unlock(); 7434 return err; 7435 } 7436 7437 /* 7438 * We have a problem here : there is no easy way to give a CHS 7439 * virtual geometry. We currently pretend that we have a 2 heads 7440 * 4 sectors (with a BIG number of cylinders...). This drives 7441 * dosfs just mad... ;-) 7442 */ 7443 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7444 { 7445 struct mddev *mddev = bdev->bd_disk->private_data; 7446 7447 geo->heads = 2; 7448 geo->sectors = 4; 7449 geo->cylinders = mddev->array_sectors / 8; 7450 return 0; 7451 } 7452 7453 static inline bool md_ioctl_valid(unsigned int cmd) 7454 { 7455 switch (cmd) { 7456 case ADD_NEW_DISK: 7457 case GET_ARRAY_INFO: 7458 case GET_BITMAP_FILE: 7459 case GET_DISK_INFO: 7460 case HOT_ADD_DISK: 7461 case HOT_REMOVE_DISK: 7462 case RAID_VERSION: 7463 case RESTART_ARRAY_RW: 7464 case RUN_ARRAY: 7465 case SET_ARRAY_INFO: 7466 case SET_BITMAP_FILE: 7467 case SET_DISK_FAULTY: 7468 case STOP_ARRAY: 7469 case STOP_ARRAY_RO: 7470 case CLUSTERED_DISK_NACK: 7471 return true; 7472 default: 7473 return false; 7474 } 7475 } 7476 7477 static int md_ioctl(struct block_device *bdev, fmode_t mode, 7478 unsigned int cmd, unsigned long arg) 7479 { 7480 int err = 0; 7481 void __user *argp = (void __user *)arg; 7482 struct mddev *mddev = NULL; 7483 bool did_set_md_closing = false; 7484 7485 if (!md_ioctl_valid(cmd)) 7486 return -ENOTTY; 7487 7488 switch (cmd) { 7489 case RAID_VERSION: 7490 case GET_ARRAY_INFO: 7491 case GET_DISK_INFO: 7492 break; 7493 default: 7494 if (!capable(CAP_SYS_ADMIN)) 7495 return -EACCES; 7496 } 7497 7498 /* 7499 * Commands dealing with the RAID driver but not any 7500 * particular array: 7501 */ 7502 switch (cmd) { 7503 case RAID_VERSION: 7504 err = get_version(argp); 7505 goto out; 7506 default:; 7507 } 7508 7509 /* 7510 * Commands creating/starting a new array: 7511 */ 7512 7513 mddev = bdev->bd_disk->private_data; 7514 7515 if (!mddev) { 7516 BUG(); 7517 goto out; 7518 } 7519 7520 /* Some actions do not requires the mutex */ 7521 switch (cmd) { 7522 case GET_ARRAY_INFO: 7523 if (!mddev->raid_disks && !mddev->external) 7524 err = -ENODEV; 7525 else 7526 err = get_array_info(mddev, argp); 7527 goto out; 7528 7529 case GET_DISK_INFO: 7530 if (!mddev->raid_disks && !mddev->external) 7531 err = -ENODEV; 7532 else 7533 err = get_disk_info(mddev, argp); 7534 goto out; 7535 7536 case SET_DISK_FAULTY: 7537 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7538 goto out; 7539 7540 case GET_BITMAP_FILE: 7541 err = get_bitmap_file(mddev, argp); 7542 goto out; 7543 7544 } 7545 7546 if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) 7547 flush_rdev_wq(mddev); 7548 7549 if (cmd == HOT_REMOVE_DISK) 7550 /* need to ensure recovery thread has run */ 7551 wait_event_interruptible_timeout(mddev->sb_wait, 7552 !test_bit(MD_RECOVERY_NEEDED, 7553 &mddev->recovery), 7554 msecs_to_jiffies(5000)); 7555 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7556 /* Need to flush page cache, and ensure no-one else opens 7557 * and writes 7558 */ 7559 mutex_lock(&mddev->open_mutex); 7560 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7561 mutex_unlock(&mddev->open_mutex); 7562 err = -EBUSY; 7563 goto out; 7564 } 7565 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7566 mutex_unlock(&mddev->open_mutex); 7567 err = -EBUSY; 7568 goto out; 7569 } 7570 did_set_md_closing = true; 7571 mutex_unlock(&mddev->open_mutex); 7572 sync_blockdev(bdev); 7573 } 7574 err = mddev_lock(mddev); 7575 if (err) { 7576 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7577 err, cmd); 7578 goto out; 7579 } 7580 7581 if (cmd == SET_ARRAY_INFO) { 7582 mdu_array_info_t info; 7583 if (!arg) 7584 memset(&info, 0, sizeof(info)); 7585 else if (copy_from_user(&info, argp, sizeof(info))) { 7586 err = -EFAULT; 7587 goto unlock; 7588 } 7589 if (mddev->pers) { 7590 err = update_array_info(mddev, &info); 7591 if (err) { 7592 pr_warn("md: couldn't update array info. %d\n", err); 7593 goto unlock; 7594 } 7595 goto unlock; 7596 } 7597 if (!list_empty(&mddev->disks)) { 7598 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7599 err = -EBUSY; 7600 goto unlock; 7601 } 7602 if (mddev->raid_disks) { 7603 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7604 err = -EBUSY; 7605 goto unlock; 7606 } 7607 err = md_set_array_info(mddev, &info); 7608 if (err) { 7609 pr_warn("md: couldn't set array info. %d\n", err); 7610 goto unlock; 7611 } 7612 goto unlock; 7613 } 7614 7615 /* 7616 * Commands querying/configuring an existing array: 7617 */ 7618 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7619 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7620 if ((!mddev->raid_disks && !mddev->external) 7621 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7622 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7623 && cmd != GET_BITMAP_FILE) { 7624 err = -ENODEV; 7625 goto unlock; 7626 } 7627 7628 /* 7629 * Commands even a read-only array can execute: 7630 */ 7631 switch (cmd) { 7632 case RESTART_ARRAY_RW: 7633 err = restart_array(mddev); 7634 goto unlock; 7635 7636 case STOP_ARRAY: 7637 err = do_md_stop(mddev, 0, bdev); 7638 goto unlock; 7639 7640 case STOP_ARRAY_RO: 7641 err = md_set_readonly(mddev, bdev); 7642 goto unlock; 7643 7644 case HOT_REMOVE_DISK: 7645 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7646 goto unlock; 7647 7648 case ADD_NEW_DISK: 7649 /* We can support ADD_NEW_DISK on read-only arrays 7650 * only if we are re-adding a preexisting device. 7651 * So require mddev->pers and MD_DISK_SYNC. 7652 */ 7653 if (mddev->pers) { 7654 mdu_disk_info_t info; 7655 if (copy_from_user(&info, argp, sizeof(info))) 7656 err = -EFAULT; 7657 else if (!(info.state & (1<<MD_DISK_SYNC))) 7658 /* Need to clear read-only for this */ 7659 break; 7660 else 7661 err = md_add_new_disk(mddev, &info); 7662 goto unlock; 7663 } 7664 break; 7665 } 7666 7667 /* 7668 * The remaining ioctls are changing the state of the 7669 * superblock, so we do not allow them on read-only arrays. 7670 */ 7671 if (mddev->ro && mddev->pers) { 7672 if (mddev->ro == 2) { 7673 mddev->ro = 0; 7674 sysfs_notify_dirent_safe(mddev->sysfs_state); 7675 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7676 /* mddev_unlock will wake thread */ 7677 /* If a device failed while we were read-only, we 7678 * need to make sure the metadata is updated now. 7679 */ 7680 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7681 mddev_unlock(mddev); 7682 wait_event(mddev->sb_wait, 7683 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7684 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7685 mddev_lock_nointr(mddev); 7686 } 7687 } else { 7688 err = -EROFS; 7689 goto unlock; 7690 } 7691 } 7692 7693 switch (cmd) { 7694 case ADD_NEW_DISK: 7695 { 7696 mdu_disk_info_t info; 7697 if (copy_from_user(&info, argp, sizeof(info))) 7698 err = -EFAULT; 7699 else 7700 err = md_add_new_disk(mddev, &info); 7701 goto unlock; 7702 } 7703 7704 case CLUSTERED_DISK_NACK: 7705 if (mddev_is_clustered(mddev)) 7706 md_cluster_ops->new_disk_ack(mddev, false); 7707 else 7708 err = -EINVAL; 7709 goto unlock; 7710 7711 case HOT_ADD_DISK: 7712 err = hot_add_disk(mddev, new_decode_dev(arg)); 7713 goto unlock; 7714 7715 case RUN_ARRAY: 7716 err = do_md_run(mddev); 7717 goto unlock; 7718 7719 case SET_BITMAP_FILE: 7720 err = set_bitmap_file(mddev, (int)arg); 7721 goto unlock; 7722 7723 default: 7724 err = -EINVAL; 7725 goto unlock; 7726 } 7727 7728 unlock: 7729 if (mddev->hold_active == UNTIL_IOCTL && 7730 err != -EINVAL) 7731 mddev->hold_active = 0; 7732 mddev_unlock(mddev); 7733 out: 7734 if(did_set_md_closing) 7735 clear_bit(MD_CLOSING, &mddev->flags); 7736 return err; 7737 } 7738 #ifdef CONFIG_COMPAT 7739 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 7740 unsigned int cmd, unsigned long arg) 7741 { 7742 switch (cmd) { 7743 case HOT_REMOVE_DISK: 7744 case HOT_ADD_DISK: 7745 case SET_DISK_FAULTY: 7746 case SET_BITMAP_FILE: 7747 /* These take in integer arg, do not convert */ 7748 break; 7749 default: 7750 arg = (unsigned long)compat_ptr(arg); 7751 break; 7752 } 7753 7754 return md_ioctl(bdev, mode, cmd, arg); 7755 } 7756 #endif /* CONFIG_COMPAT */ 7757 7758 static int md_set_read_only(struct block_device *bdev, bool ro) 7759 { 7760 struct mddev *mddev = bdev->bd_disk->private_data; 7761 int err; 7762 7763 err = mddev_lock(mddev); 7764 if (err) 7765 return err; 7766 7767 if (!mddev->raid_disks && !mddev->external) { 7768 err = -ENODEV; 7769 goto out_unlock; 7770 } 7771 7772 /* 7773 * Transitioning to read-auto need only happen for arrays that call 7774 * md_write_start and which are not ready for writes yet. 7775 */ 7776 if (!ro && mddev->ro == 1 && mddev->pers) { 7777 err = restart_array(mddev); 7778 if (err) 7779 goto out_unlock; 7780 mddev->ro = 2; 7781 } 7782 7783 out_unlock: 7784 mddev_unlock(mddev); 7785 return err; 7786 } 7787 7788 static int md_open(struct block_device *bdev, fmode_t mode) 7789 { 7790 /* 7791 * Succeed if we can lock the mddev, which confirms that 7792 * it isn't being stopped right now. 7793 */ 7794 struct mddev *mddev = mddev_find(bdev->bd_dev); 7795 int err; 7796 7797 if (!mddev) 7798 return -ENODEV; 7799 7800 if (mddev->gendisk != bdev->bd_disk) { 7801 /* we are racing with mddev_put which is discarding this 7802 * bd_disk. 7803 */ 7804 mddev_put(mddev); 7805 /* Wait until bdev->bd_disk is definitely gone */ 7806 if (work_pending(&mddev->del_work)) 7807 flush_workqueue(md_misc_wq); 7808 return -EBUSY; 7809 } 7810 BUG_ON(mddev != bdev->bd_disk->private_data); 7811 7812 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7813 goto out; 7814 7815 if (test_bit(MD_CLOSING, &mddev->flags)) { 7816 mutex_unlock(&mddev->open_mutex); 7817 err = -ENODEV; 7818 goto out; 7819 } 7820 7821 err = 0; 7822 atomic_inc(&mddev->openers); 7823 mutex_unlock(&mddev->open_mutex); 7824 7825 bdev_check_media_change(bdev); 7826 out: 7827 if (err) 7828 mddev_put(mddev); 7829 return err; 7830 } 7831 7832 static void md_release(struct gendisk *disk, fmode_t mode) 7833 { 7834 struct mddev *mddev = disk->private_data; 7835 7836 BUG_ON(!mddev); 7837 atomic_dec(&mddev->openers); 7838 mddev_put(mddev); 7839 } 7840 7841 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7842 { 7843 struct mddev *mddev = disk->private_data; 7844 unsigned int ret = 0; 7845 7846 if (mddev->changed) 7847 ret = DISK_EVENT_MEDIA_CHANGE; 7848 mddev->changed = 0; 7849 return ret; 7850 } 7851 7852 const struct block_device_operations md_fops = 7853 { 7854 .owner = THIS_MODULE, 7855 .submit_bio = md_submit_bio, 7856 .open = md_open, 7857 .release = md_release, 7858 .ioctl = md_ioctl, 7859 #ifdef CONFIG_COMPAT 7860 .compat_ioctl = md_compat_ioctl, 7861 #endif 7862 .getgeo = md_getgeo, 7863 .check_events = md_check_events, 7864 .set_read_only = md_set_read_only, 7865 }; 7866 7867 static int md_thread(void *arg) 7868 { 7869 struct md_thread *thread = arg; 7870 7871 /* 7872 * md_thread is a 'system-thread', it's priority should be very 7873 * high. We avoid resource deadlocks individually in each 7874 * raid personality. (RAID5 does preallocation) We also use RR and 7875 * the very same RT priority as kswapd, thus we will never get 7876 * into a priority inversion deadlock. 7877 * 7878 * we definitely have to have equal or higher priority than 7879 * bdflush, otherwise bdflush will deadlock if there are too 7880 * many dirty RAID5 blocks. 7881 */ 7882 7883 allow_signal(SIGKILL); 7884 while (!kthread_should_stop()) { 7885 7886 /* We need to wait INTERRUPTIBLE so that 7887 * we don't add to the load-average. 7888 * That means we need to be sure no signals are 7889 * pending 7890 */ 7891 if (signal_pending(current)) 7892 flush_signals(current); 7893 7894 wait_event_interruptible_timeout 7895 (thread->wqueue, 7896 test_bit(THREAD_WAKEUP, &thread->flags) 7897 || kthread_should_stop() || kthread_should_park(), 7898 thread->timeout); 7899 7900 clear_bit(THREAD_WAKEUP, &thread->flags); 7901 if (kthread_should_park()) 7902 kthread_parkme(); 7903 if (!kthread_should_stop()) 7904 thread->run(thread); 7905 } 7906 7907 return 0; 7908 } 7909 7910 void md_wakeup_thread(struct md_thread *thread) 7911 { 7912 if (thread) { 7913 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7914 set_bit(THREAD_WAKEUP, &thread->flags); 7915 wake_up(&thread->wqueue); 7916 } 7917 } 7918 EXPORT_SYMBOL(md_wakeup_thread); 7919 7920 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7921 struct mddev *mddev, const char *name) 7922 { 7923 struct md_thread *thread; 7924 7925 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7926 if (!thread) 7927 return NULL; 7928 7929 init_waitqueue_head(&thread->wqueue); 7930 7931 thread->run = run; 7932 thread->mddev = mddev; 7933 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7934 thread->tsk = kthread_run(md_thread, thread, 7935 "%s_%s", 7936 mdname(thread->mddev), 7937 name); 7938 if (IS_ERR(thread->tsk)) { 7939 kfree(thread); 7940 return NULL; 7941 } 7942 return thread; 7943 } 7944 EXPORT_SYMBOL(md_register_thread); 7945 7946 void md_unregister_thread(struct md_thread **threadp) 7947 { 7948 struct md_thread *thread = *threadp; 7949 if (!thread) 7950 return; 7951 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7952 /* Locking ensures that mddev_unlock does not wake_up a 7953 * non-existent thread 7954 */ 7955 spin_lock(&pers_lock); 7956 *threadp = NULL; 7957 spin_unlock(&pers_lock); 7958 7959 kthread_stop(thread->tsk); 7960 kfree(thread); 7961 } 7962 EXPORT_SYMBOL(md_unregister_thread); 7963 7964 void md_error(struct mddev *mddev, struct md_rdev *rdev) 7965 { 7966 if (!rdev || test_bit(Faulty, &rdev->flags)) 7967 return; 7968 7969 if (!mddev->pers || !mddev->pers->error_handler) 7970 return; 7971 mddev->pers->error_handler(mddev,rdev); 7972 if (mddev->degraded) 7973 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7974 sysfs_notify_dirent_safe(rdev->sysfs_state); 7975 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7976 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7977 md_wakeup_thread(mddev->thread); 7978 if (mddev->event_work.func) 7979 queue_work(md_misc_wq, &mddev->event_work); 7980 md_new_event(mddev); 7981 } 7982 EXPORT_SYMBOL(md_error); 7983 7984 /* seq_file implementation /proc/mdstat */ 7985 7986 static void status_unused(struct seq_file *seq) 7987 { 7988 int i = 0; 7989 struct md_rdev *rdev; 7990 7991 seq_printf(seq, "unused devices: "); 7992 7993 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7994 char b[BDEVNAME_SIZE]; 7995 i++; 7996 seq_printf(seq, "%s ", 7997 bdevname(rdev->bdev,b)); 7998 } 7999 if (!i) 8000 seq_printf(seq, "<none>"); 8001 8002 seq_printf(seq, "\n"); 8003 } 8004 8005 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8006 { 8007 sector_t max_sectors, resync, res; 8008 unsigned long dt, db = 0; 8009 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8010 int scale, recovery_active; 8011 unsigned int per_milli; 8012 8013 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8014 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8015 max_sectors = mddev->resync_max_sectors; 8016 else 8017 max_sectors = mddev->dev_sectors; 8018 8019 resync = mddev->curr_resync; 8020 if (resync <= 3) { 8021 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8022 /* Still cleaning up */ 8023 resync = max_sectors; 8024 } else if (resync > max_sectors) 8025 resync = max_sectors; 8026 else 8027 resync -= atomic_read(&mddev->recovery_active); 8028 8029 if (resync == 0) { 8030 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8031 struct md_rdev *rdev; 8032 8033 rdev_for_each(rdev, mddev) 8034 if (rdev->raid_disk >= 0 && 8035 !test_bit(Faulty, &rdev->flags) && 8036 rdev->recovery_offset != MaxSector && 8037 rdev->recovery_offset) { 8038 seq_printf(seq, "\trecover=REMOTE"); 8039 return 1; 8040 } 8041 if (mddev->reshape_position != MaxSector) 8042 seq_printf(seq, "\treshape=REMOTE"); 8043 else 8044 seq_printf(seq, "\tresync=REMOTE"); 8045 return 1; 8046 } 8047 if (mddev->recovery_cp < MaxSector) { 8048 seq_printf(seq, "\tresync=PENDING"); 8049 return 1; 8050 } 8051 return 0; 8052 } 8053 if (resync < 3) { 8054 seq_printf(seq, "\tresync=DELAYED"); 8055 return 1; 8056 } 8057 8058 WARN_ON(max_sectors == 0); 8059 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8060 * in a sector_t, and (max_sectors>>scale) will fit in a 8061 * u32, as those are the requirements for sector_div. 8062 * Thus 'scale' must be at least 10 8063 */ 8064 scale = 10; 8065 if (sizeof(sector_t) > sizeof(unsigned long)) { 8066 while ( max_sectors/2 > (1ULL<<(scale+32))) 8067 scale++; 8068 } 8069 res = (resync>>scale)*1000; 8070 sector_div(res, (u32)((max_sectors>>scale)+1)); 8071 8072 per_milli = res; 8073 { 8074 int i, x = per_milli/50, y = 20-x; 8075 seq_printf(seq, "["); 8076 for (i = 0; i < x; i++) 8077 seq_printf(seq, "="); 8078 seq_printf(seq, ">"); 8079 for (i = 0; i < y; i++) 8080 seq_printf(seq, "."); 8081 seq_printf(seq, "] "); 8082 } 8083 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8084 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8085 "reshape" : 8086 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8087 "check" : 8088 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8089 "resync" : "recovery"))), 8090 per_milli/10, per_milli % 10, 8091 (unsigned long long) resync/2, 8092 (unsigned long long) max_sectors/2); 8093 8094 /* 8095 * dt: time from mark until now 8096 * db: blocks written from mark until now 8097 * rt: remaining time 8098 * 8099 * rt is a sector_t, which is always 64bit now. We are keeping 8100 * the original algorithm, but it is not really necessary. 8101 * 8102 * Original algorithm: 8103 * So we divide before multiply in case it is 32bit and close 8104 * to the limit. 8105 * We scale the divisor (db) by 32 to avoid losing precision 8106 * near the end of resync when the number of remaining sectors 8107 * is close to 'db'. 8108 * We then divide rt by 32 after multiplying by db to compensate. 8109 * The '+1' avoids division by zero if db is very small. 8110 */ 8111 dt = ((jiffies - mddev->resync_mark) / HZ); 8112 if (!dt) dt++; 8113 8114 curr_mark_cnt = mddev->curr_mark_cnt; 8115 recovery_active = atomic_read(&mddev->recovery_active); 8116 resync_mark_cnt = mddev->resync_mark_cnt; 8117 8118 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8119 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8120 8121 rt = max_sectors - resync; /* number of remaining sectors */ 8122 rt = div64_u64(rt, db/32+1); 8123 rt *= dt; 8124 rt >>= 5; 8125 8126 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8127 ((unsigned long)rt % 60)/6); 8128 8129 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8130 return 1; 8131 } 8132 8133 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8134 { 8135 struct list_head *tmp; 8136 loff_t l = *pos; 8137 struct mddev *mddev; 8138 8139 if (l == 0x10000) { 8140 ++*pos; 8141 return (void *)2; 8142 } 8143 if (l > 0x10000) 8144 return NULL; 8145 if (!l--) 8146 /* header */ 8147 return (void*)1; 8148 8149 spin_lock(&all_mddevs_lock); 8150 list_for_each(tmp,&all_mddevs) 8151 if (!l--) { 8152 mddev = list_entry(tmp, struct mddev, all_mddevs); 8153 mddev_get(mddev); 8154 spin_unlock(&all_mddevs_lock); 8155 return mddev; 8156 } 8157 spin_unlock(&all_mddevs_lock); 8158 if (!l--) 8159 return (void*)2;/* tail */ 8160 return NULL; 8161 } 8162 8163 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8164 { 8165 struct list_head *tmp; 8166 struct mddev *next_mddev, *mddev = v; 8167 8168 ++*pos; 8169 if (v == (void*)2) 8170 return NULL; 8171 8172 spin_lock(&all_mddevs_lock); 8173 if (v == (void*)1) 8174 tmp = all_mddevs.next; 8175 else 8176 tmp = mddev->all_mddevs.next; 8177 if (tmp != &all_mddevs) 8178 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 8179 else { 8180 next_mddev = (void*)2; 8181 *pos = 0x10000; 8182 } 8183 spin_unlock(&all_mddevs_lock); 8184 8185 if (v != (void*)1) 8186 mddev_put(mddev); 8187 return next_mddev; 8188 8189 } 8190 8191 static void md_seq_stop(struct seq_file *seq, void *v) 8192 { 8193 struct mddev *mddev = v; 8194 8195 if (mddev && v != (void*)1 && v != (void*)2) 8196 mddev_put(mddev); 8197 } 8198 8199 static int md_seq_show(struct seq_file *seq, void *v) 8200 { 8201 struct mddev *mddev = v; 8202 sector_t sectors; 8203 struct md_rdev *rdev; 8204 8205 if (v == (void*)1) { 8206 struct md_personality *pers; 8207 seq_printf(seq, "Personalities : "); 8208 spin_lock(&pers_lock); 8209 list_for_each_entry(pers, &pers_list, list) 8210 seq_printf(seq, "[%s] ", pers->name); 8211 8212 spin_unlock(&pers_lock); 8213 seq_printf(seq, "\n"); 8214 seq->poll_event = atomic_read(&md_event_count); 8215 return 0; 8216 } 8217 if (v == (void*)2) { 8218 status_unused(seq); 8219 return 0; 8220 } 8221 8222 spin_lock(&mddev->lock); 8223 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8224 seq_printf(seq, "%s : %sactive", mdname(mddev), 8225 mddev->pers ? "" : "in"); 8226 if (mddev->pers) { 8227 if (mddev->ro==1) 8228 seq_printf(seq, " (read-only)"); 8229 if (mddev->ro==2) 8230 seq_printf(seq, " (auto-read-only)"); 8231 seq_printf(seq, " %s", mddev->pers->name); 8232 } 8233 8234 sectors = 0; 8235 rcu_read_lock(); 8236 rdev_for_each_rcu(rdev, mddev) { 8237 char b[BDEVNAME_SIZE]; 8238 seq_printf(seq, " %s[%d]", 8239 bdevname(rdev->bdev,b), rdev->desc_nr); 8240 if (test_bit(WriteMostly, &rdev->flags)) 8241 seq_printf(seq, "(W)"); 8242 if (test_bit(Journal, &rdev->flags)) 8243 seq_printf(seq, "(J)"); 8244 if (test_bit(Faulty, &rdev->flags)) { 8245 seq_printf(seq, "(F)"); 8246 continue; 8247 } 8248 if (rdev->raid_disk < 0) 8249 seq_printf(seq, "(S)"); /* spare */ 8250 if (test_bit(Replacement, &rdev->flags)) 8251 seq_printf(seq, "(R)"); 8252 sectors += rdev->sectors; 8253 } 8254 rcu_read_unlock(); 8255 8256 if (!list_empty(&mddev->disks)) { 8257 if (mddev->pers) 8258 seq_printf(seq, "\n %llu blocks", 8259 (unsigned long long) 8260 mddev->array_sectors / 2); 8261 else 8262 seq_printf(seq, "\n %llu blocks", 8263 (unsigned long long)sectors / 2); 8264 } 8265 if (mddev->persistent) { 8266 if (mddev->major_version != 0 || 8267 mddev->minor_version != 90) { 8268 seq_printf(seq," super %d.%d", 8269 mddev->major_version, 8270 mddev->minor_version); 8271 } 8272 } else if (mddev->external) 8273 seq_printf(seq, " super external:%s", 8274 mddev->metadata_type); 8275 else 8276 seq_printf(seq, " super non-persistent"); 8277 8278 if (mddev->pers) { 8279 mddev->pers->status(seq, mddev); 8280 seq_printf(seq, "\n "); 8281 if (mddev->pers->sync_request) { 8282 if (status_resync(seq, mddev)) 8283 seq_printf(seq, "\n "); 8284 } 8285 } else 8286 seq_printf(seq, "\n "); 8287 8288 md_bitmap_status(seq, mddev->bitmap); 8289 8290 seq_printf(seq, "\n"); 8291 } 8292 spin_unlock(&mddev->lock); 8293 8294 return 0; 8295 } 8296 8297 static const struct seq_operations md_seq_ops = { 8298 .start = md_seq_start, 8299 .next = md_seq_next, 8300 .stop = md_seq_stop, 8301 .show = md_seq_show, 8302 }; 8303 8304 static int md_seq_open(struct inode *inode, struct file *file) 8305 { 8306 struct seq_file *seq; 8307 int error; 8308 8309 error = seq_open(file, &md_seq_ops); 8310 if (error) 8311 return error; 8312 8313 seq = file->private_data; 8314 seq->poll_event = atomic_read(&md_event_count); 8315 return error; 8316 } 8317 8318 static int md_unloading; 8319 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8320 { 8321 struct seq_file *seq = filp->private_data; 8322 __poll_t mask; 8323 8324 if (md_unloading) 8325 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8326 poll_wait(filp, &md_event_waiters, wait); 8327 8328 /* always allow read */ 8329 mask = EPOLLIN | EPOLLRDNORM; 8330 8331 if (seq->poll_event != atomic_read(&md_event_count)) 8332 mask |= EPOLLERR | EPOLLPRI; 8333 return mask; 8334 } 8335 8336 static const struct proc_ops mdstat_proc_ops = { 8337 .proc_open = md_seq_open, 8338 .proc_read = seq_read, 8339 .proc_lseek = seq_lseek, 8340 .proc_release = seq_release, 8341 .proc_poll = mdstat_poll, 8342 }; 8343 8344 int register_md_personality(struct md_personality *p) 8345 { 8346 pr_debug("md: %s personality registered for level %d\n", 8347 p->name, p->level); 8348 spin_lock(&pers_lock); 8349 list_add_tail(&p->list, &pers_list); 8350 spin_unlock(&pers_lock); 8351 return 0; 8352 } 8353 EXPORT_SYMBOL(register_md_personality); 8354 8355 int unregister_md_personality(struct md_personality *p) 8356 { 8357 pr_debug("md: %s personality unregistered\n", p->name); 8358 spin_lock(&pers_lock); 8359 list_del_init(&p->list); 8360 spin_unlock(&pers_lock); 8361 return 0; 8362 } 8363 EXPORT_SYMBOL(unregister_md_personality); 8364 8365 int register_md_cluster_operations(struct md_cluster_operations *ops, 8366 struct module *module) 8367 { 8368 int ret = 0; 8369 spin_lock(&pers_lock); 8370 if (md_cluster_ops != NULL) 8371 ret = -EALREADY; 8372 else { 8373 md_cluster_ops = ops; 8374 md_cluster_mod = module; 8375 } 8376 spin_unlock(&pers_lock); 8377 return ret; 8378 } 8379 EXPORT_SYMBOL(register_md_cluster_operations); 8380 8381 int unregister_md_cluster_operations(void) 8382 { 8383 spin_lock(&pers_lock); 8384 md_cluster_ops = NULL; 8385 spin_unlock(&pers_lock); 8386 return 0; 8387 } 8388 EXPORT_SYMBOL(unregister_md_cluster_operations); 8389 8390 int md_setup_cluster(struct mddev *mddev, int nodes) 8391 { 8392 int ret; 8393 if (!md_cluster_ops) 8394 request_module("md-cluster"); 8395 spin_lock(&pers_lock); 8396 /* ensure module won't be unloaded */ 8397 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8398 pr_warn("can't find md-cluster module or get it's reference.\n"); 8399 spin_unlock(&pers_lock); 8400 return -ENOENT; 8401 } 8402 spin_unlock(&pers_lock); 8403 8404 ret = md_cluster_ops->join(mddev, nodes); 8405 if (!ret) 8406 mddev->safemode_delay = 0; 8407 return ret; 8408 } 8409 8410 void md_cluster_stop(struct mddev *mddev) 8411 { 8412 if (!md_cluster_ops) 8413 return; 8414 md_cluster_ops->leave(mddev); 8415 module_put(md_cluster_mod); 8416 } 8417 8418 static int is_mddev_idle(struct mddev *mddev, int init) 8419 { 8420 struct md_rdev *rdev; 8421 int idle; 8422 int curr_events; 8423 8424 idle = 1; 8425 rcu_read_lock(); 8426 rdev_for_each_rcu(rdev, mddev) { 8427 struct gendisk *disk = rdev->bdev->bd_disk; 8428 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8429 atomic_read(&disk->sync_io); 8430 /* sync IO will cause sync_io to increase before the disk_stats 8431 * as sync_io is counted when a request starts, and 8432 * disk_stats is counted when it completes. 8433 * So resync activity will cause curr_events to be smaller than 8434 * when there was no such activity. 8435 * non-sync IO will cause disk_stat to increase without 8436 * increasing sync_io so curr_events will (eventually) 8437 * be larger than it was before. Once it becomes 8438 * substantially larger, the test below will cause 8439 * the array to appear non-idle, and resync will slow 8440 * down. 8441 * If there is a lot of outstanding resync activity when 8442 * we set last_event to curr_events, then all that activity 8443 * completing might cause the array to appear non-idle 8444 * and resync will be slowed down even though there might 8445 * not have been non-resync activity. This will only 8446 * happen once though. 'last_events' will soon reflect 8447 * the state where there is little or no outstanding 8448 * resync requests, and further resync activity will 8449 * always make curr_events less than last_events. 8450 * 8451 */ 8452 if (init || curr_events - rdev->last_events > 64) { 8453 rdev->last_events = curr_events; 8454 idle = 0; 8455 } 8456 } 8457 rcu_read_unlock(); 8458 return idle; 8459 } 8460 8461 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8462 { 8463 /* another "blocks" (512byte) blocks have been synced */ 8464 atomic_sub(blocks, &mddev->recovery_active); 8465 wake_up(&mddev->recovery_wait); 8466 if (!ok) { 8467 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8468 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8469 md_wakeup_thread(mddev->thread); 8470 // stop recovery, signal do_sync .... 8471 } 8472 } 8473 EXPORT_SYMBOL(md_done_sync); 8474 8475 /* md_write_start(mddev, bi) 8476 * If we need to update some array metadata (e.g. 'active' flag 8477 * in superblock) before writing, schedule a superblock update 8478 * and wait for it to complete. 8479 * A return value of 'false' means that the write wasn't recorded 8480 * and cannot proceed as the array is being suspend. 8481 */ 8482 bool md_write_start(struct mddev *mddev, struct bio *bi) 8483 { 8484 int did_change = 0; 8485 8486 if (bio_data_dir(bi) != WRITE) 8487 return true; 8488 8489 BUG_ON(mddev->ro == 1); 8490 if (mddev->ro == 2) { 8491 /* need to switch to read/write */ 8492 mddev->ro = 0; 8493 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8494 md_wakeup_thread(mddev->thread); 8495 md_wakeup_thread(mddev->sync_thread); 8496 did_change = 1; 8497 } 8498 rcu_read_lock(); 8499 percpu_ref_get(&mddev->writes_pending); 8500 smp_mb(); /* Match smp_mb in set_in_sync() */ 8501 if (mddev->safemode == 1) 8502 mddev->safemode = 0; 8503 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8504 if (mddev->in_sync || mddev->sync_checkers) { 8505 spin_lock(&mddev->lock); 8506 if (mddev->in_sync) { 8507 mddev->in_sync = 0; 8508 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8509 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8510 md_wakeup_thread(mddev->thread); 8511 did_change = 1; 8512 } 8513 spin_unlock(&mddev->lock); 8514 } 8515 rcu_read_unlock(); 8516 if (did_change) 8517 sysfs_notify_dirent_safe(mddev->sysfs_state); 8518 if (!mddev->has_superblocks) 8519 return true; 8520 wait_event(mddev->sb_wait, 8521 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8522 mddev->suspended); 8523 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8524 percpu_ref_put(&mddev->writes_pending); 8525 return false; 8526 } 8527 return true; 8528 } 8529 EXPORT_SYMBOL(md_write_start); 8530 8531 /* md_write_inc can only be called when md_write_start() has 8532 * already been called at least once of the current request. 8533 * It increments the counter and is useful when a single request 8534 * is split into several parts. Each part causes an increment and 8535 * so needs a matching md_write_end(). 8536 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8537 * a spinlocked region. 8538 */ 8539 void md_write_inc(struct mddev *mddev, struct bio *bi) 8540 { 8541 if (bio_data_dir(bi) != WRITE) 8542 return; 8543 WARN_ON_ONCE(mddev->in_sync || mddev->ro); 8544 percpu_ref_get(&mddev->writes_pending); 8545 } 8546 EXPORT_SYMBOL(md_write_inc); 8547 8548 void md_write_end(struct mddev *mddev) 8549 { 8550 percpu_ref_put(&mddev->writes_pending); 8551 8552 if (mddev->safemode == 2) 8553 md_wakeup_thread(mddev->thread); 8554 else if (mddev->safemode_delay) 8555 /* The roundup() ensures this only performs locking once 8556 * every ->safemode_delay jiffies 8557 */ 8558 mod_timer(&mddev->safemode_timer, 8559 roundup(jiffies, mddev->safemode_delay) + 8560 mddev->safemode_delay); 8561 } 8562 8563 EXPORT_SYMBOL(md_write_end); 8564 8565 /* This is used by raid0 and raid10 */ 8566 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8567 struct bio *bio, sector_t start, sector_t size) 8568 { 8569 struct bio *discard_bio = NULL; 8570 8571 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 0, 8572 &discard_bio) || !discard_bio) 8573 return; 8574 8575 bio_chain(discard_bio, bio); 8576 bio_clone_blkg_association(discard_bio, bio); 8577 if (mddev->gendisk) 8578 trace_block_bio_remap(discard_bio, 8579 disk_devt(mddev->gendisk), 8580 bio->bi_iter.bi_sector); 8581 submit_bio_noacct(discard_bio); 8582 } 8583 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8584 8585 static void md_end_io_acct(struct bio *bio) 8586 { 8587 struct md_io_acct *md_io_acct = bio->bi_private; 8588 struct bio *orig_bio = md_io_acct->orig_bio; 8589 8590 orig_bio->bi_status = bio->bi_status; 8591 8592 bio_end_io_acct(orig_bio, md_io_acct->start_time); 8593 bio_put(bio); 8594 bio_endio(orig_bio); 8595 } 8596 8597 /* 8598 * Used by personalities that don't already clone the bio and thus can't 8599 * easily add the timestamp to their extended bio structure. 8600 */ 8601 void md_account_bio(struct mddev *mddev, struct bio **bio) 8602 { 8603 struct md_io_acct *md_io_acct; 8604 struct bio *clone; 8605 8606 if (!blk_queue_io_stat((*bio)->bi_bdev->bd_disk->queue)) 8607 return; 8608 8609 clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set); 8610 md_io_acct = container_of(clone, struct md_io_acct, bio_clone); 8611 md_io_acct->orig_bio = *bio; 8612 md_io_acct->start_time = bio_start_io_acct(*bio); 8613 8614 clone->bi_end_io = md_end_io_acct; 8615 clone->bi_private = md_io_acct; 8616 *bio = clone; 8617 } 8618 EXPORT_SYMBOL_GPL(md_account_bio); 8619 8620 /* md_allow_write(mddev) 8621 * Calling this ensures that the array is marked 'active' so that writes 8622 * may proceed without blocking. It is important to call this before 8623 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8624 * Must be called with mddev_lock held. 8625 */ 8626 void md_allow_write(struct mddev *mddev) 8627 { 8628 if (!mddev->pers) 8629 return; 8630 if (mddev->ro) 8631 return; 8632 if (!mddev->pers->sync_request) 8633 return; 8634 8635 spin_lock(&mddev->lock); 8636 if (mddev->in_sync) { 8637 mddev->in_sync = 0; 8638 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8639 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8640 if (mddev->safemode_delay && 8641 mddev->safemode == 0) 8642 mddev->safemode = 1; 8643 spin_unlock(&mddev->lock); 8644 md_update_sb(mddev, 0); 8645 sysfs_notify_dirent_safe(mddev->sysfs_state); 8646 /* wait for the dirty state to be recorded in the metadata */ 8647 wait_event(mddev->sb_wait, 8648 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8649 } else 8650 spin_unlock(&mddev->lock); 8651 } 8652 EXPORT_SYMBOL_GPL(md_allow_write); 8653 8654 #define SYNC_MARKS 10 8655 #define SYNC_MARK_STEP (3*HZ) 8656 #define UPDATE_FREQUENCY (5*60*HZ) 8657 void md_do_sync(struct md_thread *thread) 8658 { 8659 struct mddev *mddev = thread->mddev; 8660 struct mddev *mddev2; 8661 unsigned int currspeed = 0, window; 8662 sector_t max_sectors,j, io_sectors, recovery_done; 8663 unsigned long mark[SYNC_MARKS]; 8664 unsigned long update_time; 8665 sector_t mark_cnt[SYNC_MARKS]; 8666 int last_mark,m; 8667 struct list_head *tmp; 8668 sector_t last_check; 8669 int skipped = 0; 8670 struct md_rdev *rdev; 8671 char *desc, *action = NULL; 8672 struct blk_plug plug; 8673 int ret; 8674 8675 /* just incase thread restarts... */ 8676 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8677 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8678 return; 8679 if (mddev->ro) {/* never try to sync a read-only array */ 8680 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8681 return; 8682 } 8683 8684 if (mddev_is_clustered(mddev)) { 8685 ret = md_cluster_ops->resync_start(mddev); 8686 if (ret) 8687 goto skip; 8688 8689 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8690 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8691 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8692 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8693 && ((unsigned long long)mddev->curr_resync_completed 8694 < (unsigned long long)mddev->resync_max_sectors)) 8695 goto skip; 8696 } 8697 8698 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8699 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8700 desc = "data-check"; 8701 action = "check"; 8702 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8703 desc = "requested-resync"; 8704 action = "repair"; 8705 } else 8706 desc = "resync"; 8707 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8708 desc = "reshape"; 8709 else 8710 desc = "recovery"; 8711 8712 mddev->last_sync_action = action ?: desc; 8713 8714 /* we overload curr_resync somewhat here. 8715 * 0 == not engaged in resync at all 8716 * 2 == checking that there is no conflict with another sync 8717 * 1 == like 2, but have yielded to allow conflicting resync to 8718 * commence 8719 * other == active in resync - this many blocks 8720 * 8721 * Before starting a resync we must have set curr_resync to 8722 * 2, and then checked that every "conflicting" array has curr_resync 8723 * less than ours. When we find one that is the same or higher 8724 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8725 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8726 * This will mean we have to start checking from the beginning again. 8727 * 8728 */ 8729 8730 do { 8731 int mddev2_minor = -1; 8732 mddev->curr_resync = 2; 8733 8734 try_again: 8735 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8736 goto skip; 8737 for_each_mddev(mddev2, tmp) { 8738 if (mddev2 == mddev) 8739 continue; 8740 if (!mddev->parallel_resync 8741 && mddev2->curr_resync 8742 && match_mddev_units(mddev, mddev2)) { 8743 DEFINE_WAIT(wq); 8744 if (mddev < mddev2 && mddev->curr_resync == 2) { 8745 /* arbitrarily yield */ 8746 mddev->curr_resync = 1; 8747 wake_up(&resync_wait); 8748 } 8749 if (mddev > mddev2 && mddev->curr_resync == 1) 8750 /* no need to wait here, we can wait the next 8751 * time 'round when curr_resync == 2 8752 */ 8753 continue; 8754 /* We need to wait 'interruptible' so as not to 8755 * contribute to the load average, and not to 8756 * be caught by 'softlockup' 8757 */ 8758 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8759 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8760 mddev2->curr_resync >= mddev->curr_resync) { 8761 if (mddev2_minor != mddev2->md_minor) { 8762 mddev2_minor = mddev2->md_minor; 8763 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8764 desc, mdname(mddev), 8765 mdname(mddev2)); 8766 } 8767 mddev_put(mddev2); 8768 if (signal_pending(current)) 8769 flush_signals(current); 8770 schedule(); 8771 finish_wait(&resync_wait, &wq); 8772 goto try_again; 8773 } 8774 finish_wait(&resync_wait, &wq); 8775 } 8776 } 8777 } while (mddev->curr_resync < 2); 8778 8779 j = 0; 8780 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8781 /* resync follows the size requested by the personality, 8782 * which defaults to physical size, but can be virtual size 8783 */ 8784 max_sectors = mddev->resync_max_sectors; 8785 atomic64_set(&mddev->resync_mismatches, 0); 8786 /* we don't use the checkpoint if there's a bitmap */ 8787 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8788 j = mddev->resync_min; 8789 else if (!mddev->bitmap) 8790 j = mddev->recovery_cp; 8791 8792 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8793 max_sectors = mddev->resync_max_sectors; 8794 /* 8795 * If the original node aborts reshaping then we continue the 8796 * reshaping, so set j again to avoid restart reshape from the 8797 * first beginning 8798 */ 8799 if (mddev_is_clustered(mddev) && 8800 mddev->reshape_position != MaxSector) 8801 j = mddev->reshape_position; 8802 } else { 8803 /* recovery follows the physical size of devices */ 8804 max_sectors = mddev->dev_sectors; 8805 j = MaxSector; 8806 rcu_read_lock(); 8807 rdev_for_each_rcu(rdev, mddev) 8808 if (rdev->raid_disk >= 0 && 8809 !test_bit(Journal, &rdev->flags) && 8810 !test_bit(Faulty, &rdev->flags) && 8811 !test_bit(In_sync, &rdev->flags) && 8812 rdev->recovery_offset < j) 8813 j = rdev->recovery_offset; 8814 rcu_read_unlock(); 8815 8816 /* If there is a bitmap, we need to make sure all 8817 * writes that started before we added a spare 8818 * complete before we start doing a recovery. 8819 * Otherwise the write might complete and (via 8820 * bitmap_endwrite) set a bit in the bitmap after the 8821 * recovery has checked that bit and skipped that 8822 * region. 8823 */ 8824 if (mddev->bitmap) { 8825 mddev->pers->quiesce(mddev, 1); 8826 mddev->pers->quiesce(mddev, 0); 8827 } 8828 } 8829 8830 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8831 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8832 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8833 speed_max(mddev), desc); 8834 8835 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8836 8837 io_sectors = 0; 8838 for (m = 0; m < SYNC_MARKS; m++) { 8839 mark[m] = jiffies; 8840 mark_cnt[m] = io_sectors; 8841 } 8842 last_mark = 0; 8843 mddev->resync_mark = mark[last_mark]; 8844 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8845 8846 /* 8847 * Tune reconstruction: 8848 */ 8849 window = 32 * (PAGE_SIZE / 512); 8850 pr_debug("md: using %dk window, over a total of %lluk.\n", 8851 window/2, (unsigned long long)max_sectors/2); 8852 8853 atomic_set(&mddev->recovery_active, 0); 8854 last_check = 0; 8855 8856 if (j>2) { 8857 pr_debug("md: resuming %s of %s from checkpoint.\n", 8858 desc, mdname(mddev)); 8859 mddev->curr_resync = j; 8860 } else 8861 mddev->curr_resync = 3; /* no longer delayed */ 8862 mddev->curr_resync_completed = j; 8863 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8864 md_new_event(mddev); 8865 update_time = jiffies; 8866 8867 blk_start_plug(&plug); 8868 while (j < max_sectors) { 8869 sector_t sectors; 8870 8871 skipped = 0; 8872 8873 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8874 ((mddev->curr_resync > mddev->curr_resync_completed && 8875 (mddev->curr_resync - mddev->curr_resync_completed) 8876 > (max_sectors >> 4)) || 8877 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8878 (j - mddev->curr_resync_completed)*2 8879 >= mddev->resync_max - mddev->curr_resync_completed || 8880 mddev->curr_resync_completed > mddev->resync_max 8881 )) { 8882 /* time to update curr_resync_completed */ 8883 wait_event(mddev->recovery_wait, 8884 atomic_read(&mddev->recovery_active) == 0); 8885 mddev->curr_resync_completed = j; 8886 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8887 j > mddev->recovery_cp) 8888 mddev->recovery_cp = j; 8889 update_time = jiffies; 8890 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8891 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8892 } 8893 8894 while (j >= mddev->resync_max && 8895 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8896 /* As this condition is controlled by user-space, 8897 * we can block indefinitely, so use '_interruptible' 8898 * to avoid triggering warnings. 8899 */ 8900 flush_signals(current); /* just in case */ 8901 wait_event_interruptible(mddev->recovery_wait, 8902 mddev->resync_max > j 8903 || test_bit(MD_RECOVERY_INTR, 8904 &mddev->recovery)); 8905 } 8906 8907 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8908 break; 8909 8910 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8911 if (sectors == 0) { 8912 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8913 break; 8914 } 8915 8916 if (!skipped) { /* actual IO requested */ 8917 io_sectors += sectors; 8918 atomic_add(sectors, &mddev->recovery_active); 8919 } 8920 8921 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8922 break; 8923 8924 j += sectors; 8925 if (j > max_sectors) 8926 /* when skipping, extra large numbers can be returned. */ 8927 j = max_sectors; 8928 if (j > 2) 8929 mddev->curr_resync = j; 8930 mddev->curr_mark_cnt = io_sectors; 8931 if (last_check == 0) 8932 /* this is the earliest that rebuild will be 8933 * visible in /proc/mdstat 8934 */ 8935 md_new_event(mddev); 8936 8937 if (last_check + window > io_sectors || j == max_sectors) 8938 continue; 8939 8940 last_check = io_sectors; 8941 repeat: 8942 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8943 /* step marks */ 8944 int next = (last_mark+1) % SYNC_MARKS; 8945 8946 mddev->resync_mark = mark[next]; 8947 mddev->resync_mark_cnt = mark_cnt[next]; 8948 mark[next] = jiffies; 8949 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8950 last_mark = next; 8951 } 8952 8953 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8954 break; 8955 8956 /* 8957 * this loop exits only if either when we are slower than 8958 * the 'hard' speed limit, or the system was IO-idle for 8959 * a jiffy. 8960 * the system might be non-idle CPU-wise, but we only care 8961 * about not overloading the IO subsystem. (things like an 8962 * e2fsck being done on the RAID array should execute fast) 8963 */ 8964 cond_resched(); 8965 8966 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8967 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8968 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8969 8970 if (currspeed > speed_min(mddev)) { 8971 if (currspeed > speed_max(mddev)) { 8972 msleep(500); 8973 goto repeat; 8974 } 8975 if (!is_mddev_idle(mddev, 0)) { 8976 /* 8977 * Give other IO more of a chance. 8978 * The faster the devices, the less we wait. 8979 */ 8980 wait_event(mddev->recovery_wait, 8981 !atomic_read(&mddev->recovery_active)); 8982 } 8983 } 8984 } 8985 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 8986 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8987 ? "interrupted" : "done"); 8988 /* 8989 * this also signals 'finished resyncing' to md_stop 8990 */ 8991 blk_finish_plug(&plug); 8992 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8993 8994 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8995 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8996 mddev->curr_resync > 3) { 8997 mddev->curr_resync_completed = mddev->curr_resync; 8998 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8999 } 9000 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9001 9002 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9003 mddev->curr_resync > 3) { 9004 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9005 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9006 if (mddev->curr_resync >= mddev->recovery_cp) { 9007 pr_debug("md: checkpointing %s of %s.\n", 9008 desc, mdname(mddev)); 9009 if (test_bit(MD_RECOVERY_ERROR, 9010 &mddev->recovery)) 9011 mddev->recovery_cp = 9012 mddev->curr_resync_completed; 9013 else 9014 mddev->recovery_cp = 9015 mddev->curr_resync; 9016 } 9017 } else 9018 mddev->recovery_cp = MaxSector; 9019 } else { 9020 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9021 mddev->curr_resync = MaxSector; 9022 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9023 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9024 rcu_read_lock(); 9025 rdev_for_each_rcu(rdev, mddev) 9026 if (rdev->raid_disk >= 0 && 9027 mddev->delta_disks >= 0 && 9028 !test_bit(Journal, &rdev->flags) && 9029 !test_bit(Faulty, &rdev->flags) && 9030 !test_bit(In_sync, &rdev->flags) && 9031 rdev->recovery_offset < mddev->curr_resync) 9032 rdev->recovery_offset = mddev->curr_resync; 9033 rcu_read_unlock(); 9034 } 9035 } 9036 } 9037 skip: 9038 /* set CHANGE_PENDING here since maybe another update is needed, 9039 * so other nodes are informed. It should be harmless for normal 9040 * raid */ 9041 set_mask_bits(&mddev->sb_flags, 0, 9042 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9043 9044 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9045 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9046 mddev->delta_disks > 0 && 9047 mddev->pers->finish_reshape && 9048 mddev->pers->size && 9049 mddev->queue) { 9050 mddev_lock_nointr(mddev); 9051 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9052 mddev_unlock(mddev); 9053 if (!mddev_is_clustered(mddev)) 9054 set_capacity_and_notify(mddev->gendisk, 9055 mddev->array_sectors); 9056 } 9057 9058 spin_lock(&mddev->lock); 9059 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9060 /* We completed so min/max setting can be forgotten if used. */ 9061 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9062 mddev->resync_min = 0; 9063 mddev->resync_max = MaxSector; 9064 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9065 mddev->resync_min = mddev->curr_resync_completed; 9066 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9067 mddev->curr_resync = 0; 9068 spin_unlock(&mddev->lock); 9069 9070 wake_up(&resync_wait); 9071 md_wakeup_thread(mddev->thread); 9072 return; 9073 } 9074 EXPORT_SYMBOL_GPL(md_do_sync); 9075 9076 static int remove_and_add_spares(struct mddev *mddev, 9077 struct md_rdev *this) 9078 { 9079 struct md_rdev *rdev; 9080 int spares = 0; 9081 int removed = 0; 9082 bool remove_some = false; 9083 9084 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9085 /* Mustn't remove devices when resync thread is running */ 9086 return 0; 9087 9088 rdev_for_each(rdev, mddev) { 9089 if ((this == NULL || rdev == this) && 9090 rdev->raid_disk >= 0 && 9091 !test_bit(Blocked, &rdev->flags) && 9092 test_bit(Faulty, &rdev->flags) && 9093 atomic_read(&rdev->nr_pending)==0) { 9094 /* Faulty non-Blocked devices with nr_pending == 0 9095 * never get nr_pending incremented, 9096 * never get Faulty cleared, and never get Blocked set. 9097 * So we can synchronize_rcu now rather than once per device 9098 */ 9099 remove_some = true; 9100 set_bit(RemoveSynchronized, &rdev->flags); 9101 } 9102 } 9103 9104 if (remove_some) 9105 synchronize_rcu(); 9106 rdev_for_each(rdev, mddev) { 9107 if ((this == NULL || rdev == this) && 9108 rdev->raid_disk >= 0 && 9109 !test_bit(Blocked, &rdev->flags) && 9110 ((test_bit(RemoveSynchronized, &rdev->flags) || 9111 (!test_bit(In_sync, &rdev->flags) && 9112 !test_bit(Journal, &rdev->flags))) && 9113 atomic_read(&rdev->nr_pending)==0)) { 9114 if (mddev->pers->hot_remove_disk( 9115 mddev, rdev) == 0) { 9116 sysfs_unlink_rdev(mddev, rdev); 9117 rdev->saved_raid_disk = rdev->raid_disk; 9118 rdev->raid_disk = -1; 9119 removed++; 9120 } 9121 } 9122 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 9123 clear_bit(RemoveSynchronized, &rdev->flags); 9124 } 9125 9126 if (removed && mddev->kobj.sd) 9127 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9128 9129 if (this && removed) 9130 goto no_add; 9131 9132 rdev_for_each(rdev, mddev) { 9133 if (this && this != rdev) 9134 continue; 9135 if (test_bit(Candidate, &rdev->flags)) 9136 continue; 9137 if (rdev->raid_disk >= 0 && 9138 !test_bit(In_sync, &rdev->flags) && 9139 !test_bit(Journal, &rdev->flags) && 9140 !test_bit(Faulty, &rdev->flags)) 9141 spares++; 9142 if (rdev->raid_disk >= 0) 9143 continue; 9144 if (test_bit(Faulty, &rdev->flags)) 9145 continue; 9146 if (!test_bit(Journal, &rdev->flags)) { 9147 if (mddev->ro && 9148 ! (rdev->saved_raid_disk >= 0 && 9149 !test_bit(Bitmap_sync, &rdev->flags))) 9150 continue; 9151 9152 rdev->recovery_offset = 0; 9153 } 9154 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9155 /* failure here is OK */ 9156 sysfs_link_rdev(mddev, rdev); 9157 if (!test_bit(Journal, &rdev->flags)) 9158 spares++; 9159 md_new_event(mddev); 9160 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9161 } 9162 } 9163 no_add: 9164 if (removed) 9165 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9166 return spares; 9167 } 9168 9169 static void md_start_sync(struct work_struct *ws) 9170 { 9171 struct mddev *mddev = container_of(ws, struct mddev, del_work); 9172 9173 mddev->sync_thread = md_register_thread(md_do_sync, 9174 mddev, 9175 "resync"); 9176 if (!mddev->sync_thread) { 9177 pr_warn("%s: could not start resync thread...\n", 9178 mdname(mddev)); 9179 /* leave the spares where they are, it shouldn't hurt */ 9180 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9181 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9182 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9183 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9184 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9185 wake_up(&resync_wait); 9186 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9187 &mddev->recovery)) 9188 if (mddev->sysfs_action) 9189 sysfs_notify_dirent_safe(mddev->sysfs_action); 9190 } else 9191 md_wakeup_thread(mddev->sync_thread); 9192 sysfs_notify_dirent_safe(mddev->sysfs_action); 9193 md_new_event(mddev); 9194 } 9195 9196 /* 9197 * This routine is regularly called by all per-raid-array threads to 9198 * deal with generic issues like resync and super-block update. 9199 * Raid personalities that don't have a thread (linear/raid0) do not 9200 * need this as they never do any recovery or update the superblock. 9201 * 9202 * It does not do any resync itself, but rather "forks" off other threads 9203 * to do that as needed. 9204 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9205 * "->recovery" and create a thread at ->sync_thread. 9206 * When the thread finishes it sets MD_RECOVERY_DONE 9207 * and wakeups up this thread which will reap the thread and finish up. 9208 * This thread also removes any faulty devices (with nr_pending == 0). 9209 * 9210 * The overall approach is: 9211 * 1/ if the superblock needs updating, update it. 9212 * 2/ If a recovery thread is running, don't do anything else. 9213 * 3/ If recovery has finished, clean up, possibly marking spares active. 9214 * 4/ If there are any faulty devices, remove them. 9215 * 5/ If array is degraded, try to add spares devices 9216 * 6/ If array has spares or is not in-sync, start a resync thread. 9217 */ 9218 void md_check_recovery(struct mddev *mddev) 9219 { 9220 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { 9221 /* Write superblock - thread that called mddev_suspend() 9222 * holds reconfig_mutex for us. 9223 */ 9224 set_bit(MD_UPDATING_SB, &mddev->flags); 9225 smp_mb__after_atomic(); 9226 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) 9227 md_update_sb(mddev, 0); 9228 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); 9229 wake_up(&mddev->sb_wait); 9230 } 9231 9232 if (mddev->suspended) 9233 return; 9234 9235 if (mddev->bitmap) 9236 md_bitmap_daemon_work(mddev); 9237 9238 if (signal_pending(current)) { 9239 if (mddev->pers->sync_request && !mddev->external) { 9240 pr_debug("md: %s in immediate safe mode\n", 9241 mdname(mddev)); 9242 mddev->safemode = 2; 9243 } 9244 flush_signals(current); 9245 } 9246 9247 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9248 return; 9249 if ( ! ( 9250 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9251 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9252 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9253 (mddev->external == 0 && mddev->safemode == 1) || 9254 (mddev->safemode == 2 9255 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9256 )) 9257 return; 9258 9259 if (mddev_trylock(mddev)) { 9260 int spares = 0; 9261 bool try_set_sync = mddev->safemode != 0; 9262 9263 if (!mddev->external && mddev->safemode == 1) 9264 mddev->safemode = 0; 9265 9266 if (mddev->ro) { 9267 struct md_rdev *rdev; 9268 if (!mddev->external && mddev->in_sync) 9269 /* 'Blocked' flag not needed as failed devices 9270 * will be recorded if array switched to read/write. 9271 * Leaving it set will prevent the device 9272 * from being removed. 9273 */ 9274 rdev_for_each(rdev, mddev) 9275 clear_bit(Blocked, &rdev->flags); 9276 /* On a read-only array we can: 9277 * - remove failed devices 9278 * - add already-in_sync devices if the array itself 9279 * is in-sync. 9280 * As we only add devices that are already in-sync, 9281 * we can activate the spares immediately. 9282 */ 9283 remove_and_add_spares(mddev, NULL); 9284 /* There is no thread, but we need to call 9285 * ->spare_active and clear saved_raid_disk 9286 */ 9287 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9288 md_reap_sync_thread(mddev); 9289 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9290 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9291 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9292 goto unlock; 9293 } 9294 9295 if (mddev_is_clustered(mddev)) { 9296 struct md_rdev *rdev, *tmp; 9297 /* kick the device if another node issued a 9298 * remove disk. 9299 */ 9300 rdev_for_each_safe(rdev, tmp, mddev) { 9301 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9302 rdev->raid_disk < 0) 9303 md_kick_rdev_from_array(rdev); 9304 } 9305 } 9306 9307 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9308 spin_lock(&mddev->lock); 9309 set_in_sync(mddev); 9310 spin_unlock(&mddev->lock); 9311 } 9312 9313 if (mddev->sb_flags) 9314 md_update_sb(mddev, 0); 9315 9316 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 9317 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9318 /* resync/recovery still happening */ 9319 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9320 goto unlock; 9321 } 9322 if (mddev->sync_thread) { 9323 md_reap_sync_thread(mddev); 9324 goto unlock; 9325 } 9326 /* Set RUNNING before clearing NEEDED to avoid 9327 * any transients in the value of "sync_action". 9328 */ 9329 mddev->curr_resync_completed = 0; 9330 spin_lock(&mddev->lock); 9331 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9332 spin_unlock(&mddev->lock); 9333 /* Clear some bits that don't mean anything, but 9334 * might be left set 9335 */ 9336 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9337 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9338 9339 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9340 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 9341 goto not_running; 9342 /* no recovery is running. 9343 * remove any failed drives, then 9344 * add spares if possible. 9345 * Spares are also removed and re-added, to allow 9346 * the personality to fail the re-add. 9347 */ 9348 9349 if (mddev->reshape_position != MaxSector) { 9350 if (mddev->pers->check_reshape == NULL || 9351 mddev->pers->check_reshape(mddev) != 0) 9352 /* Cannot proceed */ 9353 goto not_running; 9354 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9355 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9356 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 9357 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9358 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9359 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9360 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9361 } else if (mddev->recovery_cp < MaxSector) { 9362 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9363 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9364 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9365 /* nothing to be done ... */ 9366 goto not_running; 9367 9368 if (mddev->pers->sync_request) { 9369 if (spares) { 9370 /* We are adding a device or devices to an array 9371 * which has the bitmap stored on all devices. 9372 * So make sure all bitmap pages get written 9373 */ 9374 md_bitmap_write_all(mddev->bitmap); 9375 } 9376 INIT_WORK(&mddev->del_work, md_start_sync); 9377 queue_work(md_misc_wq, &mddev->del_work); 9378 goto unlock; 9379 } 9380 not_running: 9381 if (!mddev->sync_thread) { 9382 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9383 wake_up(&resync_wait); 9384 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9385 &mddev->recovery)) 9386 if (mddev->sysfs_action) 9387 sysfs_notify_dirent_safe(mddev->sysfs_action); 9388 } 9389 unlock: 9390 wake_up(&mddev->sb_wait); 9391 mddev_unlock(mddev); 9392 } 9393 } 9394 EXPORT_SYMBOL(md_check_recovery); 9395 9396 void md_reap_sync_thread(struct mddev *mddev) 9397 { 9398 struct md_rdev *rdev; 9399 sector_t old_dev_sectors = mddev->dev_sectors; 9400 bool is_reshaped = false; 9401 9402 /* resync has finished, collect result */ 9403 md_unregister_thread(&mddev->sync_thread); 9404 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9405 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9406 mddev->degraded != mddev->raid_disks) { 9407 /* success...*/ 9408 /* activate any spares */ 9409 if (mddev->pers->spare_active(mddev)) { 9410 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9411 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9412 } 9413 } 9414 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9415 mddev->pers->finish_reshape) { 9416 mddev->pers->finish_reshape(mddev); 9417 if (mddev_is_clustered(mddev)) 9418 is_reshaped = true; 9419 } 9420 9421 /* If array is no-longer degraded, then any saved_raid_disk 9422 * information must be scrapped. 9423 */ 9424 if (!mddev->degraded) 9425 rdev_for_each(rdev, mddev) 9426 rdev->saved_raid_disk = -1; 9427 9428 md_update_sb(mddev, 1); 9429 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9430 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9431 * clustered raid */ 9432 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9433 md_cluster_ops->resync_finish(mddev); 9434 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9435 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9436 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9437 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9438 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9439 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9440 /* 9441 * We call md_cluster_ops->update_size here because sync_size could 9442 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9443 * so it is time to update size across cluster. 9444 */ 9445 if (mddev_is_clustered(mddev) && is_reshaped 9446 && !test_bit(MD_CLOSING, &mddev->flags)) 9447 md_cluster_ops->update_size(mddev, old_dev_sectors); 9448 wake_up(&resync_wait); 9449 /* flag recovery needed just to double check */ 9450 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9451 sysfs_notify_dirent_safe(mddev->sysfs_action); 9452 md_new_event(mddev); 9453 if (mddev->event_work.func) 9454 queue_work(md_misc_wq, &mddev->event_work); 9455 } 9456 EXPORT_SYMBOL(md_reap_sync_thread); 9457 9458 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9459 { 9460 sysfs_notify_dirent_safe(rdev->sysfs_state); 9461 wait_event_timeout(rdev->blocked_wait, 9462 !test_bit(Blocked, &rdev->flags) && 9463 !test_bit(BlockedBadBlocks, &rdev->flags), 9464 msecs_to_jiffies(5000)); 9465 rdev_dec_pending(rdev, mddev); 9466 } 9467 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9468 9469 void md_finish_reshape(struct mddev *mddev) 9470 { 9471 /* called be personality module when reshape completes. */ 9472 struct md_rdev *rdev; 9473 9474 rdev_for_each(rdev, mddev) { 9475 if (rdev->data_offset > rdev->new_data_offset) 9476 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9477 else 9478 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9479 rdev->data_offset = rdev->new_data_offset; 9480 } 9481 } 9482 EXPORT_SYMBOL(md_finish_reshape); 9483 9484 /* Bad block management */ 9485 9486 /* Returns 1 on success, 0 on failure */ 9487 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9488 int is_new) 9489 { 9490 struct mddev *mddev = rdev->mddev; 9491 int rv; 9492 if (is_new) 9493 s += rdev->new_data_offset; 9494 else 9495 s += rdev->data_offset; 9496 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9497 if (rv == 0) { 9498 /* Make sure they get written out promptly */ 9499 if (test_bit(ExternalBbl, &rdev->flags)) 9500 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9501 sysfs_notify_dirent_safe(rdev->sysfs_state); 9502 set_mask_bits(&mddev->sb_flags, 0, 9503 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9504 md_wakeup_thread(rdev->mddev->thread); 9505 return 1; 9506 } else 9507 return 0; 9508 } 9509 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9510 9511 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9512 int is_new) 9513 { 9514 int rv; 9515 if (is_new) 9516 s += rdev->new_data_offset; 9517 else 9518 s += rdev->data_offset; 9519 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9520 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9521 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9522 return rv; 9523 } 9524 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9525 9526 static int md_notify_reboot(struct notifier_block *this, 9527 unsigned long code, void *x) 9528 { 9529 struct list_head *tmp; 9530 struct mddev *mddev; 9531 int need_delay = 0; 9532 9533 for_each_mddev(mddev, tmp) { 9534 if (mddev_trylock(mddev)) { 9535 if (mddev->pers) 9536 __md_stop_writes(mddev); 9537 if (mddev->persistent) 9538 mddev->safemode = 2; 9539 mddev_unlock(mddev); 9540 } 9541 need_delay = 1; 9542 } 9543 /* 9544 * certain more exotic SCSI devices are known to be 9545 * volatile wrt too early system reboots. While the 9546 * right place to handle this issue is the given 9547 * driver, we do want to have a safe RAID driver ... 9548 */ 9549 if (need_delay) 9550 mdelay(1000*1); 9551 9552 return NOTIFY_DONE; 9553 } 9554 9555 static struct notifier_block md_notifier = { 9556 .notifier_call = md_notify_reboot, 9557 .next = NULL, 9558 .priority = INT_MAX, /* before any real devices */ 9559 }; 9560 9561 static void md_geninit(void) 9562 { 9563 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9564 9565 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9566 } 9567 9568 static int __init md_init(void) 9569 { 9570 int ret = -ENOMEM; 9571 9572 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9573 if (!md_wq) 9574 goto err_wq; 9575 9576 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9577 if (!md_misc_wq) 9578 goto err_misc_wq; 9579 9580 md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); 9581 if (!md_rdev_misc_wq) 9582 goto err_rdev_misc_wq; 9583 9584 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9585 if (ret < 0) 9586 goto err_md; 9587 9588 ret = __register_blkdev(0, "mdp", md_probe); 9589 if (ret < 0) 9590 goto err_mdp; 9591 mdp_major = ret; 9592 9593 register_reboot_notifier(&md_notifier); 9594 raid_table_header = register_sysctl_table(raid_root_table); 9595 9596 md_geninit(); 9597 return 0; 9598 9599 err_mdp: 9600 unregister_blkdev(MD_MAJOR, "md"); 9601 err_md: 9602 destroy_workqueue(md_rdev_misc_wq); 9603 err_rdev_misc_wq: 9604 destroy_workqueue(md_misc_wq); 9605 err_misc_wq: 9606 destroy_workqueue(md_wq); 9607 err_wq: 9608 return ret; 9609 } 9610 9611 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9612 { 9613 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9614 struct md_rdev *rdev2, *tmp; 9615 int role, ret; 9616 char b[BDEVNAME_SIZE]; 9617 9618 /* 9619 * If size is changed in another node then we need to 9620 * do resize as well. 9621 */ 9622 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9623 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9624 if (ret) 9625 pr_info("md-cluster: resize failed\n"); 9626 else 9627 md_bitmap_update_sb(mddev->bitmap); 9628 } 9629 9630 /* Check for change of roles in the active devices */ 9631 rdev_for_each_safe(rdev2, tmp, mddev) { 9632 if (test_bit(Faulty, &rdev2->flags)) 9633 continue; 9634 9635 /* Check if the roles changed */ 9636 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9637 9638 if (test_bit(Candidate, &rdev2->flags)) { 9639 if (role == 0xfffe) { 9640 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 9641 md_kick_rdev_from_array(rdev2); 9642 continue; 9643 } 9644 else 9645 clear_bit(Candidate, &rdev2->flags); 9646 } 9647 9648 if (role != rdev2->raid_disk) { 9649 /* 9650 * got activated except reshape is happening. 9651 */ 9652 if (rdev2->raid_disk == -1 && role != 0xffff && 9653 !(le32_to_cpu(sb->feature_map) & 9654 MD_FEATURE_RESHAPE_ACTIVE)) { 9655 rdev2->saved_raid_disk = role; 9656 ret = remove_and_add_spares(mddev, rdev2); 9657 pr_info("Activated spare: %s\n", 9658 bdevname(rdev2->bdev,b)); 9659 /* wakeup mddev->thread here, so array could 9660 * perform resync with the new activated disk */ 9661 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9662 md_wakeup_thread(mddev->thread); 9663 } 9664 /* device faulty 9665 * We just want to do the minimum to mark the disk 9666 * as faulty. The recovery is performed by the 9667 * one who initiated the error. 9668 */ 9669 if ((role == 0xfffe) || (role == 0xfffd)) { 9670 md_error(mddev, rdev2); 9671 clear_bit(Blocked, &rdev2->flags); 9672 } 9673 } 9674 } 9675 9676 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9677 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9678 if (ret) 9679 pr_warn("md: updating array disks failed. %d\n", ret); 9680 } 9681 9682 /* 9683 * Since mddev->delta_disks has already updated in update_raid_disks, 9684 * so it is time to check reshape. 9685 */ 9686 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9687 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9688 /* 9689 * reshape is happening in the remote node, we need to 9690 * update reshape_position and call start_reshape. 9691 */ 9692 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9693 if (mddev->pers->update_reshape_pos) 9694 mddev->pers->update_reshape_pos(mddev); 9695 if (mddev->pers->start_reshape) 9696 mddev->pers->start_reshape(mddev); 9697 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9698 mddev->reshape_position != MaxSector && 9699 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9700 /* reshape is just done in another node. */ 9701 mddev->reshape_position = MaxSector; 9702 if (mddev->pers->update_reshape_pos) 9703 mddev->pers->update_reshape_pos(mddev); 9704 } 9705 9706 /* Finally set the event to be up to date */ 9707 mddev->events = le64_to_cpu(sb->events); 9708 } 9709 9710 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9711 { 9712 int err; 9713 struct page *swapout = rdev->sb_page; 9714 struct mdp_superblock_1 *sb; 9715 9716 /* Store the sb page of the rdev in the swapout temporary 9717 * variable in case we err in the future 9718 */ 9719 rdev->sb_page = NULL; 9720 err = alloc_disk_sb(rdev); 9721 if (err == 0) { 9722 ClearPageUptodate(rdev->sb_page); 9723 rdev->sb_loaded = 0; 9724 err = super_types[mddev->major_version]. 9725 load_super(rdev, NULL, mddev->minor_version); 9726 } 9727 if (err < 0) { 9728 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9729 __func__, __LINE__, rdev->desc_nr, err); 9730 if (rdev->sb_page) 9731 put_page(rdev->sb_page); 9732 rdev->sb_page = swapout; 9733 rdev->sb_loaded = 1; 9734 return err; 9735 } 9736 9737 sb = page_address(rdev->sb_page); 9738 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9739 * is not set 9740 */ 9741 9742 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9743 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9744 9745 /* The other node finished recovery, call spare_active to set 9746 * device In_sync and mddev->degraded 9747 */ 9748 if (rdev->recovery_offset == MaxSector && 9749 !test_bit(In_sync, &rdev->flags) && 9750 mddev->pers->spare_active(mddev)) 9751 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9752 9753 put_page(swapout); 9754 return 0; 9755 } 9756 9757 void md_reload_sb(struct mddev *mddev, int nr) 9758 { 9759 struct md_rdev *rdev; 9760 int err; 9761 9762 /* Find the rdev */ 9763 rdev_for_each_rcu(rdev, mddev) { 9764 if (rdev->desc_nr == nr) 9765 break; 9766 } 9767 9768 if (!rdev || rdev->desc_nr != nr) { 9769 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9770 return; 9771 } 9772 9773 err = read_rdev(mddev, rdev); 9774 if (err < 0) 9775 return; 9776 9777 check_sb_changes(mddev, rdev); 9778 9779 /* Read all rdev's to update recovery_offset */ 9780 rdev_for_each_rcu(rdev, mddev) { 9781 if (!test_bit(Faulty, &rdev->flags)) 9782 read_rdev(mddev, rdev); 9783 } 9784 } 9785 EXPORT_SYMBOL(md_reload_sb); 9786 9787 #ifndef MODULE 9788 9789 /* 9790 * Searches all registered partitions for autorun RAID arrays 9791 * at boot time. 9792 */ 9793 9794 static DEFINE_MUTEX(detected_devices_mutex); 9795 static LIST_HEAD(all_detected_devices); 9796 struct detected_devices_node { 9797 struct list_head list; 9798 dev_t dev; 9799 }; 9800 9801 void md_autodetect_dev(dev_t dev) 9802 { 9803 struct detected_devices_node *node_detected_dev; 9804 9805 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9806 if (node_detected_dev) { 9807 node_detected_dev->dev = dev; 9808 mutex_lock(&detected_devices_mutex); 9809 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9810 mutex_unlock(&detected_devices_mutex); 9811 } 9812 } 9813 9814 void md_autostart_arrays(int part) 9815 { 9816 struct md_rdev *rdev; 9817 struct detected_devices_node *node_detected_dev; 9818 dev_t dev; 9819 int i_scanned, i_passed; 9820 9821 i_scanned = 0; 9822 i_passed = 0; 9823 9824 pr_info("md: Autodetecting RAID arrays.\n"); 9825 9826 mutex_lock(&detected_devices_mutex); 9827 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9828 i_scanned++; 9829 node_detected_dev = list_entry(all_detected_devices.next, 9830 struct detected_devices_node, list); 9831 list_del(&node_detected_dev->list); 9832 dev = node_detected_dev->dev; 9833 kfree(node_detected_dev); 9834 mutex_unlock(&detected_devices_mutex); 9835 rdev = md_import_device(dev,0, 90); 9836 mutex_lock(&detected_devices_mutex); 9837 if (IS_ERR(rdev)) 9838 continue; 9839 9840 if (test_bit(Faulty, &rdev->flags)) 9841 continue; 9842 9843 set_bit(AutoDetected, &rdev->flags); 9844 list_add(&rdev->same_set, &pending_raid_disks); 9845 i_passed++; 9846 } 9847 mutex_unlock(&detected_devices_mutex); 9848 9849 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 9850 9851 autorun_devices(part); 9852 } 9853 9854 #endif /* !MODULE */ 9855 9856 static __exit void md_exit(void) 9857 { 9858 struct mddev *mddev; 9859 struct list_head *tmp; 9860 int delay = 1; 9861 9862 unregister_blkdev(MD_MAJOR,"md"); 9863 unregister_blkdev(mdp_major, "mdp"); 9864 unregister_reboot_notifier(&md_notifier); 9865 unregister_sysctl_table(raid_table_header); 9866 9867 /* We cannot unload the modules while some process is 9868 * waiting for us in select() or poll() - wake them up 9869 */ 9870 md_unloading = 1; 9871 while (waitqueue_active(&md_event_waiters)) { 9872 /* not safe to leave yet */ 9873 wake_up(&md_event_waiters); 9874 msleep(delay); 9875 delay += delay; 9876 } 9877 remove_proc_entry("mdstat", NULL); 9878 9879 for_each_mddev(mddev, tmp) { 9880 export_array(mddev); 9881 mddev->ctime = 0; 9882 mddev->hold_active = 0; 9883 /* 9884 * for_each_mddev() will call mddev_put() at the end of each 9885 * iteration. As the mddev is now fully clear, this will 9886 * schedule the mddev for destruction by a workqueue, and the 9887 * destroy_workqueue() below will wait for that to complete. 9888 */ 9889 } 9890 destroy_workqueue(md_rdev_misc_wq); 9891 destroy_workqueue(md_misc_wq); 9892 destroy_workqueue(md_wq); 9893 } 9894 9895 subsys_initcall(md_init); 9896 module_exit(md_exit) 9897 9898 static int get_ro(char *buffer, const struct kernel_param *kp) 9899 { 9900 return sprintf(buffer, "%d\n", start_readonly); 9901 } 9902 static int set_ro(const char *val, const struct kernel_param *kp) 9903 { 9904 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 9905 } 9906 9907 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 9908 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 9909 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9910 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 9911 9912 MODULE_LICENSE("GPL"); 9913 MODULE_DESCRIPTION("MD RAID framework"); 9914 MODULE_ALIAS("md"); 9915 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9916