1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected by pers_lock. */ 74 static LIST_HEAD(pers_list); 75 static DEFINE_SPINLOCK(pers_lock); 76 77 static const struct kobj_type md_ktype; 78 79 struct md_cluster_operations *md_cluster_ops; 80 EXPORT_SYMBOL(md_cluster_ops); 81 static struct module *md_cluster_mod; 82 83 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 84 static struct workqueue_struct *md_wq; 85 static struct workqueue_struct *md_misc_wq; 86 struct workqueue_struct *md_bitmap_wq; 87 88 static int remove_and_add_spares(struct mddev *mddev, 89 struct md_rdev *this); 90 static void mddev_detach(struct mddev *mddev); 91 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 92 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 93 94 /* 95 * Default number of read corrections we'll attempt on an rdev 96 * before ejecting it from the array. We divide the read error 97 * count by 2 for every hour elapsed between read errors. 98 */ 99 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 100 /* Default safemode delay: 200 msec */ 101 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 102 /* 103 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 104 * is 1000 KB/sec, so the extra system load does not show up that much. 105 * Increase it if you want to have more _guaranteed_ speed. Note that 106 * the RAID driver will use the maximum available bandwidth if the IO 107 * subsystem is idle. There is also an 'absolute maximum' reconstruction 108 * speed limit - in case reconstruction slows down your system despite 109 * idle IO detection. 110 * 111 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 112 * or /sys/block/mdX/md/sync_speed_{min,max} 113 */ 114 115 static int sysctl_speed_limit_min = 1000; 116 static int sysctl_speed_limit_max = 200000; speed_min(struct mddev * mddev)117 static inline int speed_min(struct mddev *mddev) 118 { 119 return mddev->sync_speed_min ? 120 mddev->sync_speed_min : sysctl_speed_limit_min; 121 } 122 speed_max(struct mddev * mddev)123 static inline int speed_max(struct mddev *mddev) 124 { 125 return mddev->sync_speed_max ? 126 mddev->sync_speed_max : sysctl_speed_limit_max; 127 } 128 rdev_uninit_serial(struct md_rdev * rdev)129 static void rdev_uninit_serial(struct md_rdev *rdev) 130 { 131 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 132 return; 133 134 kvfree(rdev->serial); 135 rdev->serial = NULL; 136 } 137 rdevs_uninit_serial(struct mddev * mddev)138 static void rdevs_uninit_serial(struct mddev *mddev) 139 { 140 struct md_rdev *rdev; 141 142 rdev_for_each(rdev, mddev) 143 rdev_uninit_serial(rdev); 144 } 145 rdev_init_serial(struct md_rdev * rdev)146 static int rdev_init_serial(struct md_rdev *rdev) 147 { 148 /* serial_nums equals with BARRIER_BUCKETS_NR */ 149 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 150 struct serial_in_rdev *serial = NULL; 151 152 if (test_bit(CollisionCheck, &rdev->flags)) 153 return 0; 154 155 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 156 GFP_KERNEL); 157 if (!serial) 158 return -ENOMEM; 159 160 for (i = 0; i < serial_nums; i++) { 161 struct serial_in_rdev *serial_tmp = &serial[i]; 162 163 spin_lock_init(&serial_tmp->serial_lock); 164 serial_tmp->serial_rb = RB_ROOT_CACHED; 165 init_waitqueue_head(&serial_tmp->serial_io_wait); 166 } 167 168 rdev->serial = serial; 169 set_bit(CollisionCheck, &rdev->flags); 170 171 return 0; 172 } 173 rdevs_init_serial(struct mddev * mddev)174 static int rdevs_init_serial(struct mddev *mddev) 175 { 176 struct md_rdev *rdev; 177 int ret = 0; 178 179 rdev_for_each(rdev, mddev) { 180 ret = rdev_init_serial(rdev); 181 if (ret) 182 break; 183 } 184 185 /* Free all resources if pool is not existed */ 186 if (ret && !mddev->serial_info_pool) 187 rdevs_uninit_serial(mddev); 188 189 return ret; 190 } 191 192 /* 193 * rdev needs to enable serial stuffs if it meets the conditions: 194 * 1. it is multi-queue device flaged with writemostly. 195 * 2. the write-behind mode is enabled. 196 */ rdev_need_serial(struct md_rdev * rdev)197 static int rdev_need_serial(struct md_rdev *rdev) 198 { 199 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 200 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 201 test_bit(WriteMostly, &rdev->flags)); 202 } 203 204 /* 205 * Init resource for rdev(s), then create serial_info_pool if: 206 * 1. rdev is the first device which return true from rdev_enable_serial. 207 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 208 */ mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev,bool is_suspend)209 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 210 bool is_suspend) 211 { 212 int ret = 0; 213 214 if (rdev && !rdev_need_serial(rdev) && 215 !test_bit(CollisionCheck, &rdev->flags)) 216 return; 217 218 if (!is_suspend) 219 mddev_suspend(mddev); 220 221 if (!rdev) 222 ret = rdevs_init_serial(mddev); 223 else 224 ret = rdev_init_serial(rdev); 225 if (ret) 226 goto abort; 227 228 if (mddev->serial_info_pool == NULL) { 229 /* 230 * already in memalloc noio context by 231 * mddev_suspend() 232 */ 233 mddev->serial_info_pool = 234 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 235 sizeof(struct serial_info)); 236 if (!mddev->serial_info_pool) { 237 rdevs_uninit_serial(mddev); 238 pr_err("can't alloc memory pool for serialization\n"); 239 } 240 } 241 242 abort: 243 if (!is_suspend) 244 mddev_resume(mddev); 245 } 246 247 /* 248 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 249 * 1. rdev is the last device flaged with CollisionCheck. 250 * 2. when bitmap is destroyed while policy is not enabled. 251 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 252 */ mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev,bool is_suspend)253 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, 254 bool is_suspend) 255 { 256 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 257 return; 258 259 if (mddev->serial_info_pool) { 260 struct md_rdev *temp; 261 int num = 0; /* used to track if other rdevs need the pool */ 262 263 if (!is_suspend) 264 mddev_suspend(mddev); 265 rdev_for_each(temp, mddev) { 266 if (!rdev) { 267 if (!mddev->serialize_policy || 268 !rdev_need_serial(temp)) 269 rdev_uninit_serial(temp); 270 else 271 num++; 272 } else if (temp != rdev && 273 test_bit(CollisionCheck, &temp->flags)) 274 num++; 275 } 276 277 if (rdev) 278 rdev_uninit_serial(rdev); 279 280 if (num) 281 pr_info("The mempool could be used by other devices\n"); 282 else { 283 mempool_destroy(mddev->serial_info_pool); 284 mddev->serial_info_pool = NULL; 285 } 286 if (!is_suspend) 287 mddev_resume(mddev); 288 } 289 } 290 291 static struct ctl_table_header *raid_table_header; 292 293 static struct ctl_table raid_table[] = { 294 { 295 .procname = "speed_limit_min", 296 .data = &sysctl_speed_limit_min, 297 .maxlen = sizeof(int), 298 .mode = S_IRUGO|S_IWUSR, 299 .proc_handler = proc_dointvec, 300 }, 301 { 302 .procname = "speed_limit_max", 303 .data = &sysctl_speed_limit_max, 304 .maxlen = sizeof(int), 305 .mode = S_IRUGO|S_IWUSR, 306 .proc_handler = proc_dointvec, 307 }, 308 { } 309 }; 310 311 static int start_readonly; 312 313 /* 314 * The original mechanism for creating an md device is to create 315 * a device node in /dev and to open it. This causes races with device-close. 316 * The preferred method is to write to the "new_array" module parameter. 317 * This can avoid races. 318 * Setting create_on_open to false disables the original mechanism 319 * so all the races disappear. 320 */ 321 static bool create_on_open = true; 322 323 /* 324 * We have a system wide 'event count' that is incremented 325 * on any 'interesting' event, and readers of /proc/mdstat 326 * can use 'poll' or 'select' to find out when the event 327 * count increases. 328 * 329 * Events are: 330 * start array, stop array, error, add device, remove device, 331 * start build, activate spare 332 */ 333 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 334 static atomic_t md_event_count; md_new_event(void)335 void md_new_event(void) 336 { 337 atomic_inc(&md_event_count); 338 wake_up(&md_event_waiters); 339 } 340 EXPORT_SYMBOL_GPL(md_new_event); 341 342 /* 343 * Enables to iterate over all existing md arrays 344 * all_mddevs_lock protects this list. 345 */ 346 static LIST_HEAD(all_mddevs); 347 static DEFINE_SPINLOCK(all_mddevs_lock); 348 349 /* Rather than calling directly into the personality make_request function, 350 * IO requests come here first so that we can check if the device is 351 * being suspended pending a reconfiguration. 352 * We hold a refcount over the call to ->make_request. By the time that 353 * call has finished, the bio has been linked into some internal structure 354 * and so is visible to ->quiesce(), so we don't need the refcount any more. 355 */ is_suspended(struct mddev * mddev,struct bio * bio)356 static bool is_suspended(struct mddev *mddev, struct bio *bio) 357 { 358 if (is_md_suspended(mddev)) 359 return true; 360 if (bio_data_dir(bio) != WRITE) 361 return false; 362 if (mddev->suspend_lo >= mddev->suspend_hi) 363 return false; 364 if (bio->bi_iter.bi_sector >= mddev->suspend_hi) 365 return false; 366 if (bio_end_sector(bio) < mddev->suspend_lo) 367 return false; 368 return true; 369 } 370 md_handle_request(struct mddev * mddev,struct bio * bio)371 void md_handle_request(struct mddev *mddev, struct bio *bio) 372 { 373 check_suspended: 374 if (is_suspended(mddev, bio)) { 375 DEFINE_WAIT(__wait); 376 /* Bail out if REQ_NOWAIT is set for the bio */ 377 if (bio->bi_opf & REQ_NOWAIT) { 378 bio_wouldblock_error(bio); 379 return; 380 } 381 for (;;) { 382 prepare_to_wait(&mddev->sb_wait, &__wait, 383 TASK_UNINTERRUPTIBLE); 384 if (!is_suspended(mddev, bio)) 385 break; 386 schedule(); 387 } 388 finish_wait(&mddev->sb_wait, &__wait); 389 } 390 if (!percpu_ref_tryget_live(&mddev->active_io)) 391 goto check_suspended; 392 393 if (!mddev->pers->make_request(mddev, bio)) { 394 percpu_ref_put(&mddev->active_io); 395 goto check_suspended; 396 } 397 398 percpu_ref_put(&mddev->active_io); 399 } 400 EXPORT_SYMBOL(md_handle_request); 401 md_submit_bio(struct bio * bio)402 static void md_submit_bio(struct bio *bio) 403 { 404 const int rw = bio_data_dir(bio); 405 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 406 407 if (mddev == NULL || mddev->pers == NULL) { 408 bio_io_error(bio); 409 return; 410 } 411 412 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 413 bio_io_error(bio); 414 return; 415 } 416 417 bio = bio_split_to_limits(bio); 418 if (!bio) 419 return; 420 421 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 422 if (bio_sectors(bio) != 0) 423 bio->bi_status = BLK_STS_IOERR; 424 bio_endio(bio); 425 return; 426 } 427 428 /* bio could be mergeable after passing to underlayer */ 429 bio->bi_opf &= ~REQ_NOMERGE; 430 431 md_handle_request(mddev, bio); 432 } 433 434 /* mddev_suspend makes sure no new requests are submitted 435 * to the device, and that any requests that have been submitted 436 * are completely handled. 437 * Once mddev_detach() is called and completes, the module will be 438 * completely unused. 439 */ mddev_suspend(struct mddev * mddev)440 void mddev_suspend(struct mddev *mddev) 441 { 442 struct md_thread *thread = rcu_dereference_protected(mddev->thread, 443 lockdep_is_held(&mddev->reconfig_mutex)); 444 445 WARN_ON_ONCE(thread && current == thread->tsk); 446 if (mddev->suspended++) 447 return; 448 wake_up(&mddev->sb_wait); 449 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); 450 percpu_ref_kill(&mddev->active_io); 451 452 if (mddev->pers && mddev->pers->prepare_suspend) 453 mddev->pers->prepare_suspend(mddev); 454 455 wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); 456 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); 457 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); 458 459 /* restrict memory reclaim I/O during raid array is suspend */ 460 mddev->noio_flag = memalloc_noio_save(); 461 } 462 EXPORT_SYMBOL_GPL(mddev_suspend); 463 mddev_resume(struct mddev * mddev)464 void mddev_resume(struct mddev *mddev) 465 { 466 lockdep_assert_held(&mddev->reconfig_mutex); 467 if (--mddev->suspended) 468 return; 469 470 /* entred the memalloc scope from mddev_suspend() */ 471 memalloc_noio_restore(mddev->noio_flag); 472 473 percpu_ref_resurrect(&mddev->active_io); 474 wake_up(&mddev->sb_wait); 475 476 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 477 md_wakeup_thread(mddev->thread); 478 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 479 } 480 EXPORT_SYMBOL_GPL(mddev_resume); 481 482 /* 483 * Generic flush handling for md 484 */ 485 md_end_flush(struct bio * bio)486 static void md_end_flush(struct bio *bio) 487 { 488 struct md_rdev *rdev = bio->bi_private; 489 struct mddev *mddev = rdev->mddev; 490 491 bio_put(bio); 492 493 rdev_dec_pending(rdev, mddev); 494 495 if (atomic_dec_and_test(&mddev->flush_pending)) 496 /* The pre-request flush has finished */ 497 queue_work(md_wq, &mddev->flush_work); 498 } 499 500 static void md_submit_flush_data(struct work_struct *ws); 501 submit_flushes(struct work_struct * ws)502 static void submit_flushes(struct work_struct *ws) 503 { 504 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 505 struct md_rdev *rdev; 506 507 mddev->start_flush = ktime_get_boottime(); 508 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 509 atomic_set(&mddev->flush_pending, 1); 510 rcu_read_lock(); 511 rdev_for_each_rcu(rdev, mddev) 512 if (rdev->raid_disk >= 0 && 513 !test_bit(Faulty, &rdev->flags)) { 514 struct bio *bi; 515 516 atomic_inc(&rdev->nr_pending); 517 rcu_read_unlock(); 518 bi = bio_alloc_bioset(rdev->bdev, 0, 519 REQ_OP_WRITE | REQ_PREFLUSH, 520 GFP_NOIO, &mddev->bio_set); 521 bi->bi_end_io = md_end_flush; 522 bi->bi_private = rdev; 523 atomic_inc(&mddev->flush_pending); 524 submit_bio(bi); 525 rcu_read_lock(); 526 } 527 rcu_read_unlock(); 528 if (atomic_dec_and_test(&mddev->flush_pending)) 529 queue_work(md_wq, &mddev->flush_work); 530 } 531 md_submit_flush_data(struct work_struct * ws)532 static void md_submit_flush_data(struct work_struct *ws) 533 { 534 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 535 struct bio *bio = mddev->flush_bio; 536 537 /* 538 * must reset flush_bio before calling into md_handle_request to avoid a 539 * deadlock, because other bios passed md_handle_request suspend check 540 * could wait for this and below md_handle_request could wait for those 541 * bios because of suspend check 542 */ 543 spin_lock_irq(&mddev->lock); 544 mddev->prev_flush_start = mddev->start_flush; 545 mddev->flush_bio = NULL; 546 spin_unlock_irq(&mddev->lock); 547 wake_up(&mddev->sb_wait); 548 549 if (bio->bi_iter.bi_size == 0) { 550 /* an empty barrier - all done */ 551 bio_endio(bio); 552 } else { 553 bio->bi_opf &= ~REQ_PREFLUSH; 554 555 /* 556 * make_requst() will never return error here, it only 557 * returns error in raid5_make_request() by dm-raid. 558 * Since dm always splits data and flush operation into 559 * two separate io, io size of flush submitted by dm 560 * always is 0, make_request() will not be called here. 561 */ 562 if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) 563 bio_io_error(bio);; 564 } 565 566 /* The pair is percpu_ref_get() from md_flush_request() */ 567 percpu_ref_put(&mddev->active_io); 568 } 569 570 /* 571 * Manages consolidation of flushes and submitting any flushes needed for 572 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 573 * being finished in another context. Returns false if the flushing is 574 * complete but still needs the I/O portion of the bio to be processed. 575 */ md_flush_request(struct mddev * mddev,struct bio * bio)576 bool md_flush_request(struct mddev *mddev, struct bio *bio) 577 { 578 ktime_t req_start = ktime_get_boottime(); 579 spin_lock_irq(&mddev->lock); 580 /* flush requests wait until ongoing flush completes, 581 * hence coalescing all the pending requests. 582 */ 583 wait_event_lock_irq(mddev->sb_wait, 584 !mddev->flush_bio || 585 ktime_before(req_start, mddev->prev_flush_start), 586 mddev->lock); 587 /* new request after previous flush is completed */ 588 if (ktime_after(req_start, mddev->prev_flush_start)) { 589 WARN_ON(mddev->flush_bio); 590 /* 591 * Grab a reference to make sure mddev_suspend() will wait for 592 * this flush to be done. 593 * 594 * md_flush_reqeust() is called under md_handle_request() and 595 * 'active_io' is already grabbed, hence percpu_ref_is_zero() 596 * won't pass, percpu_ref_tryget_live() can't be used because 597 * percpu_ref_kill() can be called by mddev_suspend() 598 * concurrently. 599 */ 600 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 601 percpu_ref_get(&mddev->active_io); 602 mddev->flush_bio = bio; 603 bio = NULL; 604 } 605 spin_unlock_irq(&mddev->lock); 606 607 if (!bio) { 608 INIT_WORK(&mddev->flush_work, submit_flushes); 609 queue_work(md_wq, &mddev->flush_work); 610 } else { 611 /* flush was performed for some other bio while we waited. */ 612 if (bio->bi_iter.bi_size == 0) 613 /* an empty barrier - all done */ 614 bio_endio(bio); 615 else { 616 bio->bi_opf &= ~REQ_PREFLUSH; 617 return false; 618 } 619 } 620 return true; 621 } 622 EXPORT_SYMBOL(md_flush_request); 623 mddev_get(struct mddev * mddev)624 static inline struct mddev *mddev_get(struct mddev *mddev) 625 { 626 lockdep_assert_held(&all_mddevs_lock); 627 628 if (test_bit(MD_DELETED, &mddev->flags)) 629 return NULL; 630 atomic_inc(&mddev->active); 631 return mddev; 632 } 633 634 static void mddev_delayed_delete(struct work_struct *ws); 635 __mddev_put(struct mddev * mddev)636 static void __mddev_put(struct mddev *mddev) 637 { 638 if (mddev->raid_disks || !list_empty(&mddev->disks) || 639 mddev->ctime || mddev->hold_active) 640 return; 641 642 /* Array is not configured at all, and not held active, so destroy it */ 643 set_bit(MD_DELETED, &mddev->flags); 644 645 /* 646 * Call queue_work inside the spinlock so that flush_workqueue() after 647 * mddev_find will succeed in waiting for the work to be done. 648 */ 649 queue_work(md_misc_wq, &mddev->del_work); 650 } 651 mddev_put(struct mddev * mddev)652 void mddev_put(struct mddev *mddev) 653 { 654 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 655 return; 656 657 __mddev_put(mddev); 658 spin_unlock(&all_mddevs_lock); 659 } 660 661 static void md_safemode_timeout(struct timer_list *t); 662 static void md_start_sync(struct work_struct *ws); 663 mddev_init(struct mddev * mddev)664 void mddev_init(struct mddev *mddev) 665 { 666 mutex_init(&mddev->open_mutex); 667 mutex_init(&mddev->reconfig_mutex); 668 mutex_init(&mddev->sync_mutex); 669 mutex_init(&mddev->bitmap_info.mutex); 670 INIT_LIST_HEAD(&mddev->disks); 671 INIT_LIST_HEAD(&mddev->all_mddevs); 672 INIT_LIST_HEAD(&mddev->deleting); 673 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 674 atomic_set(&mddev->active, 1); 675 atomic_set(&mddev->openers, 0); 676 atomic_set(&mddev->sync_seq, 0); 677 spin_lock_init(&mddev->lock); 678 atomic_set(&mddev->flush_pending, 0); 679 init_waitqueue_head(&mddev->sb_wait); 680 init_waitqueue_head(&mddev->recovery_wait); 681 mddev->reshape_position = MaxSector; 682 mddev->reshape_backwards = 0; 683 mddev->last_sync_action = "none"; 684 mddev->resync_min = 0; 685 mddev->resync_max = MaxSector; 686 mddev->level = LEVEL_NONE; 687 688 INIT_WORK(&mddev->sync_work, md_start_sync); 689 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 690 } 691 EXPORT_SYMBOL_GPL(mddev_init); 692 mddev_find_locked(dev_t unit)693 static struct mddev *mddev_find_locked(dev_t unit) 694 { 695 struct mddev *mddev; 696 697 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 698 if (mddev->unit == unit) 699 return mddev; 700 701 return NULL; 702 } 703 704 /* find an unused unit number */ mddev_alloc_unit(void)705 static dev_t mddev_alloc_unit(void) 706 { 707 static int next_minor = 512; 708 int start = next_minor; 709 bool is_free = 0; 710 dev_t dev = 0; 711 712 while (!is_free) { 713 dev = MKDEV(MD_MAJOR, next_minor); 714 next_minor++; 715 if (next_minor > MINORMASK) 716 next_minor = 0; 717 if (next_minor == start) 718 return 0; /* Oh dear, all in use. */ 719 is_free = !mddev_find_locked(dev); 720 } 721 722 return dev; 723 } 724 mddev_alloc(dev_t unit)725 static struct mddev *mddev_alloc(dev_t unit) 726 { 727 struct mddev *new; 728 int error; 729 730 if (unit && MAJOR(unit) != MD_MAJOR) 731 unit &= ~((1 << MdpMinorShift) - 1); 732 733 new = kzalloc(sizeof(*new), GFP_KERNEL); 734 if (!new) 735 return ERR_PTR(-ENOMEM); 736 mddev_init(new); 737 738 spin_lock(&all_mddevs_lock); 739 if (unit) { 740 error = -EEXIST; 741 if (mddev_find_locked(unit)) 742 goto out_free_new; 743 new->unit = unit; 744 if (MAJOR(unit) == MD_MAJOR) 745 new->md_minor = MINOR(unit); 746 else 747 new->md_minor = MINOR(unit) >> MdpMinorShift; 748 new->hold_active = UNTIL_IOCTL; 749 } else { 750 error = -ENODEV; 751 new->unit = mddev_alloc_unit(); 752 if (!new->unit) 753 goto out_free_new; 754 new->md_minor = MINOR(new->unit); 755 new->hold_active = UNTIL_STOP; 756 } 757 758 list_add(&new->all_mddevs, &all_mddevs); 759 spin_unlock(&all_mddevs_lock); 760 return new; 761 out_free_new: 762 spin_unlock(&all_mddevs_lock); 763 kfree(new); 764 return ERR_PTR(error); 765 } 766 mddev_free(struct mddev * mddev)767 static void mddev_free(struct mddev *mddev) 768 { 769 spin_lock(&all_mddevs_lock); 770 list_del(&mddev->all_mddevs); 771 spin_unlock(&all_mddevs_lock); 772 773 kfree(mddev); 774 } 775 776 static const struct attribute_group md_redundancy_group; 777 mddev_unlock(struct mddev * mddev)778 void mddev_unlock(struct mddev *mddev) 779 { 780 struct md_rdev *rdev; 781 struct md_rdev *tmp; 782 LIST_HEAD(delete); 783 784 if (!list_empty(&mddev->deleting)) 785 list_splice_init(&mddev->deleting, &delete); 786 787 if (mddev->to_remove) { 788 /* These cannot be removed under reconfig_mutex as 789 * an access to the files will try to take reconfig_mutex 790 * while holding the file unremovable, which leads to 791 * a deadlock. 792 * So hold set sysfs_active while the remove in happeing, 793 * and anything else which might set ->to_remove or my 794 * otherwise change the sysfs namespace will fail with 795 * -EBUSY if sysfs_active is still set. 796 * We set sysfs_active under reconfig_mutex and elsewhere 797 * test it under the same mutex to ensure its correct value 798 * is seen. 799 */ 800 const struct attribute_group *to_remove = mddev->to_remove; 801 mddev->to_remove = NULL; 802 mddev->sysfs_active = 1; 803 mutex_unlock(&mddev->reconfig_mutex); 804 805 if (mddev->kobj.sd) { 806 if (to_remove != &md_redundancy_group) 807 sysfs_remove_group(&mddev->kobj, to_remove); 808 if (mddev->pers == NULL || 809 mddev->pers->sync_request == NULL) { 810 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 811 if (mddev->sysfs_action) 812 sysfs_put(mddev->sysfs_action); 813 if (mddev->sysfs_completed) 814 sysfs_put(mddev->sysfs_completed); 815 if (mddev->sysfs_degraded) 816 sysfs_put(mddev->sysfs_degraded); 817 mddev->sysfs_action = NULL; 818 mddev->sysfs_completed = NULL; 819 mddev->sysfs_degraded = NULL; 820 } 821 } 822 mddev->sysfs_active = 0; 823 } else 824 mutex_unlock(&mddev->reconfig_mutex); 825 826 md_wakeup_thread(mddev->thread); 827 wake_up(&mddev->sb_wait); 828 829 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 830 list_del_init(&rdev->same_set); 831 kobject_del(&rdev->kobj); 832 export_rdev(rdev, mddev); 833 } 834 } 835 EXPORT_SYMBOL_GPL(mddev_unlock); 836 md_find_rdev_nr_rcu(struct mddev * mddev,int nr)837 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 838 { 839 struct md_rdev *rdev; 840 841 rdev_for_each_rcu(rdev, mddev) 842 if (rdev->desc_nr == nr) 843 return rdev; 844 845 return NULL; 846 } 847 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 848 find_rdev(struct mddev * mddev,dev_t dev)849 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 850 { 851 struct md_rdev *rdev; 852 853 rdev_for_each(rdev, mddev) 854 if (rdev->bdev->bd_dev == dev) 855 return rdev; 856 857 return NULL; 858 } 859 md_find_rdev_rcu(struct mddev * mddev,dev_t dev)860 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 861 { 862 struct md_rdev *rdev; 863 864 rdev_for_each_rcu(rdev, mddev) 865 if (rdev->bdev->bd_dev == dev) 866 return rdev; 867 868 return NULL; 869 } 870 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 871 find_pers(int level,char * clevel)872 static struct md_personality *find_pers(int level, char *clevel) 873 { 874 struct md_personality *pers; 875 list_for_each_entry(pers, &pers_list, list) { 876 if (level != LEVEL_NONE && pers->level == level) 877 return pers; 878 if (strcmp(pers->name, clevel)==0) 879 return pers; 880 } 881 return NULL; 882 } 883 884 /* return the offset of the super block in 512byte sectors */ calc_dev_sboffset(struct md_rdev * rdev)885 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 886 { 887 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 888 } 889 alloc_disk_sb(struct md_rdev * rdev)890 static int alloc_disk_sb(struct md_rdev *rdev) 891 { 892 rdev->sb_page = alloc_page(GFP_KERNEL); 893 if (!rdev->sb_page) 894 return -ENOMEM; 895 return 0; 896 } 897 md_rdev_clear(struct md_rdev * rdev)898 void md_rdev_clear(struct md_rdev *rdev) 899 { 900 if (rdev->sb_page) { 901 put_page(rdev->sb_page); 902 rdev->sb_loaded = 0; 903 rdev->sb_page = NULL; 904 rdev->sb_start = 0; 905 rdev->sectors = 0; 906 } 907 if (rdev->bb_page) { 908 put_page(rdev->bb_page); 909 rdev->bb_page = NULL; 910 } 911 badblocks_exit(&rdev->badblocks); 912 } 913 EXPORT_SYMBOL_GPL(md_rdev_clear); 914 super_written(struct bio * bio)915 static void super_written(struct bio *bio) 916 { 917 struct md_rdev *rdev = bio->bi_private; 918 struct mddev *mddev = rdev->mddev; 919 920 if (bio->bi_status) { 921 pr_err("md: %s gets error=%d\n", __func__, 922 blk_status_to_errno(bio->bi_status)); 923 md_error(mddev, rdev); 924 if (!test_bit(Faulty, &rdev->flags) 925 && (bio->bi_opf & MD_FAILFAST)) { 926 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 927 set_bit(LastDev, &rdev->flags); 928 } 929 } else 930 clear_bit(LastDev, &rdev->flags); 931 932 bio_put(bio); 933 934 rdev_dec_pending(rdev, mddev); 935 936 if (atomic_dec_and_test(&mddev->pending_writes)) 937 wake_up(&mddev->sb_wait); 938 } 939 md_super_write(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page)940 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 941 sector_t sector, int size, struct page *page) 942 { 943 /* write first size bytes of page to sector of rdev 944 * Increment mddev->pending_writes before returning 945 * and decrement it on completion, waking up sb_wait 946 * if zero is reached. 947 * If an error occurred, call md_error 948 */ 949 struct bio *bio; 950 951 if (!page) 952 return; 953 954 if (test_bit(Faulty, &rdev->flags)) 955 return; 956 957 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 958 1, 959 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 960 | REQ_PREFLUSH | REQ_FUA, 961 GFP_NOIO, &mddev->sync_set); 962 963 atomic_inc(&rdev->nr_pending); 964 965 bio->bi_iter.bi_sector = sector; 966 __bio_add_page(bio, page, size, 0); 967 bio->bi_private = rdev; 968 bio->bi_end_io = super_written; 969 970 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 971 test_bit(FailFast, &rdev->flags) && 972 !test_bit(LastDev, &rdev->flags)) 973 bio->bi_opf |= MD_FAILFAST; 974 975 atomic_inc(&mddev->pending_writes); 976 submit_bio(bio); 977 } 978 md_super_wait(struct mddev * mddev)979 int md_super_wait(struct mddev *mddev) 980 { 981 /* wait for all superblock writes that were scheduled to complete */ 982 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 983 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 984 return -EAGAIN; 985 return 0; 986 } 987 sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)988 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 989 struct page *page, blk_opf_t opf, bool metadata_op) 990 { 991 struct bio bio; 992 struct bio_vec bvec; 993 994 if (metadata_op && rdev->meta_bdev) 995 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 996 else 997 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 998 999 if (metadata_op) 1000 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1001 else if (rdev->mddev->reshape_position != MaxSector && 1002 (rdev->mddev->reshape_backwards == 1003 (sector >= rdev->mddev->reshape_position))) 1004 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1005 else 1006 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1007 __bio_add_page(&bio, page, size, 0); 1008 1009 submit_bio_wait(&bio); 1010 1011 return !bio.bi_status; 1012 } 1013 EXPORT_SYMBOL_GPL(sync_page_io); 1014 read_disk_sb(struct md_rdev * rdev,int size)1015 static int read_disk_sb(struct md_rdev *rdev, int size) 1016 { 1017 if (rdev->sb_loaded) 1018 return 0; 1019 1020 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1021 goto fail; 1022 rdev->sb_loaded = 1; 1023 return 0; 1024 1025 fail: 1026 pr_err("md: disabled device %pg, could not read superblock.\n", 1027 rdev->bdev); 1028 return -EINVAL; 1029 } 1030 md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1031 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1032 { 1033 return sb1->set_uuid0 == sb2->set_uuid0 && 1034 sb1->set_uuid1 == sb2->set_uuid1 && 1035 sb1->set_uuid2 == sb2->set_uuid2 && 1036 sb1->set_uuid3 == sb2->set_uuid3; 1037 } 1038 md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1039 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1040 { 1041 int ret; 1042 mdp_super_t *tmp1, *tmp2; 1043 1044 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1045 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1046 1047 if (!tmp1 || !tmp2) { 1048 ret = 0; 1049 goto abort; 1050 } 1051 1052 *tmp1 = *sb1; 1053 *tmp2 = *sb2; 1054 1055 /* 1056 * nr_disks is not constant 1057 */ 1058 tmp1->nr_disks = 0; 1059 tmp2->nr_disks = 0; 1060 1061 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1062 abort: 1063 kfree(tmp1); 1064 kfree(tmp2); 1065 return ret; 1066 } 1067 md_csum_fold(u32 csum)1068 static u32 md_csum_fold(u32 csum) 1069 { 1070 csum = (csum & 0xffff) + (csum >> 16); 1071 return (csum & 0xffff) + (csum >> 16); 1072 } 1073 calc_sb_csum(mdp_super_t * sb)1074 static unsigned int calc_sb_csum(mdp_super_t *sb) 1075 { 1076 u64 newcsum = 0; 1077 u32 *sb32 = (u32*)sb; 1078 int i; 1079 unsigned int disk_csum, csum; 1080 1081 disk_csum = sb->sb_csum; 1082 sb->sb_csum = 0; 1083 1084 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1085 newcsum += sb32[i]; 1086 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1087 1088 #ifdef CONFIG_ALPHA 1089 /* This used to use csum_partial, which was wrong for several 1090 * reasons including that different results are returned on 1091 * different architectures. It isn't critical that we get exactly 1092 * the same return value as before (we always csum_fold before 1093 * testing, and that removes any differences). However as we 1094 * know that csum_partial always returned a 16bit value on 1095 * alphas, do a fold to maximise conformity to previous behaviour. 1096 */ 1097 sb->sb_csum = md_csum_fold(disk_csum); 1098 #else 1099 sb->sb_csum = disk_csum; 1100 #endif 1101 return csum; 1102 } 1103 1104 /* 1105 * Handle superblock details. 1106 * We want to be able to handle multiple superblock formats 1107 * so we have a common interface to them all, and an array of 1108 * different handlers. 1109 * We rely on user-space to write the initial superblock, and support 1110 * reading and updating of superblocks. 1111 * Interface methods are: 1112 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1113 * loads and validates a superblock on dev. 1114 * if refdev != NULL, compare superblocks on both devices 1115 * Return: 1116 * 0 - dev has a superblock that is compatible with refdev 1117 * 1 - dev has a superblock that is compatible and newer than refdev 1118 * so dev should be used as the refdev in future 1119 * -EINVAL superblock incompatible or invalid 1120 * -othererror e.g. -EIO 1121 * 1122 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1123 * Verify that dev is acceptable into mddev. 1124 * The first time, mddev->raid_disks will be 0, and data from 1125 * dev should be merged in. Subsequent calls check that dev 1126 * is new enough. Return 0 or -EINVAL 1127 * 1128 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1129 * Update the superblock for rdev with data in mddev 1130 * This does not write to disc. 1131 * 1132 */ 1133 1134 struct super_type { 1135 char *name; 1136 struct module *owner; 1137 int (*load_super)(struct md_rdev *rdev, 1138 struct md_rdev *refdev, 1139 int minor_version); 1140 int (*validate_super)(struct mddev *mddev, 1141 struct md_rdev *freshest, 1142 struct md_rdev *rdev); 1143 void (*sync_super)(struct mddev *mddev, 1144 struct md_rdev *rdev); 1145 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1146 sector_t num_sectors); 1147 int (*allow_new_offset)(struct md_rdev *rdev, 1148 unsigned long long new_offset); 1149 }; 1150 1151 /* 1152 * Check that the given mddev has no bitmap. 1153 * 1154 * This function is called from the run method of all personalities that do not 1155 * support bitmaps. It prints an error message and returns non-zero if mddev 1156 * has a bitmap. Otherwise, it returns 0. 1157 * 1158 */ md_check_no_bitmap(struct mddev * mddev)1159 int md_check_no_bitmap(struct mddev *mddev) 1160 { 1161 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1162 return 0; 1163 pr_warn("%s: bitmaps are not supported for %s\n", 1164 mdname(mddev), mddev->pers->name); 1165 return 1; 1166 } 1167 EXPORT_SYMBOL(md_check_no_bitmap); 1168 1169 /* 1170 * load_super for 0.90.0 1171 */ super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1172 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1173 { 1174 mdp_super_t *sb; 1175 int ret; 1176 bool spare_disk = true; 1177 1178 /* 1179 * Calculate the position of the superblock (512byte sectors), 1180 * it's at the end of the disk. 1181 * 1182 * It also happens to be a multiple of 4Kb. 1183 */ 1184 rdev->sb_start = calc_dev_sboffset(rdev); 1185 1186 ret = read_disk_sb(rdev, MD_SB_BYTES); 1187 if (ret) 1188 return ret; 1189 1190 ret = -EINVAL; 1191 1192 sb = page_address(rdev->sb_page); 1193 1194 if (sb->md_magic != MD_SB_MAGIC) { 1195 pr_warn("md: invalid raid superblock magic on %pg\n", 1196 rdev->bdev); 1197 goto abort; 1198 } 1199 1200 if (sb->major_version != 0 || 1201 sb->minor_version < 90 || 1202 sb->minor_version > 91) { 1203 pr_warn("Bad version number %d.%d on %pg\n", 1204 sb->major_version, sb->minor_version, rdev->bdev); 1205 goto abort; 1206 } 1207 1208 if (sb->raid_disks <= 0) 1209 goto abort; 1210 1211 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1212 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1213 goto abort; 1214 } 1215 1216 rdev->preferred_minor = sb->md_minor; 1217 rdev->data_offset = 0; 1218 rdev->new_data_offset = 0; 1219 rdev->sb_size = MD_SB_BYTES; 1220 rdev->badblocks.shift = -1; 1221 1222 if (sb->level == LEVEL_MULTIPATH) 1223 rdev->desc_nr = -1; 1224 else 1225 rdev->desc_nr = sb->this_disk.number; 1226 1227 /* not spare disk, or LEVEL_MULTIPATH */ 1228 if (sb->level == LEVEL_MULTIPATH || 1229 (rdev->desc_nr >= 0 && 1230 rdev->desc_nr < MD_SB_DISKS && 1231 sb->disks[rdev->desc_nr].state & 1232 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) 1233 spare_disk = false; 1234 1235 if (!refdev) { 1236 if (!spare_disk) 1237 ret = 1; 1238 else 1239 ret = 0; 1240 } else { 1241 __u64 ev1, ev2; 1242 mdp_super_t *refsb = page_address(refdev->sb_page); 1243 if (!md_uuid_equal(refsb, sb)) { 1244 pr_warn("md: %pg has different UUID to %pg\n", 1245 rdev->bdev, refdev->bdev); 1246 goto abort; 1247 } 1248 if (!md_sb_equal(refsb, sb)) { 1249 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1250 rdev->bdev, refdev->bdev); 1251 goto abort; 1252 } 1253 ev1 = md_event(sb); 1254 ev2 = md_event(refsb); 1255 1256 if (!spare_disk && ev1 > ev2) 1257 ret = 1; 1258 else 1259 ret = 0; 1260 } 1261 rdev->sectors = rdev->sb_start; 1262 /* Limit to 4TB as metadata cannot record more than that. 1263 * (not needed for Linear and RAID0 as metadata doesn't 1264 * record this size) 1265 */ 1266 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1267 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1268 1269 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1270 /* "this cannot possibly happen" ... */ 1271 ret = -EINVAL; 1272 1273 abort: 1274 return ret; 1275 } 1276 1277 /* 1278 * validate_super for 0.90.0 1279 * note: we are not using "freshest" for 0.9 superblock 1280 */ super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1281 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1282 { 1283 mdp_disk_t *desc; 1284 mdp_super_t *sb = page_address(rdev->sb_page); 1285 __u64 ev1 = md_event(sb); 1286 1287 rdev->raid_disk = -1; 1288 clear_bit(Faulty, &rdev->flags); 1289 clear_bit(In_sync, &rdev->flags); 1290 clear_bit(Bitmap_sync, &rdev->flags); 1291 clear_bit(WriteMostly, &rdev->flags); 1292 1293 if (mddev->raid_disks == 0) { 1294 mddev->major_version = 0; 1295 mddev->minor_version = sb->minor_version; 1296 mddev->patch_version = sb->patch_version; 1297 mddev->external = 0; 1298 mddev->chunk_sectors = sb->chunk_size >> 9; 1299 mddev->ctime = sb->ctime; 1300 mddev->utime = sb->utime; 1301 mddev->level = sb->level; 1302 mddev->clevel[0] = 0; 1303 mddev->layout = sb->layout; 1304 mddev->raid_disks = sb->raid_disks; 1305 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1306 mddev->events = ev1; 1307 mddev->bitmap_info.offset = 0; 1308 mddev->bitmap_info.space = 0; 1309 /* bitmap can use 60 K after the 4K superblocks */ 1310 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1311 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1312 mddev->reshape_backwards = 0; 1313 1314 if (mddev->minor_version >= 91) { 1315 mddev->reshape_position = sb->reshape_position; 1316 mddev->delta_disks = sb->delta_disks; 1317 mddev->new_level = sb->new_level; 1318 mddev->new_layout = sb->new_layout; 1319 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1320 if (mddev->delta_disks < 0) 1321 mddev->reshape_backwards = 1; 1322 } else { 1323 mddev->reshape_position = MaxSector; 1324 mddev->delta_disks = 0; 1325 mddev->new_level = mddev->level; 1326 mddev->new_layout = mddev->layout; 1327 mddev->new_chunk_sectors = mddev->chunk_sectors; 1328 } 1329 if (mddev->level == 0) 1330 mddev->layout = -1; 1331 1332 if (sb->state & (1<<MD_SB_CLEAN)) 1333 mddev->recovery_cp = MaxSector; 1334 else { 1335 if (sb->events_hi == sb->cp_events_hi && 1336 sb->events_lo == sb->cp_events_lo) { 1337 mddev->recovery_cp = sb->recovery_cp; 1338 } else 1339 mddev->recovery_cp = 0; 1340 } 1341 1342 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1343 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1344 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1345 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1346 1347 mddev->max_disks = MD_SB_DISKS; 1348 1349 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1350 mddev->bitmap_info.file == NULL) { 1351 mddev->bitmap_info.offset = 1352 mddev->bitmap_info.default_offset; 1353 mddev->bitmap_info.space = 1354 mddev->bitmap_info.default_space; 1355 } 1356 1357 } else if (mddev->pers == NULL) { 1358 /* Insist on good event counter while assembling, except 1359 * for spares (which don't need an event count) */ 1360 ++ev1; 1361 if (sb->disks[rdev->desc_nr].state & ( 1362 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1363 if (ev1 < mddev->events) 1364 return -EINVAL; 1365 } else if (mddev->bitmap) { 1366 /* if adding to array with a bitmap, then we can accept an 1367 * older device ... but not too old. 1368 */ 1369 if (ev1 < mddev->bitmap->events_cleared) 1370 return 0; 1371 if (ev1 < mddev->events) 1372 set_bit(Bitmap_sync, &rdev->flags); 1373 } else { 1374 if (ev1 < mddev->events) 1375 /* just a hot-add of a new device, leave raid_disk at -1 */ 1376 return 0; 1377 } 1378 1379 if (mddev->level != LEVEL_MULTIPATH) { 1380 desc = sb->disks + rdev->desc_nr; 1381 1382 if (desc->state & (1<<MD_DISK_FAULTY)) 1383 set_bit(Faulty, &rdev->flags); 1384 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1385 desc->raid_disk < mddev->raid_disks */) { 1386 set_bit(In_sync, &rdev->flags); 1387 rdev->raid_disk = desc->raid_disk; 1388 rdev->saved_raid_disk = desc->raid_disk; 1389 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1390 /* active but not in sync implies recovery up to 1391 * reshape position. We don't know exactly where 1392 * that is, so set to zero for now */ 1393 if (mddev->minor_version >= 91) { 1394 rdev->recovery_offset = 0; 1395 rdev->raid_disk = desc->raid_disk; 1396 } 1397 } 1398 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1399 set_bit(WriteMostly, &rdev->flags); 1400 if (desc->state & (1<<MD_DISK_FAILFAST)) 1401 set_bit(FailFast, &rdev->flags); 1402 } else /* MULTIPATH are always insync */ 1403 set_bit(In_sync, &rdev->flags); 1404 return 0; 1405 } 1406 1407 /* 1408 * sync_super for 0.90.0 1409 */ super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1410 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1411 { 1412 mdp_super_t *sb; 1413 struct md_rdev *rdev2; 1414 int next_spare = mddev->raid_disks; 1415 1416 /* make rdev->sb match mddev data.. 1417 * 1418 * 1/ zero out disks 1419 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1420 * 3/ any empty disks < next_spare become removed 1421 * 1422 * disks[0] gets initialised to REMOVED because 1423 * we cannot be sure from other fields if it has 1424 * been initialised or not. 1425 */ 1426 int i; 1427 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1428 1429 rdev->sb_size = MD_SB_BYTES; 1430 1431 sb = page_address(rdev->sb_page); 1432 1433 memset(sb, 0, sizeof(*sb)); 1434 1435 sb->md_magic = MD_SB_MAGIC; 1436 sb->major_version = mddev->major_version; 1437 sb->patch_version = mddev->patch_version; 1438 sb->gvalid_words = 0; /* ignored */ 1439 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1440 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1441 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1442 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1443 1444 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1445 sb->level = mddev->level; 1446 sb->size = mddev->dev_sectors / 2; 1447 sb->raid_disks = mddev->raid_disks; 1448 sb->md_minor = mddev->md_minor; 1449 sb->not_persistent = 0; 1450 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1451 sb->state = 0; 1452 sb->events_hi = (mddev->events>>32); 1453 sb->events_lo = (u32)mddev->events; 1454 1455 if (mddev->reshape_position == MaxSector) 1456 sb->minor_version = 90; 1457 else { 1458 sb->minor_version = 91; 1459 sb->reshape_position = mddev->reshape_position; 1460 sb->new_level = mddev->new_level; 1461 sb->delta_disks = mddev->delta_disks; 1462 sb->new_layout = mddev->new_layout; 1463 sb->new_chunk = mddev->new_chunk_sectors << 9; 1464 } 1465 mddev->minor_version = sb->minor_version; 1466 if (mddev->in_sync) 1467 { 1468 sb->recovery_cp = mddev->recovery_cp; 1469 sb->cp_events_hi = (mddev->events>>32); 1470 sb->cp_events_lo = (u32)mddev->events; 1471 if (mddev->recovery_cp == MaxSector) 1472 sb->state = (1<< MD_SB_CLEAN); 1473 } else 1474 sb->recovery_cp = 0; 1475 1476 sb->layout = mddev->layout; 1477 sb->chunk_size = mddev->chunk_sectors << 9; 1478 1479 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1480 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1481 1482 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1483 rdev_for_each(rdev2, mddev) { 1484 mdp_disk_t *d; 1485 int desc_nr; 1486 int is_active = test_bit(In_sync, &rdev2->flags); 1487 1488 if (rdev2->raid_disk >= 0 && 1489 sb->minor_version >= 91) 1490 /* we have nowhere to store the recovery_offset, 1491 * but if it is not below the reshape_position, 1492 * we can piggy-back on that. 1493 */ 1494 is_active = 1; 1495 if (rdev2->raid_disk < 0 || 1496 test_bit(Faulty, &rdev2->flags)) 1497 is_active = 0; 1498 if (is_active) 1499 desc_nr = rdev2->raid_disk; 1500 else 1501 desc_nr = next_spare++; 1502 rdev2->desc_nr = desc_nr; 1503 d = &sb->disks[rdev2->desc_nr]; 1504 nr_disks++; 1505 d->number = rdev2->desc_nr; 1506 d->major = MAJOR(rdev2->bdev->bd_dev); 1507 d->minor = MINOR(rdev2->bdev->bd_dev); 1508 if (is_active) 1509 d->raid_disk = rdev2->raid_disk; 1510 else 1511 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1512 if (test_bit(Faulty, &rdev2->flags)) 1513 d->state = (1<<MD_DISK_FAULTY); 1514 else if (is_active) { 1515 d->state = (1<<MD_DISK_ACTIVE); 1516 if (test_bit(In_sync, &rdev2->flags)) 1517 d->state |= (1<<MD_DISK_SYNC); 1518 active++; 1519 working++; 1520 } else { 1521 d->state = 0; 1522 spare++; 1523 working++; 1524 } 1525 if (test_bit(WriteMostly, &rdev2->flags)) 1526 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1527 if (test_bit(FailFast, &rdev2->flags)) 1528 d->state |= (1<<MD_DISK_FAILFAST); 1529 } 1530 /* now set the "removed" and "faulty" bits on any missing devices */ 1531 for (i=0 ; i < mddev->raid_disks ; i++) { 1532 mdp_disk_t *d = &sb->disks[i]; 1533 if (d->state == 0 && d->number == 0) { 1534 d->number = i; 1535 d->raid_disk = i; 1536 d->state = (1<<MD_DISK_REMOVED); 1537 d->state |= (1<<MD_DISK_FAULTY); 1538 failed++; 1539 } 1540 } 1541 sb->nr_disks = nr_disks; 1542 sb->active_disks = active; 1543 sb->working_disks = working; 1544 sb->failed_disks = failed; 1545 sb->spare_disks = spare; 1546 1547 sb->this_disk = sb->disks[rdev->desc_nr]; 1548 sb->sb_csum = calc_sb_csum(sb); 1549 } 1550 1551 /* 1552 * rdev_size_change for 0.90.0 1553 */ 1554 static unsigned long long super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1555 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1556 { 1557 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1558 return 0; /* component must fit device */ 1559 if (rdev->mddev->bitmap_info.offset) 1560 return 0; /* can't move bitmap */ 1561 rdev->sb_start = calc_dev_sboffset(rdev); 1562 if (!num_sectors || num_sectors > rdev->sb_start) 1563 num_sectors = rdev->sb_start; 1564 /* Limit to 4TB as metadata cannot record more than that. 1565 * 4TB == 2^32 KB, or 2*2^32 sectors. 1566 */ 1567 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1568 num_sectors = (sector_t)(2ULL << 32) - 2; 1569 do { 1570 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1571 rdev->sb_page); 1572 } while (md_super_wait(rdev->mddev) < 0); 1573 return num_sectors; 1574 } 1575 1576 static int super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1577 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1578 { 1579 /* non-zero offset changes not possible with v0.90 */ 1580 return new_offset == 0; 1581 } 1582 1583 /* 1584 * version 1 superblock 1585 */ 1586 calc_sb_1_csum(struct mdp_superblock_1 * sb)1587 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1588 { 1589 __le32 disk_csum; 1590 u32 csum; 1591 unsigned long long newcsum; 1592 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1593 __le32 *isuper = (__le32*)sb; 1594 1595 disk_csum = sb->sb_csum; 1596 sb->sb_csum = 0; 1597 newcsum = 0; 1598 for (; size >= 4; size -= 4) 1599 newcsum += le32_to_cpu(*isuper++); 1600 1601 if (size == 2) 1602 newcsum += le16_to_cpu(*(__le16*) isuper); 1603 1604 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1605 sb->sb_csum = disk_csum; 1606 return cpu_to_le32(csum); 1607 } 1608 super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1609 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1610 { 1611 struct mdp_superblock_1 *sb; 1612 int ret; 1613 sector_t sb_start; 1614 sector_t sectors; 1615 int bmask; 1616 bool spare_disk = true; 1617 1618 /* 1619 * Calculate the position of the superblock in 512byte sectors. 1620 * It is always aligned to a 4K boundary and 1621 * depeding on minor_version, it can be: 1622 * 0: At least 8K, but less than 12K, from end of device 1623 * 1: At start of device 1624 * 2: 4K from start of device. 1625 */ 1626 switch(minor_version) { 1627 case 0: 1628 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1629 sb_start &= ~(sector_t)(4*2-1); 1630 break; 1631 case 1: 1632 sb_start = 0; 1633 break; 1634 case 2: 1635 sb_start = 8; 1636 break; 1637 default: 1638 return -EINVAL; 1639 } 1640 rdev->sb_start = sb_start; 1641 1642 /* superblock is rarely larger than 1K, but it can be larger, 1643 * and it is safe to read 4k, so we do that 1644 */ 1645 ret = read_disk_sb(rdev, 4096); 1646 if (ret) return ret; 1647 1648 sb = page_address(rdev->sb_page); 1649 1650 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1651 sb->major_version != cpu_to_le32(1) || 1652 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1653 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1654 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1655 return -EINVAL; 1656 1657 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1658 pr_warn("md: invalid superblock checksum on %pg\n", 1659 rdev->bdev); 1660 return -EINVAL; 1661 } 1662 if (le64_to_cpu(sb->data_size) < 10) { 1663 pr_warn("md: data_size too small on %pg\n", 1664 rdev->bdev); 1665 return -EINVAL; 1666 } 1667 if (sb->pad0 || 1668 sb->pad3[0] || 1669 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1670 /* Some padding is non-zero, might be a new feature */ 1671 return -EINVAL; 1672 1673 rdev->preferred_minor = 0xffff; 1674 rdev->data_offset = le64_to_cpu(sb->data_offset); 1675 rdev->new_data_offset = rdev->data_offset; 1676 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1677 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1678 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1679 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1680 1681 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1682 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1683 if (rdev->sb_size & bmask) 1684 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1685 1686 if (minor_version 1687 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1688 return -EINVAL; 1689 if (minor_version 1690 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1691 return -EINVAL; 1692 1693 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1694 rdev->desc_nr = -1; 1695 else 1696 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1697 1698 if (!rdev->bb_page) { 1699 rdev->bb_page = alloc_page(GFP_KERNEL); 1700 if (!rdev->bb_page) 1701 return -ENOMEM; 1702 } 1703 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1704 rdev->badblocks.count == 0) { 1705 /* need to load the bad block list. 1706 * Currently we limit it to one page. 1707 */ 1708 s32 offset; 1709 sector_t bb_sector; 1710 __le64 *bbp; 1711 int i; 1712 int sectors = le16_to_cpu(sb->bblog_size); 1713 if (sectors > (PAGE_SIZE / 512)) 1714 return -EINVAL; 1715 offset = le32_to_cpu(sb->bblog_offset); 1716 if (offset == 0) 1717 return -EINVAL; 1718 bb_sector = (long long)offset; 1719 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1720 rdev->bb_page, REQ_OP_READ, true)) 1721 return -EIO; 1722 bbp = (__le64 *)page_address(rdev->bb_page); 1723 rdev->badblocks.shift = sb->bblog_shift; 1724 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1725 u64 bb = le64_to_cpu(*bbp); 1726 int count = bb & (0x3ff); 1727 u64 sector = bb >> 10; 1728 sector <<= sb->bblog_shift; 1729 count <<= sb->bblog_shift; 1730 if (bb + 1 == 0) 1731 break; 1732 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1733 return -EINVAL; 1734 } 1735 } else if (sb->bblog_offset != 0) 1736 rdev->badblocks.shift = 0; 1737 1738 if ((le32_to_cpu(sb->feature_map) & 1739 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1740 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1741 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1742 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1743 } 1744 1745 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1746 sb->level != 0) 1747 return -EINVAL; 1748 1749 /* not spare disk, or LEVEL_MULTIPATH */ 1750 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || 1751 (rdev->desc_nr >= 0 && 1752 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1753 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1754 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) 1755 spare_disk = false; 1756 1757 if (!refdev) { 1758 if (!spare_disk) 1759 ret = 1; 1760 else 1761 ret = 0; 1762 } else { 1763 __u64 ev1, ev2; 1764 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1765 1766 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1767 sb->level != refsb->level || 1768 sb->layout != refsb->layout || 1769 sb->chunksize != refsb->chunksize) { 1770 pr_warn("md: %pg has strangely different superblock to %pg\n", 1771 rdev->bdev, 1772 refdev->bdev); 1773 return -EINVAL; 1774 } 1775 ev1 = le64_to_cpu(sb->events); 1776 ev2 = le64_to_cpu(refsb->events); 1777 1778 if (!spare_disk && ev1 > ev2) 1779 ret = 1; 1780 else 1781 ret = 0; 1782 } 1783 if (minor_version) 1784 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1785 else 1786 sectors = rdev->sb_start; 1787 if (sectors < le64_to_cpu(sb->data_size)) 1788 return -EINVAL; 1789 rdev->sectors = le64_to_cpu(sb->data_size); 1790 return ret; 1791 } 1792 super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1793 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1794 { 1795 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1796 __u64 ev1 = le64_to_cpu(sb->events); 1797 1798 rdev->raid_disk = -1; 1799 clear_bit(Faulty, &rdev->flags); 1800 clear_bit(In_sync, &rdev->flags); 1801 clear_bit(Bitmap_sync, &rdev->flags); 1802 clear_bit(WriteMostly, &rdev->flags); 1803 1804 if (mddev->raid_disks == 0) { 1805 mddev->major_version = 1; 1806 mddev->patch_version = 0; 1807 mddev->external = 0; 1808 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1809 mddev->ctime = le64_to_cpu(sb->ctime); 1810 mddev->utime = le64_to_cpu(sb->utime); 1811 mddev->level = le32_to_cpu(sb->level); 1812 mddev->clevel[0] = 0; 1813 mddev->layout = le32_to_cpu(sb->layout); 1814 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1815 mddev->dev_sectors = le64_to_cpu(sb->size); 1816 mddev->events = ev1; 1817 mddev->bitmap_info.offset = 0; 1818 mddev->bitmap_info.space = 0; 1819 /* Default location for bitmap is 1K after superblock 1820 * using 3K - total of 4K 1821 */ 1822 mddev->bitmap_info.default_offset = 1024 >> 9; 1823 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1824 mddev->reshape_backwards = 0; 1825 1826 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1827 memcpy(mddev->uuid, sb->set_uuid, 16); 1828 1829 mddev->max_disks = (4096-256)/2; 1830 1831 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1832 mddev->bitmap_info.file == NULL) { 1833 mddev->bitmap_info.offset = 1834 (__s32)le32_to_cpu(sb->bitmap_offset); 1835 /* Metadata doesn't record how much space is available. 1836 * For 1.0, we assume we can use up to the superblock 1837 * if before, else to 4K beyond superblock. 1838 * For others, assume no change is possible. 1839 */ 1840 if (mddev->minor_version > 0) 1841 mddev->bitmap_info.space = 0; 1842 else if (mddev->bitmap_info.offset > 0) 1843 mddev->bitmap_info.space = 1844 8 - mddev->bitmap_info.offset; 1845 else 1846 mddev->bitmap_info.space = 1847 -mddev->bitmap_info.offset; 1848 } 1849 1850 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1851 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1852 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1853 mddev->new_level = le32_to_cpu(sb->new_level); 1854 mddev->new_layout = le32_to_cpu(sb->new_layout); 1855 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1856 if (mddev->delta_disks < 0 || 1857 (mddev->delta_disks == 0 && 1858 (le32_to_cpu(sb->feature_map) 1859 & MD_FEATURE_RESHAPE_BACKWARDS))) 1860 mddev->reshape_backwards = 1; 1861 } else { 1862 mddev->reshape_position = MaxSector; 1863 mddev->delta_disks = 0; 1864 mddev->new_level = mddev->level; 1865 mddev->new_layout = mddev->layout; 1866 mddev->new_chunk_sectors = mddev->chunk_sectors; 1867 } 1868 1869 if (mddev->level == 0 && 1870 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1871 mddev->layout = -1; 1872 1873 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1874 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1875 1876 if (le32_to_cpu(sb->feature_map) & 1877 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1878 if (le32_to_cpu(sb->feature_map) & 1879 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1880 return -EINVAL; 1881 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1882 (le32_to_cpu(sb->feature_map) & 1883 MD_FEATURE_MULTIPLE_PPLS)) 1884 return -EINVAL; 1885 set_bit(MD_HAS_PPL, &mddev->flags); 1886 } 1887 } else if (mddev->pers == NULL) { 1888 /* Insist of good event counter while assembling, except for 1889 * spares (which don't need an event count). 1890 * Similar to mdadm, we allow event counter difference of 1 1891 * from the freshest device. 1892 */ 1893 if (rdev->desc_nr >= 0 && 1894 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1895 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1896 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1897 if (ev1 + 1 < mddev->events) 1898 return -EINVAL; 1899 } else if (mddev->bitmap) { 1900 /* If adding to array with a bitmap, then we can accept an 1901 * older device, but not too old. 1902 */ 1903 if (ev1 < mddev->bitmap->events_cleared) 1904 return 0; 1905 if (ev1 < mddev->events) 1906 set_bit(Bitmap_sync, &rdev->flags); 1907 } else { 1908 if (ev1 < mddev->events) 1909 /* just a hot-add of a new device, leave raid_disk at -1 */ 1910 return 0; 1911 } 1912 if (mddev->level != LEVEL_MULTIPATH) { 1913 int role; 1914 if (rdev->desc_nr < 0 || 1915 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1916 role = MD_DISK_ROLE_SPARE; 1917 rdev->desc_nr = -1; 1918 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1919 /* 1920 * If we are assembling, and our event counter is smaller than the 1921 * highest event counter, we cannot trust our superblock about the role. 1922 * It could happen that our rdev was marked as Faulty, and all other 1923 * superblocks were updated with +1 event counter. 1924 * Then, before the next superblock update, which typically happens when 1925 * remove_and_add_spares() removes the device from the array, there was 1926 * a crash or reboot. 1927 * If we allow current rdev without consulting the freshest superblock, 1928 * we could cause data corruption. 1929 * Note that in this case our event counter is smaller by 1 than the 1930 * highest, otherwise, this rdev would not be allowed into array; 1931 * both kernel and mdadm allow event counter difference of 1. 1932 */ 1933 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 1934 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 1935 1936 if (rdev->desc_nr >= freshest_max_dev) { 1937 /* this is unexpected, better not proceed */ 1938 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 1939 mdname(mddev), rdev->bdev, rdev->desc_nr, 1940 freshest->bdev, freshest_max_dev); 1941 return -EUCLEAN; 1942 } 1943 1944 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 1945 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 1946 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 1947 } else { 1948 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1949 } 1950 switch(role) { 1951 case MD_DISK_ROLE_SPARE: /* spare */ 1952 break; 1953 case MD_DISK_ROLE_FAULTY: /* faulty */ 1954 set_bit(Faulty, &rdev->flags); 1955 break; 1956 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1957 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1958 /* journal device without journal feature */ 1959 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1960 return -EINVAL; 1961 } 1962 set_bit(Journal, &rdev->flags); 1963 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1964 rdev->raid_disk = 0; 1965 break; 1966 default: 1967 rdev->saved_raid_disk = role; 1968 if ((le32_to_cpu(sb->feature_map) & 1969 MD_FEATURE_RECOVERY_OFFSET)) { 1970 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1971 if (!(le32_to_cpu(sb->feature_map) & 1972 MD_FEATURE_RECOVERY_BITMAP)) 1973 rdev->saved_raid_disk = -1; 1974 } else { 1975 /* 1976 * If the array is FROZEN, then the device can't 1977 * be in_sync with rest of array. 1978 */ 1979 if (!test_bit(MD_RECOVERY_FROZEN, 1980 &mddev->recovery)) 1981 set_bit(In_sync, &rdev->flags); 1982 } 1983 rdev->raid_disk = role; 1984 break; 1985 } 1986 if (sb->devflags & WriteMostly1) 1987 set_bit(WriteMostly, &rdev->flags); 1988 if (sb->devflags & FailFast1) 1989 set_bit(FailFast, &rdev->flags); 1990 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1991 set_bit(Replacement, &rdev->flags); 1992 } else /* MULTIPATH are always insync */ 1993 set_bit(In_sync, &rdev->flags); 1994 1995 return 0; 1996 } 1997 super_1_sync(struct mddev * mddev,struct md_rdev * rdev)1998 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1999 { 2000 struct mdp_superblock_1 *sb; 2001 struct md_rdev *rdev2; 2002 int max_dev, i; 2003 /* make rdev->sb match mddev and rdev data. */ 2004 2005 sb = page_address(rdev->sb_page); 2006 2007 sb->feature_map = 0; 2008 sb->pad0 = 0; 2009 sb->recovery_offset = cpu_to_le64(0); 2010 memset(sb->pad3, 0, sizeof(sb->pad3)); 2011 2012 sb->utime = cpu_to_le64((__u64)mddev->utime); 2013 sb->events = cpu_to_le64(mddev->events); 2014 if (mddev->in_sync) 2015 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2016 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2017 sb->resync_offset = cpu_to_le64(MaxSector); 2018 else 2019 sb->resync_offset = cpu_to_le64(0); 2020 2021 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2022 2023 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2024 sb->size = cpu_to_le64(mddev->dev_sectors); 2025 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2026 sb->level = cpu_to_le32(mddev->level); 2027 sb->layout = cpu_to_le32(mddev->layout); 2028 if (test_bit(FailFast, &rdev->flags)) 2029 sb->devflags |= FailFast1; 2030 else 2031 sb->devflags &= ~FailFast1; 2032 2033 if (test_bit(WriteMostly, &rdev->flags)) 2034 sb->devflags |= WriteMostly1; 2035 else 2036 sb->devflags &= ~WriteMostly1; 2037 sb->data_offset = cpu_to_le64(rdev->data_offset); 2038 sb->data_size = cpu_to_le64(rdev->sectors); 2039 2040 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2041 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2042 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2043 } 2044 2045 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2046 !test_bit(In_sync, &rdev->flags)) { 2047 sb->feature_map |= 2048 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2049 sb->recovery_offset = 2050 cpu_to_le64(rdev->recovery_offset); 2051 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2052 sb->feature_map |= 2053 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2054 } 2055 /* Note: recovery_offset and journal_tail share space */ 2056 if (test_bit(Journal, &rdev->flags)) 2057 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2058 if (test_bit(Replacement, &rdev->flags)) 2059 sb->feature_map |= 2060 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2061 2062 if (mddev->reshape_position != MaxSector) { 2063 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2064 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2065 sb->new_layout = cpu_to_le32(mddev->new_layout); 2066 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2067 sb->new_level = cpu_to_le32(mddev->new_level); 2068 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2069 if (mddev->delta_disks == 0 && 2070 mddev->reshape_backwards) 2071 sb->feature_map 2072 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2073 if (rdev->new_data_offset != rdev->data_offset) { 2074 sb->feature_map 2075 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2076 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2077 - rdev->data_offset)); 2078 } 2079 } 2080 2081 if (mddev_is_clustered(mddev)) 2082 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2083 2084 if (rdev->badblocks.count == 0) 2085 /* Nothing to do for bad blocks*/ ; 2086 else if (sb->bblog_offset == 0) 2087 /* Cannot record bad blocks on this device */ 2088 md_error(mddev, rdev); 2089 else { 2090 struct badblocks *bb = &rdev->badblocks; 2091 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2092 u64 *p = bb->page; 2093 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2094 if (bb->changed) { 2095 unsigned seq; 2096 2097 retry: 2098 seq = read_seqbegin(&bb->lock); 2099 2100 memset(bbp, 0xff, PAGE_SIZE); 2101 2102 for (i = 0 ; i < bb->count ; i++) { 2103 u64 internal_bb = p[i]; 2104 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2105 | BB_LEN(internal_bb)); 2106 bbp[i] = cpu_to_le64(store_bb); 2107 } 2108 bb->changed = 0; 2109 if (read_seqretry(&bb->lock, seq)) 2110 goto retry; 2111 2112 bb->sector = (rdev->sb_start + 2113 (int)le32_to_cpu(sb->bblog_offset)); 2114 bb->size = le16_to_cpu(sb->bblog_size); 2115 } 2116 } 2117 2118 max_dev = 0; 2119 rdev_for_each(rdev2, mddev) 2120 if (rdev2->desc_nr+1 > max_dev) 2121 max_dev = rdev2->desc_nr+1; 2122 2123 if (max_dev > le32_to_cpu(sb->max_dev)) { 2124 int bmask; 2125 sb->max_dev = cpu_to_le32(max_dev); 2126 rdev->sb_size = max_dev * 2 + 256; 2127 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2128 if (rdev->sb_size & bmask) 2129 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2130 } else 2131 max_dev = le32_to_cpu(sb->max_dev); 2132 2133 for (i=0; i<max_dev;i++) 2134 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2135 2136 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2137 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2138 2139 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2140 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2141 sb->feature_map |= 2142 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2143 else 2144 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2145 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2146 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2147 } 2148 2149 rdev_for_each(rdev2, mddev) { 2150 i = rdev2->desc_nr; 2151 if (test_bit(Faulty, &rdev2->flags)) 2152 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2153 else if (test_bit(In_sync, &rdev2->flags)) 2154 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2155 else if (test_bit(Journal, &rdev2->flags)) 2156 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2157 else if (rdev2->raid_disk >= 0) 2158 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2159 else 2160 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2161 } 2162 2163 sb->sb_csum = calc_sb_1_csum(sb); 2164 } 2165 super_1_choose_bm_space(sector_t dev_size)2166 static sector_t super_1_choose_bm_space(sector_t dev_size) 2167 { 2168 sector_t bm_space; 2169 2170 /* if the device is bigger than 8Gig, save 64k for bitmap 2171 * usage, if bigger than 200Gig, save 128k 2172 */ 2173 if (dev_size < 64*2) 2174 bm_space = 0; 2175 else if (dev_size - 64*2 >= 200*1024*1024*2) 2176 bm_space = 128*2; 2177 else if (dev_size - 4*2 > 8*1024*1024*2) 2178 bm_space = 64*2; 2179 else 2180 bm_space = 4*2; 2181 return bm_space; 2182 } 2183 2184 static unsigned long long super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2185 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2186 { 2187 struct mdp_superblock_1 *sb; 2188 sector_t max_sectors; 2189 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2190 return 0; /* component must fit device */ 2191 if (rdev->data_offset != rdev->new_data_offset) 2192 return 0; /* too confusing */ 2193 if (rdev->sb_start < rdev->data_offset) { 2194 /* minor versions 1 and 2; superblock before data */ 2195 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2196 if (!num_sectors || num_sectors > max_sectors) 2197 num_sectors = max_sectors; 2198 } else if (rdev->mddev->bitmap_info.offset) { 2199 /* minor version 0 with bitmap we can't move */ 2200 return 0; 2201 } else { 2202 /* minor version 0; superblock after data */ 2203 sector_t sb_start, bm_space; 2204 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2205 2206 /* 8K is for superblock */ 2207 sb_start = dev_size - 8*2; 2208 sb_start &= ~(sector_t)(4*2 - 1); 2209 2210 bm_space = super_1_choose_bm_space(dev_size); 2211 2212 /* Space that can be used to store date needs to decrease 2213 * superblock bitmap space and bad block space(4K) 2214 */ 2215 max_sectors = sb_start - bm_space - 4*2; 2216 2217 if (!num_sectors || num_sectors > max_sectors) 2218 num_sectors = max_sectors; 2219 rdev->sb_start = sb_start; 2220 } 2221 sb = page_address(rdev->sb_page); 2222 sb->data_size = cpu_to_le64(num_sectors); 2223 sb->super_offset = cpu_to_le64(rdev->sb_start); 2224 sb->sb_csum = calc_sb_1_csum(sb); 2225 do { 2226 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2227 rdev->sb_page); 2228 } while (md_super_wait(rdev->mddev) < 0); 2229 return num_sectors; 2230 2231 } 2232 2233 static int super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2234 super_1_allow_new_offset(struct md_rdev *rdev, 2235 unsigned long long new_offset) 2236 { 2237 /* All necessary checks on new >= old have been done */ 2238 struct bitmap *bitmap; 2239 if (new_offset >= rdev->data_offset) 2240 return 1; 2241 2242 /* with 1.0 metadata, there is no metadata to tread on 2243 * so we can always move back */ 2244 if (rdev->mddev->minor_version == 0) 2245 return 1; 2246 2247 /* otherwise we must be sure not to step on 2248 * any metadata, so stay: 2249 * 36K beyond start of superblock 2250 * beyond end of badblocks 2251 * beyond write-intent bitmap 2252 */ 2253 if (rdev->sb_start + (32+4)*2 > new_offset) 2254 return 0; 2255 bitmap = rdev->mddev->bitmap; 2256 if (bitmap && !rdev->mddev->bitmap_info.file && 2257 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2258 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2259 return 0; 2260 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2261 return 0; 2262 2263 return 1; 2264 } 2265 2266 static struct super_type super_types[] = { 2267 [0] = { 2268 .name = "0.90.0", 2269 .owner = THIS_MODULE, 2270 .load_super = super_90_load, 2271 .validate_super = super_90_validate, 2272 .sync_super = super_90_sync, 2273 .rdev_size_change = super_90_rdev_size_change, 2274 .allow_new_offset = super_90_allow_new_offset, 2275 }, 2276 [1] = { 2277 .name = "md-1", 2278 .owner = THIS_MODULE, 2279 .load_super = super_1_load, 2280 .validate_super = super_1_validate, 2281 .sync_super = super_1_sync, 2282 .rdev_size_change = super_1_rdev_size_change, 2283 .allow_new_offset = super_1_allow_new_offset, 2284 }, 2285 }; 2286 sync_super(struct mddev * mddev,struct md_rdev * rdev)2287 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2288 { 2289 if (mddev->sync_super) { 2290 mddev->sync_super(mddev, rdev); 2291 return; 2292 } 2293 2294 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2295 2296 super_types[mddev->major_version].sync_super(mddev, rdev); 2297 } 2298 match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2299 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2300 { 2301 struct md_rdev *rdev, *rdev2; 2302 2303 rcu_read_lock(); 2304 rdev_for_each_rcu(rdev, mddev1) { 2305 if (test_bit(Faulty, &rdev->flags) || 2306 test_bit(Journal, &rdev->flags) || 2307 rdev->raid_disk == -1) 2308 continue; 2309 rdev_for_each_rcu(rdev2, mddev2) { 2310 if (test_bit(Faulty, &rdev2->flags) || 2311 test_bit(Journal, &rdev2->flags) || 2312 rdev2->raid_disk == -1) 2313 continue; 2314 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2315 rcu_read_unlock(); 2316 return 1; 2317 } 2318 } 2319 } 2320 rcu_read_unlock(); 2321 return 0; 2322 } 2323 2324 static LIST_HEAD(pending_raid_disks); 2325 2326 /* 2327 * Try to register data integrity profile for an mddev 2328 * 2329 * This is called when an array is started and after a disk has been kicked 2330 * from the array. It only succeeds if all working and active component devices 2331 * are integrity capable with matching profiles. 2332 */ md_integrity_register(struct mddev * mddev)2333 int md_integrity_register(struct mddev *mddev) 2334 { 2335 struct md_rdev *rdev, *reference = NULL; 2336 2337 if (list_empty(&mddev->disks)) 2338 return 0; /* nothing to do */ 2339 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2340 return 0; /* shouldn't register, or already is */ 2341 rdev_for_each(rdev, mddev) { 2342 /* skip spares and non-functional disks */ 2343 if (test_bit(Faulty, &rdev->flags)) 2344 continue; 2345 if (rdev->raid_disk < 0) 2346 continue; 2347 if (!reference) { 2348 /* Use the first rdev as the reference */ 2349 reference = rdev; 2350 continue; 2351 } 2352 /* does this rdev's profile match the reference profile? */ 2353 if (blk_integrity_compare(reference->bdev->bd_disk, 2354 rdev->bdev->bd_disk) < 0) 2355 return -EINVAL; 2356 } 2357 if (!reference || !bdev_get_integrity(reference->bdev)) 2358 return 0; 2359 /* 2360 * All component devices are integrity capable and have matching 2361 * profiles, register the common profile for the md device. 2362 */ 2363 blk_integrity_register(mddev->gendisk, 2364 bdev_get_integrity(reference->bdev)); 2365 2366 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2367 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2368 (mddev->level != 1 && mddev->level != 10 && 2369 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2370 /* 2371 * No need to handle the failure of bioset_integrity_create, 2372 * because the function is called by md_run() -> pers->run(), 2373 * md_run calls bioset_exit -> bioset_integrity_free in case 2374 * of failure case. 2375 */ 2376 pr_err("md: failed to create integrity pool for %s\n", 2377 mdname(mddev)); 2378 return -EINVAL; 2379 } 2380 return 0; 2381 } 2382 EXPORT_SYMBOL(md_integrity_register); 2383 2384 /* 2385 * Attempt to add an rdev, but only if it is consistent with the current 2386 * integrity profile 2387 */ md_integrity_add_rdev(struct md_rdev * rdev,struct mddev * mddev)2388 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2389 { 2390 struct blk_integrity *bi_mddev; 2391 2392 if (!mddev->gendisk) 2393 return 0; 2394 2395 bi_mddev = blk_get_integrity(mddev->gendisk); 2396 2397 if (!bi_mddev) /* nothing to do */ 2398 return 0; 2399 2400 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2401 pr_err("%s: incompatible integrity profile for %pg\n", 2402 mdname(mddev), rdev->bdev); 2403 return -ENXIO; 2404 } 2405 2406 return 0; 2407 } 2408 EXPORT_SYMBOL(md_integrity_add_rdev); 2409 rdev_read_only(struct md_rdev * rdev)2410 static bool rdev_read_only(struct md_rdev *rdev) 2411 { 2412 return bdev_read_only(rdev->bdev) || 2413 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2414 } 2415 bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2416 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2417 { 2418 char b[BDEVNAME_SIZE]; 2419 int err; 2420 2421 /* prevent duplicates */ 2422 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2423 return -EEXIST; 2424 2425 if (rdev_read_only(rdev) && mddev->pers) 2426 return -EROFS; 2427 2428 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2429 if (!test_bit(Journal, &rdev->flags) && 2430 rdev->sectors && 2431 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2432 if (mddev->pers) { 2433 /* Cannot change size, so fail 2434 * If mddev->level <= 0, then we don't care 2435 * about aligning sizes (e.g. linear) 2436 */ 2437 if (mddev->level > 0) 2438 return -ENOSPC; 2439 } else 2440 mddev->dev_sectors = rdev->sectors; 2441 } 2442 2443 /* Verify rdev->desc_nr is unique. 2444 * If it is -1, assign a free number, else 2445 * check number is not in use 2446 */ 2447 rcu_read_lock(); 2448 if (rdev->desc_nr < 0) { 2449 int choice = 0; 2450 if (mddev->pers) 2451 choice = mddev->raid_disks; 2452 while (md_find_rdev_nr_rcu(mddev, choice)) 2453 choice++; 2454 rdev->desc_nr = choice; 2455 } else { 2456 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2457 rcu_read_unlock(); 2458 return -EBUSY; 2459 } 2460 } 2461 rcu_read_unlock(); 2462 if (!test_bit(Journal, &rdev->flags) && 2463 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2464 pr_warn("md: %s: array is limited to %d devices\n", 2465 mdname(mddev), mddev->max_disks); 2466 return -EBUSY; 2467 } 2468 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2469 strreplace(b, '/', '!'); 2470 2471 rdev->mddev = mddev; 2472 pr_debug("md: bind<%s>\n", b); 2473 2474 if (mddev->raid_disks) 2475 mddev_create_serial_pool(mddev, rdev, false); 2476 2477 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2478 goto fail; 2479 2480 /* failure here is OK */ 2481 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2482 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2483 rdev->sysfs_unack_badblocks = 2484 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2485 rdev->sysfs_badblocks = 2486 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2487 2488 list_add_rcu(&rdev->same_set, &mddev->disks); 2489 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2490 2491 /* May as well allow recovery to be retried once */ 2492 mddev->recovery_disabled++; 2493 2494 return 0; 2495 2496 fail: 2497 pr_warn("md: failed to register dev-%s for %s\n", 2498 b, mdname(mddev)); 2499 mddev_destroy_serial_pool(mddev, rdev, false); 2500 return err; 2501 } 2502 2503 void md_autodetect_dev(dev_t dev); 2504 2505 /* just for claiming the bdev */ 2506 static struct md_rdev claim_rdev; 2507 export_rdev(struct md_rdev * rdev,struct mddev * mddev)2508 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2509 { 2510 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2511 md_rdev_clear(rdev); 2512 #ifndef MODULE 2513 if (test_bit(AutoDetected, &rdev->flags)) 2514 md_autodetect_dev(rdev->bdev->bd_dev); 2515 #endif 2516 blkdev_put(rdev->bdev, 2517 test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev); 2518 rdev->bdev = NULL; 2519 kobject_put(&rdev->kobj); 2520 } 2521 md_kick_rdev_from_array(struct md_rdev * rdev)2522 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2523 { 2524 struct mddev *mddev = rdev->mddev; 2525 2526 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2527 list_del_rcu(&rdev->same_set); 2528 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2529 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 2530 rdev->mddev = NULL; 2531 sysfs_remove_link(&rdev->kobj, "block"); 2532 sysfs_put(rdev->sysfs_state); 2533 sysfs_put(rdev->sysfs_unack_badblocks); 2534 sysfs_put(rdev->sysfs_badblocks); 2535 rdev->sysfs_state = NULL; 2536 rdev->sysfs_unack_badblocks = NULL; 2537 rdev->sysfs_badblocks = NULL; 2538 rdev->badblocks.count = 0; 2539 2540 synchronize_rcu(); 2541 2542 /* 2543 * kobject_del() will wait for all in progress writers to be done, where 2544 * reconfig_mutex is held, hence it can't be called under 2545 * reconfig_mutex and it's delayed to mddev_unlock(). 2546 */ 2547 list_add(&rdev->same_set, &mddev->deleting); 2548 } 2549 export_array(struct mddev * mddev)2550 static void export_array(struct mddev *mddev) 2551 { 2552 struct md_rdev *rdev; 2553 2554 while (!list_empty(&mddev->disks)) { 2555 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2556 same_set); 2557 md_kick_rdev_from_array(rdev); 2558 } 2559 mddev->raid_disks = 0; 2560 mddev->major_version = 0; 2561 } 2562 set_in_sync(struct mddev * mddev)2563 static bool set_in_sync(struct mddev *mddev) 2564 { 2565 lockdep_assert_held(&mddev->lock); 2566 if (!mddev->in_sync) { 2567 mddev->sync_checkers++; 2568 spin_unlock(&mddev->lock); 2569 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2570 spin_lock(&mddev->lock); 2571 if (!mddev->in_sync && 2572 percpu_ref_is_zero(&mddev->writes_pending)) { 2573 mddev->in_sync = 1; 2574 /* 2575 * Ensure ->in_sync is visible before we clear 2576 * ->sync_checkers. 2577 */ 2578 smp_mb(); 2579 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2580 sysfs_notify_dirent_safe(mddev->sysfs_state); 2581 } 2582 if (--mddev->sync_checkers == 0) 2583 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2584 } 2585 if (mddev->safemode == 1) 2586 mddev->safemode = 0; 2587 return mddev->in_sync; 2588 } 2589 sync_sbs(struct mddev * mddev,int nospares)2590 static void sync_sbs(struct mddev *mddev, int nospares) 2591 { 2592 /* Update each superblock (in-memory image), but 2593 * if we are allowed to, skip spares which already 2594 * have the right event counter, or have one earlier 2595 * (which would mean they aren't being marked as dirty 2596 * with the rest of the array) 2597 */ 2598 struct md_rdev *rdev; 2599 rdev_for_each(rdev, mddev) { 2600 if (rdev->sb_events == mddev->events || 2601 (nospares && 2602 rdev->raid_disk < 0 && 2603 rdev->sb_events+1 == mddev->events)) { 2604 /* Don't update this superblock */ 2605 rdev->sb_loaded = 2; 2606 } else { 2607 sync_super(mddev, rdev); 2608 rdev->sb_loaded = 1; 2609 } 2610 } 2611 } 2612 does_sb_need_changing(struct mddev * mddev)2613 static bool does_sb_need_changing(struct mddev *mddev) 2614 { 2615 struct md_rdev *rdev = NULL, *iter; 2616 struct mdp_superblock_1 *sb; 2617 int role; 2618 2619 /* Find a good rdev */ 2620 rdev_for_each(iter, mddev) 2621 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2622 rdev = iter; 2623 break; 2624 } 2625 2626 /* No good device found. */ 2627 if (!rdev) 2628 return false; 2629 2630 sb = page_address(rdev->sb_page); 2631 /* Check if a device has become faulty or a spare become active */ 2632 rdev_for_each(rdev, mddev) { 2633 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2634 /* Device activated? */ 2635 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2636 !test_bit(Faulty, &rdev->flags)) 2637 return true; 2638 /* Device turned faulty? */ 2639 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2640 return true; 2641 } 2642 2643 /* Check if any mddev parameters have changed */ 2644 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2645 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2646 (mddev->layout != le32_to_cpu(sb->layout)) || 2647 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2648 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2649 return true; 2650 2651 return false; 2652 } 2653 md_update_sb(struct mddev * mddev,int force_change)2654 void md_update_sb(struct mddev *mddev, int force_change) 2655 { 2656 struct md_rdev *rdev; 2657 int sync_req; 2658 int nospares = 0; 2659 int any_badblocks_changed = 0; 2660 int ret = -1; 2661 2662 if (!md_is_rdwr(mddev)) { 2663 if (force_change) 2664 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2665 return; 2666 } 2667 2668 repeat: 2669 if (mddev_is_clustered(mddev)) { 2670 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2671 force_change = 1; 2672 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2673 nospares = 1; 2674 ret = md_cluster_ops->metadata_update_start(mddev); 2675 /* Has someone else has updated the sb */ 2676 if (!does_sb_need_changing(mddev)) { 2677 if (ret == 0) 2678 md_cluster_ops->metadata_update_cancel(mddev); 2679 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2680 BIT(MD_SB_CHANGE_DEVS) | 2681 BIT(MD_SB_CHANGE_CLEAN)); 2682 return; 2683 } 2684 } 2685 2686 /* 2687 * First make sure individual recovery_offsets are correct 2688 * curr_resync_completed can only be used during recovery. 2689 * During reshape/resync it might use array-addresses rather 2690 * that device addresses. 2691 */ 2692 rdev_for_each(rdev, mddev) { 2693 if (rdev->raid_disk >= 0 && 2694 mddev->delta_disks >= 0 && 2695 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2696 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2697 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2698 !test_bit(Journal, &rdev->flags) && 2699 !test_bit(In_sync, &rdev->flags) && 2700 mddev->curr_resync_completed > rdev->recovery_offset) 2701 rdev->recovery_offset = mddev->curr_resync_completed; 2702 2703 } 2704 if (!mddev->persistent) { 2705 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2706 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2707 if (!mddev->external) { 2708 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2709 rdev_for_each(rdev, mddev) { 2710 if (rdev->badblocks.changed) { 2711 rdev->badblocks.changed = 0; 2712 ack_all_badblocks(&rdev->badblocks); 2713 md_error(mddev, rdev); 2714 } 2715 clear_bit(Blocked, &rdev->flags); 2716 clear_bit(BlockedBadBlocks, &rdev->flags); 2717 wake_up(&rdev->blocked_wait); 2718 } 2719 } 2720 wake_up(&mddev->sb_wait); 2721 return; 2722 } 2723 2724 spin_lock(&mddev->lock); 2725 2726 mddev->utime = ktime_get_real_seconds(); 2727 2728 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2729 force_change = 1; 2730 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2731 /* just a clean<-> dirty transition, possibly leave spares alone, 2732 * though if events isn't the right even/odd, we will have to do 2733 * spares after all 2734 */ 2735 nospares = 1; 2736 if (force_change) 2737 nospares = 0; 2738 if (mddev->degraded) 2739 /* If the array is degraded, then skipping spares is both 2740 * dangerous and fairly pointless. 2741 * Dangerous because a device that was removed from the array 2742 * might have a event_count that still looks up-to-date, 2743 * so it can be re-added without a resync. 2744 * Pointless because if there are any spares to skip, 2745 * then a recovery will happen and soon that array won't 2746 * be degraded any more and the spare can go back to sleep then. 2747 */ 2748 nospares = 0; 2749 2750 sync_req = mddev->in_sync; 2751 2752 /* If this is just a dirty<->clean transition, and the array is clean 2753 * and 'events' is odd, we can roll back to the previous clean state */ 2754 if (nospares 2755 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2756 && mddev->can_decrease_events 2757 && mddev->events != 1) { 2758 mddev->events--; 2759 mddev->can_decrease_events = 0; 2760 } else { 2761 /* otherwise we have to go forward and ... */ 2762 mddev->events ++; 2763 mddev->can_decrease_events = nospares; 2764 } 2765 2766 /* 2767 * This 64-bit counter should never wrap. 2768 * Either we are in around ~1 trillion A.C., assuming 2769 * 1 reboot per second, or we have a bug... 2770 */ 2771 WARN_ON(mddev->events == 0); 2772 2773 rdev_for_each(rdev, mddev) { 2774 if (rdev->badblocks.changed) 2775 any_badblocks_changed++; 2776 if (test_bit(Faulty, &rdev->flags)) 2777 set_bit(FaultRecorded, &rdev->flags); 2778 } 2779 2780 sync_sbs(mddev, nospares); 2781 spin_unlock(&mddev->lock); 2782 2783 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2784 mdname(mddev), mddev->in_sync); 2785 2786 if (mddev->queue) 2787 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2788 rewrite: 2789 md_bitmap_update_sb(mddev->bitmap); 2790 rdev_for_each(rdev, mddev) { 2791 if (rdev->sb_loaded != 1) 2792 continue; /* no noise on spare devices */ 2793 2794 if (!test_bit(Faulty, &rdev->flags)) { 2795 md_super_write(mddev,rdev, 2796 rdev->sb_start, rdev->sb_size, 2797 rdev->sb_page); 2798 pr_debug("md: (write) %pg's sb offset: %llu\n", 2799 rdev->bdev, 2800 (unsigned long long)rdev->sb_start); 2801 rdev->sb_events = mddev->events; 2802 if (rdev->badblocks.size) { 2803 md_super_write(mddev, rdev, 2804 rdev->badblocks.sector, 2805 rdev->badblocks.size << 9, 2806 rdev->bb_page); 2807 rdev->badblocks.size = 0; 2808 } 2809 2810 } else 2811 pr_debug("md: %pg (skipping faulty)\n", 2812 rdev->bdev); 2813 2814 if (mddev->level == LEVEL_MULTIPATH) 2815 /* only need to write one superblock... */ 2816 break; 2817 } 2818 if (md_super_wait(mddev) < 0) 2819 goto rewrite; 2820 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2821 2822 if (mddev_is_clustered(mddev) && ret == 0) 2823 md_cluster_ops->metadata_update_finish(mddev); 2824 2825 if (mddev->in_sync != sync_req || 2826 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2827 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2828 /* have to write it out again */ 2829 goto repeat; 2830 wake_up(&mddev->sb_wait); 2831 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2832 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2833 2834 rdev_for_each(rdev, mddev) { 2835 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2836 clear_bit(Blocked, &rdev->flags); 2837 2838 if (any_badblocks_changed) 2839 ack_all_badblocks(&rdev->badblocks); 2840 clear_bit(BlockedBadBlocks, &rdev->flags); 2841 wake_up(&rdev->blocked_wait); 2842 } 2843 } 2844 EXPORT_SYMBOL(md_update_sb); 2845 add_bound_rdev(struct md_rdev * rdev)2846 static int add_bound_rdev(struct md_rdev *rdev) 2847 { 2848 struct mddev *mddev = rdev->mddev; 2849 int err = 0; 2850 bool add_journal = test_bit(Journal, &rdev->flags); 2851 2852 if (!mddev->pers->hot_remove_disk || add_journal) { 2853 /* If there is hot_add_disk but no hot_remove_disk 2854 * then added disks for geometry changes, 2855 * and should be added immediately. 2856 */ 2857 super_types[mddev->major_version]. 2858 validate_super(mddev, NULL/*freshest*/, rdev); 2859 if (add_journal) 2860 mddev_suspend(mddev); 2861 err = mddev->pers->hot_add_disk(mddev, rdev); 2862 if (add_journal) 2863 mddev_resume(mddev); 2864 if (err) { 2865 md_kick_rdev_from_array(rdev); 2866 return err; 2867 } 2868 } 2869 sysfs_notify_dirent_safe(rdev->sysfs_state); 2870 2871 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2872 if (mddev->degraded) 2873 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2874 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2875 md_new_event(); 2876 md_wakeup_thread(mddev->thread); 2877 return 0; 2878 } 2879 2880 /* words written to sysfs files may, or may not, be \n terminated. 2881 * We want to accept with case. For this we use cmd_match. 2882 */ cmd_match(const char * cmd,const char * str)2883 static int cmd_match(const char *cmd, const char *str) 2884 { 2885 /* See if cmd, written into a sysfs file, matches 2886 * str. They must either be the same, or cmd can 2887 * have a trailing newline 2888 */ 2889 while (*cmd && *str && *cmd == *str) { 2890 cmd++; 2891 str++; 2892 } 2893 if (*cmd == '\n') 2894 cmd++; 2895 if (*str || *cmd) 2896 return 0; 2897 return 1; 2898 } 2899 2900 struct rdev_sysfs_entry { 2901 struct attribute attr; 2902 ssize_t (*show)(struct md_rdev *, char *); 2903 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2904 }; 2905 2906 static ssize_t state_show(struct md_rdev * rdev,char * page)2907 state_show(struct md_rdev *rdev, char *page) 2908 { 2909 char *sep = ","; 2910 size_t len = 0; 2911 unsigned long flags = READ_ONCE(rdev->flags); 2912 2913 if (test_bit(Faulty, &flags) || 2914 (!test_bit(ExternalBbl, &flags) && 2915 rdev->badblocks.unacked_exist)) 2916 len += sprintf(page+len, "faulty%s", sep); 2917 if (test_bit(In_sync, &flags)) 2918 len += sprintf(page+len, "in_sync%s", sep); 2919 if (test_bit(Journal, &flags)) 2920 len += sprintf(page+len, "journal%s", sep); 2921 if (test_bit(WriteMostly, &flags)) 2922 len += sprintf(page+len, "write_mostly%s", sep); 2923 if (test_bit(Blocked, &flags) || 2924 (rdev->badblocks.unacked_exist 2925 && !test_bit(Faulty, &flags))) 2926 len += sprintf(page+len, "blocked%s", sep); 2927 if (!test_bit(Faulty, &flags) && 2928 !test_bit(Journal, &flags) && 2929 !test_bit(In_sync, &flags)) 2930 len += sprintf(page+len, "spare%s", sep); 2931 if (test_bit(WriteErrorSeen, &flags)) 2932 len += sprintf(page+len, "write_error%s", sep); 2933 if (test_bit(WantReplacement, &flags)) 2934 len += sprintf(page+len, "want_replacement%s", sep); 2935 if (test_bit(Replacement, &flags)) 2936 len += sprintf(page+len, "replacement%s", sep); 2937 if (test_bit(ExternalBbl, &flags)) 2938 len += sprintf(page+len, "external_bbl%s", sep); 2939 if (test_bit(FailFast, &flags)) 2940 len += sprintf(page+len, "failfast%s", sep); 2941 2942 if (len) 2943 len -= strlen(sep); 2944 2945 return len+sprintf(page+len, "\n"); 2946 } 2947 2948 static ssize_t state_store(struct md_rdev * rdev,const char * buf,size_t len)2949 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2950 { 2951 /* can write 2952 * faulty - simulates an error 2953 * remove - disconnects the device 2954 * writemostly - sets write_mostly 2955 * -writemostly - clears write_mostly 2956 * blocked - sets the Blocked flags 2957 * -blocked - clears the Blocked and possibly simulates an error 2958 * insync - sets Insync providing device isn't active 2959 * -insync - clear Insync for a device with a slot assigned, 2960 * so that it gets rebuilt based on bitmap 2961 * write_error - sets WriteErrorSeen 2962 * -write_error - clears WriteErrorSeen 2963 * {,-}failfast - set/clear FailFast 2964 */ 2965 2966 struct mddev *mddev = rdev->mddev; 2967 int err = -EINVAL; 2968 bool need_update_sb = false; 2969 2970 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2971 md_error(rdev->mddev, rdev); 2972 2973 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2974 err = -EBUSY; 2975 else 2976 err = 0; 2977 } else if (cmd_match(buf, "remove")) { 2978 if (rdev->mddev->pers) { 2979 clear_bit(Blocked, &rdev->flags); 2980 remove_and_add_spares(rdev->mddev, rdev); 2981 } 2982 if (rdev->raid_disk >= 0) 2983 err = -EBUSY; 2984 else { 2985 err = 0; 2986 if (mddev_is_clustered(mddev)) 2987 err = md_cluster_ops->remove_disk(mddev, rdev); 2988 2989 if (err == 0) { 2990 md_kick_rdev_from_array(rdev); 2991 if (mddev->pers) { 2992 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2993 md_wakeup_thread(mddev->thread); 2994 } 2995 md_new_event(); 2996 } 2997 } 2998 } else if (cmd_match(buf, "writemostly")) { 2999 set_bit(WriteMostly, &rdev->flags); 3000 mddev_create_serial_pool(rdev->mddev, rdev, false); 3001 need_update_sb = true; 3002 err = 0; 3003 } else if (cmd_match(buf, "-writemostly")) { 3004 mddev_destroy_serial_pool(rdev->mddev, rdev, false); 3005 clear_bit(WriteMostly, &rdev->flags); 3006 need_update_sb = true; 3007 err = 0; 3008 } else if (cmd_match(buf, "blocked")) { 3009 set_bit(Blocked, &rdev->flags); 3010 err = 0; 3011 } else if (cmd_match(buf, "-blocked")) { 3012 if (!test_bit(Faulty, &rdev->flags) && 3013 !test_bit(ExternalBbl, &rdev->flags) && 3014 rdev->badblocks.unacked_exist) { 3015 /* metadata handler doesn't understand badblocks, 3016 * so we need to fail the device 3017 */ 3018 md_error(rdev->mddev, rdev); 3019 } 3020 clear_bit(Blocked, &rdev->flags); 3021 clear_bit(BlockedBadBlocks, &rdev->flags); 3022 wake_up(&rdev->blocked_wait); 3023 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3024 md_wakeup_thread(rdev->mddev->thread); 3025 3026 err = 0; 3027 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3028 set_bit(In_sync, &rdev->flags); 3029 err = 0; 3030 } else if (cmd_match(buf, "failfast")) { 3031 set_bit(FailFast, &rdev->flags); 3032 need_update_sb = true; 3033 err = 0; 3034 } else if (cmd_match(buf, "-failfast")) { 3035 clear_bit(FailFast, &rdev->flags); 3036 need_update_sb = true; 3037 err = 0; 3038 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3039 !test_bit(Journal, &rdev->flags)) { 3040 if (rdev->mddev->pers == NULL) { 3041 clear_bit(In_sync, &rdev->flags); 3042 rdev->saved_raid_disk = rdev->raid_disk; 3043 rdev->raid_disk = -1; 3044 err = 0; 3045 } 3046 } else if (cmd_match(buf, "write_error")) { 3047 set_bit(WriteErrorSeen, &rdev->flags); 3048 err = 0; 3049 } else if (cmd_match(buf, "-write_error")) { 3050 clear_bit(WriteErrorSeen, &rdev->flags); 3051 err = 0; 3052 } else if (cmd_match(buf, "want_replacement")) { 3053 /* Any non-spare device that is not a replacement can 3054 * become want_replacement at any time, but we then need to 3055 * check if recovery is needed. 3056 */ 3057 if (rdev->raid_disk >= 0 && 3058 !test_bit(Journal, &rdev->flags) && 3059 !test_bit(Replacement, &rdev->flags)) 3060 set_bit(WantReplacement, &rdev->flags); 3061 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3062 md_wakeup_thread(rdev->mddev->thread); 3063 err = 0; 3064 } else if (cmd_match(buf, "-want_replacement")) { 3065 /* Clearing 'want_replacement' is always allowed. 3066 * Once replacements starts it is too late though. 3067 */ 3068 err = 0; 3069 clear_bit(WantReplacement, &rdev->flags); 3070 } else if (cmd_match(buf, "replacement")) { 3071 /* Can only set a device as a replacement when array has not 3072 * yet been started. Once running, replacement is automatic 3073 * from spares, or by assigning 'slot'. 3074 */ 3075 if (rdev->mddev->pers) 3076 err = -EBUSY; 3077 else { 3078 set_bit(Replacement, &rdev->flags); 3079 err = 0; 3080 } 3081 } else if (cmd_match(buf, "-replacement")) { 3082 /* Similarly, can only clear Replacement before start */ 3083 if (rdev->mddev->pers) 3084 err = -EBUSY; 3085 else { 3086 clear_bit(Replacement, &rdev->flags); 3087 err = 0; 3088 } 3089 } else if (cmd_match(buf, "re-add")) { 3090 if (!rdev->mddev->pers) 3091 err = -EINVAL; 3092 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3093 rdev->saved_raid_disk >= 0) { 3094 /* clear_bit is performed _after_ all the devices 3095 * have their local Faulty bit cleared. If any writes 3096 * happen in the meantime in the local node, they 3097 * will land in the local bitmap, which will be synced 3098 * by this node eventually 3099 */ 3100 if (!mddev_is_clustered(rdev->mddev) || 3101 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3102 clear_bit(Faulty, &rdev->flags); 3103 err = add_bound_rdev(rdev); 3104 } 3105 } else 3106 err = -EBUSY; 3107 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3108 set_bit(ExternalBbl, &rdev->flags); 3109 rdev->badblocks.shift = 0; 3110 err = 0; 3111 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3112 clear_bit(ExternalBbl, &rdev->flags); 3113 err = 0; 3114 } 3115 if (need_update_sb) 3116 md_update_sb(mddev, 1); 3117 if (!err) 3118 sysfs_notify_dirent_safe(rdev->sysfs_state); 3119 return err ? err : len; 3120 } 3121 static struct rdev_sysfs_entry rdev_state = 3122 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3123 3124 static ssize_t errors_show(struct md_rdev * rdev,char * page)3125 errors_show(struct md_rdev *rdev, char *page) 3126 { 3127 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3128 } 3129 3130 static ssize_t errors_store(struct md_rdev * rdev,const char * buf,size_t len)3131 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3132 { 3133 unsigned int n; 3134 int rv; 3135 3136 rv = kstrtouint(buf, 10, &n); 3137 if (rv < 0) 3138 return rv; 3139 atomic_set(&rdev->corrected_errors, n); 3140 return len; 3141 } 3142 static struct rdev_sysfs_entry rdev_errors = 3143 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3144 3145 static ssize_t slot_show(struct md_rdev * rdev,char * page)3146 slot_show(struct md_rdev *rdev, char *page) 3147 { 3148 if (test_bit(Journal, &rdev->flags)) 3149 return sprintf(page, "journal\n"); 3150 else if (rdev->raid_disk < 0) 3151 return sprintf(page, "none\n"); 3152 else 3153 return sprintf(page, "%d\n", rdev->raid_disk); 3154 } 3155 3156 static ssize_t slot_store(struct md_rdev * rdev,const char * buf,size_t len)3157 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3158 { 3159 int slot; 3160 int err; 3161 3162 if (test_bit(Journal, &rdev->flags)) 3163 return -EBUSY; 3164 if (strncmp(buf, "none", 4)==0) 3165 slot = -1; 3166 else { 3167 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3168 if (err < 0) 3169 return err; 3170 if (slot < 0) 3171 /* overflow */ 3172 return -ENOSPC; 3173 } 3174 if (rdev->mddev->pers && slot == -1) { 3175 /* Setting 'slot' on an active array requires also 3176 * updating the 'rd%d' link, and communicating 3177 * with the personality with ->hot_*_disk. 3178 * For now we only support removing 3179 * failed/spare devices. This normally happens automatically, 3180 * but not when the metadata is externally managed. 3181 */ 3182 if (rdev->raid_disk == -1) 3183 return -EEXIST; 3184 /* personality does all needed checks */ 3185 if (rdev->mddev->pers->hot_remove_disk == NULL) 3186 return -EINVAL; 3187 clear_bit(Blocked, &rdev->flags); 3188 remove_and_add_spares(rdev->mddev, rdev); 3189 if (rdev->raid_disk >= 0) 3190 return -EBUSY; 3191 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3192 md_wakeup_thread(rdev->mddev->thread); 3193 } else if (rdev->mddev->pers) { 3194 /* Activating a spare .. or possibly reactivating 3195 * if we ever get bitmaps working here. 3196 */ 3197 int err; 3198 3199 if (rdev->raid_disk != -1) 3200 return -EBUSY; 3201 3202 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3203 return -EBUSY; 3204 3205 if (rdev->mddev->pers->hot_add_disk == NULL) 3206 return -EINVAL; 3207 3208 if (slot >= rdev->mddev->raid_disks && 3209 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3210 return -ENOSPC; 3211 3212 rdev->raid_disk = slot; 3213 if (test_bit(In_sync, &rdev->flags)) 3214 rdev->saved_raid_disk = slot; 3215 else 3216 rdev->saved_raid_disk = -1; 3217 clear_bit(In_sync, &rdev->flags); 3218 clear_bit(Bitmap_sync, &rdev->flags); 3219 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3220 if (err) { 3221 rdev->raid_disk = -1; 3222 return err; 3223 } else 3224 sysfs_notify_dirent_safe(rdev->sysfs_state); 3225 /* failure here is OK */; 3226 sysfs_link_rdev(rdev->mddev, rdev); 3227 /* don't wakeup anyone, leave that to userspace. */ 3228 } else { 3229 if (slot >= rdev->mddev->raid_disks && 3230 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3231 return -ENOSPC; 3232 rdev->raid_disk = slot; 3233 /* assume it is working */ 3234 clear_bit(Faulty, &rdev->flags); 3235 clear_bit(WriteMostly, &rdev->flags); 3236 set_bit(In_sync, &rdev->flags); 3237 sysfs_notify_dirent_safe(rdev->sysfs_state); 3238 } 3239 return len; 3240 } 3241 3242 static struct rdev_sysfs_entry rdev_slot = 3243 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3244 3245 static ssize_t offset_show(struct md_rdev * rdev,char * page)3246 offset_show(struct md_rdev *rdev, char *page) 3247 { 3248 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3249 } 3250 3251 static ssize_t offset_store(struct md_rdev * rdev,const char * buf,size_t len)3252 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3253 { 3254 unsigned long long offset; 3255 if (kstrtoull(buf, 10, &offset) < 0) 3256 return -EINVAL; 3257 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3258 return -EBUSY; 3259 if (rdev->sectors && rdev->mddev->external) 3260 /* Must set offset before size, so overlap checks 3261 * can be sane */ 3262 return -EBUSY; 3263 rdev->data_offset = offset; 3264 rdev->new_data_offset = offset; 3265 return len; 3266 } 3267 3268 static struct rdev_sysfs_entry rdev_offset = 3269 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3270 new_offset_show(struct md_rdev * rdev,char * page)3271 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3272 { 3273 return sprintf(page, "%llu\n", 3274 (unsigned long long)rdev->new_data_offset); 3275 } 3276 new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3277 static ssize_t new_offset_store(struct md_rdev *rdev, 3278 const char *buf, size_t len) 3279 { 3280 unsigned long long new_offset; 3281 struct mddev *mddev = rdev->mddev; 3282 3283 if (kstrtoull(buf, 10, &new_offset) < 0) 3284 return -EINVAL; 3285 3286 if (mddev->sync_thread || 3287 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3288 return -EBUSY; 3289 if (new_offset == rdev->data_offset) 3290 /* reset is always permitted */ 3291 ; 3292 else if (new_offset > rdev->data_offset) { 3293 /* must not push array size beyond rdev_sectors */ 3294 if (new_offset - rdev->data_offset 3295 + mddev->dev_sectors > rdev->sectors) 3296 return -E2BIG; 3297 } 3298 /* Metadata worries about other space details. */ 3299 3300 /* decreasing the offset is inconsistent with a backwards 3301 * reshape. 3302 */ 3303 if (new_offset < rdev->data_offset && 3304 mddev->reshape_backwards) 3305 return -EINVAL; 3306 /* Increasing offset is inconsistent with forwards 3307 * reshape. reshape_direction should be set to 3308 * 'backwards' first. 3309 */ 3310 if (new_offset > rdev->data_offset && 3311 !mddev->reshape_backwards) 3312 return -EINVAL; 3313 3314 if (mddev->pers && mddev->persistent && 3315 !super_types[mddev->major_version] 3316 .allow_new_offset(rdev, new_offset)) 3317 return -E2BIG; 3318 rdev->new_data_offset = new_offset; 3319 if (new_offset > rdev->data_offset) 3320 mddev->reshape_backwards = 1; 3321 else if (new_offset < rdev->data_offset) 3322 mddev->reshape_backwards = 0; 3323 3324 return len; 3325 } 3326 static struct rdev_sysfs_entry rdev_new_offset = 3327 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3328 3329 static ssize_t rdev_size_show(struct md_rdev * rdev,char * page)3330 rdev_size_show(struct md_rdev *rdev, char *page) 3331 { 3332 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3333 } 3334 md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3335 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3336 { 3337 /* check if two start/length pairs overlap */ 3338 if (a->data_offset + a->sectors <= b->data_offset) 3339 return false; 3340 if (b->data_offset + b->sectors <= a->data_offset) 3341 return false; 3342 return true; 3343 } 3344 md_rdev_overlaps(struct md_rdev * rdev)3345 static bool md_rdev_overlaps(struct md_rdev *rdev) 3346 { 3347 struct mddev *mddev; 3348 struct md_rdev *rdev2; 3349 3350 spin_lock(&all_mddevs_lock); 3351 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3352 if (test_bit(MD_DELETED, &mddev->flags)) 3353 continue; 3354 rdev_for_each(rdev2, mddev) { 3355 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3356 md_rdevs_overlap(rdev, rdev2)) { 3357 spin_unlock(&all_mddevs_lock); 3358 return true; 3359 } 3360 } 3361 } 3362 spin_unlock(&all_mddevs_lock); 3363 return false; 3364 } 3365 strict_blocks_to_sectors(const char * buf,sector_t * sectors)3366 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3367 { 3368 unsigned long long blocks; 3369 sector_t new; 3370 3371 if (kstrtoull(buf, 10, &blocks) < 0) 3372 return -EINVAL; 3373 3374 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3375 return -EINVAL; /* sector conversion overflow */ 3376 3377 new = blocks * 2; 3378 if (new != blocks * 2) 3379 return -EINVAL; /* unsigned long long to sector_t overflow */ 3380 3381 *sectors = new; 3382 return 0; 3383 } 3384 3385 static ssize_t rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3386 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3387 { 3388 struct mddev *my_mddev = rdev->mddev; 3389 sector_t oldsectors = rdev->sectors; 3390 sector_t sectors; 3391 3392 if (test_bit(Journal, &rdev->flags)) 3393 return -EBUSY; 3394 if (strict_blocks_to_sectors(buf, §ors) < 0) 3395 return -EINVAL; 3396 if (rdev->data_offset != rdev->new_data_offset) 3397 return -EINVAL; /* too confusing */ 3398 if (my_mddev->pers && rdev->raid_disk >= 0) { 3399 if (my_mddev->persistent) { 3400 sectors = super_types[my_mddev->major_version]. 3401 rdev_size_change(rdev, sectors); 3402 if (!sectors) 3403 return -EBUSY; 3404 } else if (!sectors) 3405 sectors = bdev_nr_sectors(rdev->bdev) - 3406 rdev->data_offset; 3407 if (!my_mddev->pers->resize) 3408 /* Cannot change size for RAID0 or Linear etc */ 3409 return -EINVAL; 3410 } 3411 if (sectors < my_mddev->dev_sectors) 3412 return -EINVAL; /* component must fit device */ 3413 3414 rdev->sectors = sectors; 3415 3416 /* 3417 * Check that all other rdevs with the same bdev do not overlap. This 3418 * check does not provide a hard guarantee, it just helps avoid 3419 * dangerous mistakes. 3420 */ 3421 if (sectors > oldsectors && my_mddev->external && 3422 md_rdev_overlaps(rdev)) { 3423 /* 3424 * Someone else could have slipped in a size change here, but 3425 * doing so is just silly. We put oldsectors back because we 3426 * know it is safe, and trust userspace not to race with itself. 3427 */ 3428 rdev->sectors = oldsectors; 3429 return -EBUSY; 3430 } 3431 return len; 3432 } 3433 3434 static struct rdev_sysfs_entry rdev_size = 3435 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3436 recovery_start_show(struct md_rdev * rdev,char * page)3437 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3438 { 3439 unsigned long long recovery_start = rdev->recovery_offset; 3440 3441 if (test_bit(In_sync, &rdev->flags) || 3442 recovery_start == MaxSector) 3443 return sprintf(page, "none\n"); 3444 3445 return sprintf(page, "%llu\n", recovery_start); 3446 } 3447 recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3448 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3449 { 3450 unsigned long long recovery_start; 3451 3452 if (cmd_match(buf, "none")) 3453 recovery_start = MaxSector; 3454 else if (kstrtoull(buf, 10, &recovery_start)) 3455 return -EINVAL; 3456 3457 if (rdev->mddev->pers && 3458 rdev->raid_disk >= 0) 3459 return -EBUSY; 3460 3461 rdev->recovery_offset = recovery_start; 3462 if (recovery_start == MaxSector) 3463 set_bit(In_sync, &rdev->flags); 3464 else 3465 clear_bit(In_sync, &rdev->flags); 3466 return len; 3467 } 3468 3469 static struct rdev_sysfs_entry rdev_recovery_start = 3470 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3471 3472 /* sysfs access to bad-blocks list. 3473 * We present two files. 3474 * 'bad-blocks' lists sector numbers and lengths of ranges that 3475 * are recorded as bad. The list is truncated to fit within 3476 * the one-page limit of sysfs. 3477 * Writing "sector length" to this file adds an acknowledged 3478 * bad block list. 3479 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3480 * been acknowledged. Writing to this file adds bad blocks 3481 * without acknowledging them. This is largely for testing. 3482 */ bb_show(struct md_rdev * rdev,char * page)3483 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3484 { 3485 return badblocks_show(&rdev->badblocks, page, 0); 3486 } bb_store(struct md_rdev * rdev,const char * page,size_t len)3487 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3488 { 3489 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3490 /* Maybe that ack was all we needed */ 3491 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3492 wake_up(&rdev->blocked_wait); 3493 return rv; 3494 } 3495 static struct rdev_sysfs_entry rdev_bad_blocks = 3496 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3497 ubb_show(struct md_rdev * rdev,char * page)3498 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3499 { 3500 return badblocks_show(&rdev->badblocks, page, 1); 3501 } ubb_store(struct md_rdev * rdev,const char * page,size_t len)3502 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3503 { 3504 return badblocks_store(&rdev->badblocks, page, len, 1); 3505 } 3506 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3507 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3508 3509 static ssize_t ppl_sector_show(struct md_rdev * rdev,char * page)3510 ppl_sector_show(struct md_rdev *rdev, char *page) 3511 { 3512 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3513 } 3514 3515 static ssize_t ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3516 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3517 { 3518 unsigned long long sector; 3519 3520 if (kstrtoull(buf, 10, §or) < 0) 3521 return -EINVAL; 3522 if (sector != (sector_t)sector) 3523 return -EINVAL; 3524 3525 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3526 rdev->raid_disk >= 0) 3527 return -EBUSY; 3528 3529 if (rdev->mddev->persistent) { 3530 if (rdev->mddev->major_version == 0) 3531 return -EINVAL; 3532 if ((sector > rdev->sb_start && 3533 sector - rdev->sb_start > S16_MAX) || 3534 (sector < rdev->sb_start && 3535 rdev->sb_start - sector > -S16_MIN)) 3536 return -EINVAL; 3537 rdev->ppl.offset = sector - rdev->sb_start; 3538 } else if (!rdev->mddev->external) { 3539 return -EBUSY; 3540 } 3541 rdev->ppl.sector = sector; 3542 return len; 3543 } 3544 3545 static struct rdev_sysfs_entry rdev_ppl_sector = 3546 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3547 3548 static ssize_t ppl_size_show(struct md_rdev * rdev,char * page)3549 ppl_size_show(struct md_rdev *rdev, char *page) 3550 { 3551 return sprintf(page, "%u\n", rdev->ppl.size); 3552 } 3553 3554 static ssize_t ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3555 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3556 { 3557 unsigned int size; 3558 3559 if (kstrtouint(buf, 10, &size) < 0) 3560 return -EINVAL; 3561 3562 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3563 rdev->raid_disk >= 0) 3564 return -EBUSY; 3565 3566 if (rdev->mddev->persistent) { 3567 if (rdev->mddev->major_version == 0) 3568 return -EINVAL; 3569 if (size > U16_MAX) 3570 return -EINVAL; 3571 } else if (!rdev->mddev->external) { 3572 return -EBUSY; 3573 } 3574 rdev->ppl.size = size; 3575 return len; 3576 } 3577 3578 static struct rdev_sysfs_entry rdev_ppl_size = 3579 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3580 3581 static struct attribute *rdev_default_attrs[] = { 3582 &rdev_state.attr, 3583 &rdev_errors.attr, 3584 &rdev_slot.attr, 3585 &rdev_offset.attr, 3586 &rdev_new_offset.attr, 3587 &rdev_size.attr, 3588 &rdev_recovery_start.attr, 3589 &rdev_bad_blocks.attr, 3590 &rdev_unack_bad_blocks.attr, 3591 &rdev_ppl_sector.attr, 3592 &rdev_ppl_size.attr, 3593 NULL, 3594 }; 3595 ATTRIBUTE_GROUPS(rdev_default); 3596 static ssize_t rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3597 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3598 { 3599 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3600 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3601 3602 if (!entry->show) 3603 return -EIO; 3604 if (!rdev->mddev) 3605 return -ENODEV; 3606 return entry->show(rdev, page); 3607 } 3608 3609 static ssize_t rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3610 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3611 const char *page, size_t length) 3612 { 3613 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3614 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3615 struct kernfs_node *kn = NULL; 3616 ssize_t rv; 3617 struct mddev *mddev = rdev->mddev; 3618 3619 if (!entry->store) 3620 return -EIO; 3621 if (!capable(CAP_SYS_ADMIN)) 3622 return -EACCES; 3623 3624 if (entry->store == state_store && cmd_match(page, "remove")) 3625 kn = sysfs_break_active_protection(kobj, attr); 3626 3627 rv = mddev ? mddev_lock(mddev) : -ENODEV; 3628 if (!rv) { 3629 if (rdev->mddev == NULL) 3630 rv = -ENODEV; 3631 else 3632 rv = entry->store(rdev, page, length); 3633 mddev_unlock(mddev); 3634 } 3635 3636 if (kn) 3637 sysfs_unbreak_active_protection(kn); 3638 3639 return rv; 3640 } 3641 rdev_free(struct kobject * ko)3642 static void rdev_free(struct kobject *ko) 3643 { 3644 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3645 kfree(rdev); 3646 } 3647 static const struct sysfs_ops rdev_sysfs_ops = { 3648 .show = rdev_attr_show, 3649 .store = rdev_attr_store, 3650 }; 3651 static const struct kobj_type rdev_ktype = { 3652 .release = rdev_free, 3653 .sysfs_ops = &rdev_sysfs_ops, 3654 .default_groups = rdev_default_groups, 3655 }; 3656 md_rdev_init(struct md_rdev * rdev)3657 int md_rdev_init(struct md_rdev *rdev) 3658 { 3659 rdev->desc_nr = -1; 3660 rdev->saved_raid_disk = -1; 3661 rdev->raid_disk = -1; 3662 rdev->flags = 0; 3663 rdev->data_offset = 0; 3664 rdev->new_data_offset = 0; 3665 rdev->sb_events = 0; 3666 rdev->last_read_error = 0; 3667 rdev->sb_loaded = 0; 3668 rdev->bb_page = NULL; 3669 atomic_set(&rdev->nr_pending, 0); 3670 atomic_set(&rdev->read_errors, 0); 3671 atomic_set(&rdev->corrected_errors, 0); 3672 3673 INIT_LIST_HEAD(&rdev->same_set); 3674 init_waitqueue_head(&rdev->blocked_wait); 3675 3676 /* Add space to store bad block list. 3677 * This reserves the space even on arrays where it cannot 3678 * be used - I wonder if that matters 3679 */ 3680 return badblocks_init(&rdev->badblocks, 0); 3681 } 3682 EXPORT_SYMBOL_GPL(md_rdev_init); 3683 3684 /* 3685 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3686 * 3687 * mark the device faulty if: 3688 * 3689 * - the device is nonexistent (zero size) 3690 * - the device has no valid superblock 3691 * 3692 * a faulty rdev _never_ has rdev->sb set. 3693 */ md_import_device(dev_t newdev,int super_format,int super_minor)3694 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3695 { 3696 struct md_rdev *rdev; 3697 struct md_rdev *holder; 3698 sector_t size; 3699 int err; 3700 3701 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3702 if (!rdev) 3703 return ERR_PTR(-ENOMEM); 3704 3705 err = md_rdev_init(rdev); 3706 if (err) 3707 goto out_free_rdev; 3708 err = alloc_disk_sb(rdev); 3709 if (err) 3710 goto out_clear_rdev; 3711 3712 if (super_format == -2) { 3713 holder = &claim_rdev; 3714 } else { 3715 holder = rdev; 3716 set_bit(Holder, &rdev->flags); 3717 } 3718 3719 rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, 3720 holder, NULL); 3721 if (IS_ERR(rdev->bdev)) { 3722 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3723 MAJOR(newdev), MINOR(newdev)); 3724 err = PTR_ERR(rdev->bdev); 3725 goto out_clear_rdev; 3726 } 3727 3728 kobject_init(&rdev->kobj, &rdev_ktype); 3729 3730 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3731 if (!size) { 3732 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3733 rdev->bdev); 3734 err = -EINVAL; 3735 goto out_blkdev_put; 3736 } 3737 3738 if (super_format >= 0) { 3739 err = super_types[super_format]. 3740 load_super(rdev, NULL, super_minor); 3741 if (err == -EINVAL) { 3742 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3743 rdev->bdev, 3744 super_format, super_minor); 3745 goto out_blkdev_put; 3746 } 3747 if (err < 0) { 3748 pr_warn("md: could not read %pg's sb, not importing!\n", 3749 rdev->bdev); 3750 goto out_blkdev_put; 3751 } 3752 } 3753 3754 return rdev; 3755 3756 out_blkdev_put: 3757 blkdev_put(rdev->bdev, holder); 3758 out_clear_rdev: 3759 md_rdev_clear(rdev); 3760 out_free_rdev: 3761 kfree(rdev); 3762 return ERR_PTR(err); 3763 } 3764 3765 /* 3766 * Check a full RAID array for plausibility 3767 */ 3768 analyze_sbs(struct mddev * mddev)3769 static int analyze_sbs(struct mddev *mddev) 3770 { 3771 int i; 3772 struct md_rdev *rdev, *freshest, *tmp; 3773 3774 freshest = NULL; 3775 rdev_for_each_safe(rdev, tmp, mddev) 3776 switch (super_types[mddev->major_version]. 3777 load_super(rdev, freshest, mddev->minor_version)) { 3778 case 1: 3779 freshest = rdev; 3780 break; 3781 case 0: 3782 break; 3783 default: 3784 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3785 rdev->bdev); 3786 md_kick_rdev_from_array(rdev); 3787 } 3788 3789 /* Cannot find a valid fresh disk */ 3790 if (!freshest) { 3791 pr_warn("md: cannot find a valid disk\n"); 3792 return -EINVAL; 3793 } 3794 3795 super_types[mddev->major_version]. 3796 validate_super(mddev, NULL/*freshest*/, freshest); 3797 3798 i = 0; 3799 rdev_for_each_safe(rdev, tmp, mddev) { 3800 if (mddev->max_disks && 3801 (rdev->desc_nr >= mddev->max_disks || 3802 i > mddev->max_disks)) { 3803 pr_warn("md: %s: %pg: only %d devices permitted\n", 3804 mdname(mddev), rdev->bdev, 3805 mddev->max_disks); 3806 md_kick_rdev_from_array(rdev); 3807 continue; 3808 } 3809 if (rdev != freshest) { 3810 if (super_types[mddev->major_version]. 3811 validate_super(mddev, freshest, rdev)) { 3812 pr_warn("md: kicking non-fresh %pg from array!\n", 3813 rdev->bdev); 3814 md_kick_rdev_from_array(rdev); 3815 continue; 3816 } 3817 } 3818 if (mddev->level == LEVEL_MULTIPATH) { 3819 rdev->desc_nr = i++; 3820 rdev->raid_disk = rdev->desc_nr; 3821 set_bit(In_sync, &rdev->flags); 3822 } else if (rdev->raid_disk >= 3823 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3824 !test_bit(Journal, &rdev->flags)) { 3825 rdev->raid_disk = -1; 3826 clear_bit(In_sync, &rdev->flags); 3827 } 3828 } 3829 3830 return 0; 3831 } 3832 3833 /* Read a fixed-point number. 3834 * Numbers in sysfs attributes should be in "standard" units where 3835 * possible, so time should be in seconds. 3836 * However we internally use a a much smaller unit such as 3837 * milliseconds or jiffies. 3838 * This function takes a decimal number with a possible fractional 3839 * component, and produces an integer which is the result of 3840 * multiplying that number by 10^'scale'. 3841 * all without any floating-point arithmetic. 3842 */ strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3843 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3844 { 3845 unsigned long result = 0; 3846 long decimals = -1; 3847 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3848 if (*cp == '.') 3849 decimals = 0; 3850 else if (decimals < scale) { 3851 unsigned int value; 3852 value = *cp - '0'; 3853 result = result * 10 + value; 3854 if (decimals >= 0) 3855 decimals++; 3856 } 3857 cp++; 3858 } 3859 if (*cp == '\n') 3860 cp++; 3861 if (*cp) 3862 return -EINVAL; 3863 if (decimals < 0) 3864 decimals = 0; 3865 *res = result * int_pow(10, scale - decimals); 3866 return 0; 3867 } 3868 3869 static ssize_t safe_delay_show(struct mddev * mddev,char * page)3870 safe_delay_show(struct mddev *mddev, char *page) 3871 { 3872 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3873 3874 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3875 } 3876 static ssize_t safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3877 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3878 { 3879 unsigned long msec; 3880 3881 if (mddev_is_clustered(mddev)) { 3882 pr_warn("md: Safemode is disabled for clustered mode\n"); 3883 return -EINVAL; 3884 } 3885 3886 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3887 return -EINVAL; 3888 if (msec == 0) 3889 mddev->safemode_delay = 0; 3890 else { 3891 unsigned long old_delay = mddev->safemode_delay; 3892 unsigned long new_delay = (msec*HZ)/1000; 3893 3894 if (new_delay == 0) 3895 new_delay = 1; 3896 mddev->safemode_delay = new_delay; 3897 if (new_delay < old_delay || old_delay == 0) 3898 mod_timer(&mddev->safemode_timer, jiffies+1); 3899 } 3900 return len; 3901 } 3902 static struct md_sysfs_entry md_safe_delay = 3903 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3904 3905 static ssize_t level_show(struct mddev * mddev,char * page)3906 level_show(struct mddev *mddev, char *page) 3907 { 3908 struct md_personality *p; 3909 int ret; 3910 spin_lock(&mddev->lock); 3911 p = mddev->pers; 3912 if (p) 3913 ret = sprintf(page, "%s\n", p->name); 3914 else if (mddev->clevel[0]) 3915 ret = sprintf(page, "%s\n", mddev->clevel); 3916 else if (mddev->level != LEVEL_NONE) 3917 ret = sprintf(page, "%d\n", mddev->level); 3918 else 3919 ret = 0; 3920 spin_unlock(&mddev->lock); 3921 return ret; 3922 } 3923 3924 static ssize_t level_store(struct mddev * mddev,const char * buf,size_t len)3925 level_store(struct mddev *mddev, const char *buf, size_t len) 3926 { 3927 char clevel[16]; 3928 ssize_t rv; 3929 size_t slen = len; 3930 struct md_personality *pers, *oldpers; 3931 long level; 3932 void *priv, *oldpriv; 3933 struct md_rdev *rdev; 3934 3935 if (slen == 0 || slen >= sizeof(clevel)) 3936 return -EINVAL; 3937 3938 rv = mddev_lock(mddev); 3939 if (rv) 3940 return rv; 3941 3942 if (mddev->pers == NULL) { 3943 strncpy(mddev->clevel, buf, slen); 3944 if (mddev->clevel[slen-1] == '\n') 3945 slen--; 3946 mddev->clevel[slen] = 0; 3947 mddev->level = LEVEL_NONE; 3948 rv = len; 3949 goto out_unlock; 3950 } 3951 rv = -EROFS; 3952 if (!md_is_rdwr(mddev)) 3953 goto out_unlock; 3954 3955 /* request to change the personality. Need to ensure: 3956 * - array is not engaged in resync/recovery/reshape 3957 * - old personality can be suspended 3958 * - new personality will access other array. 3959 */ 3960 3961 rv = -EBUSY; 3962 if (mddev->sync_thread || 3963 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3964 mddev->reshape_position != MaxSector || 3965 mddev->sysfs_active) 3966 goto out_unlock; 3967 3968 rv = -EINVAL; 3969 if (!mddev->pers->quiesce) { 3970 pr_warn("md: %s: %s does not support online personality change\n", 3971 mdname(mddev), mddev->pers->name); 3972 goto out_unlock; 3973 } 3974 3975 /* Now find the new personality */ 3976 strncpy(clevel, buf, slen); 3977 if (clevel[slen-1] == '\n') 3978 slen--; 3979 clevel[slen] = 0; 3980 if (kstrtol(clevel, 10, &level)) 3981 level = LEVEL_NONE; 3982 3983 if (request_module("md-%s", clevel) != 0) 3984 request_module("md-level-%s", clevel); 3985 spin_lock(&pers_lock); 3986 pers = find_pers(level, clevel); 3987 if (!pers || !try_module_get(pers->owner)) { 3988 spin_unlock(&pers_lock); 3989 pr_warn("md: personality %s not loaded\n", clevel); 3990 rv = -EINVAL; 3991 goto out_unlock; 3992 } 3993 spin_unlock(&pers_lock); 3994 3995 if (pers == mddev->pers) { 3996 /* Nothing to do! */ 3997 module_put(pers->owner); 3998 rv = len; 3999 goto out_unlock; 4000 } 4001 if (!pers->takeover) { 4002 module_put(pers->owner); 4003 pr_warn("md: %s: %s does not support personality takeover\n", 4004 mdname(mddev), clevel); 4005 rv = -EINVAL; 4006 goto out_unlock; 4007 } 4008 4009 rdev_for_each(rdev, mddev) 4010 rdev->new_raid_disk = rdev->raid_disk; 4011 4012 /* ->takeover must set new_* and/or delta_disks 4013 * if it succeeds, and may set them when it fails. 4014 */ 4015 priv = pers->takeover(mddev); 4016 if (IS_ERR(priv)) { 4017 mddev->new_level = mddev->level; 4018 mddev->new_layout = mddev->layout; 4019 mddev->new_chunk_sectors = mddev->chunk_sectors; 4020 mddev->raid_disks -= mddev->delta_disks; 4021 mddev->delta_disks = 0; 4022 mddev->reshape_backwards = 0; 4023 module_put(pers->owner); 4024 pr_warn("md: %s: %s would not accept array\n", 4025 mdname(mddev), clevel); 4026 rv = PTR_ERR(priv); 4027 goto out_unlock; 4028 } 4029 4030 /* Looks like we have a winner */ 4031 mddev_suspend(mddev); 4032 mddev_detach(mddev); 4033 4034 spin_lock(&mddev->lock); 4035 oldpers = mddev->pers; 4036 oldpriv = mddev->private; 4037 mddev->pers = pers; 4038 mddev->private = priv; 4039 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4040 mddev->level = mddev->new_level; 4041 mddev->layout = mddev->new_layout; 4042 mddev->chunk_sectors = mddev->new_chunk_sectors; 4043 mddev->delta_disks = 0; 4044 mddev->reshape_backwards = 0; 4045 mddev->degraded = 0; 4046 spin_unlock(&mddev->lock); 4047 4048 if (oldpers->sync_request == NULL && 4049 mddev->external) { 4050 /* We are converting from a no-redundancy array 4051 * to a redundancy array and metadata is managed 4052 * externally so we need to be sure that writes 4053 * won't block due to a need to transition 4054 * clean->dirty 4055 * until external management is started. 4056 */ 4057 mddev->in_sync = 0; 4058 mddev->safemode_delay = 0; 4059 mddev->safemode = 0; 4060 } 4061 4062 oldpers->free(mddev, oldpriv); 4063 4064 if (oldpers->sync_request == NULL && 4065 pers->sync_request != NULL) { 4066 /* need to add the md_redundancy_group */ 4067 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4068 pr_warn("md: cannot register extra attributes for %s\n", 4069 mdname(mddev)); 4070 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4071 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4072 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4073 } 4074 if (oldpers->sync_request != NULL && 4075 pers->sync_request == NULL) { 4076 /* need to remove the md_redundancy_group */ 4077 if (mddev->to_remove == NULL) 4078 mddev->to_remove = &md_redundancy_group; 4079 } 4080 4081 module_put(oldpers->owner); 4082 4083 rdev_for_each(rdev, mddev) { 4084 if (rdev->raid_disk < 0) 4085 continue; 4086 if (rdev->new_raid_disk >= mddev->raid_disks) 4087 rdev->new_raid_disk = -1; 4088 if (rdev->new_raid_disk == rdev->raid_disk) 4089 continue; 4090 sysfs_unlink_rdev(mddev, rdev); 4091 } 4092 rdev_for_each(rdev, mddev) { 4093 if (rdev->raid_disk < 0) 4094 continue; 4095 if (rdev->new_raid_disk == rdev->raid_disk) 4096 continue; 4097 rdev->raid_disk = rdev->new_raid_disk; 4098 if (rdev->raid_disk < 0) 4099 clear_bit(In_sync, &rdev->flags); 4100 else { 4101 if (sysfs_link_rdev(mddev, rdev)) 4102 pr_warn("md: cannot register rd%d for %s after level change\n", 4103 rdev->raid_disk, mdname(mddev)); 4104 } 4105 } 4106 4107 if (pers->sync_request == NULL) { 4108 /* this is now an array without redundancy, so 4109 * it must always be in_sync 4110 */ 4111 mddev->in_sync = 1; 4112 del_timer_sync(&mddev->safemode_timer); 4113 } 4114 blk_set_stacking_limits(&mddev->queue->limits); 4115 pers->run(mddev); 4116 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4117 mddev_resume(mddev); 4118 if (!mddev->thread) 4119 md_update_sb(mddev, 1); 4120 sysfs_notify_dirent_safe(mddev->sysfs_level); 4121 md_new_event(); 4122 rv = len; 4123 out_unlock: 4124 mddev_unlock(mddev); 4125 return rv; 4126 } 4127 4128 static struct md_sysfs_entry md_level = 4129 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4130 4131 static ssize_t layout_show(struct mddev * mddev,char * page)4132 layout_show(struct mddev *mddev, char *page) 4133 { 4134 /* just a number, not meaningful for all levels */ 4135 if (mddev->reshape_position != MaxSector && 4136 mddev->layout != mddev->new_layout) 4137 return sprintf(page, "%d (%d)\n", 4138 mddev->new_layout, mddev->layout); 4139 return sprintf(page, "%d\n", mddev->layout); 4140 } 4141 4142 static ssize_t layout_store(struct mddev * mddev,const char * buf,size_t len)4143 layout_store(struct mddev *mddev, const char *buf, size_t len) 4144 { 4145 unsigned int n; 4146 int err; 4147 4148 err = kstrtouint(buf, 10, &n); 4149 if (err < 0) 4150 return err; 4151 err = mddev_lock(mddev); 4152 if (err) 4153 return err; 4154 4155 if (mddev->pers) { 4156 if (mddev->pers->check_reshape == NULL) 4157 err = -EBUSY; 4158 else if (!md_is_rdwr(mddev)) 4159 err = -EROFS; 4160 else { 4161 mddev->new_layout = n; 4162 err = mddev->pers->check_reshape(mddev); 4163 if (err) 4164 mddev->new_layout = mddev->layout; 4165 } 4166 } else { 4167 mddev->new_layout = n; 4168 if (mddev->reshape_position == MaxSector) 4169 mddev->layout = n; 4170 } 4171 mddev_unlock(mddev); 4172 return err ?: len; 4173 } 4174 static struct md_sysfs_entry md_layout = 4175 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4176 4177 static ssize_t raid_disks_show(struct mddev * mddev,char * page)4178 raid_disks_show(struct mddev *mddev, char *page) 4179 { 4180 if (mddev->raid_disks == 0) 4181 return 0; 4182 if (mddev->reshape_position != MaxSector && 4183 mddev->delta_disks != 0) 4184 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4185 mddev->raid_disks - mddev->delta_disks); 4186 return sprintf(page, "%d\n", mddev->raid_disks); 4187 } 4188 4189 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4190 4191 static ssize_t raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4192 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4193 { 4194 unsigned int n; 4195 int err; 4196 4197 err = kstrtouint(buf, 10, &n); 4198 if (err < 0) 4199 return err; 4200 4201 err = mddev_lock(mddev); 4202 if (err) 4203 return err; 4204 if (mddev->pers) 4205 err = update_raid_disks(mddev, n); 4206 else if (mddev->reshape_position != MaxSector) { 4207 struct md_rdev *rdev; 4208 int olddisks = mddev->raid_disks - mddev->delta_disks; 4209 4210 err = -EINVAL; 4211 rdev_for_each(rdev, mddev) { 4212 if (olddisks < n && 4213 rdev->data_offset < rdev->new_data_offset) 4214 goto out_unlock; 4215 if (olddisks > n && 4216 rdev->data_offset > rdev->new_data_offset) 4217 goto out_unlock; 4218 } 4219 err = 0; 4220 mddev->delta_disks = n - olddisks; 4221 mddev->raid_disks = n; 4222 mddev->reshape_backwards = (mddev->delta_disks < 0); 4223 } else 4224 mddev->raid_disks = n; 4225 out_unlock: 4226 mddev_unlock(mddev); 4227 return err ? err : len; 4228 } 4229 static struct md_sysfs_entry md_raid_disks = 4230 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4231 4232 static ssize_t uuid_show(struct mddev * mddev,char * page)4233 uuid_show(struct mddev *mddev, char *page) 4234 { 4235 return sprintf(page, "%pU\n", mddev->uuid); 4236 } 4237 static struct md_sysfs_entry md_uuid = 4238 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4239 4240 static ssize_t chunk_size_show(struct mddev * mddev,char * page)4241 chunk_size_show(struct mddev *mddev, char *page) 4242 { 4243 if (mddev->reshape_position != MaxSector && 4244 mddev->chunk_sectors != mddev->new_chunk_sectors) 4245 return sprintf(page, "%d (%d)\n", 4246 mddev->new_chunk_sectors << 9, 4247 mddev->chunk_sectors << 9); 4248 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4249 } 4250 4251 static ssize_t chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4252 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4253 { 4254 unsigned long n; 4255 int err; 4256 4257 err = kstrtoul(buf, 10, &n); 4258 if (err < 0) 4259 return err; 4260 4261 err = mddev_lock(mddev); 4262 if (err) 4263 return err; 4264 if (mddev->pers) { 4265 if (mddev->pers->check_reshape == NULL) 4266 err = -EBUSY; 4267 else if (!md_is_rdwr(mddev)) 4268 err = -EROFS; 4269 else { 4270 mddev->new_chunk_sectors = n >> 9; 4271 err = mddev->pers->check_reshape(mddev); 4272 if (err) 4273 mddev->new_chunk_sectors = mddev->chunk_sectors; 4274 } 4275 } else { 4276 mddev->new_chunk_sectors = n >> 9; 4277 if (mddev->reshape_position == MaxSector) 4278 mddev->chunk_sectors = n >> 9; 4279 } 4280 mddev_unlock(mddev); 4281 return err ?: len; 4282 } 4283 static struct md_sysfs_entry md_chunk_size = 4284 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4285 4286 static ssize_t resync_start_show(struct mddev * mddev,char * page)4287 resync_start_show(struct mddev *mddev, char *page) 4288 { 4289 if (mddev->recovery_cp == MaxSector) 4290 return sprintf(page, "none\n"); 4291 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4292 } 4293 4294 static ssize_t resync_start_store(struct mddev * mddev,const char * buf,size_t len)4295 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4296 { 4297 unsigned long long n; 4298 int err; 4299 4300 if (cmd_match(buf, "none")) 4301 n = MaxSector; 4302 else { 4303 err = kstrtoull(buf, 10, &n); 4304 if (err < 0) 4305 return err; 4306 if (n != (sector_t)n) 4307 return -EINVAL; 4308 } 4309 4310 err = mddev_lock(mddev); 4311 if (err) 4312 return err; 4313 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4314 err = -EBUSY; 4315 4316 if (!err) { 4317 mddev->recovery_cp = n; 4318 if (mddev->pers) 4319 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4320 } 4321 mddev_unlock(mddev); 4322 return err ?: len; 4323 } 4324 static struct md_sysfs_entry md_resync_start = 4325 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4326 resync_start_show, resync_start_store); 4327 4328 /* 4329 * The array state can be: 4330 * 4331 * clear 4332 * No devices, no size, no level 4333 * Equivalent to STOP_ARRAY ioctl 4334 * inactive 4335 * May have some settings, but array is not active 4336 * all IO results in error 4337 * When written, doesn't tear down array, but just stops it 4338 * suspended (not supported yet) 4339 * All IO requests will block. The array can be reconfigured. 4340 * Writing this, if accepted, will block until array is quiescent 4341 * readonly 4342 * no resync can happen. no superblocks get written. 4343 * write requests fail 4344 * read-auto 4345 * like readonly, but behaves like 'clean' on a write request. 4346 * 4347 * clean - no pending writes, but otherwise active. 4348 * When written to inactive array, starts without resync 4349 * If a write request arrives then 4350 * if metadata is known, mark 'dirty' and switch to 'active'. 4351 * if not known, block and switch to write-pending 4352 * If written to an active array that has pending writes, then fails. 4353 * active 4354 * fully active: IO and resync can be happening. 4355 * When written to inactive array, starts with resync 4356 * 4357 * write-pending 4358 * clean, but writes are blocked waiting for 'active' to be written. 4359 * 4360 * active-idle 4361 * like active, but no writes have been seen for a while (100msec). 4362 * 4363 * broken 4364 * Array is failed. It's useful because mounted-arrays aren't stopped 4365 * when array is failed, so this state will at least alert the user that 4366 * something is wrong. 4367 */ 4368 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4369 write_pending, active_idle, broken, bad_word}; 4370 static char *array_states[] = { 4371 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4372 "write-pending", "active-idle", "broken", NULL }; 4373 match_word(const char * word,char ** list)4374 static int match_word(const char *word, char **list) 4375 { 4376 int n; 4377 for (n=0; list[n]; n++) 4378 if (cmd_match(word, list[n])) 4379 break; 4380 return n; 4381 } 4382 4383 static ssize_t array_state_show(struct mddev * mddev,char * page)4384 array_state_show(struct mddev *mddev, char *page) 4385 { 4386 enum array_state st = inactive; 4387 4388 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4389 switch(mddev->ro) { 4390 case MD_RDONLY: 4391 st = readonly; 4392 break; 4393 case MD_AUTO_READ: 4394 st = read_auto; 4395 break; 4396 case MD_RDWR: 4397 spin_lock(&mddev->lock); 4398 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4399 st = write_pending; 4400 else if (mddev->in_sync) 4401 st = clean; 4402 else if (mddev->safemode) 4403 st = active_idle; 4404 else 4405 st = active; 4406 spin_unlock(&mddev->lock); 4407 } 4408 4409 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4410 st = broken; 4411 } else { 4412 if (list_empty(&mddev->disks) && 4413 mddev->raid_disks == 0 && 4414 mddev->dev_sectors == 0) 4415 st = clear; 4416 else 4417 st = inactive; 4418 } 4419 return sprintf(page, "%s\n", array_states[st]); 4420 } 4421 4422 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4423 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4424 static int restart_array(struct mddev *mddev); 4425 4426 static ssize_t array_state_store(struct mddev * mddev,const char * buf,size_t len)4427 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4428 { 4429 int err = 0; 4430 enum array_state st = match_word(buf, array_states); 4431 4432 if (mddev->pers && (st == active || st == clean) && 4433 mddev->ro != MD_RDONLY) { 4434 /* don't take reconfig_mutex when toggling between 4435 * clean and active 4436 */ 4437 spin_lock(&mddev->lock); 4438 if (st == active) { 4439 restart_array(mddev); 4440 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4441 md_wakeup_thread(mddev->thread); 4442 wake_up(&mddev->sb_wait); 4443 } else /* st == clean */ { 4444 restart_array(mddev); 4445 if (!set_in_sync(mddev)) 4446 err = -EBUSY; 4447 } 4448 if (!err) 4449 sysfs_notify_dirent_safe(mddev->sysfs_state); 4450 spin_unlock(&mddev->lock); 4451 return err ?: len; 4452 } 4453 err = mddev_lock(mddev); 4454 if (err) 4455 return err; 4456 err = -EINVAL; 4457 switch(st) { 4458 case bad_word: 4459 break; 4460 case clear: 4461 /* stopping an active array */ 4462 err = do_md_stop(mddev, 0, NULL); 4463 break; 4464 case inactive: 4465 /* stopping an active array */ 4466 if (mddev->pers) 4467 err = do_md_stop(mddev, 2, NULL); 4468 else 4469 err = 0; /* already inactive */ 4470 break; 4471 case suspended: 4472 break; /* not supported yet */ 4473 case readonly: 4474 if (mddev->pers) 4475 err = md_set_readonly(mddev, NULL); 4476 else { 4477 mddev->ro = MD_RDONLY; 4478 set_disk_ro(mddev->gendisk, 1); 4479 err = do_md_run(mddev); 4480 } 4481 break; 4482 case read_auto: 4483 if (mddev->pers) { 4484 if (md_is_rdwr(mddev)) 4485 err = md_set_readonly(mddev, NULL); 4486 else if (mddev->ro == MD_RDONLY) 4487 err = restart_array(mddev); 4488 if (err == 0) { 4489 mddev->ro = MD_AUTO_READ; 4490 set_disk_ro(mddev->gendisk, 0); 4491 } 4492 } else { 4493 mddev->ro = MD_AUTO_READ; 4494 err = do_md_run(mddev); 4495 } 4496 break; 4497 case clean: 4498 if (mddev->pers) { 4499 err = restart_array(mddev); 4500 if (err) 4501 break; 4502 spin_lock(&mddev->lock); 4503 if (!set_in_sync(mddev)) 4504 err = -EBUSY; 4505 spin_unlock(&mddev->lock); 4506 } else 4507 err = -EINVAL; 4508 break; 4509 case active: 4510 if (mddev->pers) { 4511 err = restart_array(mddev); 4512 if (err) 4513 break; 4514 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4515 wake_up(&mddev->sb_wait); 4516 err = 0; 4517 } else { 4518 mddev->ro = MD_RDWR; 4519 set_disk_ro(mddev->gendisk, 0); 4520 err = do_md_run(mddev); 4521 } 4522 break; 4523 case write_pending: 4524 case active_idle: 4525 case broken: 4526 /* these cannot be set */ 4527 break; 4528 } 4529 4530 if (!err) { 4531 if (mddev->hold_active == UNTIL_IOCTL) 4532 mddev->hold_active = 0; 4533 sysfs_notify_dirent_safe(mddev->sysfs_state); 4534 } 4535 mddev_unlock(mddev); 4536 return err ?: len; 4537 } 4538 static struct md_sysfs_entry md_array_state = 4539 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4540 4541 static ssize_t max_corrected_read_errors_show(struct mddev * mddev,char * page)4542 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4543 return sprintf(page, "%d\n", 4544 atomic_read(&mddev->max_corr_read_errors)); 4545 } 4546 4547 static ssize_t max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4548 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4549 { 4550 unsigned int n; 4551 int rv; 4552 4553 rv = kstrtouint(buf, 10, &n); 4554 if (rv < 0) 4555 return rv; 4556 if (n > INT_MAX) 4557 return -EINVAL; 4558 atomic_set(&mddev->max_corr_read_errors, n); 4559 return len; 4560 } 4561 4562 static struct md_sysfs_entry max_corr_read_errors = 4563 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4564 max_corrected_read_errors_store); 4565 4566 static ssize_t null_show(struct mddev * mddev,char * page)4567 null_show(struct mddev *mddev, char *page) 4568 { 4569 return -EINVAL; 4570 } 4571 4572 static ssize_t new_dev_store(struct mddev * mddev,const char * buf,size_t len)4573 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4574 { 4575 /* buf must be %d:%d\n? giving major and minor numbers */ 4576 /* The new device is added to the array. 4577 * If the array has a persistent superblock, we read the 4578 * superblock to initialise info and check validity. 4579 * Otherwise, only checking done is that in bind_rdev_to_array, 4580 * which mainly checks size. 4581 */ 4582 char *e; 4583 int major = simple_strtoul(buf, &e, 10); 4584 int minor; 4585 dev_t dev; 4586 struct md_rdev *rdev; 4587 int err; 4588 4589 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4590 return -EINVAL; 4591 minor = simple_strtoul(e+1, &e, 10); 4592 if (*e && *e != '\n') 4593 return -EINVAL; 4594 dev = MKDEV(major, minor); 4595 if (major != MAJOR(dev) || 4596 minor != MINOR(dev)) 4597 return -EOVERFLOW; 4598 4599 err = mddev_lock(mddev); 4600 if (err) 4601 return err; 4602 if (mddev->persistent) { 4603 rdev = md_import_device(dev, mddev->major_version, 4604 mddev->minor_version); 4605 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4606 struct md_rdev *rdev0 4607 = list_entry(mddev->disks.next, 4608 struct md_rdev, same_set); 4609 err = super_types[mddev->major_version] 4610 .load_super(rdev, rdev0, mddev->minor_version); 4611 if (err < 0) 4612 goto out; 4613 } 4614 } else if (mddev->external) 4615 rdev = md_import_device(dev, -2, -1); 4616 else 4617 rdev = md_import_device(dev, -1, -1); 4618 4619 if (IS_ERR(rdev)) { 4620 mddev_unlock(mddev); 4621 return PTR_ERR(rdev); 4622 } 4623 err = bind_rdev_to_array(rdev, mddev); 4624 out: 4625 if (err) 4626 export_rdev(rdev, mddev); 4627 mddev_unlock(mddev); 4628 if (!err) 4629 md_new_event(); 4630 return err ? err : len; 4631 } 4632 4633 static struct md_sysfs_entry md_new_device = 4634 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4635 4636 static ssize_t bitmap_store(struct mddev * mddev,const char * buf,size_t len)4637 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4638 { 4639 char *end; 4640 unsigned long chunk, end_chunk; 4641 int err; 4642 4643 err = mddev_lock(mddev); 4644 if (err) 4645 return err; 4646 if (!mddev->bitmap) 4647 goto out; 4648 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4649 while (*buf) { 4650 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4651 if (buf == end) break; 4652 if (*end == '-') { /* range */ 4653 buf = end + 1; 4654 end_chunk = simple_strtoul(buf, &end, 0); 4655 if (buf == end) break; 4656 } 4657 if (*end && !isspace(*end)) break; 4658 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4659 buf = skip_spaces(end); 4660 } 4661 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4662 out: 4663 mddev_unlock(mddev); 4664 return len; 4665 } 4666 4667 static struct md_sysfs_entry md_bitmap = 4668 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4669 4670 static ssize_t size_show(struct mddev * mddev,char * page)4671 size_show(struct mddev *mddev, char *page) 4672 { 4673 return sprintf(page, "%llu\n", 4674 (unsigned long long)mddev->dev_sectors / 2); 4675 } 4676 4677 static int update_size(struct mddev *mddev, sector_t num_sectors); 4678 4679 static ssize_t size_store(struct mddev * mddev,const char * buf,size_t len)4680 size_store(struct mddev *mddev, const char *buf, size_t len) 4681 { 4682 /* If array is inactive, we can reduce the component size, but 4683 * not increase it (except from 0). 4684 * If array is active, we can try an on-line resize 4685 */ 4686 sector_t sectors; 4687 int err = strict_blocks_to_sectors(buf, §ors); 4688 4689 if (err < 0) 4690 return err; 4691 err = mddev_lock(mddev); 4692 if (err) 4693 return err; 4694 if (mddev->pers) { 4695 err = update_size(mddev, sectors); 4696 if (err == 0) 4697 md_update_sb(mddev, 1); 4698 } else { 4699 if (mddev->dev_sectors == 0 || 4700 mddev->dev_sectors > sectors) 4701 mddev->dev_sectors = sectors; 4702 else 4703 err = -ENOSPC; 4704 } 4705 mddev_unlock(mddev); 4706 return err ? err : len; 4707 } 4708 4709 static struct md_sysfs_entry md_size = 4710 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4711 4712 /* Metadata version. 4713 * This is one of 4714 * 'none' for arrays with no metadata (good luck...) 4715 * 'external' for arrays with externally managed metadata, 4716 * or N.M for internally known formats 4717 */ 4718 static ssize_t metadata_show(struct mddev * mddev,char * page)4719 metadata_show(struct mddev *mddev, char *page) 4720 { 4721 if (mddev->persistent) 4722 return sprintf(page, "%d.%d\n", 4723 mddev->major_version, mddev->minor_version); 4724 else if (mddev->external) 4725 return sprintf(page, "external:%s\n", mddev->metadata_type); 4726 else 4727 return sprintf(page, "none\n"); 4728 } 4729 4730 static ssize_t metadata_store(struct mddev * mddev,const char * buf,size_t len)4731 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4732 { 4733 int major, minor; 4734 char *e; 4735 int err; 4736 /* Changing the details of 'external' metadata is 4737 * always permitted. Otherwise there must be 4738 * no devices attached to the array. 4739 */ 4740 4741 err = mddev_lock(mddev); 4742 if (err) 4743 return err; 4744 err = -EBUSY; 4745 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4746 ; 4747 else if (!list_empty(&mddev->disks)) 4748 goto out_unlock; 4749 4750 err = 0; 4751 if (cmd_match(buf, "none")) { 4752 mddev->persistent = 0; 4753 mddev->external = 0; 4754 mddev->major_version = 0; 4755 mddev->minor_version = 90; 4756 goto out_unlock; 4757 } 4758 if (strncmp(buf, "external:", 9) == 0) { 4759 size_t namelen = len-9; 4760 if (namelen >= sizeof(mddev->metadata_type)) 4761 namelen = sizeof(mddev->metadata_type)-1; 4762 strncpy(mddev->metadata_type, buf+9, namelen); 4763 mddev->metadata_type[namelen] = 0; 4764 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4765 mddev->metadata_type[--namelen] = 0; 4766 mddev->persistent = 0; 4767 mddev->external = 1; 4768 mddev->major_version = 0; 4769 mddev->minor_version = 90; 4770 goto out_unlock; 4771 } 4772 major = simple_strtoul(buf, &e, 10); 4773 err = -EINVAL; 4774 if (e==buf || *e != '.') 4775 goto out_unlock; 4776 buf = e+1; 4777 minor = simple_strtoul(buf, &e, 10); 4778 if (e==buf || (*e && *e != '\n') ) 4779 goto out_unlock; 4780 err = -ENOENT; 4781 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4782 goto out_unlock; 4783 mddev->major_version = major; 4784 mddev->minor_version = minor; 4785 mddev->persistent = 1; 4786 mddev->external = 0; 4787 err = 0; 4788 out_unlock: 4789 mddev_unlock(mddev); 4790 return err ?: len; 4791 } 4792 4793 static struct md_sysfs_entry md_metadata = 4794 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4795 4796 static ssize_t action_show(struct mddev * mddev,char * page)4797 action_show(struct mddev *mddev, char *page) 4798 { 4799 char *type = "idle"; 4800 unsigned long recovery = mddev->recovery; 4801 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4802 type = "frozen"; 4803 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4804 (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4805 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4806 type = "reshape"; 4807 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4808 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4809 type = "resync"; 4810 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4811 type = "check"; 4812 else 4813 type = "repair"; 4814 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4815 type = "recover"; 4816 else if (mddev->reshape_position != MaxSector) 4817 type = "reshape"; 4818 } 4819 return sprintf(page, "%s\n", type); 4820 } 4821 stop_sync_thread(struct mddev * mddev)4822 static void stop_sync_thread(struct mddev *mddev) 4823 { 4824 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4825 return; 4826 4827 if (mddev_lock(mddev)) 4828 return; 4829 4830 /* 4831 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4832 * held. 4833 */ 4834 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4835 mddev_unlock(mddev); 4836 return; 4837 } 4838 4839 if (work_pending(&mddev->sync_work)) 4840 flush_workqueue(md_misc_wq); 4841 4842 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4843 /* 4844 * Thread might be blocked waiting for metadata update which will now 4845 * never happen 4846 */ 4847 md_wakeup_thread_directly(mddev->sync_thread); 4848 4849 mddev_unlock(mddev); 4850 } 4851 idle_sync_thread(struct mddev * mddev)4852 static void idle_sync_thread(struct mddev *mddev) 4853 { 4854 int sync_seq = atomic_read(&mddev->sync_seq); 4855 4856 mutex_lock(&mddev->sync_mutex); 4857 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4858 stop_sync_thread(mddev); 4859 4860 wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4861 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4862 4863 mutex_unlock(&mddev->sync_mutex); 4864 } 4865 frozen_sync_thread(struct mddev * mddev)4866 static void frozen_sync_thread(struct mddev *mddev) 4867 { 4868 mutex_lock(&mddev->sync_mutex); 4869 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4870 stop_sync_thread(mddev); 4871 4872 wait_event(resync_wait, mddev->sync_thread == NULL && 4873 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4874 4875 mutex_unlock(&mddev->sync_mutex); 4876 } 4877 4878 static ssize_t action_store(struct mddev * mddev,const char * page,size_t len)4879 action_store(struct mddev *mddev, const char *page, size_t len) 4880 { 4881 if (!mddev->pers || !mddev->pers->sync_request) 4882 return -EINVAL; 4883 4884 4885 if (cmd_match(page, "idle")) 4886 idle_sync_thread(mddev); 4887 else if (cmd_match(page, "frozen")) 4888 frozen_sync_thread(mddev); 4889 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4890 return -EBUSY; 4891 else if (cmd_match(page, "resync")) 4892 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4893 else if (cmd_match(page, "recover")) { 4894 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4895 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4896 } else if (cmd_match(page, "reshape")) { 4897 int err; 4898 if (mddev->pers->start_reshape == NULL) 4899 return -EINVAL; 4900 err = mddev_lock(mddev); 4901 if (!err) { 4902 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4903 err = -EBUSY; 4904 } else if (mddev->reshape_position == MaxSector || 4905 mddev->pers->check_reshape == NULL || 4906 mddev->pers->check_reshape(mddev)) { 4907 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4908 err = mddev->pers->start_reshape(mddev); 4909 } else { 4910 /* 4911 * If reshape is still in progress, and 4912 * md_check_recovery() can continue to reshape, 4913 * don't restart reshape because data can be 4914 * corrupted for raid456. 4915 */ 4916 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4917 } 4918 mddev_unlock(mddev); 4919 } 4920 if (err) 4921 return err; 4922 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4923 } else { 4924 if (cmd_match(page, "check")) 4925 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4926 else if (!cmd_match(page, "repair")) 4927 return -EINVAL; 4928 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4929 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4930 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4931 } 4932 if (mddev->ro == MD_AUTO_READ) { 4933 /* A write to sync_action is enough to justify 4934 * canceling read-auto mode 4935 */ 4936 mddev->ro = MD_RDWR; 4937 md_wakeup_thread(mddev->sync_thread); 4938 } 4939 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4940 md_wakeup_thread(mddev->thread); 4941 sysfs_notify_dirent_safe(mddev->sysfs_action); 4942 return len; 4943 } 4944 4945 static struct md_sysfs_entry md_scan_mode = 4946 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4947 4948 static ssize_t last_sync_action_show(struct mddev * mddev,char * page)4949 last_sync_action_show(struct mddev *mddev, char *page) 4950 { 4951 return sprintf(page, "%s\n", mddev->last_sync_action); 4952 } 4953 4954 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4955 4956 static ssize_t mismatch_cnt_show(struct mddev * mddev,char * page)4957 mismatch_cnt_show(struct mddev *mddev, char *page) 4958 { 4959 return sprintf(page, "%llu\n", 4960 (unsigned long long) 4961 atomic64_read(&mddev->resync_mismatches)); 4962 } 4963 4964 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4965 4966 static ssize_t sync_min_show(struct mddev * mddev,char * page)4967 sync_min_show(struct mddev *mddev, char *page) 4968 { 4969 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4970 mddev->sync_speed_min ? "local": "system"); 4971 } 4972 4973 static ssize_t sync_min_store(struct mddev * mddev,const char * buf,size_t len)4974 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4975 { 4976 unsigned int min; 4977 int rv; 4978 4979 if (strncmp(buf, "system", 6)==0) { 4980 min = 0; 4981 } else { 4982 rv = kstrtouint(buf, 10, &min); 4983 if (rv < 0) 4984 return rv; 4985 if (min == 0) 4986 return -EINVAL; 4987 } 4988 mddev->sync_speed_min = min; 4989 return len; 4990 } 4991 4992 static struct md_sysfs_entry md_sync_min = 4993 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4994 4995 static ssize_t sync_max_show(struct mddev * mddev,char * page)4996 sync_max_show(struct mddev *mddev, char *page) 4997 { 4998 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4999 mddev->sync_speed_max ? "local": "system"); 5000 } 5001 5002 static ssize_t sync_max_store(struct mddev * mddev,const char * buf,size_t len)5003 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5004 { 5005 unsigned int max; 5006 int rv; 5007 5008 if (strncmp(buf, "system", 6)==0) { 5009 max = 0; 5010 } else { 5011 rv = kstrtouint(buf, 10, &max); 5012 if (rv < 0) 5013 return rv; 5014 if (max == 0) 5015 return -EINVAL; 5016 } 5017 mddev->sync_speed_max = max; 5018 return len; 5019 } 5020 5021 static struct md_sysfs_entry md_sync_max = 5022 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5023 5024 static ssize_t degraded_show(struct mddev * mddev,char * page)5025 degraded_show(struct mddev *mddev, char *page) 5026 { 5027 return sprintf(page, "%d\n", mddev->degraded); 5028 } 5029 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5030 5031 static ssize_t sync_force_parallel_show(struct mddev * mddev,char * page)5032 sync_force_parallel_show(struct mddev *mddev, char *page) 5033 { 5034 return sprintf(page, "%d\n", mddev->parallel_resync); 5035 } 5036 5037 static ssize_t sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5038 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5039 { 5040 long n; 5041 5042 if (kstrtol(buf, 10, &n)) 5043 return -EINVAL; 5044 5045 if (n != 0 && n != 1) 5046 return -EINVAL; 5047 5048 mddev->parallel_resync = n; 5049 5050 if (mddev->sync_thread) 5051 wake_up(&resync_wait); 5052 5053 return len; 5054 } 5055 5056 /* force parallel resync, even with shared block devices */ 5057 static struct md_sysfs_entry md_sync_force_parallel = 5058 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5059 sync_force_parallel_show, sync_force_parallel_store); 5060 5061 static ssize_t sync_speed_show(struct mddev * mddev,char * page)5062 sync_speed_show(struct mddev *mddev, char *page) 5063 { 5064 unsigned long resync, dt, db; 5065 if (mddev->curr_resync == MD_RESYNC_NONE) 5066 return sprintf(page, "none\n"); 5067 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5068 dt = (jiffies - mddev->resync_mark) / HZ; 5069 if (!dt) dt++; 5070 db = resync - mddev->resync_mark_cnt; 5071 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5072 } 5073 5074 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5075 5076 static ssize_t sync_completed_show(struct mddev * mddev,char * page)5077 sync_completed_show(struct mddev *mddev, char *page) 5078 { 5079 unsigned long long max_sectors, resync; 5080 5081 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5082 return sprintf(page, "none\n"); 5083 5084 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5085 mddev->curr_resync == MD_RESYNC_DELAYED) 5086 return sprintf(page, "delayed\n"); 5087 5088 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5089 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5090 max_sectors = mddev->resync_max_sectors; 5091 else 5092 max_sectors = mddev->dev_sectors; 5093 5094 resync = mddev->curr_resync_completed; 5095 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5096 } 5097 5098 static struct md_sysfs_entry md_sync_completed = 5099 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5100 5101 static ssize_t min_sync_show(struct mddev * mddev,char * page)5102 min_sync_show(struct mddev *mddev, char *page) 5103 { 5104 return sprintf(page, "%llu\n", 5105 (unsigned long long)mddev->resync_min); 5106 } 5107 static ssize_t min_sync_store(struct mddev * mddev,const char * buf,size_t len)5108 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5109 { 5110 unsigned long long min; 5111 int err; 5112 5113 if (kstrtoull(buf, 10, &min)) 5114 return -EINVAL; 5115 5116 spin_lock(&mddev->lock); 5117 err = -EINVAL; 5118 if (min > mddev->resync_max) 5119 goto out_unlock; 5120 5121 err = -EBUSY; 5122 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5123 goto out_unlock; 5124 5125 /* Round down to multiple of 4K for safety */ 5126 mddev->resync_min = round_down(min, 8); 5127 err = 0; 5128 5129 out_unlock: 5130 spin_unlock(&mddev->lock); 5131 return err ?: len; 5132 } 5133 5134 static struct md_sysfs_entry md_min_sync = 5135 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5136 5137 static ssize_t max_sync_show(struct mddev * mddev,char * page)5138 max_sync_show(struct mddev *mddev, char *page) 5139 { 5140 if (mddev->resync_max == MaxSector) 5141 return sprintf(page, "max\n"); 5142 else 5143 return sprintf(page, "%llu\n", 5144 (unsigned long long)mddev->resync_max); 5145 } 5146 static ssize_t max_sync_store(struct mddev * mddev,const char * buf,size_t len)5147 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5148 { 5149 int err; 5150 spin_lock(&mddev->lock); 5151 if (strncmp(buf, "max", 3) == 0) 5152 mddev->resync_max = MaxSector; 5153 else { 5154 unsigned long long max; 5155 int chunk; 5156 5157 err = -EINVAL; 5158 if (kstrtoull(buf, 10, &max)) 5159 goto out_unlock; 5160 if (max < mddev->resync_min) 5161 goto out_unlock; 5162 5163 err = -EBUSY; 5164 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5165 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5166 goto out_unlock; 5167 5168 /* Must be a multiple of chunk_size */ 5169 chunk = mddev->chunk_sectors; 5170 if (chunk) { 5171 sector_t temp = max; 5172 5173 err = -EINVAL; 5174 if (sector_div(temp, chunk)) 5175 goto out_unlock; 5176 } 5177 mddev->resync_max = max; 5178 } 5179 wake_up(&mddev->recovery_wait); 5180 err = 0; 5181 out_unlock: 5182 spin_unlock(&mddev->lock); 5183 return err ?: len; 5184 } 5185 5186 static struct md_sysfs_entry md_max_sync = 5187 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5188 5189 static ssize_t suspend_lo_show(struct mddev * mddev,char * page)5190 suspend_lo_show(struct mddev *mddev, char *page) 5191 { 5192 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 5193 } 5194 5195 static ssize_t suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5196 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5197 { 5198 unsigned long long new; 5199 int err; 5200 5201 err = kstrtoull(buf, 10, &new); 5202 if (err < 0) 5203 return err; 5204 if (new != (sector_t)new) 5205 return -EINVAL; 5206 5207 err = mddev_lock(mddev); 5208 if (err) 5209 return err; 5210 err = -EINVAL; 5211 if (mddev->pers == NULL || 5212 mddev->pers->quiesce == NULL) 5213 goto unlock; 5214 mddev_suspend(mddev); 5215 mddev->suspend_lo = new; 5216 mddev_resume(mddev); 5217 5218 err = 0; 5219 unlock: 5220 mddev_unlock(mddev); 5221 return err ?: len; 5222 } 5223 static struct md_sysfs_entry md_suspend_lo = 5224 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5225 5226 static ssize_t suspend_hi_show(struct mddev * mddev,char * page)5227 suspend_hi_show(struct mddev *mddev, char *page) 5228 { 5229 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 5230 } 5231 5232 static ssize_t suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5233 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5234 { 5235 unsigned long long new; 5236 int err; 5237 5238 err = kstrtoull(buf, 10, &new); 5239 if (err < 0) 5240 return err; 5241 if (new != (sector_t)new) 5242 return -EINVAL; 5243 5244 err = mddev_lock(mddev); 5245 if (err) 5246 return err; 5247 err = -EINVAL; 5248 if (mddev->pers == NULL) 5249 goto unlock; 5250 5251 mddev_suspend(mddev); 5252 mddev->suspend_hi = new; 5253 mddev_resume(mddev); 5254 5255 err = 0; 5256 unlock: 5257 mddev_unlock(mddev); 5258 return err ?: len; 5259 } 5260 static struct md_sysfs_entry md_suspend_hi = 5261 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5262 5263 static ssize_t reshape_position_show(struct mddev * mddev,char * page)5264 reshape_position_show(struct mddev *mddev, char *page) 5265 { 5266 if (mddev->reshape_position != MaxSector) 5267 return sprintf(page, "%llu\n", 5268 (unsigned long long)mddev->reshape_position); 5269 strcpy(page, "none\n"); 5270 return 5; 5271 } 5272 5273 static ssize_t reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5274 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5275 { 5276 struct md_rdev *rdev; 5277 unsigned long long new; 5278 int err; 5279 5280 err = kstrtoull(buf, 10, &new); 5281 if (err < 0) 5282 return err; 5283 if (new != (sector_t)new) 5284 return -EINVAL; 5285 err = mddev_lock(mddev); 5286 if (err) 5287 return err; 5288 err = -EBUSY; 5289 if (mddev->pers) 5290 goto unlock; 5291 mddev->reshape_position = new; 5292 mddev->delta_disks = 0; 5293 mddev->reshape_backwards = 0; 5294 mddev->new_level = mddev->level; 5295 mddev->new_layout = mddev->layout; 5296 mddev->new_chunk_sectors = mddev->chunk_sectors; 5297 rdev_for_each(rdev, mddev) 5298 rdev->new_data_offset = rdev->data_offset; 5299 err = 0; 5300 unlock: 5301 mddev_unlock(mddev); 5302 return err ?: len; 5303 } 5304 5305 static struct md_sysfs_entry md_reshape_position = 5306 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5307 reshape_position_store); 5308 5309 static ssize_t reshape_direction_show(struct mddev * mddev,char * page)5310 reshape_direction_show(struct mddev *mddev, char *page) 5311 { 5312 return sprintf(page, "%s\n", 5313 mddev->reshape_backwards ? "backwards" : "forwards"); 5314 } 5315 5316 static ssize_t reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5317 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5318 { 5319 int backwards = 0; 5320 int err; 5321 5322 if (cmd_match(buf, "forwards")) 5323 backwards = 0; 5324 else if (cmd_match(buf, "backwards")) 5325 backwards = 1; 5326 else 5327 return -EINVAL; 5328 if (mddev->reshape_backwards == backwards) 5329 return len; 5330 5331 err = mddev_lock(mddev); 5332 if (err) 5333 return err; 5334 /* check if we are allowed to change */ 5335 if (mddev->delta_disks) 5336 err = -EBUSY; 5337 else if (mddev->persistent && 5338 mddev->major_version == 0) 5339 err = -EINVAL; 5340 else 5341 mddev->reshape_backwards = backwards; 5342 mddev_unlock(mddev); 5343 return err ?: len; 5344 } 5345 5346 static struct md_sysfs_entry md_reshape_direction = 5347 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5348 reshape_direction_store); 5349 5350 static ssize_t array_size_show(struct mddev * mddev,char * page)5351 array_size_show(struct mddev *mddev, char *page) 5352 { 5353 if (mddev->external_size) 5354 return sprintf(page, "%llu\n", 5355 (unsigned long long)mddev->array_sectors/2); 5356 else 5357 return sprintf(page, "default\n"); 5358 } 5359 5360 static ssize_t array_size_store(struct mddev * mddev,const char * buf,size_t len)5361 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5362 { 5363 sector_t sectors; 5364 int err; 5365 5366 err = mddev_lock(mddev); 5367 if (err) 5368 return err; 5369 5370 /* cluster raid doesn't support change array_sectors */ 5371 if (mddev_is_clustered(mddev)) { 5372 mddev_unlock(mddev); 5373 return -EINVAL; 5374 } 5375 5376 if (strncmp(buf, "default", 7) == 0) { 5377 if (mddev->pers) 5378 sectors = mddev->pers->size(mddev, 0, 0); 5379 else 5380 sectors = mddev->array_sectors; 5381 5382 mddev->external_size = 0; 5383 } else { 5384 if (strict_blocks_to_sectors(buf, §ors) < 0) 5385 err = -EINVAL; 5386 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5387 err = -E2BIG; 5388 else 5389 mddev->external_size = 1; 5390 } 5391 5392 if (!err) { 5393 mddev->array_sectors = sectors; 5394 if (mddev->pers) 5395 set_capacity_and_notify(mddev->gendisk, 5396 mddev->array_sectors); 5397 } 5398 mddev_unlock(mddev); 5399 return err ?: len; 5400 } 5401 5402 static struct md_sysfs_entry md_array_size = 5403 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5404 array_size_store); 5405 5406 static ssize_t consistency_policy_show(struct mddev * mddev,char * page)5407 consistency_policy_show(struct mddev *mddev, char *page) 5408 { 5409 int ret; 5410 5411 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5412 ret = sprintf(page, "journal\n"); 5413 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5414 ret = sprintf(page, "ppl\n"); 5415 } else if (mddev->bitmap) { 5416 ret = sprintf(page, "bitmap\n"); 5417 } else if (mddev->pers) { 5418 if (mddev->pers->sync_request) 5419 ret = sprintf(page, "resync\n"); 5420 else 5421 ret = sprintf(page, "none\n"); 5422 } else { 5423 ret = sprintf(page, "unknown\n"); 5424 } 5425 5426 return ret; 5427 } 5428 5429 static ssize_t consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5430 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5431 { 5432 int err = 0; 5433 5434 if (mddev->pers) { 5435 if (mddev->pers->change_consistency_policy) 5436 err = mddev->pers->change_consistency_policy(mddev, buf); 5437 else 5438 err = -EBUSY; 5439 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5440 set_bit(MD_HAS_PPL, &mddev->flags); 5441 } else { 5442 err = -EINVAL; 5443 } 5444 5445 return err ? err : len; 5446 } 5447 5448 static struct md_sysfs_entry md_consistency_policy = 5449 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5450 consistency_policy_store); 5451 fail_last_dev_show(struct mddev * mddev,char * page)5452 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5453 { 5454 return sprintf(page, "%d\n", mddev->fail_last_dev); 5455 } 5456 5457 /* 5458 * Setting fail_last_dev to true to allow last device to be forcibly removed 5459 * from RAID1/RAID10. 5460 */ 5461 static ssize_t fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5462 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5463 { 5464 int ret; 5465 bool value; 5466 5467 ret = kstrtobool(buf, &value); 5468 if (ret) 5469 return ret; 5470 5471 if (value != mddev->fail_last_dev) 5472 mddev->fail_last_dev = value; 5473 5474 return len; 5475 } 5476 static struct md_sysfs_entry md_fail_last_dev = 5477 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5478 fail_last_dev_store); 5479 serialize_policy_show(struct mddev * mddev,char * page)5480 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5481 { 5482 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5483 return sprintf(page, "n/a\n"); 5484 else 5485 return sprintf(page, "%d\n", mddev->serialize_policy); 5486 } 5487 5488 /* 5489 * Setting serialize_policy to true to enforce write IO is not reordered 5490 * for raid1. 5491 */ 5492 static ssize_t serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5493 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5494 { 5495 int err; 5496 bool value; 5497 5498 err = kstrtobool(buf, &value); 5499 if (err) 5500 return err; 5501 5502 if (value == mddev->serialize_policy) 5503 return len; 5504 5505 err = mddev_lock(mddev); 5506 if (err) 5507 return err; 5508 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5509 pr_err("md: serialize_policy is only effective for raid1\n"); 5510 err = -EINVAL; 5511 goto unlock; 5512 } 5513 5514 mddev_suspend(mddev); 5515 if (value) 5516 mddev_create_serial_pool(mddev, NULL, true); 5517 else 5518 mddev_destroy_serial_pool(mddev, NULL, true); 5519 mddev->serialize_policy = value; 5520 mddev_resume(mddev); 5521 unlock: 5522 mddev_unlock(mddev); 5523 return err ?: len; 5524 } 5525 5526 static struct md_sysfs_entry md_serialize_policy = 5527 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5528 serialize_policy_store); 5529 5530 5531 static struct attribute *md_default_attrs[] = { 5532 &md_level.attr, 5533 &md_layout.attr, 5534 &md_raid_disks.attr, 5535 &md_uuid.attr, 5536 &md_chunk_size.attr, 5537 &md_size.attr, 5538 &md_resync_start.attr, 5539 &md_metadata.attr, 5540 &md_new_device.attr, 5541 &md_safe_delay.attr, 5542 &md_array_state.attr, 5543 &md_reshape_position.attr, 5544 &md_reshape_direction.attr, 5545 &md_array_size.attr, 5546 &max_corr_read_errors.attr, 5547 &md_consistency_policy.attr, 5548 &md_fail_last_dev.attr, 5549 &md_serialize_policy.attr, 5550 NULL, 5551 }; 5552 5553 static const struct attribute_group md_default_group = { 5554 .attrs = md_default_attrs, 5555 }; 5556 5557 static struct attribute *md_redundancy_attrs[] = { 5558 &md_scan_mode.attr, 5559 &md_last_scan_mode.attr, 5560 &md_mismatches.attr, 5561 &md_sync_min.attr, 5562 &md_sync_max.attr, 5563 &md_sync_speed.attr, 5564 &md_sync_force_parallel.attr, 5565 &md_sync_completed.attr, 5566 &md_min_sync.attr, 5567 &md_max_sync.attr, 5568 &md_suspend_lo.attr, 5569 &md_suspend_hi.attr, 5570 &md_bitmap.attr, 5571 &md_degraded.attr, 5572 NULL, 5573 }; 5574 static const struct attribute_group md_redundancy_group = { 5575 .name = NULL, 5576 .attrs = md_redundancy_attrs, 5577 }; 5578 5579 static const struct attribute_group *md_attr_groups[] = { 5580 &md_default_group, 5581 &md_bitmap_group, 5582 NULL, 5583 }; 5584 5585 static ssize_t md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)5586 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5587 { 5588 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5589 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5590 ssize_t rv; 5591 5592 if (!entry->show) 5593 return -EIO; 5594 spin_lock(&all_mddevs_lock); 5595 if (!mddev_get(mddev)) { 5596 spin_unlock(&all_mddevs_lock); 5597 return -EBUSY; 5598 } 5599 spin_unlock(&all_mddevs_lock); 5600 5601 rv = entry->show(mddev, page); 5602 mddev_put(mddev); 5603 return rv; 5604 } 5605 5606 static ssize_t md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)5607 md_attr_store(struct kobject *kobj, struct attribute *attr, 5608 const char *page, size_t length) 5609 { 5610 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5611 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5612 ssize_t rv; 5613 5614 if (!entry->store) 5615 return -EIO; 5616 if (!capable(CAP_SYS_ADMIN)) 5617 return -EACCES; 5618 spin_lock(&all_mddevs_lock); 5619 if (!mddev_get(mddev)) { 5620 spin_unlock(&all_mddevs_lock); 5621 return -EBUSY; 5622 } 5623 spin_unlock(&all_mddevs_lock); 5624 rv = entry->store(mddev, page, length); 5625 mddev_put(mddev); 5626 return rv; 5627 } 5628 md_kobj_release(struct kobject * ko)5629 static void md_kobj_release(struct kobject *ko) 5630 { 5631 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5632 5633 if (mddev->sysfs_state) 5634 sysfs_put(mddev->sysfs_state); 5635 if (mddev->sysfs_level) 5636 sysfs_put(mddev->sysfs_level); 5637 5638 del_gendisk(mddev->gendisk); 5639 put_disk(mddev->gendisk); 5640 } 5641 5642 static const struct sysfs_ops md_sysfs_ops = { 5643 .show = md_attr_show, 5644 .store = md_attr_store, 5645 }; 5646 static const struct kobj_type md_ktype = { 5647 .release = md_kobj_release, 5648 .sysfs_ops = &md_sysfs_ops, 5649 .default_groups = md_attr_groups, 5650 }; 5651 5652 int mdp_major = 0; 5653 mddev_delayed_delete(struct work_struct * ws)5654 static void mddev_delayed_delete(struct work_struct *ws) 5655 { 5656 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5657 5658 kobject_put(&mddev->kobj); 5659 } 5660 no_op(struct percpu_ref * r)5661 static void no_op(struct percpu_ref *r) {} 5662 mddev_init_writes_pending(struct mddev * mddev)5663 int mddev_init_writes_pending(struct mddev *mddev) 5664 { 5665 if (mddev->writes_pending.percpu_count_ptr) 5666 return 0; 5667 if (percpu_ref_init(&mddev->writes_pending, no_op, 5668 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) 5669 return -ENOMEM; 5670 /* We want to start with the refcount at zero */ 5671 percpu_ref_put(&mddev->writes_pending); 5672 return 0; 5673 } 5674 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5675 md_alloc(dev_t dev,char * name)5676 struct mddev *md_alloc(dev_t dev, char *name) 5677 { 5678 /* 5679 * If dev is zero, name is the name of a device to allocate with 5680 * an arbitrary minor number. It will be "md_???" 5681 * If dev is non-zero it must be a device number with a MAJOR of 5682 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5683 * the device is being created by opening a node in /dev. 5684 * If "name" is not NULL, the device is being created by 5685 * writing to /sys/module/md_mod/parameters/new_array. 5686 */ 5687 static DEFINE_MUTEX(disks_mutex); 5688 struct mddev *mddev; 5689 struct gendisk *disk; 5690 int partitioned; 5691 int shift; 5692 int unit; 5693 int error ; 5694 5695 /* 5696 * Wait for any previous instance of this device to be completely 5697 * removed (mddev_delayed_delete). 5698 */ 5699 flush_workqueue(md_misc_wq); 5700 5701 mutex_lock(&disks_mutex); 5702 mddev = mddev_alloc(dev); 5703 if (IS_ERR(mddev)) { 5704 error = PTR_ERR(mddev); 5705 goto out_unlock; 5706 } 5707 5708 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5709 shift = partitioned ? MdpMinorShift : 0; 5710 unit = MINOR(mddev->unit) >> shift; 5711 5712 if (name && !dev) { 5713 /* Need to ensure that 'name' is not a duplicate. 5714 */ 5715 struct mddev *mddev2; 5716 spin_lock(&all_mddevs_lock); 5717 5718 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5719 if (mddev2->gendisk && 5720 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5721 spin_unlock(&all_mddevs_lock); 5722 error = -EEXIST; 5723 goto out_free_mddev; 5724 } 5725 spin_unlock(&all_mddevs_lock); 5726 } 5727 if (name && dev) 5728 /* 5729 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5730 */ 5731 mddev->hold_active = UNTIL_STOP; 5732 5733 error = -ENOMEM; 5734 disk = blk_alloc_disk(NUMA_NO_NODE); 5735 if (!disk) 5736 goto out_free_mddev; 5737 5738 disk->major = MAJOR(mddev->unit); 5739 disk->first_minor = unit << shift; 5740 disk->minors = 1 << shift; 5741 if (name) 5742 strcpy(disk->disk_name, name); 5743 else if (partitioned) 5744 sprintf(disk->disk_name, "md_d%d", unit); 5745 else 5746 sprintf(disk->disk_name, "md%d", unit); 5747 disk->fops = &md_fops; 5748 disk->private_data = mddev; 5749 5750 mddev->queue = disk->queue; 5751 blk_set_stacking_limits(&mddev->queue->limits); 5752 blk_queue_write_cache(mddev->queue, true, true); 5753 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5754 mddev->gendisk = disk; 5755 error = add_disk(disk); 5756 if (error) 5757 goto out_put_disk; 5758 5759 kobject_init(&mddev->kobj, &md_ktype); 5760 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5761 if (error) { 5762 /* 5763 * The disk is already live at this point. Clear the hold flag 5764 * and let mddev_put take care of the deletion, as it isn't any 5765 * different from a normal close on last release now. 5766 */ 5767 mddev->hold_active = 0; 5768 mutex_unlock(&disks_mutex); 5769 mddev_put(mddev); 5770 return ERR_PTR(error); 5771 } 5772 5773 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5774 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5775 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5776 mutex_unlock(&disks_mutex); 5777 return mddev; 5778 5779 out_put_disk: 5780 put_disk(disk); 5781 out_free_mddev: 5782 mddev_free(mddev); 5783 out_unlock: 5784 mutex_unlock(&disks_mutex); 5785 return ERR_PTR(error); 5786 } 5787 md_alloc_and_put(dev_t dev,char * name)5788 static int md_alloc_and_put(dev_t dev, char *name) 5789 { 5790 struct mddev *mddev = md_alloc(dev, name); 5791 5792 if (IS_ERR(mddev)) 5793 return PTR_ERR(mddev); 5794 mddev_put(mddev); 5795 return 0; 5796 } 5797 md_probe(dev_t dev)5798 static void md_probe(dev_t dev) 5799 { 5800 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5801 return; 5802 if (create_on_open) 5803 md_alloc_and_put(dev, NULL); 5804 } 5805 add_named_array(const char * val,const struct kernel_param * kp)5806 static int add_named_array(const char *val, const struct kernel_param *kp) 5807 { 5808 /* 5809 * val must be "md_*" or "mdNNN". 5810 * For "md_*" we allocate an array with a large free minor number, and 5811 * set the name to val. val must not already be an active name. 5812 * For "mdNNN" we allocate an array with the minor number NNN 5813 * which must not already be in use. 5814 */ 5815 int len = strlen(val); 5816 char buf[DISK_NAME_LEN]; 5817 unsigned long devnum; 5818 5819 while (len && val[len-1] == '\n') 5820 len--; 5821 if (len >= DISK_NAME_LEN) 5822 return -E2BIG; 5823 strscpy(buf, val, len+1); 5824 if (strncmp(buf, "md_", 3) == 0) 5825 return md_alloc_and_put(0, buf); 5826 if (strncmp(buf, "md", 2) == 0 && 5827 isdigit(buf[2]) && 5828 kstrtoul(buf+2, 10, &devnum) == 0 && 5829 devnum <= MINORMASK) 5830 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5831 5832 return -EINVAL; 5833 } 5834 md_safemode_timeout(struct timer_list * t)5835 static void md_safemode_timeout(struct timer_list *t) 5836 { 5837 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5838 5839 mddev->safemode = 1; 5840 if (mddev->external) 5841 sysfs_notify_dirent_safe(mddev->sysfs_state); 5842 5843 md_wakeup_thread(mddev->thread); 5844 } 5845 5846 static int start_dirty_degraded; active_io_release(struct percpu_ref * ref)5847 static void active_io_release(struct percpu_ref *ref) 5848 { 5849 struct mddev *mddev = container_of(ref, struct mddev, active_io); 5850 5851 wake_up(&mddev->sb_wait); 5852 } 5853 md_run(struct mddev * mddev)5854 int md_run(struct mddev *mddev) 5855 { 5856 int err; 5857 struct md_rdev *rdev; 5858 struct md_personality *pers; 5859 bool nowait = true; 5860 5861 if (list_empty(&mddev->disks)) 5862 /* cannot run an array with no devices.. */ 5863 return -EINVAL; 5864 5865 if (mddev->pers) 5866 return -EBUSY; 5867 /* Cannot run until previous stop completes properly */ 5868 if (mddev->sysfs_active) 5869 return -EBUSY; 5870 5871 /* 5872 * Analyze all RAID superblock(s) 5873 */ 5874 if (!mddev->raid_disks) { 5875 if (!mddev->persistent) 5876 return -EINVAL; 5877 err = analyze_sbs(mddev); 5878 if (err) 5879 return -EINVAL; 5880 } 5881 5882 if (mddev->level != LEVEL_NONE) 5883 request_module("md-level-%d", mddev->level); 5884 else if (mddev->clevel[0]) 5885 request_module("md-%s", mddev->clevel); 5886 5887 /* 5888 * Drop all container device buffers, from now on 5889 * the only valid external interface is through the md 5890 * device. 5891 */ 5892 mddev->has_superblocks = false; 5893 rdev_for_each(rdev, mddev) { 5894 if (test_bit(Faulty, &rdev->flags)) 5895 continue; 5896 sync_blockdev(rdev->bdev); 5897 invalidate_bdev(rdev->bdev); 5898 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 5899 mddev->ro = MD_RDONLY; 5900 if (mddev->gendisk) 5901 set_disk_ro(mddev->gendisk, 1); 5902 } 5903 5904 if (rdev->sb_page) 5905 mddev->has_superblocks = true; 5906 5907 /* perform some consistency tests on the device. 5908 * We don't want the data to overlap the metadata, 5909 * Internal Bitmap issues have been handled elsewhere. 5910 */ 5911 if (rdev->meta_bdev) { 5912 /* Nothing to check */; 5913 } else if (rdev->data_offset < rdev->sb_start) { 5914 if (mddev->dev_sectors && 5915 rdev->data_offset + mddev->dev_sectors 5916 > rdev->sb_start) { 5917 pr_warn("md: %s: data overlaps metadata\n", 5918 mdname(mddev)); 5919 return -EINVAL; 5920 } 5921 } else { 5922 if (rdev->sb_start + rdev->sb_size/512 5923 > rdev->data_offset) { 5924 pr_warn("md: %s: metadata overlaps data\n", 5925 mdname(mddev)); 5926 return -EINVAL; 5927 } 5928 } 5929 sysfs_notify_dirent_safe(rdev->sysfs_state); 5930 nowait = nowait && bdev_nowait(rdev->bdev); 5931 } 5932 5933 err = percpu_ref_init(&mddev->active_io, active_io_release, 5934 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); 5935 if (err) 5936 return err; 5937 5938 if (!bioset_initialized(&mddev->bio_set)) { 5939 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5940 if (err) 5941 goto exit_active_io; 5942 } 5943 if (!bioset_initialized(&mddev->sync_set)) { 5944 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5945 if (err) 5946 goto exit_bio_set; 5947 } 5948 5949 if (!bioset_initialized(&mddev->io_clone_set)) { 5950 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5951 offsetof(struct md_io_clone, bio_clone), 0); 5952 if (err) 5953 goto exit_sync_set; 5954 } 5955 5956 spin_lock(&pers_lock); 5957 pers = find_pers(mddev->level, mddev->clevel); 5958 if (!pers || !try_module_get(pers->owner)) { 5959 spin_unlock(&pers_lock); 5960 if (mddev->level != LEVEL_NONE) 5961 pr_warn("md: personality for level %d is not loaded!\n", 5962 mddev->level); 5963 else 5964 pr_warn("md: personality for level %s is not loaded!\n", 5965 mddev->clevel); 5966 err = -EINVAL; 5967 goto abort; 5968 } 5969 spin_unlock(&pers_lock); 5970 if (mddev->level != pers->level) { 5971 mddev->level = pers->level; 5972 mddev->new_level = pers->level; 5973 } 5974 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5975 5976 if (mddev->reshape_position != MaxSector && 5977 pers->start_reshape == NULL) { 5978 /* This personality cannot handle reshaping... */ 5979 module_put(pers->owner); 5980 err = -EINVAL; 5981 goto abort; 5982 } 5983 5984 if (pers->sync_request) { 5985 /* Warn if this is a potentially silly 5986 * configuration. 5987 */ 5988 struct md_rdev *rdev2; 5989 int warned = 0; 5990 5991 rdev_for_each(rdev, mddev) 5992 rdev_for_each(rdev2, mddev) { 5993 if (rdev < rdev2 && 5994 rdev->bdev->bd_disk == 5995 rdev2->bdev->bd_disk) { 5996 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 5997 mdname(mddev), 5998 rdev->bdev, 5999 rdev2->bdev); 6000 warned = 1; 6001 } 6002 } 6003 6004 if (warned) 6005 pr_warn("True protection against single-disk failure might be compromised.\n"); 6006 } 6007 6008 mddev->recovery = 0; 6009 /* may be over-ridden by personality */ 6010 mddev->resync_max_sectors = mddev->dev_sectors; 6011 6012 mddev->ok_start_degraded = start_dirty_degraded; 6013 6014 if (start_readonly && md_is_rdwr(mddev)) 6015 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6016 6017 err = pers->run(mddev); 6018 if (err) 6019 pr_warn("md: pers->run() failed ...\n"); 6020 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6021 WARN_ONCE(!mddev->external_size, 6022 "%s: default size too small, but 'external_size' not in effect?\n", 6023 __func__); 6024 pr_warn("md: invalid array_size %llu > default size %llu\n", 6025 (unsigned long long)mddev->array_sectors / 2, 6026 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6027 err = -EINVAL; 6028 } 6029 if (err == 0 && pers->sync_request && 6030 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6031 struct bitmap *bitmap; 6032 6033 bitmap = md_bitmap_create(mddev, -1); 6034 if (IS_ERR(bitmap)) { 6035 err = PTR_ERR(bitmap); 6036 pr_warn("%s: failed to create bitmap (%d)\n", 6037 mdname(mddev), err); 6038 } else 6039 mddev->bitmap = bitmap; 6040 6041 } 6042 if (err) 6043 goto bitmap_abort; 6044 6045 if (mddev->bitmap_info.max_write_behind > 0) { 6046 bool create_pool = false; 6047 6048 rdev_for_each(rdev, mddev) { 6049 if (test_bit(WriteMostly, &rdev->flags) && 6050 rdev_init_serial(rdev)) 6051 create_pool = true; 6052 } 6053 if (create_pool && mddev->serial_info_pool == NULL) { 6054 mddev->serial_info_pool = 6055 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6056 sizeof(struct serial_info)); 6057 if (!mddev->serial_info_pool) { 6058 err = -ENOMEM; 6059 goto bitmap_abort; 6060 } 6061 } 6062 } 6063 6064 if (mddev->queue) { 6065 bool nonrot = true; 6066 6067 rdev_for_each(rdev, mddev) { 6068 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { 6069 nonrot = false; 6070 break; 6071 } 6072 } 6073 if (mddev->degraded) 6074 nonrot = false; 6075 if (nonrot) 6076 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6077 else 6078 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6079 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6080 6081 /* Set the NOWAIT flags if all underlying devices support it */ 6082 if (nowait) 6083 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6084 } 6085 if (pers->sync_request) { 6086 if (mddev->kobj.sd && 6087 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6088 pr_warn("md: cannot register extra attributes for %s\n", 6089 mdname(mddev)); 6090 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6091 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6092 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6093 } else if (mddev->ro == MD_AUTO_READ) 6094 mddev->ro = MD_RDWR; 6095 6096 atomic_set(&mddev->max_corr_read_errors, 6097 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6098 mddev->safemode = 0; 6099 if (mddev_is_clustered(mddev)) 6100 mddev->safemode_delay = 0; 6101 else 6102 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6103 mddev->in_sync = 1; 6104 smp_wmb(); 6105 spin_lock(&mddev->lock); 6106 mddev->pers = pers; 6107 spin_unlock(&mddev->lock); 6108 rdev_for_each(rdev, mddev) 6109 if (rdev->raid_disk >= 0) 6110 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6111 6112 if (mddev->degraded && md_is_rdwr(mddev)) 6113 /* This ensures that recovering status is reported immediately 6114 * via sysfs - until a lack of spares is confirmed. 6115 */ 6116 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6117 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6118 6119 if (mddev->sb_flags) 6120 md_update_sb(mddev, 0); 6121 6122 md_new_event(); 6123 return 0; 6124 6125 bitmap_abort: 6126 mddev_detach(mddev); 6127 if (mddev->private) 6128 pers->free(mddev, mddev->private); 6129 mddev->private = NULL; 6130 module_put(pers->owner); 6131 md_bitmap_destroy(mddev); 6132 abort: 6133 bioset_exit(&mddev->io_clone_set); 6134 exit_sync_set: 6135 bioset_exit(&mddev->sync_set); 6136 exit_bio_set: 6137 bioset_exit(&mddev->bio_set); 6138 exit_active_io: 6139 percpu_ref_exit(&mddev->active_io); 6140 return err; 6141 } 6142 EXPORT_SYMBOL_GPL(md_run); 6143 do_md_run(struct mddev * mddev)6144 int do_md_run(struct mddev *mddev) 6145 { 6146 int err; 6147 6148 set_bit(MD_NOT_READY, &mddev->flags); 6149 err = md_run(mddev); 6150 if (err) 6151 goto out; 6152 err = md_bitmap_load(mddev); 6153 if (err) { 6154 md_bitmap_destroy(mddev); 6155 goto out; 6156 } 6157 6158 if (mddev_is_clustered(mddev)) 6159 md_allow_write(mddev); 6160 6161 /* run start up tasks that require md_thread */ 6162 md_start(mddev); 6163 6164 md_wakeup_thread(mddev->thread); 6165 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6166 6167 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6168 clear_bit(MD_NOT_READY, &mddev->flags); 6169 mddev->changed = 1; 6170 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6171 sysfs_notify_dirent_safe(mddev->sysfs_state); 6172 sysfs_notify_dirent_safe(mddev->sysfs_action); 6173 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6174 out: 6175 clear_bit(MD_NOT_READY, &mddev->flags); 6176 return err; 6177 } 6178 md_start(struct mddev * mddev)6179 int md_start(struct mddev *mddev) 6180 { 6181 int ret = 0; 6182 6183 if (mddev->pers->start) { 6184 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6185 md_wakeup_thread(mddev->thread); 6186 ret = mddev->pers->start(mddev); 6187 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6188 md_wakeup_thread(mddev->sync_thread); 6189 } 6190 return ret; 6191 } 6192 EXPORT_SYMBOL_GPL(md_start); 6193 restart_array(struct mddev * mddev)6194 static int restart_array(struct mddev *mddev) 6195 { 6196 struct gendisk *disk = mddev->gendisk; 6197 struct md_rdev *rdev; 6198 bool has_journal = false; 6199 bool has_readonly = false; 6200 6201 /* Complain if it has no devices */ 6202 if (list_empty(&mddev->disks)) 6203 return -ENXIO; 6204 if (!mddev->pers) 6205 return -EINVAL; 6206 if (md_is_rdwr(mddev)) 6207 return -EBUSY; 6208 6209 rcu_read_lock(); 6210 rdev_for_each_rcu(rdev, mddev) { 6211 if (test_bit(Journal, &rdev->flags) && 6212 !test_bit(Faulty, &rdev->flags)) 6213 has_journal = true; 6214 if (rdev_read_only(rdev)) 6215 has_readonly = true; 6216 } 6217 rcu_read_unlock(); 6218 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6219 /* Don't restart rw with journal missing/faulty */ 6220 return -EINVAL; 6221 if (has_readonly) 6222 return -EROFS; 6223 6224 mddev->safemode = 0; 6225 mddev->ro = MD_RDWR; 6226 set_disk_ro(disk, 0); 6227 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6228 /* Kick recovery or resync if necessary */ 6229 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6230 md_wakeup_thread(mddev->thread); 6231 md_wakeup_thread(mddev->sync_thread); 6232 sysfs_notify_dirent_safe(mddev->sysfs_state); 6233 return 0; 6234 } 6235 md_clean(struct mddev * mddev)6236 static void md_clean(struct mddev *mddev) 6237 { 6238 mddev->array_sectors = 0; 6239 mddev->external_size = 0; 6240 mddev->dev_sectors = 0; 6241 mddev->raid_disks = 0; 6242 mddev->recovery_cp = 0; 6243 mddev->resync_min = 0; 6244 mddev->resync_max = MaxSector; 6245 mddev->reshape_position = MaxSector; 6246 /* we still need mddev->external in export_rdev, do not clear it yet */ 6247 mddev->persistent = 0; 6248 mddev->level = LEVEL_NONE; 6249 mddev->clevel[0] = 0; 6250 /* 6251 * Don't clear MD_CLOSING, or mddev can be opened again. 6252 * 'hold_active != 0' means mddev is still in the creation 6253 * process and will be used later. 6254 */ 6255 if (mddev->hold_active) 6256 mddev->flags = 0; 6257 else 6258 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6259 mddev->sb_flags = 0; 6260 mddev->ro = MD_RDWR; 6261 mddev->metadata_type[0] = 0; 6262 mddev->chunk_sectors = 0; 6263 mddev->ctime = mddev->utime = 0; 6264 mddev->layout = 0; 6265 mddev->max_disks = 0; 6266 mddev->events = 0; 6267 mddev->can_decrease_events = 0; 6268 mddev->delta_disks = 0; 6269 mddev->reshape_backwards = 0; 6270 mddev->new_level = LEVEL_NONE; 6271 mddev->new_layout = 0; 6272 mddev->new_chunk_sectors = 0; 6273 mddev->curr_resync = MD_RESYNC_NONE; 6274 atomic64_set(&mddev->resync_mismatches, 0); 6275 mddev->suspend_lo = mddev->suspend_hi = 0; 6276 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6277 mddev->recovery = 0; 6278 mddev->in_sync = 0; 6279 mddev->changed = 0; 6280 mddev->degraded = 0; 6281 mddev->safemode = 0; 6282 mddev->private = NULL; 6283 mddev->cluster_info = NULL; 6284 mddev->bitmap_info.offset = 0; 6285 mddev->bitmap_info.default_offset = 0; 6286 mddev->bitmap_info.default_space = 0; 6287 mddev->bitmap_info.chunksize = 0; 6288 mddev->bitmap_info.daemon_sleep = 0; 6289 mddev->bitmap_info.max_write_behind = 0; 6290 mddev->bitmap_info.nodes = 0; 6291 } 6292 __md_stop_writes(struct mddev * mddev)6293 static void __md_stop_writes(struct mddev *mddev) 6294 { 6295 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6296 if (work_pending(&mddev->sync_work)) 6297 flush_workqueue(md_misc_wq); 6298 if (mddev->sync_thread) { 6299 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6300 md_reap_sync_thread(mddev); 6301 } 6302 6303 del_timer_sync(&mddev->safemode_timer); 6304 6305 if (mddev->pers && mddev->pers->quiesce) { 6306 mddev->pers->quiesce(mddev, 1); 6307 mddev->pers->quiesce(mddev, 0); 6308 } 6309 md_bitmap_flush(mddev); 6310 6311 if (md_is_rdwr(mddev) && 6312 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6313 mddev->sb_flags)) { 6314 /* mark array as shutdown cleanly */ 6315 if (!mddev_is_clustered(mddev)) 6316 mddev->in_sync = 1; 6317 md_update_sb(mddev, 1); 6318 } 6319 /* disable policy to guarantee rdevs free resources for serialization */ 6320 mddev->serialize_policy = 0; 6321 mddev_destroy_serial_pool(mddev, NULL, true); 6322 } 6323 md_stop_writes(struct mddev * mddev)6324 void md_stop_writes(struct mddev *mddev) 6325 { 6326 mddev_lock_nointr(mddev); 6327 __md_stop_writes(mddev); 6328 mddev_unlock(mddev); 6329 } 6330 EXPORT_SYMBOL_GPL(md_stop_writes); 6331 mddev_detach(struct mddev * mddev)6332 static void mddev_detach(struct mddev *mddev) 6333 { 6334 md_bitmap_wait_behind_writes(mddev); 6335 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6336 mddev->pers->quiesce(mddev, 1); 6337 mddev->pers->quiesce(mddev, 0); 6338 } 6339 md_unregister_thread(mddev, &mddev->thread); 6340 if (mddev->queue) 6341 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6342 } 6343 __md_stop(struct mddev * mddev)6344 static void __md_stop(struct mddev *mddev) 6345 { 6346 struct md_personality *pers = mddev->pers; 6347 md_bitmap_destroy(mddev); 6348 mddev_detach(mddev); 6349 /* Ensure ->event_work is done */ 6350 if (mddev->event_work.func) 6351 flush_workqueue(md_misc_wq); 6352 spin_lock(&mddev->lock); 6353 mddev->pers = NULL; 6354 spin_unlock(&mddev->lock); 6355 if (mddev->private) 6356 pers->free(mddev, mddev->private); 6357 mddev->private = NULL; 6358 if (pers->sync_request && mddev->to_remove == NULL) 6359 mddev->to_remove = &md_redundancy_group; 6360 module_put(pers->owner); 6361 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6362 6363 percpu_ref_exit(&mddev->active_io); 6364 bioset_exit(&mddev->bio_set); 6365 bioset_exit(&mddev->sync_set); 6366 bioset_exit(&mddev->io_clone_set); 6367 } 6368 md_stop(struct mddev * mddev)6369 void md_stop(struct mddev *mddev) 6370 { 6371 lockdep_assert_held(&mddev->reconfig_mutex); 6372 6373 /* stop the array and free an attached data structures. 6374 * This is called from dm-raid 6375 */ 6376 __md_stop_writes(mddev); 6377 __md_stop(mddev); 6378 percpu_ref_exit(&mddev->writes_pending); 6379 } 6380 6381 EXPORT_SYMBOL_GPL(md_stop); 6382 md_set_readonly(struct mddev * mddev,struct block_device * bdev)6383 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6384 { 6385 int err = 0; 6386 int did_freeze = 0; 6387 6388 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6389 return -EBUSY; 6390 6391 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6392 did_freeze = 1; 6393 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6394 md_wakeup_thread(mddev->thread); 6395 } 6396 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6397 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6398 6399 /* 6400 * Thread might be blocked waiting for metadata update which will now 6401 * never happen 6402 */ 6403 md_wakeup_thread_directly(mddev->sync_thread); 6404 6405 mddev_unlock(mddev); 6406 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6407 &mddev->recovery)); 6408 wait_event(mddev->sb_wait, 6409 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6410 mddev_lock_nointr(mddev); 6411 6412 mutex_lock(&mddev->open_mutex); 6413 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6414 mddev->sync_thread || 6415 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6416 pr_warn("md: %s still in use.\n",mdname(mddev)); 6417 err = -EBUSY; 6418 goto out; 6419 } 6420 6421 if (mddev->pers) { 6422 __md_stop_writes(mddev); 6423 6424 if (mddev->ro == MD_RDONLY) { 6425 err = -ENXIO; 6426 goto out; 6427 } 6428 6429 mddev->ro = MD_RDONLY; 6430 set_disk_ro(mddev->gendisk, 1); 6431 } 6432 6433 out: 6434 if ((mddev->pers && !err) || did_freeze) { 6435 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6436 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6437 md_wakeup_thread(mddev->thread); 6438 sysfs_notify_dirent_safe(mddev->sysfs_state); 6439 } 6440 6441 mutex_unlock(&mddev->open_mutex); 6442 return err; 6443 } 6444 6445 /* mode: 6446 * 0 - completely stop and dis-assemble array 6447 * 2 - stop but do not disassemble array 6448 */ do_md_stop(struct mddev * mddev,int mode,struct block_device * bdev)6449 static int do_md_stop(struct mddev *mddev, int mode, 6450 struct block_device *bdev) 6451 { 6452 struct gendisk *disk = mddev->gendisk; 6453 struct md_rdev *rdev; 6454 int did_freeze = 0; 6455 6456 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6457 did_freeze = 1; 6458 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6459 md_wakeup_thread(mddev->thread); 6460 } 6461 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6462 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6463 6464 /* 6465 * Thread might be blocked waiting for metadata update which will now 6466 * never happen 6467 */ 6468 md_wakeup_thread_directly(mddev->sync_thread); 6469 6470 mddev_unlock(mddev); 6471 wait_event(resync_wait, (mddev->sync_thread == NULL && 6472 !test_bit(MD_RECOVERY_RUNNING, 6473 &mddev->recovery))); 6474 mddev_lock_nointr(mddev); 6475 6476 mutex_lock(&mddev->open_mutex); 6477 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6478 mddev->sysfs_active || 6479 mddev->sync_thread || 6480 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6481 pr_warn("md: %s still in use.\n",mdname(mddev)); 6482 mutex_unlock(&mddev->open_mutex); 6483 if (did_freeze) { 6484 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6485 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6486 md_wakeup_thread(mddev->thread); 6487 } 6488 return -EBUSY; 6489 } 6490 if (mddev->pers) { 6491 if (!md_is_rdwr(mddev)) 6492 set_disk_ro(disk, 0); 6493 6494 __md_stop_writes(mddev); 6495 __md_stop(mddev); 6496 6497 /* tell userspace to handle 'inactive' */ 6498 sysfs_notify_dirent_safe(mddev->sysfs_state); 6499 6500 rdev_for_each(rdev, mddev) 6501 if (rdev->raid_disk >= 0) 6502 sysfs_unlink_rdev(mddev, rdev); 6503 6504 set_capacity_and_notify(disk, 0); 6505 mutex_unlock(&mddev->open_mutex); 6506 mddev->changed = 1; 6507 6508 if (!md_is_rdwr(mddev)) 6509 mddev->ro = MD_RDWR; 6510 } else 6511 mutex_unlock(&mddev->open_mutex); 6512 /* 6513 * Free resources if final stop 6514 */ 6515 if (mode == 0) { 6516 pr_info("md: %s stopped.\n", mdname(mddev)); 6517 6518 if (mddev->bitmap_info.file) { 6519 struct file *f = mddev->bitmap_info.file; 6520 spin_lock(&mddev->lock); 6521 mddev->bitmap_info.file = NULL; 6522 spin_unlock(&mddev->lock); 6523 fput(f); 6524 } 6525 mddev->bitmap_info.offset = 0; 6526 6527 export_array(mddev); 6528 6529 md_clean(mddev); 6530 if (mddev->hold_active == UNTIL_STOP) 6531 mddev->hold_active = 0; 6532 } 6533 md_new_event(); 6534 sysfs_notify_dirent_safe(mddev->sysfs_state); 6535 return 0; 6536 } 6537 6538 #ifndef MODULE autorun_array(struct mddev * mddev)6539 static void autorun_array(struct mddev *mddev) 6540 { 6541 struct md_rdev *rdev; 6542 int err; 6543 6544 if (list_empty(&mddev->disks)) 6545 return; 6546 6547 pr_info("md: running: "); 6548 6549 rdev_for_each(rdev, mddev) { 6550 pr_cont("<%pg>", rdev->bdev); 6551 } 6552 pr_cont("\n"); 6553 6554 err = do_md_run(mddev); 6555 if (err) { 6556 pr_warn("md: do_md_run() returned %d\n", err); 6557 do_md_stop(mddev, 0, NULL); 6558 } 6559 } 6560 6561 /* 6562 * lets try to run arrays based on all disks that have arrived 6563 * until now. (those are in pending_raid_disks) 6564 * 6565 * the method: pick the first pending disk, collect all disks with 6566 * the same UUID, remove all from the pending list and put them into 6567 * the 'same_array' list. Then order this list based on superblock 6568 * update time (freshest comes first), kick out 'old' disks and 6569 * compare superblocks. If everything's fine then run it. 6570 * 6571 * If "unit" is allocated, then bump its reference count 6572 */ autorun_devices(int part)6573 static void autorun_devices(int part) 6574 { 6575 struct md_rdev *rdev0, *rdev, *tmp; 6576 struct mddev *mddev; 6577 6578 pr_info("md: autorun ...\n"); 6579 while (!list_empty(&pending_raid_disks)) { 6580 int unit; 6581 dev_t dev; 6582 LIST_HEAD(candidates); 6583 rdev0 = list_entry(pending_raid_disks.next, 6584 struct md_rdev, same_set); 6585 6586 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6587 INIT_LIST_HEAD(&candidates); 6588 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6589 if (super_90_load(rdev, rdev0, 0) >= 0) { 6590 pr_debug("md: adding %pg ...\n", 6591 rdev->bdev); 6592 list_move(&rdev->same_set, &candidates); 6593 } 6594 /* 6595 * now we have a set of devices, with all of them having 6596 * mostly sane superblocks. It's time to allocate the 6597 * mddev. 6598 */ 6599 if (part) { 6600 dev = MKDEV(mdp_major, 6601 rdev0->preferred_minor << MdpMinorShift); 6602 unit = MINOR(dev) >> MdpMinorShift; 6603 } else { 6604 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6605 unit = MINOR(dev); 6606 } 6607 if (rdev0->preferred_minor != unit) { 6608 pr_warn("md: unit number in %pg is bad: %d\n", 6609 rdev0->bdev, rdev0->preferred_minor); 6610 break; 6611 } 6612 6613 mddev = md_alloc(dev, NULL); 6614 if (IS_ERR(mddev)) 6615 break; 6616 6617 if (mddev_lock(mddev)) 6618 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6619 else if (mddev->raid_disks || mddev->major_version 6620 || !list_empty(&mddev->disks)) { 6621 pr_warn("md: %s already running, cannot run %pg\n", 6622 mdname(mddev), rdev0->bdev); 6623 mddev_unlock(mddev); 6624 } else { 6625 pr_debug("md: created %s\n", mdname(mddev)); 6626 mddev->persistent = 1; 6627 rdev_for_each_list(rdev, tmp, &candidates) { 6628 list_del_init(&rdev->same_set); 6629 if (bind_rdev_to_array(rdev, mddev)) 6630 export_rdev(rdev, mddev); 6631 } 6632 autorun_array(mddev); 6633 mddev_unlock(mddev); 6634 } 6635 /* on success, candidates will be empty, on error 6636 * it won't... 6637 */ 6638 rdev_for_each_list(rdev, tmp, &candidates) { 6639 list_del_init(&rdev->same_set); 6640 export_rdev(rdev, mddev); 6641 } 6642 mddev_put(mddev); 6643 } 6644 pr_info("md: ... autorun DONE.\n"); 6645 } 6646 #endif /* !MODULE */ 6647 get_version(void __user * arg)6648 static int get_version(void __user *arg) 6649 { 6650 mdu_version_t ver; 6651 6652 ver.major = MD_MAJOR_VERSION; 6653 ver.minor = MD_MINOR_VERSION; 6654 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6655 6656 if (copy_to_user(arg, &ver, sizeof(ver))) 6657 return -EFAULT; 6658 6659 return 0; 6660 } 6661 get_array_info(struct mddev * mddev,void __user * arg)6662 static int get_array_info(struct mddev *mddev, void __user *arg) 6663 { 6664 mdu_array_info_t info; 6665 int nr,working,insync,failed,spare; 6666 struct md_rdev *rdev; 6667 6668 nr = working = insync = failed = spare = 0; 6669 rcu_read_lock(); 6670 rdev_for_each_rcu(rdev, mddev) { 6671 nr++; 6672 if (test_bit(Faulty, &rdev->flags)) 6673 failed++; 6674 else { 6675 working++; 6676 if (test_bit(In_sync, &rdev->flags)) 6677 insync++; 6678 else if (test_bit(Journal, &rdev->flags)) 6679 /* TODO: add journal count to md_u.h */ 6680 ; 6681 else 6682 spare++; 6683 } 6684 } 6685 rcu_read_unlock(); 6686 6687 info.major_version = mddev->major_version; 6688 info.minor_version = mddev->minor_version; 6689 info.patch_version = MD_PATCHLEVEL_VERSION; 6690 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6691 info.level = mddev->level; 6692 info.size = mddev->dev_sectors / 2; 6693 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6694 info.size = -1; 6695 info.nr_disks = nr; 6696 info.raid_disks = mddev->raid_disks; 6697 info.md_minor = mddev->md_minor; 6698 info.not_persistent= !mddev->persistent; 6699 6700 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6701 info.state = 0; 6702 if (mddev->in_sync) 6703 info.state = (1<<MD_SB_CLEAN); 6704 if (mddev->bitmap && mddev->bitmap_info.offset) 6705 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6706 if (mddev_is_clustered(mddev)) 6707 info.state |= (1<<MD_SB_CLUSTERED); 6708 info.active_disks = insync; 6709 info.working_disks = working; 6710 info.failed_disks = failed; 6711 info.spare_disks = spare; 6712 6713 info.layout = mddev->layout; 6714 info.chunk_size = mddev->chunk_sectors << 9; 6715 6716 if (copy_to_user(arg, &info, sizeof(info))) 6717 return -EFAULT; 6718 6719 return 0; 6720 } 6721 get_bitmap_file(struct mddev * mddev,void __user * arg)6722 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6723 { 6724 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6725 char *ptr; 6726 int err; 6727 6728 file = kzalloc(sizeof(*file), GFP_NOIO); 6729 if (!file) 6730 return -ENOMEM; 6731 6732 err = 0; 6733 spin_lock(&mddev->lock); 6734 /* bitmap enabled */ 6735 if (mddev->bitmap_info.file) { 6736 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6737 sizeof(file->pathname)); 6738 if (IS_ERR(ptr)) 6739 err = PTR_ERR(ptr); 6740 else 6741 memmove(file->pathname, ptr, 6742 sizeof(file->pathname)-(ptr-file->pathname)); 6743 } 6744 spin_unlock(&mddev->lock); 6745 6746 if (err == 0 && 6747 copy_to_user(arg, file, sizeof(*file))) 6748 err = -EFAULT; 6749 6750 kfree(file); 6751 return err; 6752 } 6753 get_disk_info(struct mddev * mddev,void __user * arg)6754 static int get_disk_info(struct mddev *mddev, void __user * arg) 6755 { 6756 mdu_disk_info_t info; 6757 struct md_rdev *rdev; 6758 6759 if (copy_from_user(&info, arg, sizeof(info))) 6760 return -EFAULT; 6761 6762 rcu_read_lock(); 6763 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6764 if (rdev) { 6765 info.major = MAJOR(rdev->bdev->bd_dev); 6766 info.minor = MINOR(rdev->bdev->bd_dev); 6767 info.raid_disk = rdev->raid_disk; 6768 info.state = 0; 6769 if (test_bit(Faulty, &rdev->flags)) 6770 info.state |= (1<<MD_DISK_FAULTY); 6771 else if (test_bit(In_sync, &rdev->flags)) { 6772 info.state |= (1<<MD_DISK_ACTIVE); 6773 info.state |= (1<<MD_DISK_SYNC); 6774 } 6775 if (test_bit(Journal, &rdev->flags)) 6776 info.state |= (1<<MD_DISK_JOURNAL); 6777 if (test_bit(WriteMostly, &rdev->flags)) 6778 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6779 if (test_bit(FailFast, &rdev->flags)) 6780 info.state |= (1<<MD_DISK_FAILFAST); 6781 } else { 6782 info.major = info.minor = 0; 6783 info.raid_disk = -1; 6784 info.state = (1<<MD_DISK_REMOVED); 6785 } 6786 rcu_read_unlock(); 6787 6788 if (copy_to_user(arg, &info, sizeof(info))) 6789 return -EFAULT; 6790 6791 return 0; 6792 } 6793 md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)6794 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6795 { 6796 struct md_rdev *rdev; 6797 dev_t dev = MKDEV(info->major,info->minor); 6798 6799 if (mddev_is_clustered(mddev) && 6800 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6801 pr_warn("%s: Cannot add to clustered mddev.\n", 6802 mdname(mddev)); 6803 return -EINVAL; 6804 } 6805 6806 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6807 return -EOVERFLOW; 6808 6809 if (!mddev->raid_disks) { 6810 int err; 6811 /* expecting a device which has a superblock */ 6812 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6813 if (IS_ERR(rdev)) { 6814 pr_warn("md: md_import_device returned %ld\n", 6815 PTR_ERR(rdev)); 6816 return PTR_ERR(rdev); 6817 } 6818 if (!list_empty(&mddev->disks)) { 6819 struct md_rdev *rdev0 6820 = list_entry(mddev->disks.next, 6821 struct md_rdev, same_set); 6822 err = super_types[mddev->major_version] 6823 .load_super(rdev, rdev0, mddev->minor_version); 6824 if (err < 0) { 6825 pr_warn("md: %pg has different UUID to %pg\n", 6826 rdev->bdev, 6827 rdev0->bdev); 6828 export_rdev(rdev, mddev); 6829 return -EINVAL; 6830 } 6831 } 6832 err = bind_rdev_to_array(rdev, mddev); 6833 if (err) 6834 export_rdev(rdev, mddev); 6835 return err; 6836 } 6837 6838 /* 6839 * md_add_new_disk can be used once the array is assembled 6840 * to add "hot spares". They must already have a superblock 6841 * written 6842 */ 6843 if (mddev->pers) { 6844 int err; 6845 if (!mddev->pers->hot_add_disk) { 6846 pr_warn("%s: personality does not support diskops!\n", 6847 mdname(mddev)); 6848 return -EINVAL; 6849 } 6850 if (mddev->persistent) 6851 rdev = md_import_device(dev, mddev->major_version, 6852 mddev->minor_version); 6853 else 6854 rdev = md_import_device(dev, -1, -1); 6855 if (IS_ERR(rdev)) { 6856 pr_warn("md: md_import_device returned %ld\n", 6857 PTR_ERR(rdev)); 6858 return PTR_ERR(rdev); 6859 } 6860 /* set saved_raid_disk if appropriate */ 6861 if (!mddev->persistent) { 6862 if (info->state & (1<<MD_DISK_SYNC) && 6863 info->raid_disk < mddev->raid_disks) { 6864 rdev->raid_disk = info->raid_disk; 6865 clear_bit(Bitmap_sync, &rdev->flags); 6866 } else 6867 rdev->raid_disk = -1; 6868 rdev->saved_raid_disk = rdev->raid_disk; 6869 } else 6870 super_types[mddev->major_version]. 6871 validate_super(mddev, NULL/*freshest*/, rdev); 6872 if ((info->state & (1<<MD_DISK_SYNC)) && 6873 rdev->raid_disk != info->raid_disk) { 6874 /* This was a hot-add request, but events doesn't 6875 * match, so reject it. 6876 */ 6877 export_rdev(rdev, mddev); 6878 return -EINVAL; 6879 } 6880 6881 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6882 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6883 set_bit(WriteMostly, &rdev->flags); 6884 else 6885 clear_bit(WriteMostly, &rdev->flags); 6886 if (info->state & (1<<MD_DISK_FAILFAST)) 6887 set_bit(FailFast, &rdev->flags); 6888 else 6889 clear_bit(FailFast, &rdev->flags); 6890 6891 if (info->state & (1<<MD_DISK_JOURNAL)) { 6892 struct md_rdev *rdev2; 6893 bool has_journal = false; 6894 6895 /* make sure no existing journal disk */ 6896 rdev_for_each(rdev2, mddev) { 6897 if (test_bit(Journal, &rdev2->flags)) { 6898 has_journal = true; 6899 break; 6900 } 6901 } 6902 if (has_journal || mddev->bitmap) { 6903 export_rdev(rdev, mddev); 6904 return -EBUSY; 6905 } 6906 set_bit(Journal, &rdev->flags); 6907 } 6908 /* 6909 * check whether the device shows up in other nodes 6910 */ 6911 if (mddev_is_clustered(mddev)) { 6912 if (info->state & (1 << MD_DISK_CANDIDATE)) 6913 set_bit(Candidate, &rdev->flags); 6914 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6915 /* --add initiated by this node */ 6916 err = md_cluster_ops->add_new_disk(mddev, rdev); 6917 if (err) { 6918 export_rdev(rdev, mddev); 6919 return err; 6920 } 6921 } 6922 } 6923 6924 rdev->raid_disk = -1; 6925 err = bind_rdev_to_array(rdev, mddev); 6926 6927 if (err) 6928 export_rdev(rdev, mddev); 6929 6930 if (mddev_is_clustered(mddev)) { 6931 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6932 if (!err) { 6933 err = md_cluster_ops->new_disk_ack(mddev, 6934 err == 0); 6935 if (err) 6936 md_kick_rdev_from_array(rdev); 6937 } 6938 } else { 6939 if (err) 6940 md_cluster_ops->add_new_disk_cancel(mddev); 6941 else 6942 err = add_bound_rdev(rdev); 6943 } 6944 6945 } else if (!err) 6946 err = add_bound_rdev(rdev); 6947 6948 return err; 6949 } 6950 6951 /* otherwise, md_add_new_disk is only allowed 6952 * for major_version==0 superblocks 6953 */ 6954 if (mddev->major_version != 0) { 6955 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6956 return -EINVAL; 6957 } 6958 6959 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6960 int err; 6961 rdev = md_import_device(dev, -1, 0); 6962 if (IS_ERR(rdev)) { 6963 pr_warn("md: error, md_import_device() returned %ld\n", 6964 PTR_ERR(rdev)); 6965 return PTR_ERR(rdev); 6966 } 6967 rdev->desc_nr = info->number; 6968 if (info->raid_disk < mddev->raid_disks) 6969 rdev->raid_disk = info->raid_disk; 6970 else 6971 rdev->raid_disk = -1; 6972 6973 if (rdev->raid_disk < mddev->raid_disks) 6974 if (info->state & (1<<MD_DISK_SYNC)) 6975 set_bit(In_sync, &rdev->flags); 6976 6977 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6978 set_bit(WriteMostly, &rdev->flags); 6979 if (info->state & (1<<MD_DISK_FAILFAST)) 6980 set_bit(FailFast, &rdev->flags); 6981 6982 if (!mddev->persistent) { 6983 pr_debug("md: nonpersistent superblock ...\n"); 6984 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 6985 } else 6986 rdev->sb_start = calc_dev_sboffset(rdev); 6987 rdev->sectors = rdev->sb_start; 6988 6989 err = bind_rdev_to_array(rdev, mddev); 6990 if (err) { 6991 export_rdev(rdev, mddev); 6992 return err; 6993 } 6994 } 6995 6996 return 0; 6997 } 6998 hot_remove_disk(struct mddev * mddev,dev_t dev)6999 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7000 { 7001 struct md_rdev *rdev; 7002 7003 if (!mddev->pers) 7004 return -ENODEV; 7005 7006 rdev = find_rdev(mddev, dev); 7007 if (!rdev) 7008 return -ENXIO; 7009 7010 if (rdev->raid_disk < 0) 7011 goto kick_rdev; 7012 7013 clear_bit(Blocked, &rdev->flags); 7014 remove_and_add_spares(mddev, rdev); 7015 7016 if (rdev->raid_disk >= 0) 7017 goto busy; 7018 7019 kick_rdev: 7020 if (mddev_is_clustered(mddev)) { 7021 if (md_cluster_ops->remove_disk(mddev, rdev)) 7022 goto busy; 7023 } 7024 7025 md_kick_rdev_from_array(rdev); 7026 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7027 if (mddev->thread) 7028 md_wakeup_thread(mddev->thread); 7029 else 7030 md_update_sb(mddev, 1); 7031 md_new_event(); 7032 7033 return 0; 7034 busy: 7035 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7036 rdev->bdev, mdname(mddev)); 7037 return -EBUSY; 7038 } 7039 hot_add_disk(struct mddev * mddev,dev_t dev)7040 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7041 { 7042 int err; 7043 struct md_rdev *rdev; 7044 7045 if (!mddev->pers) 7046 return -ENODEV; 7047 7048 if (mddev->major_version != 0) { 7049 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7050 mdname(mddev)); 7051 return -EINVAL; 7052 } 7053 if (!mddev->pers->hot_add_disk) { 7054 pr_warn("%s: personality does not support diskops!\n", 7055 mdname(mddev)); 7056 return -EINVAL; 7057 } 7058 7059 rdev = md_import_device(dev, -1, 0); 7060 if (IS_ERR(rdev)) { 7061 pr_warn("md: error, md_import_device() returned %ld\n", 7062 PTR_ERR(rdev)); 7063 return -EINVAL; 7064 } 7065 7066 if (mddev->persistent) 7067 rdev->sb_start = calc_dev_sboffset(rdev); 7068 else 7069 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7070 7071 rdev->sectors = rdev->sb_start; 7072 7073 if (test_bit(Faulty, &rdev->flags)) { 7074 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7075 rdev->bdev, mdname(mddev)); 7076 err = -EINVAL; 7077 goto abort_export; 7078 } 7079 7080 clear_bit(In_sync, &rdev->flags); 7081 rdev->desc_nr = -1; 7082 rdev->saved_raid_disk = -1; 7083 err = bind_rdev_to_array(rdev, mddev); 7084 if (err) 7085 goto abort_export; 7086 7087 /* 7088 * The rest should better be atomic, we can have disk failures 7089 * noticed in interrupt contexts ... 7090 */ 7091 7092 rdev->raid_disk = -1; 7093 7094 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7095 if (!mddev->thread) 7096 md_update_sb(mddev, 1); 7097 /* 7098 * If the new disk does not support REQ_NOWAIT, 7099 * disable on the whole MD. 7100 */ 7101 if (!bdev_nowait(rdev->bdev)) { 7102 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7103 mdname(mddev), rdev->bdev); 7104 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7105 } 7106 /* 7107 * Kick recovery, maybe this spare has to be added to the 7108 * array immediately. 7109 */ 7110 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7111 md_wakeup_thread(mddev->thread); 7112 md_new_event(); 7113 return 0; 7114 7115 abort_export: 7116 export_rdev(rdev, mddev); 7117 return err; 7118 } 7119 set_bitmap_file(struct mddev * mddev,int fd)7120 static int set_bitmap_file(struct mddev *mddev, int fd) 7121 { 7122 int err = 0; 7123 7124 if (mddev->pers) { 7125 if (!mddev->pers->quiesce || !mddev->thread) 7126 return -EBUSY; 7127 if (mddev->recovery || mddev->sync_thread) 7128 return -EBUSY; 7129 /* we should be able to change the bitmap.. */ 7130 } 7131 7132 if (fd >= 0) { 7133 struct inode *inode; 7134 struct file *f; 7135 7136 if (mddev->bitmap || mddev->bitmap_info.file) 7137 return -EEXIST; /* cannot add when bitmap is present */ 7138 7139 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7140 pr_warn("%s: bitmap files not supported by this kernel\n", 7141 mdname(mddev)); 7142 return -EINVAL; 7143 } 7144 pr_warn("%s: using deprecated bitmap file support\n", 7145 mdname(mddev)); 7146 7147 f = fget(fd); 7148 7149 if (f == NULL) { 7150 pr_warn("%s: error: failed to get bitmap file\n", 7151 mdname(mddev)); 7152 return -EBADF; 7153 } 7154 7155 inode = f->f_mapping->host; 7156 if (!S_ISREG(inode->i_mode)) { 7157 pr_warn("%s: error: bitmap file must be a regular file\n", 7158 mdname(mddev)); 7159 err = -EBADF; 7160 } else if (!(f->f_mode & FMODE_WRITE)) { 7161 pr_warn("%s: error: bitmap file must open for write\n", 7162 mdname(mddev)); 7163 err = -EBADF; 7164 } else if (atomic_read(&inode->i_writecount) != 1) { 7165 pr_warn("%s: error: bitmap file is already in use\n", 7166 mdname(mddev)); 7167 err = -EBUSY; 7168 } 7169 if (err) { 7170 fput(f); 7171 return err; 7172 } 7173 mddev->bitmap_info.file = f; 7174 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7175 } else if (mddev->bitmap == NULL) 7176 return -ENOENT; /* cannot remove what isn't there */ 7177 err = 0; 7178 if (mddev->pers) { 7179 if (fd >= 0) { 7180 struct bitmap *bitmap; 7181 7182 bitmap = md_bitmap_create(mddev, -1); 7183 mddev_suspend(mddev); 7184 if (!IS_ERR(bitmap)) { 7185 mddev->bitmap = bitmap; 7186 err = md_bitmap_load(mddev); 7187 } else 7188 err = PTR_ERR(bitmap); 7189 if (err) { 7190 md_bitmap_destroy(mddev); 7191 fd = -1; 7192 } 7193 mddev_resume(mddev); 7194 } else if (fd < 0) { 7195 mddev_suspend(mddev); 7196 md_bitmap_destroy(mddev); 7197 mddev_resume(mddev); 7198 } 7199 } 7200 if (fd < 0) { 7201 struct file *f = mddev->bitmap_info.file; 7202 if (f) { 7203 spin_lock(&mddev->lock); 7204 mddev->bitmap_info.file = NULL; 7205 spin_unlock(&mddev->lock); 7206 fput(f); 7207 } 7208 } 7209 7210 return err; 7211 } 7212 7213 /* 7214 * md_set_array_info is used two different ways 7215 * The original usage is when creating a new array. 7216 * In this usage, raid_disks is > 0 and it together with 7217 * level, size, not_persistent,layout,chunksize determine the 7218 * shape of the array. 7219 * This will always create an array with a type-0.90.0 superblock. 7220 * The newer usage is when assembling an array. 7221 * In this case raid_disks will be 0, and the major_version field is 7222 * use to determine which style super-blocks are to be found on the devices. 7223 * The minor and patch _version numbers are also kept incase the 7224 * super_block handler wishes to interpret them. 7225 */ md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7226 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7227 { 7228 if (info->raid_disks == 0) { 7229 /* just setting version number for superblock loading */ 7230 if (info->major_version < 0 || 7231 info->major_version >= ARRAY_SIZE(super_types) || 7232 super_types[info->major_version].name == NULL) { 7233 /* maybe try to auto-load a module? */ 7234 pr_warn("md: superblock version %d not known\n", 7235 info->major_version); 7236 return -EINVAL; 7237 } 7238 mddev->major_version = info->major_version; 7239 mddev->minor_version = info->minor_version; 7240 mddev->patch_version = info->patch_version; 7241 mddev->persistent = !info->not_persistent; 7242 /* ensure mddev_put doesn't delete this now that there 7243 * is some minimal configuration. 7244 */ 7245 mddev->ctime = ktime_get_real_seconds(); 7246 return 0; 7247 } 7248 mddev->major_version = MD_MAJOR_VERSION; 7249 mddev->minor_version = MD_MINOR_VERSION; 7250 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7251 mddev->ctime = ktime_get_real_seconds(); 7252 7253 mddev->level = info->level; 7254 mddev->clevel[0] = 0; 7255 mddev->dev_sectors = 2 * (sector_t)info->size; 7256 mddev->raid_disks = info->raid_disks; 7257 /* don't set md_minor, it is determined by which /dev/md* was 7258 * openned 7259 */ 7260 if (info->state & (1<<MD_SB_CLEAN)) 7261 mddev->recovery_cp = MaxSector; 7262 else 7263 mddev->recovery_cp = 0; 7264 mddev->persistent = ! info->not_persistent; 7265 mddev->external = 0; 7266 7267 mddev->layout = info->layout; 7268 if (mddev->level == 0) 7269 /* Cannot trust RAID0 layout info here */ 7270 mddev->layout = -1; 7271 mddev->chunk_sectors = info->chunk_size >> 9; 7272 7273 if (mddev->persistent) { 7274 mddev->max_disks = MD_SB_DISKS; 7275 mddev->flags = 0; 7276 mddev->sb_flags = 0; 7277 } 7278 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7279 7280 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7281 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7282 mddev->bitmap_info.offset = 0; 7283 7284 mddev->reshape_position = MaxSector; 7285 7286 /* 7287 * Generate a 128 bit UUID 7288 */ 7289 get_random_bytes(mddev->uuid, 16); 7290 7291 mddev->new_level = mddev->level; 7292 mddev->new_chunk_sectors = mddev->chunk_sectors; 7293 mddev->new_layout = mddev->layout; 7294 mddev->delta_disks = 0; 7295 mddev->reshape_backwards = 0; 7296 7297 return 0; 7298 } 7299 md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7300 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7301 { 7302 lockdep_assert_held(&mddev->reconfig_mutex); 7303 7304 if (mddev->external_size) 7305 return; 7306 7307 mddev->array_sectors = array_sectors; 7308 } 7309 EXPORT_SYMBOL(md_set_array_sectors); 7310 update_size(struct mddev * mddev,sector_t num_sectors)7311 static int update_size(struct mddev *mddev, sector_t num_sectors) 7312 { 7313 struct md_rdev *rdev; 7314 int rv; 7315 int fit = (num_sectors == 0); 7316 sector_t old_dev_sectors = mddev->dev_sectors; 7317 7318 if (mddev->pers->resize == NULL) 7319 return -EINVAL; 7320 /* The "num_sectors" is the number of sectors of each device that 7321 * is used. This can only make sense for arrays with redundancy. 7322 * linear and raid0 always use whatever space is available. We can only 7323 * consider changing this number if no resync or reconstruction is 7324 * happening, and if the new size is acceptable. It must fit before the 7325 * sb_start or, if that is <data_offset, it must fit before the size 7326 * of each device. If num_sectors is zero, we find the largest size 7327 * that fits. 7328 */ 7329 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7330 mddev->sync_thread) 7331 return -EBUSY; 7332 if (!md_is_rdwr(mddev)) 7333 return -EROFS; 7334 7335 rdev_for_each(rdev, mddev) { 7336 sector_t avail = rdev->sectors; 7337 7338 if (fit && (num_sectors == 0 || num_sectors > avail)) 7339 num_sectors = avail; 7340 if (avail < num_sectors) 7341 return -ENOSPC; 7342 } 7343 rv = mddev->pers->resize(mddev, num_sectors); 7344 if (!rv) { 7345 if (mddev_is_clustered(mddev)) 7346 md_cluster_ops->update_size(mddev, old_dev_sectors); 7347 else if (mddev->queue) { 7348 set_capacity_and_notify(mddev->gendisk, 7349 mddev->array_sectors); 7350 } 7351 } 7352 return rv; 7353 } 7354 update_raid_disks(struct mddev * mddev,int raid_disks)7355 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7356 { 7357 int rv; 7358 struct md_rdev *rdev; 7359 /* change the number of raid disks */ 7360 if (mddev->pers->check_reshape == NULL) 7361 return -EINVAL; 7362 if (!md_is_rdwr(mddev)) 7363 return -EROFS; 7364 if (raid_disks <= 0 || 7365 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7366 return -EINVAL; 7367 if (mddev->sync_thread || 7368 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7369 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7370 mddev->reshape_position != MaxSector) 7371 return -EBUSY; 7372 7373 rdev_for_each(rdev, mddev) { 7374 if (mddev->raid_disks < raid_disks && 7375 rdev->data_offset < rdev->new_data_offset) 7376 return -EINVAL; 7377 if (mddev->raid_disks > raid_disks && 7378 rdev->data_offset > rdev->new_data_offset) 7379 return -EINVAL; 7380 } 7381 7382 mddev->delta_disks = raid_disks - mddev->raid_disks; 7383 if (mddev->delta_disks < 0) 7384 mddev->reshape_backwards = 1; 7385 else if (mddev->delta_disks > 0) 7386 mddev->reshape_backwards = 0; 7387 7388 rv = mddev->pers->check_reshape(mddev); 7389 if (rv < 0) { 7390 mddev->delta_disks = 0; 7391 mddev->reshape_backwards = 0; 7392 } 7393 return rv; 7394 } 7395 7396 /* 7397 * update_array_info is used to change the configuration of an 7398 * on-line array. 7399 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7400 * fields in the info are checked against the array. 7401 * Any differences that cannot be handled will cause an error. 7402 * Normally, only one change can be managed at a time. 7403 */ update_array_info(struct mddev * mddev,mdu_array_info_t * info)7404 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7405 { 7406 int rv = 0; 7407 int cnt = 0; 7408 int state = 0; 7409 7410 /* calculate expected state,ignoring low bits */ 7411 if (mddev->bitmap && mddev->bitmap_info.offset) 7412 state |= (1 << MD_SB_BITMAP_PRESENT); 7413 7414 if (mddev->major_version != info->major_version || 7415 mddev->minor_version != info->minor_version || 7416 /* mddev->patch_version != info->patch_version || */ 7417 mddev->ctime != info->ctime || 7418 mddev->level != info->level || 7419 /* mddev->layout != info->layout || */ 7420 mddev->persistent != !info->not_persistent || 7421 mddev->chunk_sectors != info->chunk_size >> 9 || 7422 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7423 ((state^info->state) & 0xfffffe00) 7424 ) 7425 return -EINVAL; 7426 /* Check there is only one change */ 7427 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7428 cnt++; 7429 if (mddev->raid_disks != info->raid_disks) 7430 cnt++; 7431 if (mddev->layout != info->layout) 7432 cnt++; 7433 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7434 cnt++; 7435 if (cnt == 0) 7436 return 0; 7437 if (cnt > 1) 7438 return -EINVAL; 7439 7440 if (mddev->layout != info->layout) { 7441 /* Change layout 7442 * we don't need to do anything at the md level, the 7443 * personality will take care of it all. 7444 */ 7445 if (mddev->pers->check_reshape == NULL) 7446 return -EINVAL; 7447 else { 7448 mddev->new_layout = info->layout; 7449 rv = mddev->pers->check_reshape(mddev); 7450 if (rv) 7451 mddev->new_layout = mddev->layout; 7452 return rv; 7453 } 7454 } 7455 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7456 rv = update_size(mddev, (sector_t)info->size * 2); 7457 7458 if (mddev->raid_disks != info->raid_disks) 7459 rv = update_raid_disks(mddev, info->raid_disks); 7460 7461 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7462 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7463 rv = -EINVAL; 7464 goto err; 7465 } 7466 if (mddev->recovery || mddev->sync_thread) { 7467 rv = -EBUSY; 7468 goto err; 7469 } 7470 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7471 struct bitmap *bitmap; 7472 /* add the bitmap */ 7473 if (mddev->bitmap) { 7474 rv = -EEXIST; 7475 goto err; 7476 } 7477 if (mddev->bitmap_info.default_offset == 0) { 7478 rv = -EINVAL; 7479 goto err; 7480 } 7481 mddev->bitmap_info.offset = 7482 mddev->bitmap_info.default_offset; 7483 mddev->bitmap_info.space = 7484 mddev->bitmap_info.default_space; 7485 bitmap = md_bitmap_create(mddev, -1); 7486 mddev_suspend(mddev); 7487 if (!IS_ERR(bitmap)) { 7488 mddev->bitmap = bitmap; 7489 rv = md_bitmap_load(mddev); 7490 } else 7491 rv = PTR_ERR(bitmap); 7492 if (rv) 7493 md_bitmap_destroy(mddev); 7494 mddev_resume(mddev); 7495 } else { 7496 /* remove the bitmap */ 7497 if (!mddev->bitmap) { 7498 rv = -ENOENT; 7499 goto err; 7500 } 7501 if (mddev->bitmap->storage.file) { 7502 rv = -EINVAL; 7503 goto err; 7504 } 7505 if (mddev->bitmap_info.nodes) { 7506 /* hold PW on all the bitmap lock */ 7507 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7508 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7509 rv = -EPERM; 7510 md_cluster_ops->unlock_all_bitmaps(mddev); 7511 goto err; 7512 } 7513 7514 mddev->bitmap_info.nodes = 0; 7515 md_cluster_ops->leave(mddev); 7516 module_put(md_cluster_mod); 7517 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7518 } 7519 mddev_suspend(mddev); 7520 md_bitmap_destroy(mddev); 7521 mddev_resume(mddev); 7522 mddev->bitmap_info.offset = 0; 7523 } 7524 } 7525 md_update_sb(mddev, 1); 7526 return rv; 7527 err: 7528 return rv; 7529 } 7530 set_disk_faulty(struct mddev * mddev,dev_t dev)7531 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7532 { 7533 struct md_rdev *rdev; 7534 int err = 0; 7535 7536 if (mddev->pers == NULL) 7537 return -ENODEV; 7538 7539 rcu_read_lock(); 7540 rdev = md_find_rdev_rcu(mddev, dev); 7541 if (!rdev) 7542 err = -ENODEV; 7543 else { 7544 md_error(mddev, rdev); 7545 if (test_bit(MD_BROKEN, &mddev->flags)) 7546 err = -EBUSY; 7547 } 7548 rcu_read_unlock(); 7549 return err; 7550 } 7551 7552 /* 7553 * We have a problem here : there is no easy way to give a CHS 7554 * virtual geometry. We currently pretend that we have a 2 heads 7555 * 4 sectors (with a BIG number of cylinders...). This drives 7556 * dosfs just mad... ;-) 7557 */ md_getgeo(struct block_device * bdev,struct hd_geometry * geo)7558 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7559 { 7560 struct mddev *mddev = bdev->bd_disk->private_data; 7561 7562 geo->heads = 2; 7563 geo->sectors = 4; 7564 geo->cylinders = mddev->array_sectors / 8; 7565 return 0; 7566 } 7567 md_ioctl_valid(unsigned int cmd)7568 static inline bool md_ioctl_valid(unsigned int cmd) 7569 { 7570 switch (cmd) { 7571 case ADD_NEW_DISK: 7572 case GET_ARRAY_INFO: 7573 case GET_BITMAP_FILE: 7574 case GET_DISK_INFO: 7575 case HOT_ADD_DISK: 7576 case HOT_REMOVE_DISK: 7577 case RAID_VERSION: 7578 case RESTART_ARRAY_RW: 7579 case RUN_ARRAY: 7580 case SET_ARRAY_INFO: 7581 case SET_BITMAP_FILE: 7582 case SET_DISK_FAULTY: 7583 case STOP_ARRAY: 7584 case STOP_ARRAY_RO: 7585 case CLUSTERED_DISK_NACK: 7586 return true; 7587 default: 7588 return false; 7589 } 7590 } 7591 __md_set_array_info(struct mddev * mddev,void __user * argp)7592 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7593 { 7594 mdu_array_info_t info; 7595 int err; 7596 7597 if (!argp) 7598 memset(&info, 0, sizeof(info)); 7599 else if (copy_from_user(&info, argp, sizeof(info))) 7600 return -EFAULT; 7601 7602 if (mddev->pers) { 7603 err = update_array_info(mddev, &info); 7604 if (err) 7605 pr_warn("md: couldn't update array info. %d\n", err); 7606 return err; 7607 } 7608 7609 if (!list_empty(&mddev->disks)) { 7610 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7611 return -EBUSY; 7612 } 7613 7614 if (mddev->raid_disks) { 7615 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7616 return -EBUSY; 7617 } 7618 7619 err = md_set_array_info(mddev, &info); 7620 if (err) 7621 pr_warn("md: couldn't set array info. %d\n", err); 7622 7623 return err; 7624 } 7625 md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7626 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7627 unsigned int cmd, unsigned long arg) 7628 { 7629 int err = 0; 7630 void __user *argp = (void __user *)arg; 7631 struct mddev *mddev = NULL; 7632 7633 if (!md_ioctl_valid(cmd)) 7634 return -ENOTTY; 7635 7636 switch (cmd) { 7637 case RAID_VERSION: 7638 case GET_ARRAY_INFO: 7639 case GET_DISK_INFO: 7640 break; 7641 default: 7642 if (!capable(CAP_SYS_ADMIN)) 7643 return -EACCES; 7644 } 7645 7646 /* 7647 * Commands dealing with the RAID driver but not any 7648 * particular array: 7649 */ 7650 switch (cmd) { 7651 case RAID_VERSION: 7652 err = get_version(argp); 7653 goto out; 7654 default:; 7655 } 7656 7657 /* 7658 * Commands creating/starting a new array: 7659 */ 7660 7661 mddev = bdev->bd_disk->private_data; 7662 7663 /* Some actions do not requires the mutex */ 7664 switch (cmd) { 7665 case GET_ARRAY_INFO: 7666 if (!mddev->raid_disks && !mddev->external) 7667 err = -ENODEV; 7668 else 7669 err = get_array_info(mddev, argp); 7670 goto out; 7671 7672 case GET_DISK_INFO: 7673 if (!mddev->raid_disks && !mddev->external) 7674 err = -ENODEV; 7675 else 7676 err = get_disk_info(mddev, argp); 7677 goto out; 7678 7679 case SET_DISK_FAULTY: 7680 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7681 goto out; 7682 7683 case GET_BITMAP_FILE: 7684 err = get_bitmap_file(mddev, argp); 7685 goto out; 7686 7687 } 7688 7689 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7690 /* Need to flush page cache, and ensure no-one else opens 7691 * and writes 7692 */ 7693 mutex_lock(&mddev->open_mutex); 7694 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7695 mutex_unlock(&mddev->open_mutex); 7696 err = -EBUSY; 7697 goto out; 7698 } 7699 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7700 mutex_unlock(&mddev->open_mutex); 7701 err = -EBUSY; 7702 goto out; 7703 } 7704 mutex_unlock(&mddev->open_mutex); 7705 sync_blockdev(bdev); 7706 } 7707 err = mddev_lock(mddev); 7708 if (err) { 7709 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7710 err, cmd); 7711 goto out; 7712 } 7713 7714 if (cmd == SET_ARRAY_INFO) { 7715 err = __md_set_array_info(mddev, argp); 7716 goto unlock; 7717 } 7718 7719 /* 7720 * Commands querying/configuring an existing array: 7721 */ 7722 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7723 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7724 if ((!mddev->raid_disks && !mddev->external) 7725 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7726 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7727 && cmd != GET_BITMAP_FILE) { 7728 err = -ENODEV; 7729 goto unlock; 7730 } 7731 7732 /* 7733 * Commands even a read-only array can execute: 7734 */ 7735 switch (cmd) { 7736 case RESTART_ARRAY_RW: 7737 err = restart_array(mddev); 7738 goto unlock; 7739 7740 case STOP_ARRAY: 7741 err = do_md_stop(mddev, 0, bdev); 7742 goto unlock; 7743 7744 case STOP_ARRAY_RO: 7745 err = md_set_readonly(mddev, bdev); 7746 goto unlock; 7747 7748 case HOT_REMOVE_DISK: 7749 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7750 goto unlock; 7751 7752 case ADD_NEW_DISK: 7753 /* We can support ADD_NEW_DISK on read-only arrays 7754 * only if we are re-adding a preexisting device. 7755 * So require mddev->pers and MD_DISK_SYNC. 7756 */ 7757 if (mddev->pers) { 7758 mdu_disk_info_t info; 7759 if (copy_from_user(&info, argp, sizeof(info))) 7760 err = -EFAULT; 7761 else if (!(info.state & (1<<MD_DISK_SYNC))) 7762 /* Need to clear read-only for this */ 7763 break; 7764 else 7765 err = md_add_new_disk(mddev, &info); 7766 goto unlock; 7767 } 7768 break; 7769 } 7770 7771 /* 7772 * The remaining ioctls are changing the state of the 7773 * superblock, so we do not allow them on read-only arrays. 7774 */ 7775 if (!md_is_rdwr(mddev) && mddev->pers) { 7776 if (mddev->ro != MD_AUTO_READ) { 7777 err = -EROFS; 7778 goto unlock; 7779 } 7780 mddev->ro = MD_RDWR; 7781 sysfs_notify_dirent_safe(mddev->sysfs_state); 7782 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7783 /* mddev_unlock will wake thread */ 7784 /* If a device failed while we were read-only, we 7785 * need to make sure the metadata is updated now. 7786 */ 7787 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7788 mddev_unlock(mddev); 7789 wait_event(mddev->sb_wait, 7790 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7791 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7792 mddev_lock_nointr(mddev); 7793 } 7794 } 7795 7796 switch (cmd) { 7797 case ADD_NEW_DISK: 7798 { 7799 mdu_disk_info_t info; 7800 if (copy_from_user(&info, argp, sizeof(info))) 7801 err = -EFAULT; 7802 else 7803 err = md_add_new_disk(mddev, &info); 7804 goto unlock; 7805 } 7806 7807 case CLUSTERED_DISK_NACK: 7808 if (mddev_is_clustered(mddev)) 7809 md_cluster_ops->new_disk_ack(mddev, false); 7810 else 7811 err = -EINVAL; 7812 goto unlock; 7813 7814 case HOT_ADD_DISK: 7815 err = hot_add_disk(mddev, new_decode_dev(arg)); 7816 goto unlock; 7817 7818 case RUN_ARRAY: 7819 err = do_md_run(mddev); 7820 goto unlock; 7821 7822 case SET_BITMAP_FILE: 7823 err = set_bitmap_file(mddev, (int)arg); 7824 goto unlock; 7825 7826 default: 7827 err = -EINVAL; 7828 goto unlock; 7829 } 7830 7831 unlock: 7832 if (mddev->hold_active == UNTIL_IOCTL && 7833 err != -EINVAL) 7834 mddev->hold_active = 0; 7835 mddev_unlock(mddev); 7836 out: 7837 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 7838 clear_bit(MD_CLOSING, &mddev->flags); 7839 return err; 7840 } 7841 #ifdef CONFIG_COMPAT md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7842 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7843 unsigned int cmd, unsigned long arg) 7844 { 7845 switch (cmd) { 7846 case HOT_REMOVE_DISK: 7847 case HOT_ADD_DISK: 7848 case SET_DISK_FAULTY: 7849 case SET_BITMAP_FILE: 7850 /* These take in integer arg, do not convert */ 7851 break; 7852 default: 7853 arg = (unsigned long)compat_ptr(arg); 7854 break; 7855 } 7856 7857 return md_ioctl(bdev, mode, cmd, arg); 7858 } 7859 #endif /* CONFIG_COMPAT */ 7860 md_set_read_only(struct block_device * bdev,bool ro)7861 static int md_set_read_only(struct block_device *bdev, bool ro) 7862 { 7863 struct mddev *mddev = bdev->bd_disk->private_data; 7864 int err; 7865 7866 err = mddev_lock(mddev); 7867 if (err) 7868 return err; 7869 7870 if (!mddev->raid_disks && !mddev->external) { 7871 err = -ENODEV; 7872 goto out_unlock; 7873 } 7874 7875 /* 7876 * Transitioning to read-auto need only happen for arrays that call 7877 * md_write_start and which are not ready for writes yet. 7878 */ 7879 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7880 err = restart_array(mddev); 7881 if (err) 7882 goto out_unlock; 7883 mddev->ro = MD_AUTO_READ; 7884 } 7885 7886 out_unlock: 7887 mddev_unlock(mddev); 7888 return err; 7889 } 7890 md_open(struct gendisk * disk,blk_mode_t mode)7891 static int md_open(struct gendisk *disk, blk_mode_t mode) 7892 { 7893 struct mddev *mddev; 7894 int err; 7895 7896 spin_lock(&all_mddevs_lock); 7897 mddev = mddev_get(disk->private_data); 7898 spin_unlock(&all_mddevs_lock); 7899 if (!mddev) 7900 return -ENODEV; 7901 7902 err = mutex_lock_interruptible(&mddev->open_mutex); 7903 if (err) 7904 goto out; 7905 7906 err = -ENODEV; 7907 if (test_bit(MD_CLOSING, &mddev->flags)) 7908 goto out_unlock; 7909 7910 atomic_inc(&mddev->openers); 7911 mutex_unlock(&mddev->open_mutex); 7912 7913 disk_check_media_change(disk); 7914 return 0; 7915 7916 out_unlock: 7917 mutex_unlock(&mddev->open_mutex); 7918 out: 7919 mddev_put(mddev); 7920 return err; 7921 } 7922 md_release(struct gendisk * disk)7923 static void md_release(struct gendisk *disk) 7924 { 7925 struct mddev *mddev = disk->private_data; 7926 7927 BUG_ON(!mddev); 7928 atomic_dec(&mddev->openers); 7929 mddev_put(mddev); 7930 } 7931 md_check_events(struct gendisk * disk,unsigned int clearing)7932 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7933 { 7934 struct mddev *mddev = disk->private_data; 7935 unsigned int ret = 0; 7936 7937 if (mddev->changed) 7938 ret = DISK_EVENT_MEDIA_CHANGE; 7939 mddev->changed = 0; 7940 return ret; 7941 } 7942 md_free_disk(struct gendisk * disk)7943 static void md_free_disk(struct gendisk *disk) 7944 { 7945 struct mddev *mddev = disk->private_data; 7946 7947 percpu_ref_exit(&mddev->writes_pending); 7948 mddev_free(mddev); 7949 } 7950 7951 const struct block_device_operations md_fops = 7952 { 7953 .owner = THIS_MODULE, 7954 .submit_bio = md_submit_bio, 7955 .open = md_open, 7956 .release = md_release, 7957 .ioctl = md_ioctl, 7958 #ifdef CONFIG_COMPAT 7959 .compat_ioctl = md_compat_ioctl, 7960 #endif 7961 .getgeo = md_getgeo, 7962 .check_events = md_check_events, 7963 .set_read_only = md_set_read_only, 7964 .free_disk = md_free_disk, 7965 }; 7966 md_thread(void * arg)7967 static int md_thread(void *arg) 7968 { 7969 struct md_thread *thread = arg; 7970 7971 /* 7972 * md_thread is a 'system-thread', it's priority should be very 7973 * high. We avoid resource deadlocks individually in each 7974 * raid personality. (RAID5 does preallocation) We also use RR and 7975 * the very same RT priority as kswapd, thus we will never get 7976 * into a priority inversion deadlock. 7977 * 7978 * we definitely have to have equal or higher priority than 7979 * bdflush, otherwise bdflush will deadlock if there are too 7980 * many dirty RAID5 blocks. 7981 */ 7982 7983 allow_signal(SIGKILL); 7984 while (!kthread_should_stop()) { 7985 7986 /* We need to wait INTERRUPTIBLE so that 7987 * we don't add to the load-average. 7988 * That means we need to be sure no signals are 7989 * pending 7990 */ 7991 if (signal_pending(current)) 7992 flush_signals(current); 7993 7994 wait_event_interruptible_timeout 7995 (thread->wqueue, 7996 test_bit(THREAD_WAKEUP, &thread->flags) 7997 || kthread_should_stop() || kthread_should_park(), 7998 thread->timeout); 7999 8000 clear_bit(THREAD_WAKEUP, &thread->flags); 8001 if (kthread_should_park()) 8002 kthread_parkme(); 8003 if (!kthread_should_stop()) 8004 thread->run(thread); 8005 } 8006 8007 return 0; 8008 } 8009 md_wakeup_thread_directly(struct md_thread __rcu * thread)8010 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8011 { 8012 struct md_thread *t; 8013 8014 rcu_read_lock(); 8015 t = rcu_dereference(thread); 8016 if (t) 8017 wake_up_process(t->tsk); 8018 rcu_read_unlock(); 8019 } 8020 md_wakeup_thread(struct md_thread __rcu * thread)8021 void md_wakeup_thread(struct md_thread __rcu *thread) 8022 { 8023 struct md_thread *t; 8024 8025 rcu_read_lock(); 8026 t = rcu_dereference(thread); 8027 if (t) { 8028 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8029 set_bit(THREAD_WAKEUP, &t->flags); 8030 wake_up(&t->wqueue); 8031 } 8032 rcu_read_unlock(); 8033 } 8034 EXPORT_SYMBOL(md_wakeup_thread); 8035 md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8036 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8037 struct mddev *mddev, const char *name) 8038 { 8039 struct md_thread *thread; 8040 8041 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8042 if (!thread) 8043 return NULL; 8044 8045 init_waitqueue_head(&thread->wqueue); 8046 8047 thread->run = run; 8048 thread->mddev = mddev; 8049 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8050 thread->tsk = kthread_run(md_thread, thread, 8051 "%s_%s", 8052 mdname(thread->mddev), 8053 name); 8054 if (IS_ERR(thread->tsk)) { 8055 kfree(thread); 8056 return NULL; 8057 } 8058 return thread; 8059 } 8060 EXPORT_SYMBOL(md_register_thread); 8061 md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8062 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8063 { 8064 struct md_thread *thread = rcu_dereference_protected(*threadp, 8065 lockdep_is_held(&mddev->reconfig_mutex)); 8066 8067 if (!thread) 8068 return; 8069 8070 rcu_assign_pointer(*threadp, NULL); 8071 synchronize_rcu(); 8072 8073 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8074 kthread_stop(thread->tsk); 8075 kfree(thread); 8076 } 8077 EXPORT_SYMBOL(md_unregister_thread); 8078 md_error(struct mddev * mddev,struct md_rdev * rdev)8079 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8080 { 8081 if (!rdev || test_bit(Faulty, &rdev->flags)) 8082 return; 8083 8084 if (!mddev->pers || !mddev->pers->error_handler) 8085 return; 8086 mddev->pers->error_handler(mddev, rdev); 8087 8088 if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) 8089 return; 8090 8091 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8092 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8093 sysfs_notify_dirent_safe(rdev->sysfs_state); 8094 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8095 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8096 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8097 md_wakeup_thread(mddev->thread); 8098 } 8099 if (mddev->event_work.func) 8100 queue_work(md_misc_wq, &mddev->event_work); 8101 md_new_event(); 8102 } 8103 EXPORT_SYMBOL(md_error); 8104 8105 /* seq_file implementation /proc/mdstat */ 8106 status_unused(struct seq_file * seq)8107 static void status_unused(struct seq_file *seq) 8108 { 8109 int i = 0; 8110 struct md_rdev *rdev; 8111 8112 seq_printf(seq, "unused devices: "); 8113 8114 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8115 i++; 8116 seq_printf(seq, "%pg ", rdev->bdev); 8117 } 8118 if (!i) 8119 seq_printf(seq, "<none>"); 8120 8121 seq_printf(seq, "\n"); 8122 } 8123 status_personalities(struct seq_file * seq)8124 static void status_personalities(struct seq_file *seq) 8125 { 8126 struct md_personality *pers; 8127 8128 seq_puts(seq, "Personalities : "); 8129 spin_lock(&pers_lock); 8130 list_for_each_entry(pers, &pers_list, list) 8131 seq_printf(seq, "[%s] ", pers->name); 8132 8133 spin_unlock(&pers_lock); 8134 seq_puts(seq, "\n"); 8135 } 8136 status_resync(struct seq_file * seq,struct mddev * mddev)8137 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8138 { 8139 sector_t max_sectors, resync, res; 8140 unsigned long dt, db = 0; 8141 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8142 int scale, recovery_active; 8143 unsigned int per_milli; 8144 8145 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8146 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8147 max_sectors = mddev->resync_max_sectors; 8148 else 8149 max_sectors = mddev->dev_sectors; 8150 8151 resync = mddev->curr_resync; 8152 if (resync < MD_RESYNC_ACTIVE) { 8153 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8154 /* Still cleaning up */ 8155 resync = max_sectors; 8156 } else if (resync > max_sectors) { 8157 resync = max_sectors; 8158 } else { 8159 res = atomic_read(&mddev->recovery_active); 8160 /* 8161 * Resync has started, but the subtraction has overflowed or 8162 * yielded one of the special values. Force it to active to 8163 * ensure the status reports an active resync. 8164 */ 8165 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8166 resync = MD_RESYNC_ACTIVE; 8167 else 8168 resync -= res; 8169 } 8170 8171 if (resync == MD_RESYNC_NONE) { 8172 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8173 struct md_rdev *rdev; 8174 8175 rdev_for_each(rdev, mddev) 8176 if (rdev->raid_disk >= 0 && 8177 !test_bit(Faulty, &rdev->flags) && 8178 rdev->recovery_offset != MaxSector && 8179 rdev->recovery_offset) { 8180 seq_printf(seq, "\trecover=REMOTE"); 8181 return 1; 8182 } 8183 if (mddev->reshape_position != MaxSector) 8184 seq_printf(seq, "\treshape=REMOTE"); 8185 else 8186 seq_printf(seq, "\tresync=REMOTE"); 8187 return 1; 8188 } 8189 if (mddev->recovery_cp < MaxSector) { 8190 seq_printf(seq, "\tresync=PENDING"); 8191 return 1; 8192 } 8193 return 0; 8194 } 8195 if (resync < MD_RESYNC_ACTIVE) { 8196 seq_printf(seq, "\tresync=DELAYED"); 8197 return 1; 8198 } 8199 8200 WARN_ON(max_sectors == 0); 8201 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8202 * in a sector_t, and (max_sectors>>scale) will fit in a 8203 * u32, as those are the requirements for sector_div. 8204 * Thus 'scale' must be at least 10 8205 */ 8206 scale = 10; 8207 if (sizeof(sector_t) > sizeof(unsigned long)) { 8208 while ( max_sectors/2 > (1ULL<<(scale+32))) 8209 scale++; 8210 } 8211 res = (resync>>scale)*1000; 8212 sector_div(res, (u32)((max_sectors>>scale)+1)); 8213 8214 per_milli = res; 8215 { 8216 int i, x = per_milli/50, y = 20-x; 8217 seq_printf(seq, "["); 8218 for (i = 0; i < x; i++) 8219 seq_printf(seq, "="); 8220 seq_printf(seq, ">"); 8221 for (i = 0; i < y; i++) 8222 seq_printf(seq, "."); 8223 seq_printf(seq, "] "); 8224 } 8225 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8226 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8227 "reshape" : 8228 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8229 "check" : 8230 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8231 "resync" : "recovery"))), 8232 per_milli/10, per_milli % 10, 8233 (unsigned long long) resync/2, 8234 (unsigned long long) max_sectors/2); 8235 8236 /* 8237 * dt: time from mark until now 8238 * db: blocks written from mark until now 8239 * rt: remaining time 8240 * 8241 * rt is a sector_t, which is always 64bit now. We are keeping 8242 * the original algorithm, but it is not really necessary. 8243 * 8244 * Original algorithm: 8245 * So we divide before multiply in case it is 32bit and close 8246 * to the limit. 8247 * We scale the divisor (db) by 32 to avoid losing precision 8248 * near the end of resync when the number of remaining sectors 8249 * is close to 'db'. 8250 * We then divide rt by 32 after multiplying by db to compensate. 8251 * The '+1' avoids division by zero if db is very small. 8252 */ 8253 dt = ((jiffies - mddev->resync_mark) / HZ); 8254 if (!dt) dt++; 8255 8256 curr_mark_cnt = mddev->curr_mark_cnt; 8257 recovery_active = atomic_read(&mddev->recovery_active); 8258 resync_mark_cnt = mddev->resync_mark_cnt; 8259 8260 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8261 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8262 8263 rt = max_sectors - resync; /* number of remaining sectors */ 8264 rt = div64_u64(rt, db/32+1); 8265 rt *= dt; 8266 rt >>= 5; 8267 8268 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8269 ((unsigned long)rt % 60)/6); 8270 8271 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8272 return 1; 8273 } 8274 md_seq_start(struct seq_file * seq,loff_t * pos)8275 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8276 __acquires(&all_mddevs_lock) 8277 { 8278 seq->poll_event = atomic_read(&md_event_count); 8279 spin_lock(&all_mddevs_lock); 8280 8281 return seq_list_start_head(&all_mddevs, *pos); 8282 } 8283 md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8284 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8285 { 8286 return seq_list_next(v, &all_mddevs, pos); 8287 } 8288 md_seq_stop(struct seq_file * seq,void * v)8289 static void md_seq_stop(struct seq_file *seq, void *v) 8290 __releases(&all_mddevs_lock) 8291 { 8292 spin_unlock(&all_mddevs_lock); 8293 } 8294 md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8295 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8296 { 8297 struct md_bitmap_stats stats; 8298 unsigned long used_pages; 8299 unsigned long chunk_kb; 8300 int err; 8301 8302 err = md_bitmap_get_stats(mddev->bitmap, &stats); 8303 if (err) 8304 return; 8305 8306 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8307 used_pages = stats.pages - stats.missing_pages; 8308 8309 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8310 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8311 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8312 chunk_kb ? "KB" : "B"); 8313 8314 if (stats.file) { 8315 seq_puts(seq, ", file: "); 8316 seq_file_path(seq, stats.file, " \t\n"); 8317 } 8318 8319 seq_putc(seq, '\n'); 8320 } 8321 md_seq_show(struct seq_file * seq,void * v)8322 static int md_seq_show(struct seq_file *seq, void *v) 8323 { 8324 struct mddev *mddev; 8325 sector_t sectors; 8326 struct md_rdev *rdev; 8327 8328 if (v == &all_mddevs) { 8329 status_personalities(seq); 8330 if (list_empty(&all_mddevs)) 8331 status_unused(seq); 8332 return 0; 8333 } 8334 8335 mddev = list_entry(v, struct mddev, all_mddevs); 8336 if (!mddev_get(mddev)) 8337 return 0; 8338 8339 spin_unlock(&all_mddevs_lock); 8340 8341 /* prevent bitmap to be freed after checking */ 8342 mutex_lock(&mddev->bitmap_info.mutex); 8343 8344 spin_lock(&mddev->lock); 8345 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8346 seq_printf(seq, "%s : %sactive", mdname(mddev), 8347 mddev->pers ? "" : "in"); 8348 if (mddev->pers) { 8349 if (mddev->ro == MD_RDONLY) 8350 seq_printf(seq, " (read-only)"); 8351 if (mddev->ro == MD_AUTO_READ) 8352 seq_printf(seq, " (auto-read-only)"); 8353 seq_printf(seq, " %s", mddev->pers->name); 8354 } 8355 8356 sectors = 0; 8357 rcu_read_lock(); 8358 rdev_for_each_rcu(rdev, mddev) { 8359 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8360 8361 if (test_bit(WriteMostly, &rdev->flags)) 8362 seq_printf(seq, "(W)"); 8363 if (test_bit(Journal, &rdev->flags)) 8364 seq_printf(seq, "(J)"); 8365 if (test_bit(Faulty, &rdev->flags)) { 8366 seq_printf(seq, "(F)"); 8367 continue; 8368 } 8369 if (rdev->raid_disk < 0) 8370 seq_printf(seq, "(S)"); /* spare */ 8371 if (test_bit(Replacement, &rdev->flags)) 8372 seq_printf(seq, "(R)"); 8373 sectors += rdev->sectors; 8374 } 8375 rcu_read_unlock(); 8376 8377 if (!list_empty(&mddev->disks)) { 8378 if (mddev->pers) 8379 seq_printf(seq, "\n %llu blocks", 8380 (unsigned long long) 8381 mddev->array_sectors / 2); 8382 else 8383 seq_printf(seq, "\n %llu blocks", 8384 (unsigned long long)sectors / 2); 8385 } 8386 if (mddev->persistent) { 8387 if (mddev->major_version != 0 || 8388 mddev->minor_version != 90) { 8389 seq_printf(seq," super %d.%d", 8390 mddev->major_version, 8391 mddev->minor_version); 8392 } 8393 } else if (mddev->external) 8394 seq_printf(seq, " super external:%s", 8395 mddev->metadata_type); 8396 else 8397 seq_printf(seq, " super non-persistent"); 8398 8399 if (mddev->pers) { 8400 mddev->pers->status(seq, mddev); 8401 seq_printf(seq, "\n "); 8402 if (mddev->pers->sync_request) { 8403 if (status_resync(seq, mddev)) 8404 seq_printf(seq, "\n "); 8405 } 8406 } else 8407 seq_printf(seq, "\n "); 8408 8409 md_bitmap_status(seq, mddev); 8410 8411 seq_printf(seq, "\n"); 8412 } 8413 spin_unlock(&mddev->lock); 8414 mutex_unlock(&mddev->bitmap_info.mutex); 8415 spin_lock(&all_mddevs_lock); 8416 8417 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8418 status_unused(seq); 8419 8420 if (atomic_dec_and_test(&mddev->active)) 8421 __mddev_put(mddev); 8422 8423 return 0; 8424 } 8425 8426 static const struct seq_operations md_seq_ops = { 8427 .start = md_seq_start, 8428 .next = md_seq_next, 8429 .stop = md_seq_stop, 8430 .show = md_seq_show, 8431 }; 8432 md_seq_open(struct inode * inode,struct file * file)8433 static int md_seq_open(struct inode *inode, struct file *file) 8434 { 8435 struct seq_file *seq; 8436 int error; 8437 8438 error = seq_open(file, &md_seq_ops); 8439 if (error) 8440 return error; 8441 8442 seq = file->private_data; 8443 seq->poll_event = atomic_read(&md_event_count); 8444 return error; 8445 } 8446 8447 static int md_unloading; mdstat_poll(struct file * filp,poll_table * wait)8448 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8449 { 8450 struct seq_file *seq = filp->private_data; 8451 __poll_t mask; 8452 8453 if (md_unloading) 8454 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8455 poll_wait(filp, &md_event_waiters, wait); 8456 8457 /* always allow read */ 8458 mask = EPOLLIN | EPOLLRDNORM; 8459 8460 if (seq->poll_event != atomic_read(&md_event_count)) 8461 mask |= EPOLLERR | EPOLLPRI; 8462 return mask; 8463 } 8464 8465 static const struct proc_ops mdstat_proc_ops = { 8466 .proc_open = md_seq_open, 8467 .proc_read = seq_read, 8468 .proc_lseek = seq_lseek, 8469 .proc_release = seq_release, 8470 .proc_poll = mdstat_poll, 8471 }; 8472 register_md_personality(struct md_personality * p)8473 int register_md_personality(struct md_personality *p) 8474 { 8475 pr_debug("md: %s personality registered for level %d\n", 8476 p->name, p->level); 8477 spin_lock(&pers_lock); 8478 list_add_tail(&p->list, &pers_list); 8479 spin_unlock(&pers_lock); 8480 return 0; 8481 } 8482 EXPORT_SYMBOL(register_md_personality); 8483 unregister_md_personality(struct md_personality * p)8484 int unregister_md_personality(struct md_personality *p) 8485 { 8486 pr_debug("md: %s personality unregistered\n", p->name); 8487 spin_lock(&pers_lock); 8488 list_del_init(&p->list); 8489 spin_unlock(&pers_lock); 8490 return 0; 8491 } 8492 EXPORT_SYMBOL(unregister_md_personality); 8493 register_md_cluster_operations(struct md_cluster_operations * ops,struct module * module)8494 int register_md_cluster_operations(struct md_cluster_operations *ops, 8495 struct module *module) 8496 { 8497 int ret = 0; 8498 spin_lock(&pers_lock); 8499 if (md_cluster_ops != NULL) 8500 ret = -EALREADY; 8501 else { 8502 md_cluster_ops = ops; 8503 md_cluster_mod = module; 8504 } 8505 spin_unlock(&pers_lock); 8506 return ret; 8507 } 8508 EXPORT_SYMBOL(register_md_cluster_operations); 8509 unregister_md_cluster_operations(void)8510 int unregister_md_cluster_operations(void) 8511 { 8512 spin_lock(&pers_lock); 8513 md_cluster_ops = NULL; 8514 spin_unlock(&pers_lock); 8515 return 0; 8516 } 8517 EXPORT_SYMBOL(unregister_md_cluster_operations); 8518 md_setup_cluster(struct mddev * mddev,int nodes)8519 int md_setup_cluster(struct mddev *mddev, int nodes) 8520 { 8521 int ret; 8522 if (!md_cluster_ops) 8523 request_module("md-cluster"); 8524 spin_lock(&pers_lock); 8525 /* ensure module won't be unloaded */ 8526 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8527 pr_warn("can't find md-cluster module or get its reference.\n"); 8528 spin_unlock(&pers_lock); 8529 return -ENOENT; 8530 } 8531 spin_unlock(&pers_lock); 8532 8533 ret = md_cluster_ops->join(mddev, nodes); 8534 if (!ret) 8535 mddev->safemode_delay = 0; 8536 return ret; 8537 } 8538 md_cluster_stop(struct mddev * mddev)8539 void md_cluster_stop(struct mddev *mddev) 8540 { 8541 if (!md_cluster_ops) 8542 return; 8543 md_cluster_ops->leave(mddev); 8544 module_put(md_cluster_mod); 8545 } 8546 is_mddev_idle(struct mddev * mddev,int init)8547 static int is_mddev_idle(struct mddev *mddev, int init) 8548 { 8549 struct md_rdev *rdev; 8550 int idle; 8551 int curr_events; 8552 8553 idle = 1; 8554 rcu_read_lock(); 8555 rdev_for_each_rcu(rdev, mddev) { 8556 struct gendisk *disk = rdev->bdev->bd_disk; 8557 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8558 atomic_read(&disk->sync_io); 8559 /* sync IO will cause sync_io to increase before the disk_stats 8560 * as sync_io is counted when a request starts, and 8561 * disk_stats is counted when it completes. 8562 * So resync activity will cause curr_events to be smaller than 8563 * when there was no such activity. 8564 * non-sync IO will cause disk_stat to increase without 8565 * increasing sync_io so curr_events will (eventually) 8566 * be larger than it was before. Once it becomes 8567 * substantially larger, the test below will cause 8568 * the array to appear non-idle, and resync will slow 8569 * down. 8570 * If there is a lot of outstanding resync activity when 8571 * we set last_event to curr_events, then all that activity 8572 * completing might cause the array to appear non-idle 8573 * and resync will be slowed down even though there might 8574 * not have been non-resync activity. This will only 8575 * happen once though. 'last_events' will soon reflect 8576 * the state where there is little or no outstanding 8577 * resync requests, and further resync activity will 8578 * always make curr_events less than last_events. 8579 * 8580 */ 8581 if (init || curr_events - rdev->last_events > 64) { 8582 rdev->last_events = curr_events; 8583 idle = 0; 8584 } 8585 } 8586 rcu_read_unlock(); 8587 return idle; 8588 } 8589 md_done_sync(struct mddev * mddev,int blocks,int ok)8590 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8591 { 8592 /* another "blocks" (512byte) blocks have been synced */ 8593 atomic_sub(blocks, &mddev->recovery_active); 8594 wake_up(&mddev->recovery_wait); 8595 if (!ok) { 8596 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8597 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8598 md_wakeup_thread(mddev->thread); 8599 // stop recovery, signal do_sync .... 8600 } 8601 } 8602 EXPORT_SYMBOL(md_done_sync); 8603 8604 /* md_write_start(mddev, bi) 8605 * If we need to update some array metadata (e.g. 'active' flag 8606 * in superblock) before writing, schedule a superblock update 8607 * and wait for it to complete. 8608 * A return value of 'false' means that the write wasn't recorded 8609 * and cannot proceed as the array is being suspend. 8610 */ md_write_start(struct mddev * mddev,struct bio * bi)8611 bool md_write_start(struct mddev *mddev, struct bio *bi) 8612 { 8613 int did_change = 0; 8614 8615 if (bio_data_dir(bi) != WRITE) 8616 return true; 8617 8618 BUG_ON(mddev->ro == MD_RDONLY); 8619 if (mddev->ro == MD_AUTO_READ) { 8620 /* need to switch to read/write */ 8621 mddev->ro = MD_RDWR; 8622 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8623 md_wakeup_thread(mddev->thread); 8624 md_wakeup_thread(mddev->sync_thread); 8625 did_change = 1; 8626 } 8627 rcu_read_lock(); 8628 percpu_ref_get(&mddev->writes_pending); 8629 smp_mb(); /* Match smp_mb in set_in_sync() */ 8630 if (mddev->safemode == 1) 8631 mddev->safemode = 0; 8632 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8633 if (mddev->in_sync || mddev->sync_checkers) { 8634 spin_lock(&mddev->lock); 8635 if (mddev->in_sync) { 8636 mddev->in_sync = 0; 8637 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8638 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8639 md_wakeup_thread(mddev->thread); 8640 did_change = 1; 8641 } 8642 spin_unlock(&mddev->lock); 8643 } 8644 rcu_read_unlock(); 8645 if (did_change) 8646 sysfs_notify_dirent_safe(mddev->sysfs_state); 8647 if (!mddev->has_superblocks) 8648 return true; 8649 wait_event(mddev->sb_wait, 8650 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8651 is_md_suspended(mddev)); 8652 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8653 percpu_ref_put(&mddev->writes_pending); 8654 return false; 8655 } 8656 return true; 8657 } 8658 EXPORT_SYMBOL(md_write_start); 8659 8660 /* md_write_inc can only be called when md_write_start() has 8661 * already been called at least once of the current request. 8662 * It increments the counter and is useful when a single request 8663 * is split into several parts. Each part causes an increment and 8664 * so needs a matching md_write_end(). 8665 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8666 * a spinlocked region. 8667 */ md_write_inc(struct mddev * mddev,struct bio * bi)8668 void md_write_inc(struct mddev *mddev, struct bio *bi) 8669 { 8670 if (bio_data_dir(bi) != WRITE) 8671 return; 8672 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8673 percpu_ref_get(&mddev->writes_pending); 8674 } 8675 EXPORT_SYMBOL(md_write_inc); 8676 md_write_end(struct mddev * mddev)8677 void md_write_end(struct mddev *mddev) 8678 { 8679 percpu_ref_put(&mddev->writes_pending); 8680 8681 if (mddev->safemode == 2) 8682 md_wakeup_thread(mddev->thread); 8683 else if (mddev->safemode_delay) 8684 /* The roundup() ensures this only performs locking once 8685 * every ->safemode_delay jiffies 8686 */ 8687 mod_timer(&mddev->safemode_timer, 8688 roundup(jiffies, mddev->safemode_delay) + 8689 mddev->safemode_delay); 8690 } 8691 8692 EXPORT_SYMBOL(md_write_end); 8693 8694 /* This is used by raid0 and raid10 */ md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)8695 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8696 struct bio *bio, sector_t start, sector_t size) 8697 { 8698 struct bio *discard_bio = NULL; 8699 8700 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8701 &discard_bio) || !discard_bio) 8702 return; 8703 8704 bio_chain(discard_bio, bio); 8705 bio_clone_blkg_association(discard_bio, bio); 8706 if (mddev->gendisk) 8707 trace_block_bio_remap(discard_bio, 8708 disk_devt(mddev->gendisk), 8709 bio->bi_iter.bi_sector); 8710 submit_bio_noacct(discard_bio); 8711 } 8712 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8713 md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)8714 static void md_bitmap_start(struct mddev *mddev, 8715 struct md_io_clone *md_io_clone) 8716 { 8717 if (mddev->pers->bitmap_sector) 8718 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 8719 &md_io_clone->sectors); 8720 8721 md_bitmap_startwrite(mddev->bitmap, md_io_clone->offset, 8722 md_io_clone->sectors); 8723 } 8724 md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)8725 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 8726 { 8727 md_bitmap_endwrite(mddev->bitmap, md_io_clone->offset, 8728 md_io_clone->sectors); 8729 } 8730 md_end_clone_io(struct bio * bio)8731 static void md_end_clone_io(struct bio *bio) 8732 { 8733 struct md_io_clone *md_io_clone = bio->bi_private; 8734 struct bio *orig_bio = md_io_clone->orig_bio; 8735 struct mddev *mddev = md_io_clone->mddev; 8736 8737 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8738 md_bitmap_end(mddev, md_io_clone); 8739 8740 if (bio->bi_status && !orig_bio->bi_status) 8741 orig_bio->bi_status = bio->bi_status; 8742 8743 if (md_io_clone->start_time) 8744 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8745 8746 bio_put(bio); 8747 bio_endio(orig_bio); 8748 percpu_ref_put(&mddev->active_io); 8749 } 8750 md_clone_bio(struct mddev * mddev,struct bio ** bio)8751 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8752 { 8753 struct block_device *bdev = (*bio)->bi_bdev; 8754 struct md_io_clone *md_io_clone; 8755 struct bio *clone = 8756 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8757 8758 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8759 md_io_clone->orig_bio = *bio; 8760 md_io_clone->mddev = mddev; 8761 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8762 md_io_clone->start_time = bio_start_io_acct(*bio); 8763 8764 if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { 8765 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 8766 md_io_clone->sectors = bio_sectors(*bio); 8767 md_bitmap_start(mddev, md_io_clone); 8768 } 8769 8770 clone->bi_end_io = md_end_clone_io; 8771 clone->bi_private = md_io_clone; 8772 *bio = clone; 8773 } 8774 md_account_bio(struct mddev * mddev,struct bio ** bio)8775 void md_account_bio(struct mddev *mddev, struct bio **bio) 8776 { 8777 percpu_ref_get(&mddev->active_io); 8778 md_clone_bio(mddev, bio); 8779 } 8780 EXPORT_SYMBOL_GPL(md_account_bio); 8781 8782 /* md_allow_write(mddev) 8783 * Calling this ensures that the array is marked 'active' so that writes 8784 * may proceed without blocking. It is important to call this before 8785 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8786 * Must be called with mddev_lock held. 8787 */ md_allow_write(struct mddev * mddev)8788 void md_allow_write(struct mddev *mddev) 8789 { 8790 if (!mddev->pers) 8791 return; 8792 if (!md_is_rdwr(mddev)) 8793 return; 8794 if (!mddev->pers->sync_request) 8795 return; 8796 8797 spin_lock(&mddev->lock); 8798 if (mddev->in_sync) { 8799 mddev->in_sync = 0; 8800 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8801 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8802 if (mddev->safemode_delay && 8803 mddev->safemode == 0) 8804 mddev->safemode = 1; 8805 spin_unlock(&mddev->lock); 8806 md_update_sb(mddev, 0); 8807 sysfs_notify_dirent_safe(mddev->sysfs_state); 8808 /* wait for the dirty state to be recorded in the metadata */ 8809 wait_event(mddev->sb_wait, 8810 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8811 } else 8812 spin_unlock(&mddev->lock); 8813 } 8814 EXPORT_SYMBOL_GPL(md_allow_write); 8815 8816 #define SYNC_MARKS 10 8817 #define SYNC_MARK_STEP (3*HZ) 8818 #define UPDATE_FREQUENCY (5*60*HZ) md_do_sync(struct md_thread * thread)8819 void md_do_sync(struct md_thread *thread) 8820 { 8821 struct mddev *mddev = thread->mddev; 8822 struct mddev *mddev2; 8823 unsigned int currspeed = 0, window; 8824 sector_t max_sectors,j, io_sectors, recovery_done; 8825 unsigned long mark[SYNC_MARKS]; 8826 unsigned long update_time; 8827 sector_t mark_cnt[SYNC_MARKS]; 8828 int last_mark,m; 8829 sector_t last_check; 8830 int skipped = 0; 8831 struct md_rdev *rdev; 8832 char *desc, *action = NULL; 8833 struct blk_plug plug; 8834 int ret; 8835 8836 /* just incase thread restarts... */ 8837 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8838 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8839 return; 8840 if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8841 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8842 return; 8843 } 8844 8845 if (mddev_is_clustered(mddev)) { 8846 ret = md_cluster_ops->resync_start(mddev); 8847 if (ret) 8848 goto skip; 8849 8850 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8851 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8852 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8853 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8854 && ((unsigned long long)mddev->curr_resync_completed 8855 < (unsigned long long)mddev->resync_max_sectors)) 8856 goto skip; 8857 } 8858 8859 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8860 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8861 desc = "data-check"; 8862 action = "check"; 8863 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8864 desc = "requested-resync"; 8865 action = "repair"; 8866 } else 8867 desc = "resync"; 8868 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8869 desc = "reshape"; 8870 else 8871 desc = "recovery"; 8872 8873 mddev->last_sync_action = action ?: desc; 8874 8875 /* 8876 * Before starting a resync we must have set curr_resync to 8877 * 2, and then checked that every "conflicting" array has curr_resync 8878 * less than ours. When we find one that is the same or higher 8879 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8880 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8881 * This will mean we have to start checking from the beginning again. 8882 * 8883 */ 8884 8885 do { 8886 int mddev2_minor = -1; 8887 mddev->curr_resync = MD_RESYNC_DELAYED; 8888 8889 try_again: 8890 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8891 goto skip; 8892 spin_lock(&all_mddevs_lock); 8893 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8894 if (test_bit(MD_DELETED, &mddev2->flags)) 8895 continue; 8896 if (mddev2 == mddev) 8897 continue; 8898 if (!mddev->parallel_resync 8899 && mddev2->curr_resync 8900 && match_mddev_units(mddev, mddev2)) { 8901 DEFINE_WAIT(wq); 8902 if (mddev < mddev2 && 8903 mddev->curr_resync == MD_RESYNC_DELAYED) { 8904 /* arbitrarily yield */ 8905 mddev->curr_resync = MD_RESYNC_YIELDED; 8906 wake_up(&resync_wait); 8907 } 8908 if (mddev > mddev2 && 8909 mddev->curr_resync == MD_RESYNC_YIELDED) 8910 /* no need to wait here, we can wait the next 8911 * time 'round when curr_resync == 2 8912 */ 8913 continue; 8914 /* We need to wait 'interruptible' so as not to 8915 * contribute to the load average, and not to 8916 * be caught by 'softlockup' 8917 */ 8918 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8919 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8920 mddev2->curr_resync >= mddev->curr_resync) { 8921 if (mddev2_minor != mddev2->md_minor) { 8922 mddev2_minor = mddev2->md_minor; 8923 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8924 desc, mdname(mddev), 8925 mdname(mddev2)); 8926 } 8927 spin_unlock(&all_mddevs_lock); 8928 8929 if (signal_pending(current)) 8930 flush_signals(current); 8931 schedule(); 8932 finish_wait(&resync_wait, &wq); 8933 goto try_again; 8934 } 8935 finish_wait(&resync_wait, &wq); 8936 } 8937 } 8938 spin_unlock(&all_mddevs_lock); 8939 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8940 8941 j = 0; 8942 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8943 /* resync follows the size requested by the personality, 8944 * which defaults to physical size, but can be virtual size 8945 */ 8946 max_sectors = mddev->resync_max_sectors; 8947 atomic64_set(&mddev->resync_mismatches, 0); 8948 /* we don't use the checkpoint if there's a bitmap */ 8949 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8950 j = mddev->resync_min; 8951 else if (!mddev->bitmap) 8952 j = mddev->recovery_cp; 8953 8954 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8955 max_sectors = mddev->resync_max_sectors; 8956 /* 8957 * If the original node aborts reshaping then we continue the 8958 * reshaping, so set j again to avoid restart reshape from the 8959 * first beginning 8960 */ 8961 if (mddev_is_clustered(mddev) && 8962 mddev->reshape_position != MaxSector) 8963 j = mddev->reshape_position; 8964 } else { 8965 /* recovery follows the physical size of devices */ 8966 max_sectors = mddev->dev_sectors; 8967 j = MaxSector; 8968 rcu_read_lock(); 8969 rdev_for_each_rcu(rdev, mddev) 8970 if (rdev->raid_disk >= 0 && 8971 !test_bit(Journal, &rdev->flags) && 8972 !test_bit(Faulty, &rdev->flags) && 8973 !test_bit(In_sync, &rdev->flags) && 8974 rdev->recovery_offset < j) 8975 j = rdev->recovery_offset; 8976 rcu_read_unlock(); 8977 8978 /* If there is a bitmap, we need to make sure all 8979 * writes that started before we added a spare 8980 * complete before we start doing a recovery. 8981 * Otherwise the write might complete and (via 8982 * bitmap_endwrite) set a bit in the bitmap after the 8983 * recovery has checked that bit and skipped that 8984 * region. 8985 */ 8986 if (mddev->bitmap) { 8987 mddev->pers->quiesce(mddev, 1); 8988 mddev->pers->quiesce(mddev, 0); 8989 } 8990 } 8991 8992 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8993 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8994 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8995 speed_max(mddev), desc); 8996 8997 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8998 8999 io_sectors = 0; 9000 for (m = 0; m < SYNC_MARKS; m++) { 9001 mark[m] = jiffies; 9002 mark_cnt[m] = io_sectors; 9003 } 9004 last_mark = 0; 9005 mddev->resync_mark = mark[last_mark]; 9006 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9007 9008 /* 9009 * Tune reconstruction: 9010 */ 9011 window = 32 * (PAGE_SIZE / 512); 9012 pr_debug("md: using %dk window, over a total of %lluk.\n", 9013 window/2, (unsigned long long)max_sectors/2); 9014 9015 atomic_set(&mddev->recovery_active, 0); 9016 last_check = 0; 9017 9018 if (j >= MD_RESYNC_ACTIVE) { 9019 pr_debug("md: resuming %s of %s from checkpoint.\n", 9020 desc, mdname(mddev)); 9021 mddev->curr_resync = j; 9022 } else 9023 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9024 mddev->curr_resync_completed = j; 9025 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9026 md_new_event(); 9027 update_time = jiffies; 9028 9029 blk_start_plug(&plug); 9030 while (j < max_sectors) { 9031 sector_t sectors; 9032 9033 skipped = 0; 9034 9035 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9036 ((mddev->curr_resync > mddev->curr_resync_completed && 9037 (mddev->curr_resync - mddev->curr_resync_completed) 9038 > (max_sectors >> 4)) || 9039 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9040 (j - mddev->curr_resync_completed)*2 9041 >= mddev->resync_max - mddev->curr_resync_completed || 9042 mddev->curr_resync_completed > mddev->resync_max 9043 )) { 9044 /* time to update curr_resync_completed */ 9045 wait_event(mddev->recovery_wait, 9046 atomic_read(&mddev->recovery_active) == 0); 9047 mddev->curr_resync_completed = j; 9048 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9049 j > mddev->recovery_cp) 9050 mddev->recovery_cp = j; 9051 update_time = jiffies; 9052 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9053 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9054 } 9055 9056 while (j >= mddev->resync_max && 9057 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9058 /* As this condition is controlled by user-space, 9059 * we can block indefinitely, so use '_interruptible' 9060 * to avoid triggering warnings. 9061 */ 9062 flush_signals(current); /* just in case */ 9063 wait_event_interruptible(mddev->recovery_wait, 9064 mddev->resync_max > j 9065 || test_bit(MD_RECOVERY_INTR, 9066 &mddev->recovery)); 9067 } 9068 9069 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9070 break; 9071 9072 sectors = mddev->pers->sync_request(mddev, j, &skipped); 9073 if (sectors == 0) { 9074 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9075 break; 9076 } 9077 9078 if (!skipped) { /* actual IO requested */ 9079 io_sectors += sectors; 9080 atomic_add(sectors, &mddev->recovery_active); 9081 } 9082 9083 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9084 break; 9085 9086 j += sectors; 9087 if (j > max_sectors) 9088 /* when skipping, extra large numbers can be returned. */ 9089 j = max_sectors; 9090 if (j >= MD_RESYNC_ACTIVE) 9091 mddev->curr_resync = j; 9092 mddev->curr_mark_cnt = io_sectors; 9093 if (last_check == 0) 9094 /* this is the earliest that rebuild will be 9095 * visible in /proc/mdstat 9096 */ 9097 md_new_event(); 9098 9099 if (last_check + window > io_sectors || j == max_sectors) 9100 continue; 9101 9102 last_check = io_sectors; 9103 repeat: 9104 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9105 /* step marks */ 9106 int next = (last_mark+1) % SYNC_MARKS; 9107 9108 mddev->resync_mark = mark[next]; 9109 mddev->resync_mark_cnt = mark_cnt[next]; 9110 mark[next] = jiffies; 9111 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9112 last_mark = next; 9113 } 9114 9115 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9116 break; 9117 9118 /* 9119 * this loop exits only if either when we are slower than 9120 * the 'hard' speed limit, or the system was IO-idle for 9121 * a jiffy. 9122 * the system might be non-idle CPU-wise, but we only care 9123 * about not overloading the IO subsystem. (things like an 9124 * e2fsck being done on the RAID array should execute fast) 9125 */ 9126 cond_resched(); 9127 9128 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9129 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9130 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9131 9132 if (currspeed > speed_min(mddev)) { 9133 if (currspeed > speed_max(mddev)) { 9134 msleep(500); 9135 goto repeat; 9136 } 9137 if (!is_mddev_idle(mddev, 0)) { 9138 /* 9139 * Give other IO more of a chance. 9140 * The faster the devices, the less we wait. 9141 */ 9142 wait_event(mddev->recovery_wait, 9143 !atomic_read(&mddev->recovery_active)); 9144 } 9145 } 9146 } 9147 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9148 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9149 ? "interrupted" : "done"); 9150 /* 9151 * this also signals 'finished resyncing' to md_stop 9152 */ 9153 blk_finish_plug(&plug); 9154 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9155 9156 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9157 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9158 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9159 mddev->curr_resync_completed = mddev->curr_resync; 9160 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9161 } 9162 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9163 9164 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9165 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9166 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9167 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9168 if (mddev->curr_resync >= mddev->recovery_cp) { 9169 pr_debug("md: checkpointing %s of %s.\n", 9170 desc, mdname(mddev)); 9171 if (test_bit(MD_RECOVERY_ERROR, 9172 &mddev->recovery)) 9173 mddev->recovery_cp = 9174 mddev->curr_resync_completed; 9175 else 9176 mddev->recovery_cp = 9177 mddev->curr_resync; 9178 } 9179 } else 9180 mddev->recovery_cp = MaxSector; 9181 } else { 9182 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9183 mddev->curr_resync = MaxSector; 9184 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9185 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9186 rcu_read_lock(); 9187 rdev_for_each_rcu(rdev, mddev) 9188 if (rdev->raid_disk >= 0 && 9189 mddev->delta_disks >= 0 && 9190 !test_bit(Journal, &rdev->flags) && 9191 !test_bit(Faulty, &rdev->flags) && 9192 !test_bit(In_sync, &rdev->flags) && 9193 rdev->recovery_offset < mddev->curr_resync) 9194 rdev->recovery_offset = mddev->curr_resync; 9195 rcu_read_unlock(); 9196 } 9197 } 9198 } 9199 skip: 9200 /* set CHANGE_PENDING here since maybe another update is needed, 9201 * so other nodes are informed. It should be harmless for normal 9202 * raid */ 9203 set_mask_bits(&mddev->sb_flags, 0, 9204 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9205 9206 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9207 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9208 mddev->delta_disks > 0 && 9209 mddev->pers->finish_reshape && 9210 mddev->pers->size && 9211 mddev->queue) { 9212 mddev_lock_nointr(mddev); 9213 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9214 mddev_unlock(mddev); 9215 if (!mddev_is_clustered(mddev)) 9216 set_capacity_and_notify(mddev->gendisk, 9217 mddev->array_sectors); 9218 } 9219 9220 spin_lock(&mddev->lock); 9221 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9222 /* We completed so min/max setting can be forgotten if used. */ 9223 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9224 mddev->resync_min = 0; 9225 mddev->resync_max = MaxSector; 9226 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9227 mddev->resync_min = mddev->curr_resync_completed; 9228 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9229 mddev->curr_resync = MD_RESYNC_NONE; 9230 spin_unlock(&mddev->lock); 9231 9232 wake_up(&resync_wait); 9233 wake_up(&mddev->sb_wait); 9234 md_wakeup_thread(mddev->thread); 9235 return; 9236 } 9237 EXPORT_SYMBOL_GPL(md_do_sync); 9238 remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9239 static int remove_and_add_spares(struct mddev *mddev, 9240 struct md_rdev *this) 9241 { 9242 struct md_rdev *rdev; 9243 int spares = 0; 9244 int removed = 0; 9245 bool remove_some = false; 9246 9247 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9248 /* Mustn't remove devices when resync thread is running */ 9249 return 0; 9250 9251 rdev_for_each(rdev, mddev) { 9252 if ((this == NULL || rdev == this) && 9253 rdev->raid_disk >= 0 && 9254 !test_bit(Blocked, &rdev->flags) && 9255 test_bit(Faulty, &rdev->flags) && 9256 atomic_read(&rdev->nr_pending)==0) { 9257 /* Faulty non-Blocked devices with nr_pending == 0 9258 * never get nr_pending incremented, 9259 * never get Faulty cleared, and never get Blocked set. 9260 * So we can synchronize_rcu now rather than once per device 9261 */ 9262 remove_some = true; 9263 set_bit(RemoveSynchronized, &rdev->flags); 9264 } 9265 } 9266 9267 if (remove_some) 9268 synchronize_rcu(); 9269 rdev_for_each(rdev, mddev) { 9270 if ((this == NULL || rdev == this) && 9271 rdev->raid_disk >= 0 && 9272 !test_bit(Blocked, &rdev->flags) && 9273 ((test_bit(RemoveSynchronized, &rdev->flags) || 9274 (!test_bit(In_sync, &rdev->flags) && 9275 !test_bit(Journal, &rdev->flags))) && 9276 atomic_read(&rdev->nr_pending)==0)) { 9277 if (mddev->pers->hot_remove_disk( 9278 mddev, rdev) == 0) { 9279 sysfs_unlink_rdev(mddev, rdev); 9280 rdev->saved_raid_disk = rdev->raid_disk; 9281 rdev->raid_disk = -1; 9282 removed++; 9283 } 9284 } 9285 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 9286 clear_bit(RemoveSynchronized, &rdev->flags); 9287 } 9288 9289 if (removed && mddev->kobj.sd) 9290 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9291 9292 if (this && removed) 9293 goto no_add; 9294 9295 rdev_for_each(rdev, mddev) { 9296 if (this && this != rdev) 9297 continue; 9298 if (test_bit(Candidate, &rdev->flags)) 9299 continue; 9300 if (rdev->raid_disk >= 0 && 9301 !test_bit(In_sync, &rdev->flags) && 9302 !test_bit(Journal, &rdev->flags) && 9303 !test_bit(Faulty, &rdev->flags)) 9304 spares++; 9305 if (rdev->raid_disk >= 0) 9306 continue; 9307 if (test_bit(Faulty, &rdev->flags)) 9308 continue; 9309 if (!test_bit(Journal, &rdev->flags)) { 9310 if (!md_is_rdwr(mddev) && 9311 !(rdev->saved_raid_disk >= 0 && 9312 !test_bit(Bitmap_sync, &rdev->flags))) 9313 continue; 9314 9315 rdev->recovery_offset = 0; 9316 } 9317 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9318 /* failure here is OK */ 9319 sysfs_link_rdev(mddev, rdev); 9320 if (!test_bit(Journal, &rdev->flags)) 9321 spares++; 9322 md_new_event(); 9323 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9324 } 9325 } 9326 no_add: 9327 if (removed) 9328 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9329 return spares; 9330 } 9331 md_start_sync(struct work_struct * ws)9332 static void md_start_sync(struct work_struct *ws) 9333 { 9334 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9335 9336 rcu_assign_pointer(mddev->sync_thread, 9337 md_register_thread(md_do_sync, mddev, "resync")); 9338 if (!mddev->sync_thread) { 9339 pr_warn("%s: could not start resync thread...\n", 9340 mdname(mddev)); 9341 /* leave the spares where they are, it shouldn't hurt */ 9342 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9343 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9344 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9345 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9346 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9347 wake_up(&resync_wait); 9348 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9349 &mddev->recovery)) 9350 if (mddev->sysfs_action) 9351 sysfs_notify_dirent_safe(mddev->sysfs_action); 9352 } else 9353 md_wakeup_thread(mddev->sync_thread); 9354 sysfs_notify_dirent_safe(mddev->sysfs_action); 9355 md_new_event(); 9356 } 9357 9358 /* 9359 * This routine is regularly called by all per-raid-array threads to 9360 * deal with generic issues like resync and super-block update. 9361 * Raid personalities that don't have a thread (linear/raid0) do not 9362 * need this as they never do any recovery or update the superblock. 9363 * 9364 * It does not do any resync itself, but rather "forks" off other threads 9365 * to do that as needed. 9366 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9367 * "->recovery" and create a thread at ->sync_thread. 9368 * When the thread finishes it sets MD_RECOVERY_DONE 9369 * and wakeups up this thread which will reap the thread and finish up. 9370 * This thread also removes any faulty devices (with nr_pending == 0). 9371 * 9372 * The overall approach is: 9373 * 1/ if the superblock needs updating, update it. 9374 * 2/ If a recovery thread is running, don't do anything else. 9375 * 3/ If recovery has finished, clean up, possibly marking spares active. 9376 * 4/ If there are any faulty devices, remove them. 9377 * 5/ If array is degraded, try to add spares devices 9378 * 6/ If array has spares or is not in-sync, start a resync thread. 9379 */ md_check_recovery(struct mddev * mddev)9380 void md_check_recovery(struct mddev *mddev) 9381 { 9382 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { 9383 /* Write superblock - thread that called mddev_suspend() 9384 * holds reconfig_mutex for us. 9385 */ 9386 set_bit(MD_UPDATING_SB, &mddev->flags); 9387 smp_mb__after_atomic(); 9388 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) 9389 md_update_sb(mddev, 0); 9390 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); 9391 wake_up(&mddev->sb_wait); 9392 } 9393 9394 if (is_md_suspended(mddev)) 9395 return; 9396 9397 if (mddev->bitmap) 9398 md_bitmap_daemon_work(mddev); 9399 9400 if (signal_pending(current)) { 9401 if (mddev->pers->sync_request && !mddev->external) { 9402 pr_debug("md: %s in immediate safe mode\n", 9403 mdname(mddev)); 9404 mddev->safemode = 2; 9405 } 9406 flush_signals(current); 9407 } 9408 9409 if (!md_is_rdwr(mddev) && 9410 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9411 return; 9412 if ( ! ( 9413 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9414 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9415 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9416 (mddev->external == 0 && mddev->safemode == 1) || 9417 (mddev->safemode == 2 9418 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9419 )) 9420 return; 9421 9422 if (mddev_trylock(mddev)) { 9423 int spares = 0; 9424 bool try_set_sync = mddev->safemode != 0; 9425 9426 if (!mddev->external && mddev->safemode == 1) 9427 mddev->safemode = 0; 9428 9429 if (!md_is_rdwr(mddev)) { 9430 struct md_rdev *rdev; 9431 if (!mddev->external && mddev->in_sync) 9432 /* 'Blocked' flag not needed as failed devices 9433 * will be recorded if array switched to read/write. 9434 * Leaving it set will prevent the device 9435 * from being removed. 9436 */ 9437 rdev_for_each(rdev, mddev) 9438 clear_bit(Blocked, &rdev->flags); 9439 /* On a read-only array we can: 9440 * - remove failed devices 9441 * - add already-in_sync devices if the array itself 9442 * is in-sync. 9443 * As we only add devices that are already in-sync, 9444 * we can activate the spares immediately. 9445 */ 9446 remove_and_add_spares(mddev, NULL); 9447 /* There is no thread, but we need to call 9448 * ->spare_active and clear saved_raid_disk 9449 */ 9450 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9451 md_reap_sync_thread(mddev); 9452 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9453 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9454 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9455 goto unlock; 9456 } 9457 9458 if (mddev_is_clustered(mddev)) { 9459 struct md_rdev *rdev, *tmp; 9460 /* kick the device if another node issued a 9461 * remove disk. 9462 */ 9463 rdev_for_each_safe(rdev, tmp, mddev) { 9464 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9465 rdev->raid_disk < 0) 9466 md_kick_rdev_from_array(rdev); 9467 } 9468 } 9469 9470 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9471 spin_lock(&mddev->lock); 9472 set_in_sync(mddev); 9473 spin_unlock(&mddev->lock); 9474 } 9475 9476 if (mddev->sb_flags) 9477 md_update_sb(mddev, 0); 9478 9479 /* 9480 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9481 * still set. 9482 */ 9483 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9484 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9485 /* resync/recovery still happening */ 9486 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9487 goto unlock; 9488 } 9489 9490 if (WARN_ON_ONCE(!mddev->sync_thread)) 9491 goto unlock; 9492 9493 md_reap_sync_thread(mddev); 9494 goto unlock; 9495 } 9496 9497 /* Set RUNNING before clearing NEEDED to avoid 9498 * any transients in the value of "sync_action". 9499 */ 9500 mddev->curr_resync_completed = 0; 9501 spin_lock(&mddev->lock); 9502 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9503 spin_unlock(&mddev->lock); 9504 /* Clear some bits that don't mean anything, but 9505 * might be left set 9506 */ 9507 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9508 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9509 9510 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9511 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 9512 goto not_running; 9513 /* no recovery is running. 9514 * remove any failed drives, then 9515 * add spares if possible. 9516 * Spares are also removed and re-added, to allow 9517 * the personality to fail the re-add. 9518 */ 9519 9520 if (mddev->reshape_position != MaxSector) { 9521 if (mddev->pers->check_reshape == NULL || 9522 mddev->pers->check_reshape(mddev) != 0) 9523 /* Cannot proceed */ 9524 goto not_running; 9525 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9526 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9527 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 9528 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9529 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9530 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9531 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9532 } else if (mddev->recovery_cp < MaxSector) { 9533 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9534 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9535 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9536 /* nothing to be done ... */ 9537 goto not_running; 9538 9539 if (mddev->pers->sync_request) { 9540 if (spares) { 9541 /* We are adding a device or devices to an array 9542 * which has the bitmap stored on all devices. 9543 * So make sure all bitmap pages get written 9544 */ 9545 md_bitmap_write_all(mddev->bitmap); 9546 } 9547 queue_work(md_misc_wq, &mddev->sync_work); 9548 goto unlock; 9549 } 9550 not_running: 9551 if (!mddev->sync_thread) { 9552 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9553 wake_up(&resync_wait); 9554 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 9555 &mddev->recovery)) 9556 if (mddev->sysfs_action) 9557 sysfs_notify_dirent_safe(mddev->sysfs_action); 9558 } 9559 unlock: 9560 wake_up(&mddev->sb_wait); 9561 mddev_unlock(mddev); 9562 } 9563 } 9564 EXPORT_SYMBOL(md_check_recovery); 9565 md_reap_sync_thread(struct mddev * mddev)9566 void md_reap_sync_thread(struct mddev *mddev) 9567 { 9568 struct md_rdev *rdev; 9569 sector_t old_dev_sectors = mddev->dev_sectors; 9570 bool is_reshaped = false; 9571 9572 /* resync has finished, collect result */ 9573 md_unregister_thread(mddev, &mddev->sync_thread); 9574 atomic_inc(&mddev->sync_seq); 9575 9576 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9577 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9578 mddev->degraded != mddev->raid_disks) { 9579 /* success...*/ 9580 /* activate any spares */ 9581 if (mddev->pers->spare_active(mddev)) { 9582 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9583 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9584 } 9585 } 9586 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9587 mddev->pers->finish_reshape) { 9588 mddev->pers->finish_reshape(mddev); 9589 if (mddev_is_clustered(mddev)) 9590 is_reshaped = true; 9591 } 9592 9593 /* If array is no-longer degraded, then any saved_raid_disk 9594 * information must be scrapped. 9595 */ 9596 if (!mddev->degraded) 9597 rdev_for_each(rdev, mddev) 9598 rdev->saved_raid_disk = -1; 9599 9600 md_update_sb(mddev, 1); 9601 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9602 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9603 * clustered raid */ 9604 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9605 md_cluster_ops->resync_finish(mddev); 9606 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9607 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9608 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9609 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9610 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9611 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9612 /* 9613 * We call md_cluster_ops->update_size here because sync_size could 9614 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9615 * so it is time to update size across cluster. 9616 */ 9617 if (mddev_is_clustered(mddev) && is_reshaped 9618 && !test_bit(MD_CLOSING, &mddev->flags)) 9619 md_cluster_ops->update_size(mddev, old_dev_sectors); 9620 /* flag recovery needed just to double check */ 9621 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9622 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9623 sysfs_notify_dirent_safe(mddev->sysfs_action); 9624 md_new_event(); 9625 if (mddev->event_work.func) 9626 queue_work(md_misc_wq, &mddev->event_work); 9627 wake_up(&resync_wait); 9628 } 9629 EXPORT_SYMBOL(md_reap_sync_thread); 9630 md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)9631 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9632 { 9633 sysfs_notify_dirent_safe(rdev->sysfs_state); 9634 wait_event_timeout(rdev->blocked_wait, 9635 !test_bit(Blocked, &rdev->flags) && 9636 !test_bit(BlockedBadBlocks, &rdev->flags), 9637 msecs_to_jiffies(5000)); 9638 rdev_dec_pending(rdev, mddev); 9639 } 9640 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9641 md_finish_reshape(struct mddev * mddev)9642 void md_finish_reshape(struct mddev *mddev) 9643 { 9644 /* called be personality module when reshape completes. */ 9645 struct md_rdev *rdev; 9646 9647 rdev_for_each(rdev, mddev) { 9648 if (rdev->data_offset > rdev->new_data_offset) 9649 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9650 else 9651 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9652 rdev->data_offset = rdev->new_data_offset; 9653 } 9654 } 9655 EXPORT_SYMBOL(md_finish_reshape); 9656 9657 /* Bad block management */ 9658 9659 /* Returns 1 on success, 0 on failure */ rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9660 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9661 int is_new) 9662 { 9663 struct mddev *mddev = rdev->mddev; 9664 int rv; 9665 if (is_new) 9666 s += rdev->new_data_offset; 9667 else 9668 s += rdev->data_offset; 9669 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9670 if (rv == 0) { 9671 /* Make sure they get written out promptly */ 9672 if (test_bit(ExternalBbl, &rdev->flags)) 9673 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9674 sysfs_notify_dirent_safe(rdev->sysfs_state); 9675 set_mask_bits(&mddev->sb_flags, 0, 9676 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9677 md_wakeup_thread(rdev->mddev->thread); 9678 return 1; 9679 } else 9680 return 0; 9681 } 9682 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9683 rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9684 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9685 int is_new) 9686 { 9687 int rv; 9688 if (is_new) 9689 s += rdev->new_data_offset; 9690 else 9691 s += rdev->data_offset; 9692 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9693 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9694 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9695 return rv; 9696 } 9697 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9698 md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)9699 static int md_notify_reboot(struct notifier_block *this, 9700 unsigned long code, void *x) 9701 { 9702 struct mddev *mddev, *n; 9703 int need_delay = 0; 9704 9705 spin_lock(&all_mddevs_lock); 9706 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9707 if (!mddev_get(mddev)) 9708 continue; 9709 spin_unlock(&all_mddevs_lock); 9710 if (mddev_trylock(mddev)) { 9711 if (mddev->pers) 9712 __md_stop_writes(mddev); 9713 if (mddev->persistent) 9714 mddev->safemode = 2; 9715 mddev_unlock(mddev); 9716 } 9717 need_delay = 1; 9718 mddev_put(mddev); 9719 spin_lock(&all_mddevs_lock); 9720 } 9721 spin_unlock(&all_mddevs_lock); 9722 9723 /* 9724 * certain more exotic SCSI devices are known to be 9725 * volatile wrt too early system reboots. While the 9726 * right place to handle this issue is the given 9727 * driver, we do want to have a safe RAID driver ... 9728 */ 9729 if (need_delay) 9730 msleep(1000); 9731 9732 return NOTIFY_DONE; 9733 } 9734 9735 static struct notifier_block md_notifier = { 9736 .notifier_call = md_notify_reboot, 9737 .next = NULL, 9738 .priority = INT_MAX, /* before any real devices */ 9739 }; 9740 md_geninit(void)9741 static void md_geninit(void) 9742 { 9743 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9744 9745 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9746 } 9747 md_init(void)9748 static int __init md_init(void) 9749 { 9750 int ret = -ENOMEM; 9751 9752 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9753 if (!md_wq) 9754 goto err_wq; 9755 9756 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9757 if (!md_misc_wq) 9758 goto err_misc_wq; 9759 9760 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9761 0); 9762 if (!md_bitmap_wq) 9763 goto err_bitmap_wq; 9764 9765 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9766 if (ret < 0) 9767 goto err_md; 9768 9769 ret = __register_blkdev(0, "mdp", md_probe); 9770 if (ret < 0) 9771 goto err_mdp; 9772 mdp_major = ret; 9773 9774 register_reboot_notifier(&md_notifier); 9775 raid_table_header = register_sysctl("dev/raid", raid_table); 9776 9777 md_geninit(); 9778 return 0; 9779 9780 err_mdp: 9781 unregister_blkdev(MD_MAJOR, "md"); 9782 err_md: 9783 destroy_workqueue(md_bitmap_wq); 9784 err_bitmap_wq: 9785 destroy_workqueue(md_misc_wq); 9786 err_misc_wq: 9787 destroy_workqueue(md_wq); 9788 err_wq: 9789 return ret; 9790 } 9791 check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)9792 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9793 { 9794 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9795 struct md_rdev *rdev2, *tmp; 9796 int role, ret; 9797 9798 /* 9799 * If size is changed in another node then we need to 9800 * do resize as well. 9801 */ 9802 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9803 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9804 if (ret) 9805 pr_info("md-cluster: resize failed\n"); 9806 else 9807 md_bitmap_update_sb(mddev->bitmap); 9808 } 9809 9810 /* Check for change of roles in the active devices */ 9811 rdev_for_each_safe(rdev2, tmp, mddev) { 9812 if (test_bit(Faulty, &rdev2->flags)) 9813 continue; 9814 9815 /* Check if the roles changed */ 9816 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9817 9818 if (test_bit(Candidate, &rdev2->flags)) { 9819 if (role == MD_DISK_ROLE_FAULTY) { 9820 pr_info("md: Removing Candidate device %pg because add failed\n", 9821 rdev2->bdev); 9822 md_kick_rdev_from_array(rdev2); 9823 continue; 9824 } 9825 else 9826 clear_bit(Candidate, &rdev2->flags); 9827 } 9828 9829 if (role != rdev2->raid_disk) { 9830 /* 9831 * got activated except reshape is happening. 9832 */ 9833 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9834 !(le32_to_cpu(sb->feature_map) & 9835 MD_FEATURE_RESHAPE_ACTIVE)) { 9836 rdev2->saved_raid_disk = role; 9837 ret = remove_and_add_spares(mddev, rdev2); 9838 pr_info("Activated spare: %pg\n", 9839 rdev2->bdev); 9840 /* wakeup mddev->thread here, so array could 9841 * perform resync with the new activated disk */ 9842 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9843 md_wakeup_thread(mddev->thread); 9844 } 9845 /* device faulty 9846 * We just want to do the minimum to mark the disk 9847 * as faulty. The recovery is performed by the 9848 * one who initiated the error. 9849 */ 9850 if (role == MD_DISK_ROLE_FAULTY || 9851 role == MD_DISK_ROLE_JOURNAL) { 9852 md_error(mddev, rdev2); 9853 clear_bit(Blocked, &rdev2->flags); 9854 } 9855 } 9856 } 9857 9858 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9859 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9860 if (ret) 9861 pr_warn("md: updating array disks failed. %d\n", ret); 9862 } 9863 9864 /* 9865 * Since mddev->delta_disks has already updated in update_raid_disks, 9866 * so it is time to check reshape. 9867 */ 9868 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9869 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9870 /* 9871 * reshape is happening in the remote node, we need to 9872 * update reshape_position and call start_reshape. 9873 */ 9874 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9875 if (mddev->pers->update_reshape_pos) 9876 mddev->pers->update_reshape_pos(mddev); 9877 if (mddev->pers->start_reshape) 9878 mddev->pers->start_reshape(mddev); 9879 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9880 mddev->reshape_position != MaxSector && 9881 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9882 /* reshape is just done in another node. */ 9883 mddev->reshape_position = MaxSector; 9884 if (mddev->pers->update_reshape_pos) 9885 mddev->pers->update_reshape_pos(mddev); 9886 } 9887 9888 /* Finally set the event to be up to date */ 9889 mddev->events = le64_to_cpu(sb->events); 9890 } 9891 read_rdev(struct mddev * mddev,struct md_rdev * rdev)9892 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9893 { 9894 int err; 9895 struct page *swapout = rdev->sb_page; 9896 struct mdp_superblock_1 *sb; 9897 9898 /* Store the sb page of the rdev in the swapout temporary 9899 * variable in case we err in the future 9900 */ 9901 rdev->sb_page = NULL; 9902 err = alloc_disk_sb(rdev); 9903 if (err == 0) { 9904 ClearPageUptodate(rdev->sb_page); 9905 rdev->sb_loaded = 0; 9906 err = super_types[mddev->major_version]. 9907 load_super(rdev, NULL, mddev->minor_version); 9908 } 9909 if (err < 0) { 9910 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9911 __func__, __LINE__, rdev->desc_nr, err); 9912 if (rdev->sb_page) 9913 put_page(rdev->sb_page); 9914 rdev->sb_page = swapout; 9915 rdev->sb_loaded = 1; 9916 return err; 9917 } 9918 9919 sb = page_address(rdev->sb_page); 9920 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9921 * is not set 9922 */ 9923 9924 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9925 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9926 9927 /* The other node finished recovery, call spare_active to set 9928 * device In_sync and mddev->degraded 9929 */ 9930 if (rdev->recovery_offset == MaxSector && 9931 !test_bit(In_sync, &rdev->flags) && 9932 mddev->pers->spare_active(mddev)) 9933 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9934 9935 put_page(swapout); 9936 return 0; 9937 } 9938 md_reload_sb(struct mddev * mddev,int nr)9939 void md_reload_sb(struct mddev *mddev, int nr) 9940 { 9941 struct md_rdev *rdev = NULL, *iter; 9942 int err; 9943 9944 /* Find the rdev */ 9945 rdev_for_each_rcu(iter, mddev) { 9946 if (iter->desc_nr == nr) { 9947 rdev = iter; 9948 break; 9949 } 9950 } 9951 9952 if (!rdev) { 9953 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9954 return; 9955 } 9956 9957 err = read_rdev(mddev, rdev); 9958 if (err < 0) 9959 return; 9960 9961 check_sb_changes(mddev, rdev); 9962 9963 /* Read all rdev's to update recovery_offset */ 9964 rdev_for_each_rcu(rdev, mddev) { 9965 if (!test_bit(Faulty, &rdev->flags)) 9966 read_rdev(mddev, rdev); 9967 } 9968 } 9969 EXPORT_SYMBOL(md_reload_sb); 9970 9971 #ifndef MODULE 9972 9973 /* 9974 * Searches all registered partitions for autorun RAID arrays 9975 * at boot time. 9976 */ 9977 9978 static DEFINE_MUTEX(detected_devices_mutex); 9979 static LIST_HEAD(all_detected_devices); 9980 struct detected_devices_node { 9981 struct list_head list; 9982 dev_t dev; 9983 }; 9984 md_autodetect_dev(dev_t dev)9985 void md_autodetect_dev(dev_t dev) 9986 { 9987 struct detected_devices_node *node_detected_dev; 9988 9989 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9990 if (node_detected_dev) { 9991 node_detected_dev->dev = dev; 9992 mutex_lock(&detected_devices_mutex); 9993 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9994 mutex_unlock(&detected_devices_mutex); 9995 } 9996 } 9997 md_autostart_arrays(int part)9998 void md_autostart_arrays(int part) 9999 { 10000 struct md_rdev *rdev; 10001 struct detected_devices_node *node_detected_dev; 10002 dev_t dev; 10003 int i_scanned, i_passed; 10004 10005 i_scanned = 0; 10006 i_passed = 0; 10007 10008 pr_info("md: Autodetecting RAID arrays.\n"); 10009 10010 mutex_lock(&detected_devices_mutex); 10011 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10012 i_scanned++; 10013 node_detected_dev = list_entry(all_detected_devices.next, 10014 struct detected_devices_node, list); 10015 list_del(&node_detected_dev->list); 10016 dev = node_detected_dev->dev; 10017 kfree(node_detected_dev); 10018 mutex_unlock(&detected_devices_mutex); 10019 rdev = md_import_device(dev,0, 90); 10020 mutex_lock(&detected_devices_mutex); 10021 if (IS_ERR(rdev)) 10022 continue; 10023 10024 if (test_bit(Faulty, &rdev->flags)) 10025 continue; 10026 10027 set_bit(AutoDetected, &rdev->flags); 10028 list_add(&rdev->same_set, &pending_raid_disks); 10029 i_passed++; 10030 } 10031 mutex_unlock(&detected_devices_mutex); 10032 10033 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10034 10035 autorun_devices(part); 10036 } 10037 10038 #endif /* !MODULE */ 10039 md_exit(void)10040 static __exit void md_exit(void) 10041 { 10042 struct mddev *mddev, *n; 10043 int delay = 1; 10044 10045 unregister_blkdev(MD_MAJOR,"md"); 10046 unregister_blkdev(mdp_major, "mdp"); 10047 unregister_reboot_notifier(&md_notifier); 10048 unregister_sysctl_table(raid_table_header); 10049 10050 /* We cannot unload the modules while some process is 10051 * waiting for us in select() or poll() - wake them up 10052 */ 10053 md_unloading = 1; 10054 while (waitqueue_active(&md_event_waiters)) { 10055 /* not safe to leave yet */ 10056 wake_up(&md_event_waiters); 10057 msleep(delay); 10058 delay += delay; 10059 } 10060 remove_proc_entry("mdstat", NULL); 10061 10062 spin_lock(&all_mddevs_lock); 10063 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10064 if (!mddev_get(mddev)) 10065 continue; 10066 spin_unlock(&all_mddevs_lock); 10067 export_array(mddev); 10068 mddev->ctime = 0; 10069 mddev->hold_active = 0; 10070 /* 10071 * As the mddev is now fully clear, mddev_put will schedule 10072 * the mddev for destruction by a workqueue, and the 10073 * destroy_workqueue() below will wait for that to complete. 10074 */ 10075 mddev_put(mddev); 10076 spin_lock(&all_mddevs_lock); 10077 } 10078 spin_unlock(&all_mddevs_lock); 10079 10080 destroy_workqueue(md_misc_wq); 10081 destroy_workqueue(md_bitmap_wq); 10082 destroy_workqueue(md_wq); 10083 } 10084 10085 subsys_initcall(md_init); module_exit(md_exit)10086 module_exit(md_exit) 10087 10088 static int get_ro(char *buffer, const struct kernel_param *kp) 10089 { 10090 return sprintf(buffer, "%d\n", start_readonly); 10091 } set_ro(const char * val,const struct kernel_param * kp)10092 static int set_ro(const char *val, const struct kernel_param *kp) 10093 { 10094 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10095 } 10096 10097 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10098 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10099 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10100 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10101 10102 MODULE_LICENSE("GPL"); 10103 MODULE_DESCRIPTION("MD RAID framework"); 10104 MODULE_ALIAS("md"); 10105 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10106