1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * raid10.c : Multiple Devices driver for Linux 4 * 5 * Copyright (C) 2000-2004 Neil Brown 6 * 7 * RAID-10 support for md. 8 * 9 * Base on code in raid1.c. See raid1.c for further copyright information. 10 */ 11 12 #include <linux/slab.h> 13 #include <linux/delay.h> 14 #include <linux/blkdev.h> 15 #include <linux/module.h> 16 #include <linux/seq_file.h> 17 #include <linux/ratelimit.h> 18 #include <linux/kthread.h> 19 #include <linux/raid/md_p.h> 20 #include <trace/events/block.h> 21 #include "md.h" 22 #include "raid10.h" 23 #include "raid0.h" 24 #include "md-bitmap.h" 25 26 /* 27 * RAID10 provides a combination of RAID0 and RAID1 functionality. 28 * The layout of data is defined by 29 * chunk_size 30 * raid_disks 31 * near_copies (stored in low byte of layout) 32 * far_copies (stored in second byte of layout) 33 * far_offset (stored in bit 16 of layout ) 34 * use_far_sets (stored in bit 17 of layout ) 35 * use_far_sets_bugfixed (stored in bit 18 of layout ) 36 * 37 * The data to be stored is divided into chunks using chunksize. Each device 38 * is divided into far_copies sections. In each section, chunks are laid out 39 * in a style similar to raid0, but near_copies copies of each chunk is stored 40 * (each on a different drive). The starting device for each section is offset 41 * near_copies from the starting device of the previous section. Thus there 42 * are (near_copies * far_copies) of each chunk, and each is on a different 43 * drive. near_copies and far_copies must be at least one, and their product 44 * is at most raid_disks. 45 * 46 * If far_offset is true, then the far_copies are handled a bit differently. 47 * The copies are still in different stripes, but instead of being very far 48 * apart on disk, there are adjacent stripes. 49 * 50 * The far and offset algorithms are handled slightly differently if 51 * 'use_far_sets' is true. In this case, the array's devices are grouped into 52 * sets that are (near_copies * far_copies) in size. The far copied stripes 53 * are still shifted by 'near_copies' devices, but this shifting stays confined 54 * to the set rather than the entire array. This is done to improve the number 55 * of device combinations that can fail without causing the array to fail. 56 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk 57 * on a device): 58 * A B C D A B C D E 59 * ... ... 60 * D A B C E A B C D 61 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): 62 * [A B] [C D] [A B] [C D E] 63 * |...| |...| |...| | ... | 64 * [B A] [D C] [B A] [E C D] 65 */ 66 67 static void allow_barrier(struct r10conf *conf); 68 static void lower_barrier(struct r10conf *conf); 69 static int _enough(struct r10conf *conf, int previous, int ignore); 70 static int enough(struct r10conf *conf, int ignore); 71 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 72 int *skipped); 73 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 74 static void end_reshape_write(struct bio *bio); 75 static void end_reshape(struct r10conf *conf); 76 77 #define raid10_log(md, fmt, args...) \ 78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) 79 80 #include "raid1-10.c" 81 82 /* 83 * for resync bio, r10bio pointer can be retrieved from the per-bio 84 * 'struct resync_pages'. 85 */ 86 static inline struct r10bio *get_resync_r10bio(struct bio *bio) 87 { 88 return get_resync_pages(bio)->raid_bio; 89 } 90 91 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 92 { 93 struct r10conf *conf = data; 94 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); 95 96 /* allocate a r10bio with room for raid_disks entries in the 97 * bios array */ 98 return kzalloc(size, gfp_flags); 99 } 100 101 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 102 /* amount of memory to reserve for resync requests */ 103 #define RESYNC_WINDOW (1024*1024) 104 /* maximum number of concurrent requests, memory permitting */ 105 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 106 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) 107 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 108 109 /* 110 * When performing a resync, we need to read and compare, so 111 * we need as many pages are there are copies. 112 * When performing a recovery, we need 2 bios, one for read, 113 * one for write (we recover only one drive per r10buf) 114 * 115 */ 116 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 117 { 118 struct r10conf *conf = data; 119 struct r10bio *r10_bio; 120 struct bio *bio; 121 int j; 122 int nalloc, nalloc_rp; 123 struct resync_pages *rps; 124 125 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 126 if (!r10_bio) 127 return NULL; 128 129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 131 nalloc = conf->copies; /* resync */ 132 else 133 nalloc = 2; /* recovery */ 134 135 /* allocate once for all bios */ 136 if (!conf->have_replacement) 137 nalloc_rp = nalloc; 138 else 139 nalloc_rp = nalloc * 2; 140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags); 141 if (!rps) 142 goto out_free_r10bio; 143 144 /* 145 * Allocate bios. 146 */ 147 for (j = nalloc ; j-- ; ) { 148 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 149 if (!bio) 150 goto out_free_bio; 151 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 152 r10_bio->devs[j].bio = bio; 153 if (!conf->have_replacement) 154 continue; 155 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 156 if (!bio) 157 goto out_free_bio; 158 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 159 r10_bio->devs[j].repl_bio = bio; 160 } 161 /* 162 * Allocate RESYNC_PAGES data pages and attach them 163 * where needed. 164 */ 165 for (j = 0; j < nalloc; j++) { 166 struct bio *rbio = r10_bio->devs[j].repl_bio; 167 struct resync_pages *rp, *rp_repl; 168 169 rp = &rps[j]; 170 if (rbio) 171 rp_repl = &rps[nalloc + j]; 172 173 bio = r10_bio->devs[j].bio; 174 175 if (!j || test_bit(MD_RECOVERY_SYNC, 176 &conf->mddev->recovery)) { 177 if (resync_alloc_pages(rp, gfp_flags)) 178 goto out_free_pages; 179 } else { 180 memcpy(rp, &rps[0], sizeof(*rp)); 181 resync_get_all_pages(rp); 182 } 183 184 rp->raid_bio = r10_bio; 185 bio->bi_private = rp; 186 if (rbio) { 187 memcpy(rp_repl, rp, sizeof(*rp)); 188 rbio->bi_private = rp_repl; 189 } 190 } 191 192 return r10_bio; 193 194 out_free_pages: 195 while (--j >= 0) 196 resync_free_pages(&rps[j]); 197 198 j = 0; 199 out_free_bio: 200 for ( ; j < nalloc; j++) { 201 if (r10_bio->devs[j].bio) 202 bio_uninit(r10_bio->devs[j].bio); 203 kfree(r10_bio->devs[j].bio); 204 if (r10_bio->devs[j].repl_bio) 205 bio_uninit(r10_bio->devs[j].repl_bio); 206 kfree(r10_bio->devs[j].repl_bio); 207 } 208 kfree(rps); 209 out_free_r10bio: 210 rbio_pool_free(r10_bio, conf); 211 return NULL; 212 } 213 214 static void r10buf_pool_free(void *__r10_bio, void *data) 215 { 216 struct r10conf *conf = data; 217 struct r10bio *r10bio = __r10_bio; 218 int j; 219 struct resync_pages *rp = NULL; 220 221 for (j = conf->copies; j--; ) { 222 struct bio *bio = r10bio->devs[j].bio; 223 224 if (bio) { 225 rp = get_resync_pages(bio); 226 resync_free_pages(rp); 227 bio_uninit(bio); 228 kfree(bio); 229 } 230 231 bio = r10bio->devs[j].repl_bio; 232 if (bio) { 233 bio_uninit(bio); 234 kfree(bio); 235 } 236 } 237 238 /* resync pages array stored in the 1st bio's .bi_private */ 239 kfree(rp); 240 241 rbio_pool_free(r10bio, conf); 242 } 243 244 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 245 { 246 int i; 247 248 for (i = 0; i < conf->geo.raid_disks; i++) { 249 struct bio **bio = & r10_bio->devs[i].bio; 250 if (!BIO_SPECIAL(*bio)) 251 bio_put(*bio); 252 *bio = NULL; 253 bio = &r10_bio->devs[i].repl_bio; 254 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) 255 bio_put(*bio); 256 *bio = NULL; 257 } 258 } 259 260 static void free_r10bio(struct r10bio *r10_bio) 261 { 262 struct r10conf *conf = r10_bio->mddev->private; 263 264 put_all_bios(conf, r10_bio); 265 mempool_free(r10_bio, &conf->r10bio_pool); 266 } 267 268 static void put_buf(struct r10bio *r10_bio) 269 { 270 struct r10conf *conf = r10_bio->mddev->private; 271 272 mempool_free(r10_bio, &conf->r10buf_pool); 273 274 lower_barrier(conf); 275 } 276 277 static void reschedule_retry(struct r10bio *r10_bio) 278 { 279 unsigned long flags; 280 struct mddev *mddev = r10_bio->mddev; 281 struct r10conf *conf = mddev->private; 282 283 spin_lock_irqsave(&conf->device_lock, flags); 284 list_add(&r10_bio->retry_list, &conf->retry_list); 285 conf->nr_queued ++; 286 spin_unlock_irqrestore(&conf->device_lock, flags); 287 288 /* wake up frozen array... */ 289 wake_up(&conf->wait_barrier); 290 291 md_wakeup_thread(mddev->thread); 292 } 293 294 /* 295 * raid_end_bio_io() is called when we have finished servicing a mirrored 296 * operation and are ready to return a success/failure code to the buffer 297 * cache layer. 298 */ 299 static void raid_end_bio_io(struct r10bio *r10_bio) 300 { 301 struct bio *bio = r10_bio->master_bio; 302 struct r10conf *conf = r10_bio->mddev->private; 303 304 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 305 bio->bi_status = BLK_STS_IOERR; 306 307 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 308 bio_end_io_acct(bio, r10_bio->start_time); 309 bio_endio(bio); 310 /* 311 * Wake up any possible resync thread that waits for the device 312 * to go idle. 313 */ 314 allow_barrier(conf); 315 316 free_r10bio(r10_bio); 317 } 318 319 /* 320 * Update disk head position estimator based on IRQ completion info. 321 */ 322 static inline void update_head_pos(int slot, struct r10bio *r10_bio) 323 { 324 struct r10conf *conf = r10_bio->mddev->private; 325 326 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 327 r10_bio->devs[slot].addr + (r10_bio->sectors); 328 } 329 330 /* 331 * Find the disk number which triggered given bio 332 */ 333 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 334 struct bio *bio, int *slotp, int *replp) 335 { 336 int slot; 337 int repl = 0; 338 339 for (slot = 0; slot < conf->geo.raid_disks; slot++) { 340 if (r10_bio->devs[slot].bio == bio) 341 break; 342 if (r10_bio->devs[slot].repl_bio == bio) { 343 repl = 1; 344 break; 345 } 346 } 347 348 update_head_pos(slot, r10_bio); 349 350 if (slotp) 351 *slotp = slot; 352 if (replp) 353 *replp = repl; 354 return r10_bio->devs[slot].devnum; 355 } 356 357 static void raid10_end_read_request(struct bio *bio) 358 { 359 int uptodate = !bio->bi_status; 360 struct r10bio *r10_bio = bio->bi_private; 361 int slot; 362 struct md_rdev *rdev; 363 struct r10conf *conf = r10_bio->mddev->private; 364 365 slot = r10_bio->read_slot; 366 rdev = r10_bio->devs[slot].rdev; 367 /* 368 * this branch is our 'one mirror IO has finished' event handler: 369 */ 370 update_head_pos(slot, r10_bio); 371 372 if (uptodate) { 373 /* 374 * Set R10BIO_Uptodate in our master bio, so that 375 * we will return a good error code to the higher 376 * levels even if IO on some other mirrored buffer fails. 377 * 378 * The 'master' represents the composite IO operation to 379 * user-side. So if something waits for IO, then it will 380 * wait for the 'master' bio. 381 */ 382 set_bit(R10BIO_Uptodate, &r10_bio->state); 383 } else { 384 /* If all other devices that store this block have 385 * failed, we want to return the error upwards rather 386 * than fail the last device. Here we redefine 387 * "uptodate" to mean "Don't want to retry" 388 */ 389 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), 390 rdev->raid_disk)) 391 uptodate = 1; 392 } 393 if (uptodate) { 394 raid_end_bio_io(r10_bio); 395 rdev_dec_pending(rdev, conf->mddev); 396 } else { 397 /* 398 * oops, read error - keep the refcount on the rdev 399 */ 400 pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", 401 mdname(conf->mddev), 402 rdev->bdev, 403 (unsigned long long)r10_bio->sector); 404 set_bit(R10BIO_ReadError, &r10_bio->state); 405 reschedule_retry(r10_bio); 406 } 407 } 408 409 static void close_write(struct r10bio *r10_bio) 410 { 411 /* clear the bitmap if all writes complete successfully */ 412 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 413 r10_bio->sectors, 414 !test_bit(R10BIO_Degraded, &r10_bio->state), 415 0); 416 md_write_end(r10_bio->mddev); 417 } 418 419 static void one_write_done(struct r10bio *r10_bio) 420 { 421 if (atomic_dec_and_test(&r10_bio->remaining)) { 422 if (test_bit(R10BIO_WriteError, &r10_bio->state)) 423 reschedule_retry(r10_bio); 424 else { 425 close_write(r10_bio); 426 if (test_bit(R10BIO_MadeGood, &r10_bio->state)) 427 reschedule_retry(r10_bio); 428 else 429 raid_end_bio_io(r10_bio); 430 } 431 } 432 } 433 434 static void raid10_end_write_request(struct bio *bio) 435 { 436 struct r10bio *r10_bio = bio->bi_private; 437 int dev; 438 int dec_rdev = 1; 439 struct r10conf *conf = r10_bio->mddev->private; 440 int slot, repl; 441 struct md_rdev *rdev = NULL; 442 struct bio *to_put = NULL; 443 bool discard_error; 444 445 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 446 447 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 448 449 if (repl) 450 rdev = conf->mirrors[dev].replacement; 451 if (!rdev) { 452 smp_rmb(); 453 repl = 0; 454 rdev = conf->mirrors[dev].rdev; 455 } 456 /* 457 * this branch is our 'one mirror IO has finished' event handler: 458 */ 459 if (bio->bi_status && !discard_error) { 460 if (repl) 461 /* Never record new bad blocks to replacement, 462 * just fail it. 463 */ 464 md_error(rdev->mddev, rdev); 465 else { 466 set_bit(WriteErrorSeen, &rdev->flags); 467 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 468 set_bit(MD_RECOVERY_NEEDED, 469 &rdev->mddev->recovery); 470 471 dec_rdev = 0; 472 if (test_bit(FailFast, &rdev->flags) && 473 (bio->bi_opf & MD_FAILFAST)) { 474 md_error(rdev->mddev, rdev); 475 } 476 477 /* 478 * When the device is faulty, it is not necessary to 479 * handle write error. 480 */ 481 if (!test_bit(Faulty, &rdev->flags)) 482 set_bit(R10BIO_WriteError, &r10_bio->state); 483 else { 484 /* Fail the request */ 485 set_bit(R10BIO_Degraded, &r10_bio->state); 486 r10_bio->devs[slot].bio = NULL; 487 to_put = bio; 488 dec_rdev = 1; 489 } 490 } 491 } else { 492 /* 493 * Set R10BIO_Uptodate in our master bio, so that 494 * we will return a good error code for to the higher 495 * levels even if IO on some other mirrored buffer fails. 496 * 497 * The 'master' represents the composite IO operation to 498 * user-side. So if something waits for IO, then it will 499 * wait for the 'master' bio. 500 */ 501 sector_t first_bad; 502 int bad_sectors; 503 504 /* 505 * Do not set R10BIO_Uptodate if the current device is 506 * rebuilding or Faulty. This is because we cannot use 507 * such device for properly reading the data back (we could 508 * potentially use it, if the current write would have felt 509 * before rdev->recovery_offset, but for simplicity we don't 510 * check this here. 511 */ 512 if (test_bit(In_sync, &rdev->flags) && 513 !test_bit(Faulty, &rdev->flags)) 514 set_bit(R10BIO_Uptodate, &r10_bio->state); 515 516 /* Maybe we can clear some bad blocks. */ 517 if (is_badblock(rdev, 518 r10_bio->devs[slot].addr, 519 r10_bio->sectors, 520 &first_bad, &bad_sectors) && !discard_error) { 521 bio_put(bio); 522 if (repl) 523 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; 524 else 525 r10_bio->devs[slot].bio = IO_MADE_GOOD; 526 dec_rdev = 0; 527 set_bit(R10BIO_MadeGood, &r10_bio->state); 528 } 529 } 530 531 /* 532 * 533 * Let's see if all mirrored write operations have finished 534 * already. 535 */ 536 one_write_done(r10_bio); 537 if (dec_rdev) 538 rdev_dec_pending(rdev, conf->mddev); 539 if (to_put) 540 bio_put(to_put); 541 } 542 543 /* 544 * RAID10 layout manager 545 * As well as the chunksize and raid_disks count, there are two 546 * parameters: near_copies and far_copies. 547 * near_copies * far_copies must be <= raid_disks. 548 * Normally one of these will be 1. 549 * If both are 1, we get raid0. 550 * If near_copies == raid_disks, we get raid1. 551 * 552 * Chunks are laid out in raid0 style with near_copies copies of the 553 * first chunk, followed by near_copies copies of the next chunk and 554 * so on. 555 * If far_copies > 1, then after 1/far_copies of the array has been assigned 556 * as described above, we start again with a device offset of near_copies. 557 * So we effectively have another copy of the whole array further down all 558 * the drives, but with blocks on different drives. 559 * With this layout, and block is never stored twice on the one device. 560 * 561 * raid10_find_phys finds the sector offset of a given virtual sector 562 * on each device that it is on. 563 * 564 * raid10_find_virt does the reverse mapping, from a device and a 565 * sector offset to a virtual address 566 */ 567 568 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 569 { 570 int n,f; 571 sector_t sector; 572 sector_t chunk; 573 sector_t stripe; 574 int dev; 575 int slot = 0; 576 int last_far_set_start, last_far_set_size; 577 578 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 579 last_far_set_start *= geo->far_set_size; 580 581 last_far_set_size = geo->far_set_size; 582 last_far_set_size += (geo->raid_disks % geo->far_set_size); 583 584 /* now calculate first sector/dev */ 585 chunk = r10bio->sector >> geo->chunk_shift; 586 sector = r10bio->sector & geo->chunk_mask; 587 588 chunk *= geo->near_copies; 589 stripe = chunk; 590 dev = sector_div(stripe, geo->raid_disks); 591 if (geo->far_offset) 592 stripe *= geo->far_copies; 593 594 sector += stripe << geo->chunk_shift; 595 596 /* and calculate all the others */ 597 for (n = 0; n < geo->near_copies; n++) { 598 int d = dev; 599 int set; 600 sector_t s = sector; 601 r10bio->devs[slot].devnum = d; 602 r10bio->devs[slot].addr = s; 603 slot++; 604 605 for (f = 1; f < geo->far_copies; f++) { 606 set = d / geo->far_set_size; 607 d += geo->near_copies; 608 609 if ((geo->raid_disks % geo->far_set_size) && 610 (d > last_far_set_start)) { 611 d -= last_far_set_start; 612 d %= last_far_set_size; 613 d += last_far_set_start; 614 } else { 615 d %= geo->far_set_size; 616 d += geo->far_set_size * set; 617 } 618 s += geo->stride; 619 r10bio->devs[slot].devnum = d; 620 r10bio->devs[slot].addr = s; 621 slot++; 622 } 623 dev++; 624 if (dev >= geo->raid_disks) { 625 dev = 0; 626 sector += (geo->chunk_mask + 1); 627 } 628 } 629 } 630 631 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 632 { 633 struct geom *geo = &conf->geo; 634 635 if (conf->reshape_progress != MaxSector && 636 ((r10bio->sector >= conf->reshape_progress) != 637 conf->mddev->reshape_backwards)) { 638 set_bit(R10BIO_Previous, &r10bio->state); 639 geo = &conf->prev; 640 } else 641 clear_bit(R10BIO_Previous, &r10bio->state); 642 643 __raid10_find_phys(geo, r10bio); 644 } 645 646 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 647 { 648 sector_t offset, chunk, vchunk; 649 /* Never use conf->prev as this is only called during resync 650 * or recovery, so reshape isn't happening 651 */ 652 struct geom *geo = &conf->geo; 653 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; 654 int far_set_size = geo->far_set_size; 655 int last_far_set_start; 656 657 if (geo->raid_disks % geo->far_set_size) { 658 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 659 last_far_set_start *= geo->far_set_size; 660 661 if (dev >= last_far_set_start) { 662 far_set_size = geo->far_set_size; 663 far_set_size += (geo->raid_disks % geo->far_set_size); 664 far_set_start = last_far_set_start; 665 } 666 } 667 668 offset = sector & geo->chunk_mask; 669 if (geo->far_offset) { 670 int fc; 671 chunk = sector >> geo->chunk_shift; 672 fc = sector_div(chunk, geo->far_copies); 673 dev -= fc * geo->near_copies; 674 if (dev < far_set_start) 675 dev += far_set_size; 676 } else { 677 while (sector >= geo->stride) { 678 sector -= geo->stride; 679 if (dev < (geo->near_copies + far_set_start)) 680 dev += far_set_size - geo->near_copies; 681 else 682 dev -= geo->near_copies; 683 } 684 chunk = sector >> geo->chunk_shift; 685 } 686 vchunk = chunk * geo->raid_disks + dev; 687 sector_div(vchunk, geo->near_copies); 688 return (vchunk << geo->chunk_shift) + offset; 689 } 690 691 /* 692 * This routine returns the disk from which the requested read should 693 * be done. There is a per-array 'next expected sequential IO' sector 694 * number - if this matches on the next IO then we use the last disk. 695 * There is also a per-disk 'last know head position' sector that is 696 * maintained from IRQ contexts, both the normal and the resync IO 697 * completion handlers update this position correctly. If there is no 698 * perfect sequential match then we pick the disk whose head is closest. 699 * 700 * If there are 2 mirrors in the same 2 devices, performance degrades 701 * because position is mirror, not device based. 702 * 703 * The rdev for the device selected will have nr_pending incremented. 704 */ 705 706 /* 707 * FIXME: possibly should rethink readbalancing and do it differently 708 * depending on near_copies / far_copies geometry. 709 */ 710 static struct md_rdev *read_balance(struct r10conf *conf, 711 struct r10bio *r10_bio, 712 int *max_sectors) 713 { 714 const sector_t this_sector = r10_bio->sector; 715 int disk, slot; 716 int sectors = r10_bio->sectors; 717 int best_good_sectors; 718 sector_t new_distance, best_dist; 719 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; 720 int do_balance; 721 int best_dist_slot, best_pending_slot; 722 bool has_nonrot_disk = false; 723 unsigned int min_pending; 724 struct geom *geo = &conf->geo; 725 726 raid10_find_phys(conf, r10_bio); 727 rcu_read_lock(); 728 best_dist_slot = -1; 729 min_pending = UINT_MAX; 730 best_dist_rdev = NULL; 731 best_pending_rdev = NULL; 732 best_dist = MaxSector; 733 best_good_sectors = 0; 734 do_balance = 1; 735 clear_bit(R10BIO_FailFast, &r10_bio->state); 736 /* 737 * Check if we can balance. We can balance on the whole 738 * device if no resync is going on (recovery is ok), or below 739 * the resync window. We take the first readable disk when 740 * above the resync window. 741 */ 742 if ((conf->mddev->recovery_cp < MaxSector 743 && (this_sector + sectors >= conf->next_resync)) || 744 (mddev_is_clustered(conf->mddev) && 745 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 746 this_sector + sectors))) 747 do_balance = 0; 748 749 for (slot = 0; slot < conf->copies ; slot++) { 750 sector_t first_bad; 751 int bad_sectors; 752 sector_t dev_sector; 753 unsigned int pending; 754 bool nonrot; 755 756 if (r10_bio->devs[slot].bio == IO_BLOCKED) 757 continue; 758 disk = r10_bio->devs[slot].devnum; 759 rdev = rcu_dereference(conf->mirrors[disk].replacement); 760 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 761 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 762 rdev = rcu_dereference(conf->mirrors[disk].rdev); 763 if (rdev == NULL || 764 test_bit(Faulty, &rdev->flags)) 765 continue; 766 if (!test_bit(In_sync, &rdev->flags) && 767 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 768 continue; 769 770 dev_sector = r10_bio->devs[slot].addr; 771 if (is_badblock(rdev, dev_sector, sectors, 772 &first_bad, &bad_sectors)) { 773 if (best_dist < MaxSector) 774 /* Already have a better slot */ 775 continue; 776 if (first_bad <= dev_sector) { 777 /* Cannot read here. If this is the 778 * 'primary' device, then we must not read 779 * beyond 'bad_sectors' from another device. 780 */ 781 bad_sectors -= (dev_sector - first_bad); 782 if (!do_balance && sectors > bad_sectors) 783 sectors = bad_sectors; 784 if (best_good_sectors > sectors) 785 best_good_sectors = sectors; 786 } else { 787 sector_t good_sectors = 788 first_bad - dev_sector; 789 if (good_sectors > best_good_sectors) { 790 best_good_sectors = good_sectors; 791 best_dist_slot = slot; 792 best_dist_rdev = rdev; 793 } 794 if (!do_balance) 795 /* Must read from here */ 796 break; 797 } 798 continue; 799 } else 800 best_good_sectors = sectors; 801 802 if (!do_balance) 803 break; 804 805 nonrot = bdev_nonrot(rdev->bdev); 806 has_nonrot_disk |= nonrot; 807 pending = atomic_read(&rdev->nr_pending); 808 if (min_pending > pending && nonrot) { 809 min_pending = pending; 810 best_pending_slot = slot; 811 best_pending_rdev = rdev; 812 } 813 814 if (best_dist_slot >= 0) 815 /* At least 2 disks to choose from so failfast is OK */ 816 set_bit(R10BIO_FailFast, &r10_bio->state); 817 /* This optimisation is debatable, and completely destroys 818 * sequential read speed for 'far copies' arrays. So only 819 * keep it for 'near' arrays, and review those later. 820 */ 821 if (geo->near_copies > 1 && !pending) 822 new_distance = 0; 823 824 /* for far > 1 always use the lowest address */ 825 else if (geo->far_copies > 1) 826 new_distance = r10_bio->devs[slot].addr; 827 else 828 new_distance = abs(r10_bio->devs[slot].addr - 829 conf->mirrors[disk].head_position); 830 831 if (new_distance < best_dist) { 832 best_dist = new_distance; 833 best_dist_slot = slot; 834 best_dist_rdev = rdev; 835 } 836 } 837 if (slot >= conf->copies) { 838 if (has_nonrot_disk) { 839 slot = best_pending_slot; 840 rdev = best_pending_rdev; 841 } else { 842 slot = best_dist_slot; 843 rdev = best_dist_rdev; 844 } 845 } 846 847 if (slot >= 0) { 848 atomic_inc(&rdev->nr_pending); 849 r10_bio->read_slot = slot; 850 } else 851 rdev = NULL; 852 rcu_read_unlock(); 853 *max_sectors = best_good_sectors; 854 855 return rdev; 856 } 857 858 static void flush_pending_writes(struct r10conf *conf) 859 { 860 /* Any writes that have been queued but are awaiting 861 * bitmap updates get flushed here. 862 */ 863 spin_lock_irq(&conf->device_lock); 864 865 if (conf->pending_bio_list.head) { 866 struct blk_plug plug; 867 struct bio *bio; 868 869 bio = bio_list_get(&conf->pending_bio_list); 870 spin_unlock_irq(&conf->device_lock); 871 872 /* 873 * As this is called in a wait_event() loop (see freeze_array), 874 * current->state might be TASK_UNINTERRUPTIBLE which will 875 * cause a warning when we prepare to wait again. As it is 876 * rare that this path is taken, it is perfectly safe to force 877 * us to go around the wait_event() loop again, so the warning 878 * is a false-positive. Silence the warning by resetting 879 * thread state 880 */ 881 __set_current_state(TASK_RUNNING); 882 883 blk_start_plug(&plug); 884 /* flush any pending bitmap writes to disk 885 * before proceeding w/ I/O */ 886 md_bitmap_unplug(conf->mddev->bitmap); 887 wake_up(&conf->wait_barrier); 888 889 while (bio) { /* submit pending writes */ 890 struct bio *next = bio->bi_next; 891 struct md_rdev *rdev = (void*)bio->bi_bdev; 892 bio->bi_next = NULL; 893 bio_set_dev(bio, rdev->bdev); 894 if (test_bit(Faulty, &rdev->flags)) { 895 bio_io_error(bio); 896 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 897 !bdev_max_discard_sectors(bio->bi_bdev))) 898 /* Just ignore it */ 899 bio_endio(bio); 900 else 901 submit_bio_noacct(bio); 902 bio = next; 903 } 904 blk_finish_plug(&plug); 905 } else 906 spin_unlock_irq(&conf->device_lock); 907 } 908 909 /* Barriers.... 910 * Sometimes we need to suspend IO while we do something else, 911 * either some resync/recovery, or reconfigure the array. 912 * To do this we raise a 'barrier'. 913 * The 'barrier' is a counter that can be raised multiple times 914 * to count how many activities are happening which preclude 915 * normal IO. 916 * We can only raise the barrier if there is no pending IO. 917 * i.e. if nr_pending == 0. 918 * We choose only to raise the barrier if no-one is waiting for the 919 * barrier to go down. This means that as soon as an IO request 920 * is ready, no other operations which require a barrier will start 921 * until the IO request has had a chance. 922 * 923 * So: regular IO calls 'wait_barrier'. When that returns there 924 * is no backgroup IO happening, It must arrange to call 925 * allow_barrier when it has finished its IO. 926 * backgroup IO calls must call raise_barrier. Once that returns 927 * there is no normal IO happeing. It must arrange to call 928 * lower_barrier when the particular background IO completes. 929 */ 930 931 static void raise_barrier(struct r10conf *conf, int force) 932 { 933 BUG_ON(force && !conf->barrier); 934 spin_lock_irq(&conf->resync_lock); 935 936 /* Wait until no block IO is waiting (unless 'force') */ 937 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 938 conf->resync_lock); 939 940 /* block any new IO from starting */ 941 conf->barrier++; 942 943 /* Now wait for all pending IO to complete */ 944 wait_event_lock_irq(conf->wait_barrier, 945 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, 946 conf->resync_lock); 947 948 spin_unlock_irq(&conf->resync_lock); 949 } 950 951 static void lower_barrier(struct r10conf *conf) 952 { 953 unsigned long flags; 954 spin_lock_irqsave(&conf->resync_lock, flags); 955 conf->barrier--; 956 spin_unlock_irqrestore(&conf->resync_lock, flags); 957 wake_up(&conf->wait_barrier); 958 } 959 960 static bool wait_barrier(struct r10conf *conf, bool nowait) 961 { 962 bool ret = true; 963 964 spin_lock_irq(&conf->resync_lock); 965 if (conf->barrier) { 966 struct bio_list *bio_list = current->bio_list; 967 conf->nr_waiting++; 968 /* Wait for the barrier to drop. 969 * However if there are already pending 970 * requests (preventing the barrier from 971 * rising completely), and the 972 * pre-process bio queue isn't empty, 973 * then don't wait, as we need to empty 974 * that queue to get the nr_pending 975 * count down. 976 */ 977 /* Return false when nowait flag is set */ 978 if (nowait) { 979 ret = false; 980 } else { 981 raid10_log(conf->mddev, "wait barrier"); 982 wait_event_lock_irq(conf->wait_barrier, 983 !conf->barrier || 984 (atomic_read(&conf->nr_pending) && 985 bio_list && 986 (!bio_list_empty(&bio_list[0]) || 987 !bio_list_empty(&bio_list[1]))) || 988 /* move on if recovery thread is 989 * blocked by us 990 */ 991 (conf->mddev->thread->tsk == current && 992 test_bit(MD_RECOVERY_RUNNING, 993 &conf->mddev->recovery) && 994 conf->nr_queued > 0), 995 conf->resync_lock); 996 } 997 conf->nr_waiting--; 998 if (!conf->nr_waiting) 999 wake_up(&conf->wait_barrier); 1000 } 1001 /* Only increment nr_pending when we wait */ 1002 if (ret) 1003 atomic_inc(&conf->nr_pending); 1004 spin_unlock_irq(&conf->resync_lock); 1005 return ret; 1006 } 1007 1008 static void allow_barrier(struct r10conf *conf) 1009 { 1010 if ((atomic_dec_and_test(&conf->nr_pending)) || 1011 (conf->array_freeze_pending)) 1012 wake_up(&conf->wait_barrier); 1013 } 1014 1015 static void freeze_array(struct r10conf *conf, int extra) 1016 { 1017 /* stop syncio and normal IO and wait for everything to 1018 * go quiet. 1019 * We increment barrier and nr_waiting, and then 1020 * wait until nr_pending match nr_queued+extra 1021 * This is called in the context of one normal IO request 1022 * that has failed. Thus any sync request that might be pending 1023 * will be blocked by nr_pending, and we need to wait for 1024 * pending IO requests to complete or be queued for re-try. 1025 * Thus the number queued (nr_queued) plus this request (extra) 1026 * must match the number of pending IOs (nr_pending) before 1027 * we continue. 1028 */ 1029 spin_lock_irq(&conf->resync_lock); 1030 conf->array_freeze_pending++; 1031 conf->barrier++; 1032 conf->nr_waiting++; 1033 wait_event_lock_irq_cmd(conf->wait_barrier, 1034 atomic_read(&conf->nr_pending) == conf->nr_queued+extra, 1035 conf->resync_lock, 1036 flush_pending_writes(conf)); 1037 1038 conf->array_freeze_pending--; 1039 spin_unlock_irq(&conf->resync_lock); 1040 } 1041 1042 static void unfreeze_array(struct r10conf *conf) 1043 { 1044 /* reverse the effect of the freeze */ 1045 spin_lock_irq(&conf->resync_lock); 1046 conf->barrier--; 1047 conf->nr_waiting--; 1048 wake_up(&conf->wait_barrier); 1049 spin_unlock_irq(&conf->resync_lock); 1050 } 1051 1052 static sector_t choose_data_offset(struct r10bio *r10_bio, 1053 struct md_rdev *rdev) 1054 { 1055 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 1056 test_bit(R10BIO_Previous, &r10_bio->state)) 1057 return rdev->data_offset; 1058 else 1059 return rdev->new_data_offset; 1060 } 1061 1062 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 1063 { 1064 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); 1065 struct mddev *mddev = plug->cb.data; 1066 struct r10conf *conf = mddev->private; 1067 struct bio *bio; 1068 1069 if (from_schedule || current->bio_list) { 1070 spin_lock_irq(&conf->device_lock); 1071 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1072 spin_unlock_irq(&conf->device_lock); 1073 wake_up(&conf->wait_barrier); 1074 md_wakeup_thread(mddev->thread); 1075 kfree(plug); 1076 return; 1077 } 1078 1079 /* we aren't scheduling, so we can do the write-out directly. */ 1080 bio = bio_list_get(&plug->pending); 1081 md_bitmap_unplug(mddev->bitmap); 1082 wake_up(&conf->wait_barrier); 1083 1084 while (bio) { /* submit pending writes */ 1085 struct bio *next = bio->bi_next; 1086 struct md_rdev *rdev = (void*)bio->bi_bdev; 1087 bio->bi_next = NULL; 1088 bio_set_dev(bio, rdev->bdev); 1089 if (test_bit(Faulty, &rdev->flags)) { 1090 bio_io_error(bio); 1091 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 1092 !bdev_max_discard_sectors(bio->bi_bdev))) 1093 /* Just ignore it */ 1094 bio_endio(bio); 1095 else 1096 submit_bio_noacct(bio); 1097 bio = next; 1098 } 1099 kfree(plug); 1100 } 1101 1102 /* 1103 * 1. Register the new request and wait if the reconstruction thread has put 1104 * up a bar for new requests. Continue immediately if no resync is active 1105 * currently. 1106 * 2. If IO spans the reshape position. Need to wait for reshape to pass. 1107 */ 1108 static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, 1109 struct bio *bio, sector_t sectors) 1110 { 1111 /* Bail out if REQ_NOWAIT is set for the bio */ 1112 if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { 1113 bio_wouldblock_error(bio); 1114 return false; 1115 } 1116 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1117 bio->bi_iter.bi_sector < conf->reshape_progress && 1118 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 1119 allow_barrier(conf); 1120 if (bio->bi_opf & REQ_NOWAIT) { 1121 bio_wouldblock_error(bio); 1122 return false; 1123 } 1124 raid10_log(conf->mddev, "wait reshape"); 1125 wait_event(conf->wait_barrier, 1126 conf->reshape_progress <= bio->bi_iter.bi_sector || 1127 conf->reshape_progress >= bio->bi_iter.bi_sector + 1128 sectors); 1129 wait_barrier(conf, false); 1130 } 1131 return true; 1132 } 1133 1134 static void raid10_read_request(struct mddev *mddev, struct bio *bio, 1135 struct r10bio *r10_bio) 1136 { 1137 struct r10conf *conf = mddev->private; 1138 struct bio *read_bio; 1139 const enum req_op op = bio_op(bio); 1140 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1141 int max_sectors; 1142 struct md_rdev *rdev; 1143 char b[BDEVNAME_SIZE]; 1144 int slot = r10_bio->read_slot; 1145 struct md_rdev *err_rdev = NULL; 1146 gfp_t gfp = GFP_NOIO; 1147 1148 if (slot >= 0 && r10_bio->devs[slot].rdev) { 1149 /* 1150 * This is an error retry, but we cannot 1151 * safely dereference the rdev in the r10_bio, 1152 * we must use the one in conf. 1153 * If it has already been disconnected (unlikely) 1154 * we lose the device name in error messages. 1155 */ 1156 int disk; 1157 /* 1158 * As we are blocking raid10, it is a little safer to 1159 * use __GFP_HIGH. 1160 */ 1161 gfp = GFP_NOIO | __GFP_HIGH; 1162 1163 rcu_read_lock(); 1164 disk = r10_bio->devs[slot].devnum; 1165 err_rdev = rcu_dereference(conf->mirrors[disk].rdev); 1166 if (err_rdev) 1167 snprintf(b, sizeof(b), "%pg", err_rdev->bdev); 1168 else { 1169 strcpy(b, "???"); 1170 /* This never gets dereferenced */ 1171 err_rdev = r10_bio->devs[slot].rdev; 1172 } 1173 rcu_read_unlock(); 1174 } 1175 1176 if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) 1177 return; 1178 rdev = read_balance(conf, r10_bio, &max_sectors); 1179 if (!rdev) { 1180 if (err_rdev) { 1181 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", 1182 mdname(mddev), b, 1183 (unsigned long long)r10_bio->sector); 1184 } 1185 raid_end_bio_io(r10_bio); 1186 return; 1187 } 1188 if (err_rdev) 1189 pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", 1190 mdname(mddev), 1191 rdev->bdev, 1192 (unsigned long long)r10_bio->sector); 1193 if (max_sectors < bio_sectors(bio)) { 1194 struct bio *split = bio_split(bio, max_sectors, 1195 gfp, &conf->bio_split); 1196 bio_chain(split, bio); 1197 allow_barrier(conf); 1198 submit_bio_noacct(bio); 1199 wait_barrier(conf, false); 1200 bio = split; 1201 r10_bio->master_bio = bio; 1202 r10_bio->sectors = max_sectors; 1203 } 1204 slot = r10_bio->read_slot; 1205 1206 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1207 r10_bio->start_time = bio_start_io_acct(bio); 1208 read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); 1209 1210 r10_bio->devs[slot].bio = read_bio; 1211 r10_bio->devs[slot].rdev = rdev; 1212 1213 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + 1214 choose_data_offset(r10_bio, rdev); 1215 read_bio->bi_end_io = raid10_end_read_request; 1216 bio_set_op_attrs(read_bio, op, do_sync); 1217 if (test_bit(FailFast, &rdev->flags) && 1218 test_bit(R10BIO_FailFast, &r10_bio->state)) 1219 read_bio->bi_opf |= MD_FAILFAST; 1220 read_bio->bi_private = r10_bio; 1221 1222 if (mddev->gendisk) 1223 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 1224 r10_bio->sector); 1225 submit_bio_noacct(read_bio); 1226 return; 1227 } 1228 1229 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, 1230 struct bio *bio, bool replacement, 1231 int n_copy) 1232 { 1233 const enum req_op op = bio_op(bio); 1234 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1235 const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 1236 unsigned long flags; 1237 struct blk_plug_cb *cb; 1238 struct raid1_plug_cb *plug = NULL; 1239 struct r10conf *conf = mddev->private; 1240 struct md_rdev *rdev; 1241 int devnum = r10_bio->devs[n_copy].devnum; 1242 struct bio *mbio; 1243 1244 if (replacement) { 1245 rdev = conf->mirrors[devnum].replacement; 1246 if (rdev == NULL) { 1247 /* Replacement just got moved to main 'rdev' */ 1248 smp_mb(); 1249 rdev = conf->mirrors[devnum].rdev; 1250 } 1251 } else 1252 rdev = conf->mirrors[devnum].rdev; 1253 1254 mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); 1255 if (replacement) 1256 r10_bio->devs[n_copy].repl_bio = mbio; 1257 else 1258 r10_bio->devs[n_copy].bio = mbio; 1259 1260 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1261 choose_data_offset(r10_bio, rdev)); 1262 mbio->bi_end_io = raid10_end_write_request; 1263 bio_set_op_attrs(mbio, op, do_sync | do_fua); 1264 if (!replacement && test_bit(FailFast, 1265 &conf->mirrors[devnum].rdev->flags) 1266 && enough(conf, devnum)) 1267 mbio->bi_opf |= MD_FAILFAST; 1268 mbio->bi_private = r10_bio; 1269 1270 if (conf->mddev->gendisk) 1271 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), 1272 r10_bio->sector); 1273 /* flush_pending_writes() needs access to the rdev so...*/ 1274 mbio->bi_bdev = (void *)rdev; 1275 1276 atomic_inc(&r10_bio->remaining); 1277 1278 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1279 if (cb) 1280 plug = container_of(cb, struct raid1_plug_cb, cb); 1281 else 1282 plug = NULL; 1283 if (plug) { 1284 bio_list_add(&plug->pending, mbio); 1285 } else { 1286 spin_lock_irqsave(&conf->device_lock, flags); 1287 bio_list_add(&conf->pending_bio_list, mbio); 1288 spin_unlock_irqrestore(&conf->device_lock, flags); 1289 md_wakeup_thread(mddev->thread); 1290 } 1291 } 1292 1293 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 1294 { 1295 int i; 1296 struct r10conf *conf = mddev->private; 1297 struct md_rdev *blocked_rdev; 1298 1299 retry_wait: 1300 blocked_rdev = NULL; 1301 rcu_read_lock(); 1302 for (i = 0; i < conf->copies; i++) { 1303 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1304 struct md_rdev *rrdev = rcu_dereference( 1305 conf->mirrors[i].replacement); 1306 if (rdev == rrdev) 1307 rrdev = NULL; 1308 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1309 atomic_inc(&rdev->nr_pending); 1310 blocked_rdev = rdev; 1311 break; 1312 } 1313 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 1314 atomic_inc(&rrdev->nr_pending); 1315 blocked_rdev = rrdev; 1316 break; 1317 } 1318 1319 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1320 sector_t first_bad; 1321 sector_t dev_sector = r10_bio->devs[i].addr; 1322 int bad_sectors; 1323 int is_bad; 1324 1325 /* 1326 * Discard request doesn't care the write result 1327 * so it doesn't need to wait blocked disk here. 1328 */ 1329 if (!r10_bio->sectors) 1330 continue; 1331 1332 is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, 1333 &first_bad, &bad_sectors); 1334 if (is_bad < 0) { 1335 /* 1336 * Mustn't write here until the bad block 1337 * is acknowledged 1338 */ 1339 atomic_inc(&rdev->nr_pending); 1340 set_bit(BlockedBadBlocks, &rdev->flags); 1341 blocked_rdev = rdev; 1342 break; 1343 } 1344 } 1345 } 1346 rcu_read_unlock(); 1347 1348 if (unlikely(blocked_rdev)) { 1349 /* Have to wait for this device to get unblocked, then retry */ 1350 allow_barrier(conf); 1351 raid10_log(conf->mddev, "%s wait rdev %d blocked", 1352 __func__, blocked_rdev->raid_disk); 1353 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1354 wait_barrier(conf, false); 1355 goto retry_wait; 1356 } 1357 } 1358 1359 static void raid10_write_request(struct mddev *mddev, struct bio *bio, 1360 struct r10bio *r10_bio) 1361 { 1362 struct r10conf *conf = mddev->private; 1363 int i; 1364 sector_t sectors; 1365 int max_sectors; 1366 1367 if ((mddev_is_clustered(mddev) && 1368 md_cluster_ops->area_resyncing(mddev, WRITE, 1369 bio->bi_iter.bi_sector, 1370 bio_end_sector(bio)))) { 1371 DEFINE_WAIT(w); 1372 /* Bail out if REQ_NOWAIT is set for the bio */ 1373 if (bio->bi_opf & REQ_NOWAIT) { 1374 bio_wouldblock_error(bio); 1375 return; 1376 } 1377 for (;;) { 1378 prepare_to_wait(&conf->wait_barrier, 1379 &w, TASK_IDLE); 1380 if (!md_cluster_ops->area_resyncing(mddev, WRITE, 1381 bio->bi_iter.bi_sector, bio_end_sector(bio))) 1382 break; 1383 schedule(); 1384 } 1385 finish_wait(&conf->wait_barrier, &w); 1386 } 1387 1388 sectors = r10_bio->sectors; 1389 if (!regular_request_wait(mddev, conf, bio, sectors)) 1390 return; 1391 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1392 (mddev->reshape_backwards 1393 ? (bio->bi_iter.bi_sector < conf->reshape_safe && 1394 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) 1395 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && 1396 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1397 /* Need to update reshape_position in metadata */ 1398 mddev->reshape_position = conf->reshape_progress; 1399 set_mask_bits(&mddev->sb_flags, 0, 1400 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1401 md_wakeup_thread(mddev->thread); 1402 if (bio->bi_opf & REQ_NOWAIT) { 1403 allow_barrier(conf); 1404 bio_wouldblock_error(bio); 1405 return; 1406 } 1407 raid10_log(conf->mddev, "wait reshape metadata"); 1408 wait_event(mddev->sb_wait, 1409 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 1410 1411 conf->reshape_safe = mddev->reshape_position; 1412 } 1413 1414 /* first select target devices under rcu_lock and 1415 * inc refcount on their rdev. Record them by setting 1416 * bios[x] to bio 1417 * If there are known/acknowledged bad blocks on any device 1418 * on which we have seen a write error, we want to avoid 1419 * writing to those blocks. This potentially requires several 1420 * writes to write around the bad blocks. Each set of writes 1421 * gets its own r10_bio with a set of bios attached. 1422 */ 1423 1424 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 1425 raid10_find_phys(conf, r10_bio); 1426 1427 wait_blocked_dev(mddev, r10_bio); 1428 1429 rcu_read_lock(); 1430 max_sectors = r10_bio->sectors; 1431 1432 for (i = 0; i < conf->copies; i++) { 1433 int d = r10_bio->devs[i].devnum; 1434 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1435 struct md_rdev *rrdev = rcu_dereference( 1436 conf->mirrors[d].replacement); 1437 if (rdev == rrdev) 1438 rrdev = NULL; 1439 if (rdev && (test_bit(Faulty, &rdev->flags))) 1440 rdev = NULL; 1441 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1442 rrdev = NULL; 1443 1444 r10_bio->devs[i].bio = NULL; 1445 r10_bio->devs[i].repl_bio = NULL; 1446 1447 if (!rdev && !rrdev) { 1448 set_bit(R10BIO_Degraded, &r10_bio->state); 1449 continue; 1450 } 1451 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1452 sector_t first_bad; 1453 sector_t dev_sector = r10_bio->devs[i].addr; 1454 int bad_sectors; 1455 int is_bad; 1456 1457 is_bad = is_badblock(rdev, dev_sector, max_sectors, 1458 &first_bad, &bad_sectors); 1459 if (is_bad && first_bad <= dev_sector) { 1460 /* Cannot write here at all */ 1461 bad_sectors -= (dev_sector - first_bad); 1462 if (bad_sectors < max_sectors) 1463 /* Mustn't write more than bad_sectors 1464 * to other devices yet 1465 */ 1466 max_sectors = bad_sectors; 1467 /* We don't set R10BIO_Degraded as that 1468 * only applies if the disk is missing, 1469 * so it might be re-added, and we want to 1470 * know to recover this chunk. 1471 * In this case the device is here, and the 1472 * fact that this chunk is not in-sync is 1473 * recorded in the bad block log. 1474 */ 1475 continue; 1476 } 1477 if (is_bad) { 1478 int good_sectors = first_bad - dev_sector; 1479 if (good_sectors < max_sectors) 1480 max_sectors = good_sectors; 1481 } 1482 } 1483 if (rdev) { 1484 r10_bio->devs[i].bio = bio; 1485 atomic_inc(&rdev->nr_pending); 1486 } 1487 if (rrdev) { 1488 r10_bio->devs[i].repl_bio = bio; 1489 atomic_inc(&rrdev->nr_pending); 1490 } 1491 } 1492 rcu_read_unlock(); 1493 1494 if (max_sectors < r10_bio->sectors) 1495 r10_bio->sectors = max_sectors; 1496 1497 if (r10_bio->sectors < bio_sectors(bio)) { 1498 struct bio *split = bio_split(bio, r10_bio->sectors, 1499 GFP_NOIO, &conf->bio_split); 1500 bio_chain(split, bio); 1501 allow_barrier(conf); 1502 submit_bio_noacct(bio); 1503 wait_barrier(conf, false); 1504 bio = split; 1505 r10_bio->master_bio = bio; 1506 } 1507 1508 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1509 r10_bio->start_time = bio_start_io_acct(bio); 1510 atomic_set(&r10_bio->remaining, 1); 1511 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1512 1513 for (i = 0; i < conf->copies; i++) { 1514 if (r10_bio->devs[i].bio) 1515 raid10_write_one_disk(mddev, r10_bio, bio, false, i); 1516 if (r10_bio->devs[i].repl_bio) 1517 raid10_write_one_disk(mddev, r10_bio, bio, true, i); 1518 } 1519 one_write_done(r10_bio); 1520 } 1521 1522 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) 1523 { 1524 struct r10conf *conf = mddev->private; 1525 struct r10bio *r10_bio; 1526 1527 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1528 1529 r10_bio->master_bio = bio; 1530 r10_bio->sectors = sectors; 1531 1532 r10_bio->mddev = mddev; 1533 r10_bio->sector = bio->bi_iter.bi_sector; 1534 r10_bio->state = 0; 1535 r10_bio->read_slot = -1; 1536 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * 1537 conf->geo.raid_disks); 1538 1539 if (bio_data_dir(bio) == READ) 1540 raid10_read_request(mddev, bio, r10_bio); 1541 else 1542 raid10_write_request(mddev, bio, r10_bio); 1543 } 1544 1545 static void raid_end_discard_bio(struct r10bio *r10bio) 1546 { 1547 struct r10conf *conf = r10bio->mddev->private; 1548 struct r10bio *first_r10bio; 1549 1550 while (atomic_dec_and_test(&r10bio->remaining)) { 1551 1552 allow_barrier(conf); 1553 1554 if (!test_bit(R10BIO_Discard, &r10bio->state)) { 1555 first_r10bio = (struct r10bio *)r10bio->master_bio; 1556 free_r10bio(r10bio); 1557 r10bio = first_r10bio; 1558 } else { 1559 md_write_end(r10bio->mddev); 1560 bio_endio(r10bio->master_bio); 1561 free_r10bio(r10bio); 1562 break; 1563 } 1564 } 1565 } 1566 1567 static void raid10_end_discard_request(struct bio *bio) 1568 { 1569 struct r10bio *r10_bio = bio->bi_private; 1570 struct r10conf *conf = r10_bio->mddev->private; 1571 struct md_rdev *rdev = NULL; 1572 int dev; 1573 int slot, repl; 1574 1575 /* 1576 * We don't care the return value of discard bio 1577 */ 1578 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 1579 set_bit(R10BIO_Uptodate, &r10_bio->state); 1580 1581 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1582 if (repl) 1583 rdev = conf->mirrors[dev].replacement; 1584 if (!rdev) { 1585 /* 1586 * raid10_remove_disk uses smp_mb to make sure rdev is set to 1587 * replacement before setting replacement to NULL. It can read 1588 * rdev first without barrier protect even replacment is NULL 1589 */ 1590 smp_rmb(); 1591 rdev = conf->mirrors[dev].rdev; 1592 } 1593 1594 raid_end_discard_bio(r10_bio); 1595 rdev_dec_pending(rdev, conf->mddev); 1596 } 1597 1598 /* 1599 * There are some limitations to handle discard bio 1600 * 1st, the discard size is bigger than stripe_size*2. 1601 * 2st, if the discard bio spans reshape progress, we use the old way to 1602 * handle discard bio 1603 */ 1604 static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) 1605 { 1606 struct r10conf *conf = mddev->private; 1607 struct geom *geo = &conf->geo; 1608 int far_copies = geo->far_copies; 1609 bool first_copy = true; 1610 struct r10bio *r10_bio, *first_r10bio; 1611 struct bio *split; 1612 int disk; 1613 sector_t chunk; 1614 unsigned int stripe_size; 1615 unsigned int stripe_data_disks; 1616 sector_t split_size; 1617 sector_t bio_start, bio_end; 1618 sector_t first_stripe_index, last_stripe_index; 1619 sector_t start_disk_offset; 1620 unsigned int start_disk_index; 1621 sector_t end_disk_offset; 1622 unsigned int end_disk_index; 1623 unsigned int remainder; 1624 1625 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1626 return -EAGAIN; 1627 1628 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { 1629 bio_wouldblock_error(bio); 1630 return 0; 1631 } 1632 wait_barrier(conf, false); 1633 1634 /* 1635 * Check reshape again to avoid reshape happens after checking 1636 * MD_RECOVERY_RESHAPE and before wait_barrier 1637 */ 1638 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1639 goto out; 1640 1641 if (geo->near_copies) 1642 stripe_data_disks = geo->raid_disks / geo->near_copies + 1643 geo->raid_disks % geo->near_copies; 1644 else 1645 stripe_data_disks = geo->raid_disks; 1646 1647 stripe_size = stripe_data_disks << geo->chunk_shift; 1648 1649 bio_start = bio->bi_iter.bi_sector; 1650 bio_end = bio_end_sector(bio); 1651 1652 /* 1653 * Maybe one discard bio is smaller than strip size or across one 1654 * stripe and discard region is larger than one stripe size. For far 1655 * offset layout, if the discard region is not aligned with stripe 1656 * size, there is hole when we submit discard bio to member disk. 1657 * For simplicity, we only handle discard bio which discard region 1658 * is bigger than stripe_size * 2 1659 */ 1660 if (bio_sectors(bio) < stripe_size*2) 1661 goto out; 1662 1663 /* 1664 * Keep bio aligned with strip size. 1665 */ 1666 div_u64_rem(bio_start, stripe_size, &remainder); 1667 if (remainder) { 1668 split_size = stripe_size - remainder; 1669 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1670 bio_chain(split, bio); 1671 allow_barrier(conf); 1672 /* Resend the fist split part */ 1673 submit_bio_noacct(split); 1674 wait_barrier(conf, false); 1675 } 1676 div_u64_rem(bio_end, stripe_size, &remainder); 1677 if (remainder) { 1678 split_size = bio_sectors(bio) - remainder; 1679 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1680 bio_chain(split, bio); 1681 allow_barrier(conf); 1682 /* Resend the second split part */ 1683 submit_bio_noacct(bio); 1684 bio = split; 1685 wait_barrier(conf, false); 1686 } 1687 1688 bio_start = bio->bi_iter.bi_sector; 1689 bio_end = bio_end_sector(bio); 1690 1691 /* 1692 * Raid10 uses chunk as the unit to store data. It's similar like raid0. 1693 * One stripe contains the chunks from all member disk (one chunk from 1694 * one disk at the same HBA address). For layout detail, see 'man md 4' 1695 */ 1696 chunk = bio_start >> geo->chunk_shift; 1697 chunk *= geo->near_copies; 1698 first_stripe_index = chunk; 1699 start_disk_index = sector_div(first_stripe_index, geo->raid_disks); 1700 if (geo->far_offset) 1701 first_stripe_index *= geo->far_copies; 1702 start_disk_offset = (bio_start & geo->chunk_mask) + 1703 (first_stripe_index << geo->chunk_shift); 1704 1705 chunk = bio_end >> geo->chunk_shift; 1706 chunk *= geo->near_copies; 1707 last_stripe_index = chunk; 1708 end_disk_index = sector_div(last_stripe_index, geo->raid_disks); 1709 if (geo->far_offset) 1710 last_stripe_index *= geo->far_copies; 1711 end_disk_offset = (bio_end & geo->chunk_mask) + 1712 (last_stripe_index << geo->chunk_shift); 1713 1714 retry_discard: 1715 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1716 r10_bio->mddev = mddev; 1717 r10_bio->state = 0; 1718 r10_bio->sectors = 0; 1719 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); 1720 wait_blocked_dev(mddev, r10_bio); 1721 1722 /* 1723 * For far layout it needs more than one r10bio to cover all regions. 1724 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio 1725 * to record the discard bio. Other r10bio->master_bio record the first 1726 * r10bio. The first r10bio only release after all other r10bios finish. 1727 * The discard bio returns only first r10bio finishes 1728 */ 1729 if (first_copy) { 1730 r10_bio->master_bio = bio; 1731 set_bit(R10BIO_Discard, &r10_bio->state); 1732 first_copy = false; 1733 first_r10bio = r10_bio; 1734 } else 1735 r10_bio->master_bio = (struct bio *)first_r10bio; 1736 1737 /* 1738 * first select target devices under rcu_lock and 1739 * inc refcount on their rdev. Record them by setting 1740 * bios[x] to bio 1741 */ 1742 rcu_read_lock(); 1743 for (disk = 0; disk < geo->raid_disks; disk++) { 1744 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1745 struct md_rdev *rrdev = rcu_dereference( 1746 conf->mirrors[disk].replacement); 1747 1748 r10_bio->devs[disk].bio = NULL; 1749 r10_bio->devs[disk].repl_bio = NULL; 1750 1751 if (rdev && (test_bit(Faulty, &rdev->flags))) 1752 rdev = NULL; 1753 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1754 rrdev = NULL; 1755 if (!rdev && !rrdev) 1756 continue; 1757 1758 if (rdev) { 1759 r10_bio->devs[disk].bio = bio; 1760 atomic_inc(&rdev->nr_pending); 1761 } 1762 if (rrdev) { 1763 r10_bio->devs[disk].repl_bio = bio; 1764 atomic_inc(&rrdev->nr_pending); 1765 } 1766 } 1767 rcu_read_unlock(); 1768 1769 atomic_set(&r10_bio->remaining, 1); 1770 for (disk = 0; disk < geo->raid_disks; disk++) { 1771 sector_t dev_start, dev_end; 1772 struct bio *mbio, *rbio = NULL; 1773 1774 /* 1775 * Now start to calculate the start and end address for each disk. 1776 * The space between dev_start and dev_end is the discard region. 1777 * 1778 * For dev_start, it needs to consider three conditions: 1779 * 1st, the disk is before start_disk, you can imagine the disk in 1780 * the next stripe. So the dev_start is the start address of next 1781 * stripe. 1782 * 2st, the disk is after start_disk, it means the disk is at the 1783 * same stripe of first disk 1784 * 3st, the first disk itself, we can use start_disk_offset directly 1785 */ 1786 if (disk < start_disk_index) 1787 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; 1788 else if (disk > start_disk_index) 1789 dev_start = first_stripe_index * mddev->chunk_sectors; 1790 else 1791 dev_start = start_disk_offset; 1792 1793 if (disk < end_disk_index) 1794 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 1795 else if (disk > end_disk_index) 1796 dev_end = last_stripe_index * mddev->chunk_sectors; 1797 else 1798 dev_end = end_disk_offset; 1799 1800 /* 1801 * It only handles discard bio which size is >= stripe size, so 1802 * dev_end > dev_start all the time. 1803 * It doesn't need to use rcu lock to get rdev here. We already 1804 * add rdev->nr_pending in the first loop. 1805 */ 1806 if (r10_bio->devs[disk].bio) { 1807 struct md_rdev *rdev = conf->mirrors[disk].rdev; 1808 mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1809 &mddev->bio_set); 1810 mbio->bi_end_io = raid10_end_discard_request; 1811 mbio->bi_private = r10_bio; 1812 r10_bio->devs[disk].bio = mbio; 1813 r10_bio->devs[disk].devnum = disk; 1814 atomic_inc(&r10_bio->remaining); 1815 md_submit_discard_bio(mddev, rdev, mbio, 1816 dev_start + choose_data_offset(r10_bio, rdev), 1817 dev_end - dev_start); 1818 bio_endio(mbio); 1819 } 1820 if (r10_bio->devs[disk].repl_bio) { 1821 struct md_rdev *rrdev = conf->mirrors[disk].replacement; 1822 rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1823 &mddev->bio_set); 1824 rbio->bi_end_io = raid10_end_discard_request; 1825 rbio->bi_private = r10_bio; 1826 r10_bio->devs[disk].repl_bio = rbio; 1827 r10_bio->devs[disk].devnum = disk; 1828 atomic_inc(&r10_bio->remaining); 1829 md_submit_discard_bio(mddev, rrdev, rbio, 1830 dev_start + choose_data_offset(r10_bio, rrdev), 1831 dev_end - dev_start); 1832 bio_endio(rbio); 1833 } 1834 } 1835 1836 if (!geo->far_offset && --far_copies) { 1837 first_stripe_index += geo->stride >> geo->chunk_shift; 1838 start_disk_offset += geo->stride; 1839 last_stripe_index += geo->stride >> geo->chunk_shift; 1840 end_disk_offset += geo->stride; 1841 atomic_inc(&first_r10bio->remaining); 1842 raid_end_discard_bio(r10_bio); 1843 wait_barrier(conf, false); 1844 goto retry_discard; 1845 } 1846 1847 raid_end_discard_bio(r10_bio); 1848 1849 return 0; 1850 out: 1851 allow_barrier(conf); 1852 return -EAGAIN; 1853 } 1854 1855 static bool raid10_make_request(struct mddev *mddev, struct bio *bio) 1856 { 1857 struct r10conf *conf = mddev->private; 1858 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 1859 int chunk_sects = chunk_mask + 1; 1860 int sectors = bio_sectors(bio); 1861 1862 if (unlikely(bio->bi_opf & REQ_PREFLUSH) 1863 && md_flush_request(mddev, bio)) 1864 return true; 1865 1866 if (!md_write_start(mddev, bio)) 1867 return false; 1868 1869 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1870 if (!raid10_handle_discard(mddev, bio)) 1871 return true; 1872 1873 /* 1874 * If this request crosses a chunk boundary, we need to split 1875 * it. 1876 */ 1877 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + 1878 sectors > chunk_sects 1879 && (conf->geo.near_copies < conf->geo.raid_disks 1880 || conf->prev.near_copies < 1881 conf->prev.raid_disks))) 1882 sectors = chunk_sects - 1883 (bio->bi_iter.bi_sector & 1884 (chunk_sects - 1)); 1885 __make_request(mddev, bio, sectors); 1886 1887 /* In case raid10d snuck in to freeze_array */ 1888 wake_up(&conf->wait_barrier); 1889 return true; 1890 } 1891 1892 static void raid10_status(struct seq_file *seq, struct mddev *mddev) 1893 { 1894 struct r10conf *conf = mddev->private; 1895 int i; 1896 1897 if (conf->geo.near_copies < conf->geo.raid_disks) 1898 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1899 if (conf->geo.near_copies > 1) 1900 seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1901 if (conf->geo.far_copies > 1) { 1902 if (conf->geo.far_offset) 1903 seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1904 else 1905 seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1906 if (conf->geo.far_set_size != conf->geo.raid_disks) 1907 seq_printf(seq, " %d devices per set", conf->geo.far_set_size); 1908 } 1909 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1910 conf->geo.raid_disks - mddev->degraded); 1911 rcu_read_lock(); 1912 for (i = 0; i < conf->geo.raid_disks; i++) { 1913 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1914 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 1915 } 1916 rcu_read_unlock(); 1917 seq_printf(seq, "]"); 1918 } 1919 1920 /* check if there are enough drives for 1921 * every block to appear on atleast one. 1922 * Don't consider the device numbered 'ignore' 1923 * as we might be about to remove it. 1924 */ 1925 static int _enough(struct r10conf *conf, int previous, int ignore) 1926 { 1927 int first = 0; 1928 int has_enough = 0; 1929 int disks, ncopies; 1930 if (previous) { 1931 disks = conf->prev.raid_disks; 1932 ncopies = conf->prev.near_copies; 1933 } else { 1934 disks = conf->geo.raid_disks; 1935 ncopies = conf->geo.near_copies; 1936 } 1937 1938 rcu_read_lock(); 1939 do { 1940 int n = conf->copies; 1941 int cnt = 0; 1942 int this = first; 1943 while (n--) { 1944 struct md_rdev *rdev; 1945 if (this != ignore && 1946 (rdev = rcu_dereference(conf->mirrors[this].rdev)) && 1947 test_bit(In_sync, &rdev->flags)) 1948 cnt++; 1949 this = (this+1) % disks; 1950 } 1951 if (cnt == 0) 1952 goto out; 1953 first = (first + ncopies) % disks; 1954 } while (first != 0); 1955 has_enough = 1; 1956 out: 1957 rcu_read_unlock(); 1958 return has_enough; 1959 } 1960 1961 static int enough(struct r10conf *conf, int ignore) 1962 { 1963 /* when calling 'enough', both 'prev' and 'geo' must 1964 * be stable. 1965 * This is ensured if ->reconfig_mutex or ->device_lock 1966 * is held. 1967 */ 1968 return _enough(conf, 0, ignore) && 1969 _enough(conf, 1, ignore); 1970 } 1971 1972 /** 1973 * raid10_error() - RAID10 error handler. 1974 * @mddev: affected md device. 1975 * @rdev: member device to fail. 1976 * 1977 * The routine acknowledges &rdev failure and determines new @mddev state. 1978 * If it failed, then: 1979 * - &MD_BROKEN flag is set in &mddev->flags. 1980 * Otherwise, it must be degraded: 1981 * - recovery is interrupted. 1982 * - &mddev->degraded is bumped. 1983 1984 * @rdev is marked as &Faulty excluding case when array is failed and 1985 * &mddev->fail_last_dev is off. 1986 */ 1987 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) 1988 { 1989 struct r10conf *conf = mddev->private; 1990 unsigned long flags; 1991 1992 spin_lock_irqsave(&conf->device_lock, flags); 1993 1994 if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { 1995 set_bit(MD_BROKEN, &mddev->flags); 1996 1997 if (!mddev->fail_last_dev) { 1998 spin_unlock_irqrestore(&conf->device_lock, flags); 1999 return; 2000 } 2001 } 2002 if (test_and_clear_bit(In_sync, &rdev->flags)) 2003 mddev->degraded++; 2004 2005 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2006 set_bit(Blocked, &rdev->flags); 2007 set_bit(Faulty, &rdev->flags); 2008 set_mask_bits(&mddev->sb_flags, 0, 2009 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2010 spin_unlock_irqrestore(&conf->device_lock, flags); 2011 pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" 2012 "md/raid10:%s: Operation continuing on %d devices.\n", 2013 mdname(mddev), rdev->bdev, 2014 mdname(mddev), conf->geo.raid_disks - mddev->degraded); 2015 } 2016 2017 static void print_conf(struct r10conf *conf) 2018 { 2019 int i; 2020 struct md_rdev *rdev; 2021 2022 pr_debug("RAID10 conf printout:\n"); 2023 if (!conf) { 2024 pr_debug("(!conf)\n"); 2025 return; 2026 } 2027 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 2028 conf->geo.raid_disks); 2029 2030 /* This is only called with ->reconfix_mutex held, so 2031 * rcu protection of rdev is not needed */ 2032 for (i = 0; i < conf->geo.raid_disks; i++) { 2033 rdev = conf->mirrors[i].rdev; 2034 if (rdev) 2035 pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", 2036 i, !test_bit(In_sync, &rdev->flags), 2037 !test_bit(Faulty, &rdev->flags), 2038 rdev->bdev); 2039 } 2040 } 2041 2042 static void close_sync(struct r10conf *conf) 2043 { 2044 wait_barrier(conf, false); 2045 allow_barrier(conf); 2046 2047 mempool_exit(&conf->r10buf_pool); 2048 } 2049 2050 static int raid10_spare_active(struct mddev *mddev) 2051 { 2052 int i; 2053 struct r10conf *conf = mddev->private; 2054 struct raid10_info *tmp; 2055 int count = 0; 2056 unsigned long flags; 2057 2058 /* 2059 * Find all non-in_sync disks within the RAID10 configuration 2060 * and mark them in_sync 2061 */ 2062 for (i = 0; i < conf->geo.raid_disks; i++) { 2063 tmp = conf->mirrors + i; 2064 if (tmp->replacement 2065 && tmp->replacement->recovery_offset == MaxSector 2066 && !test_bit(Faulty, &tmp->replacement->flags) 2067 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 2068 /* Replacement has just become active */ 2069 if (!tmp->rdev 2070 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 2071 count++; 2072 if (tmp->rdev) { 2073 /* Replaced device not technically faulty, 2074 * but we need to be sure it gets removed 2075 * and never re-added. 2076 */ 2077 set_bit(Faulty, &tmp->rdev->flags); 2078 sysfs_notify_dirent_safe( 2079 tmp->rdev->sysfs_state); 2080 } 2081 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 2082 } else if (tmp->rdev 2083 && tmp->rdev->recovery_offset == MaxSector 2084 && !test_bit(Faulty, &tmp->rdev->flags) 2085 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 2086 count++; 2087 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 2088 } 2089 } 2090 spin_lock_irqsave(&conf->device_lock, flags); 2091 mddev->degraded -= count; 2092 spin_unlock_irqrestore(&conf->device_lock, flags); 2093 2094 print_conf(conf); 2095 return count; 2096 } 2097 2098 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 2099 { 2100 struct r10conf *conf = mddev->private; 2101 int err = -EEXIST; 2102 int mirror; 2103 int first = 0; 2104 int last = conf->geo.raid_disks - 1; 2105 2106 if (mddev->recovery_cp < MaxSector) 2107 /* only hot-add to in-sync arrays, as recovery is 2108 * very different from resync 2109 */ 2110 return -EBUSY; 2111 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) 2112 return -EINVAL; 2113 2114 if (md_integrity_add_rdev(rdev, mddev)) 2115 return -ENXIO; 2116 2117 if (rdev->raid_disk >= 0) 2118 first = last = rdev->raid_disk; 2119 2120 if (rdev->saved_raid_disk >= first && 2121 rdev->saved_raid_disk < conf->geo.raid_disks && 2122 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 2123 mirror = rdev->saved_raid_disk; 2124 else 2125 mirror = first; 2126 for ( ; mirror <= last ; mirror++) { 2127 struct raid10_info *p = &conf->mirrors[mirror]; 2128 if (p->recovery_disabled == mddev->recovery_disabled) 2129 continue; 2130 if (p->rdev) { 2131 if (!test_bit(WantReplacement, &p->rdev->flags) || 2132 p->replacement != NULL) 2133 continue; 2134 clear_bit(In_sync, &rdev->flags); 2135 set_bit(Replacement, &rdev->flags); 2136 rdev->raid_disk = mirror; 2137 err = 0; 2138 if (mddev->gendisk) 2139 disk_stack_limits(mddev->gendisk, rdev->bdev, 2140 rdev->data_offset << 9); 2141 conf->fullsync = 1; 2142 rcu_assign_pointer(p->replacement, rdev); 2143 break; 2144 } 2145 2146 if (mddev->gendisk) 2147 disk_stack_limits(mddev->gendisk, rdev->bdev, 2148 rdev->data_offset << 9); 2149 2150 p->head_position = 0; 2151 p->recovery_disabled = mddev->recovery_disabled - 1; 2152 rdev->raid_disk = mirror; 2153 err = 0; 2154 if (rdev->saved_raid_disk != mirror) 2155 conf->fullsync = 1; 2156 rcu_assign_pointer(p->rdev, rdev); 2157 break; 2158 } 2159 2160 print_conf(conf); 2161 return err; 2162 } 2163 2164 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 2165 { 2166 struct r10conf *conf = mddev->private; 2167 int err = 0; 2168 int number = rdev->raid_disk; 2169 struct md_rdev **rdevp; 2170 struct raid10_info *p; 2171 2172 print_conf(conf); 2173 if (unlikely(number >= mddev->raid_disks)) 2174 return 0; 2175 p = conf->mirrors + number; 2176 if (rdev == p->rdev) 2177 rdevp = &p->rdev; 2178 else if (rdev == p->replacement) 2179 rdevp = &p->replacement; 2180 else 2181 return 0; 2182 2183 if (test_bit(In_sync, &rdev->flags) || 2184 atomic_read(&rdev->nr_pending)) { 2185 err = -EBUSY; 2186 goto abort; 2187 } 2188 /* Only remove non-faulty devices if recovery 2189 * is not possible. 2190 */ 2191 if (!test_bit(Faulty, &rdev->flags) && 2192 mddev->recovery_disabled != p->recovery_disabled && 2193 (!p->replacement || p->replacement == rdev) && 2194 number < conf->geo.raid_disks && 2195 enough(conf, -1)) { 2196 err = -EBUSY; 2197 goto abort; 2198 } 2199 *rdevp = NULL; 2200 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 2201 synchronize_rcu(); 2202 if (atomic_read(&rdev->nr_pending)) { 2203 /* lost the race, try later */ 2204 err = -EBUSY; 2205 *rdevp = rdev; 2206 goto abort; 2207 } 2208 } 2209 if (p->replacement) { 2210 /* We must have just cleared 'rdev' */ 2211 p->rdev = p->replacement; 2212 clear_bit(Replacement, &p->replacement->flags); 2213 smp_mb(); /* Make sure other CPUs may see both as identical 2214 * but will never see neither -- if they are careful. 2215 */ 2216 p->replacement = NULL; 2217 } 2218 2219 clear_bit(WantReplacement, &rdev->flags); 2220 err = md_integrity_register(mddev); 2221 2222 abort: 2223 2224 print_conf(conf); 2225 return err; 2226 } 2227 2228 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) 2229 { 2230 struct r10conf *conf = r10_bio->mddev->private; 2231 2232 if (!bio->bi_status) 2233 set_bit(R10BIO_Uptodate, &r10_bio->state); 2234 else 2235 /* The write handler will notice the lack of 2236 * R10BIO_Uptodate and record any errors etc 2237 */ 2238 atomic_add(r10_bio->sectors, 2239 &conf->mirrors[d].rdev->corrected_errors); 2240 2241 /* for reconstruct, we always reschedule after a read. 2242 * for resync, only after all reads 2243 */ 2244 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); 2245 if (test_bit(R10BIO_IsRecover, &r10_bio->state) || 2246 atomic_dec_and_test(&r10_bio->remaining)) { 2247 /* we have read all the blocks, 2248 * do the comparison in process context in raid10d 2249 */ 2250 reschedule_retry(r10_bio); 2251 } 2252 } 2253 2254 static void end_sync_read(struct bio *bio) 2255 { 2256 struct r10bio *r10_bio = get_resync_r10bio(bio); 2257 struct r10conf *conf = r10_bio->mddev->private; 2258 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 2259 2260 __end_sync_read(r10_bio, bio, d); 2261 } 2262 2263 static void end_reshape_read(struct bio *bio) 2264 { 2265 /* reshape read bio isn't allocated from r10buf_pool */ 2266 struct r10bio *r10_bio = bio->bi_private; 2267 2268 __end_sync_read(r10_bio, bio, r10_bio->read_slot); 2269 } 2270 2271 static void end_sync_request(struct r10bio *r10_bio) 2272 { 2273 struct mddev *mddev = r10_bio->mddev; 2274 2275 while (atomic_dec_and_test(&r10_bio->remaining)) { 2276 if (r10_bio->master_bio == NULL) { 2277 /* the primary of several recovery bios */ 2278 sector_t s = r10_bio->sectors; 2279 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2280 test_bit(R10BIO_WriteError, &r10_bio->state)) 2281 reschedule_retry(r10_bio); 2282 else 2283 put_buf(r10_bio); 2284 md_done_sync(mddev, s, 1); 2285 break; 2286 } else { 2287 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 2288 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2289 test_bit(R10BIO_WriteError, &r10_bio->state)) 2290 reschedule_retry(r10_bio); 2291 else 2292 put_buf(r10_bio); 2293 r10_bio = r10_bio2; 2294 } 2295 } 2296 } 2297 2298 static void end_sync_write(struct bio *bio) 2299 { 2300 struct r10bio *r10_bio = get_resync_r10bio(bio); 2301 struct mddev *mddev = r10_bio->mddev; 2302 struct r10conf *conf = mddev->private; 2303 int d; 2304 sector_t first_bad; 2305 int bad_sectors; 2306 int slot; 2307 int repl; 2308 struct md_rdev *rdev = NULL; 2309 2310 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 2311 if (repl) 2312 rdev = conf->mirrors[d].replacement; 2313 else 2314 rdev = conf->mirrors[d].rdev; 2315 2316 if (bio->bi_status) { 2317 if (repl) 2318 md_error(mddev, rdev); 2319 else { 2320 set_bit(WriteErrorSeen, &rdev->flags); 2321 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2322 set_bit(MD_RECOVERY_NEEDED, 2323 &rdev->mddev->recovery); 2324 set_bit(R10BIO_WriteError, &r10_bio->state); 2325 } 2326 } else if (is_badblock(rdev, 2327 r10_bio->devs[slot].addr, 2328 r10_bio->sectors, 2329 &first_bad, &bad_sectors)) 2330 set_bit(R10BIO_MadeGood, &r10_bio->state); 2331 2332 rdev_dec_pending(rdev, mddev); 2333 2334 end_sync_request(r10_bio); 2335 } 2336 2337 /* 2338 * Note: sync and recover and handled very differently for raid10 2339 * This code is for resync. 2340 * For resync, we read through virtual addresses and read all blocks. 2341 * If there is any error, we schedule a write. The lowest numbered 2342 * drive is authoritative. 2343 * However requests come for physical address, so we need to map. 2344 * For every physical address there are raid_disks/copies virtual addresses, 2345 * which is always are least one, but is not necessarly an integer. 2346 * This means that a physical address can span multiple chunks, so we may 2347 * have to submit multiple io requests for a single sync request. 2348 */ 2349 /* 2350 * We check if all blocks are in-sync and only write to blocks that 2351 * aren't in sync 2352 */ 2353 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2354 { 2355 struct r10conf *conf = mddev->private; 2356 int i, first; 2357 struct bio *tbio, *fbio; 2358 int vcnt; 2359 struct page **tpages, **fpages; 2360 2361 atomic_set(&r10_bio->remaining, 1); 2362 2363 /* find the first device with a block */ 2364 for (i=0; i<conf->copies; i++) 2365 if (!r10_bio->devs[i].bio->bi_status) 2366 break; 2367 2368 if (i == conf->copies) 2369 goto done; 2370 2371 first = i; 2372 fbio = r10_bio->devs[i].bio; 2373 fbio->bi_iter.bi_size = r10_bio->sectors << 9; 2374 fbio->bi_iter.bi_idx = 0; 2375 fpages = get_resync_pages(fbio)->pages; 2376 2377 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); 2378 /* now find blocks with errors */ 2379 for (i=0 ; i < conf->copies ; i++) { 2380 int j, d; 2381 struct md_rdev *rdev; 2382 struct resync_pages *rp; 2383 2384 tbio = r10_bio->devs[i].bio; 2385 2386 if (tbio->bi_end_io != end_sync_read) 2387 continue; 2388 if (i == first) 2389 continue; 2390 2391 tpages = get_resync_pages(tbio)->pages; 2392 d = r10_bio->devs[i].devnum; 2393 rdev = conf->mirrors[d].rdev; 2394 if (!r10_bio->devs[i].bio->bi_status) { 2395 /* We know that the bi_io_vec layout is the same for 2396 * both 'first' and 'i', so we just compare them. 2397 * All vec entries are PAGE_SIZE; 2398 */ 2399 int sectors = r10_bio->sectors; 2400 for (j = 0; j < vcnt; j++) { 2401 int len = PAGE_SIZE; 2402 if (sectors < (len / 512)) 2403 len = sectors * 512; 2404 if (memcmp(page_address(fpages[j]), 2405 page_address(tpages[j]), 2406 len)) 2407 break; 2408 sectors -= len/512; 2409 } 2410 if (j == vcnt) 2411 continue; 2412 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 2413 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2414 /* Don't fix anything. */ 2415 continue; 2416 } else if (test_bit(FailFast, &rdev->flags)) { 2417 /* Just give up on this device */ 2418 md_error(rdev->mddev, rdev); 2419 continue; 2420 } 2421 /* Ok, we need to write this bio, either to correct an 2422 * inconsistency or to correct an unreadable block. 2423 * First we need to fixup bv_offset, bv_len and 2424 * bi_vecs, as the read request might have corrupted these 2425 */ 2426 rp = get_resync_pages(tbio); 2427 bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE); 2428 2429 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); 2430 2431 rp->raid_bio = r10_bio; 2432 tbio->bi_private = rp; 2433 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; 2434 tbio->bi_end_io = end_sync_write; 2435 2436 bio_copy_data(tbio, fbio); 2437 2438 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2439 atomic_inc(&r10_bio->remaining); 2440 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2441 2442 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) 2443 tbio->bi_opf |= MD_FAILFAST; 2444 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; 2445 submit_bio_noacct(tbio); 2446 } 2447 2448 /* Now write out to any replacement devices 2449 * that are active 2450 */ 2451 for (i = 0; i < conf->copies; i++) { 2452 int d; 2453 2454 tbio = r10_bio->devs[i].repl_bio; 2455 if (!tbio || !tbio->bi_end_io) 2456 continue; 2457 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write 2458 && r10_bio->devs[i].bio != fbio) 2459 bio_copy_data(tbio, fbio); 2460 d = r10_bio->devs[i].devnum; 2461 atomic_inc(&r10_bio->remaining); 2462 md_sync_acct(conf->mirrors[d].replacement->bdev, 2463 bio_sectors(tbio)); 2464 submit_bio_noacct(tbio); 2465 } 2466 2467 done: 2468 if (atomic_dec_and_test(&r10_bio->remaining)) { 2469 md_done_sync(mddev, r10_bio->sectors, 1); 2470 put_buf(r10_bio); 2471 } 2472 } 2473 2474 /* 2475 * Now for the recovery code. 2476 * Recovery happens across physical sectors. 2477 * We recover all non-is_sync drives by finding the virtual address of 2478 * each, and then choose a working drive that also has that virt address. 2479 * There is a separate r10_bio for each non-in_sync drive. 2480 * Only the first two slots are in use. The first for reading, 2481 * The second for writing. 2482 * 2483 */ 2484 static void fix_recovery_read_error(struct r10bio *r10_bio) 2485 { 2486 /* We got a read error during recovery. 2487 * We repeat the read in smaller page-sized sections. 2488 * If a read succeeds, write it to the new device or record 2489 * a bad block if we cannot. 2490 * If a read fails, record a bad block on both old and 2491 * new devices. 2492 */ 2493 struct mddev *mddev = r10_bio->mddev; 2494 struct r10conf *conf = mddev->private; 2495 struct bio *bio = r10_bio->devs[0].bio; 2496 sector_t sect = 0; 2497 int sectors = r10_bio->sectors; 2498 int idx = 0; 2499 int dr = r10_bio->devs[0].devnum; 2500 int dw = r10_bio->devs[1].devnum; 2501 struct page **pages = get_resync_pages(bio)->pages; 2502 2503 while (sectors) { 2504 int s = sectors; 2505 struct md_rdev *rdev; 2506 sector_t addr; 2507 int ok; 2508 2509 if (s > (PAGE_SIZE>>9)) 2510 s = PAGE_SIZE >> 9; 2511 2512 rdev = conf->mirrors[dr].rdev; 2513 addr = r10_bio->devs[0].addr + sect, 2514 ok = sync_page_io(rdev, 2515 addr, 2516 s << 9, 2517 pages[idx], 2518 REQ_OP_READ, false); 2519 if (ok) { 2520 rdev = conf->mirrors[dw].rdev; 2521 addr = r10_bio->devs[1].addr + sect; 2522 ok = sync_page_io(rdev, 2523 addr, 2524 s << 9, 2525 pages[idx], 2526 REQ_OP_WRITE, false); 2527 if (!ok) { 2528 set_bit(WriteErrorSeen, &rdev->flags); 2529 if (!test_and_set_bit(WantReplacement, 2530 &rdev->flags)) 2531 set_bit(MD_RECOVERY_NEEDED, 2532 &rdev->mddev->recovery); 2533 } 2534 } 2535 if (!ok) { 2536 /* We don't worry if we cannot set a bad block - 2537 * it really is bad so there is no loss in not 2538 * recording it yet 2539 */ 2540 rdev_set_badblocks(rdev, addr, s, 0); 2541 2542 if (rdev != conf->mirrors[dw].rdev) { 2543 /* need bad block on destination too */ 2544 struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 2545 addr = r10_bio->devs[1].addr + sect; 2546 ok = rdev_set_badblocks(rdev2, addr, s, 0); 2547 if (!ok) { 2548 /* just abort the recovery */ 2549 pr_notice("md/raid10:%s: recovery aborted due to read error\n", 2550 mdname(mddev)); 2551 2552 conf->mirrors[dw].recovery_disabled 2553 = mddev->recovery_disabled; 2554 set_bit(MD_RECOVERY_INTR, 2555 &mddev->recovery); 2556 break; 2557 } 2558 } 2559 } 2560 2561 sectors -= s; 2562 sect += s; 2563 idx++; 2564 } 2565 } 2566 2567 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2568 { 2569 struct r10conf *conf = mddev->private; 2570 int d; 2571 struct bio *wbio, *wbio2; 2572 2573 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 2574 fix_recovery_read_error(r10_bio); 2575 end_sync_request(r10_bio); 2576 return; 2577 } 2578 2579 /* 2580 * share the pages with the first bio 2581 * and submit the write request 2582 */ 2583 d = r10_bio->devs[1].devnum; 2584 wbio = r10_bio->devs[1].bio; 2585 wbio2 = r10_bio->devs[1].repl_bio; 2586 /* Need to test wbio2->bi_end_io before we call 2587 * submit_bio_noacct as if the former is NULL, 2588 * the latter is free to free wbio2. 2589 */ 2590 if (wbio2 && !wbio2->bi_end_io) 2591 wbio2 = NULL; 2592 if (wbio->bi_end_io) { 2593 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2594 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 2595 submit_bio_noacct(wbio); 2596 } 2597 if (wbio2) { 2598 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2599 md_sync_acct(conf->mirrors[d].replacement->bdev, 2600 bio_sectors(wbio2)); 2601 submit_bio_noacct(wbio2); 2602 } 2603 } 2604 2605 /* 2606 * Used by fix_read_error() to decay the per rdev read_errors. 2607 * We halve the read error count for every hour that has elapsed 2608 * since the last recorded read error. 2609 * 2610 */ 2611 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 2612 { 2613 long cur_time_mon; 2614 unsigned long hours_since_last; 2615 unsigned int read_errors = atomic_read(&rdev->read_errors); 2616 2617 cur_time_mon = ktime_get_seconds(); 2618 2619 if (rdev->last_read_error == 0) { 2620 /* first time we've seen a read error */ 2621 rdev->last_read_error = cur_time_mon; 2622 return; 2623 } 2624 2625 hours_since_last = (long)(cur_time_mon - 2626 rdev->last_read_error) / 3600; 2627 2628 rdev->last_read_error = cur_time_mon; 2629 2630 /* 2631 * if hours_since_last is > the number of bits in read_errors 2632 * just set read errors to 0. We do this to avoid 2633 * overflowing the shift of read_errors by hours_since_last. 2634 */ 2635 if (hours_since_last >= 8 * sizeof(read_errors)) 2636 atomic_set(&rdev->read_errors, 0); 2637 else 2638 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 2639 } 2640 2641 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 2642 int sectors, struct page *page, enum req_op op) 2643 { 2644 sector_t first_bad; 2645 int bad_sectors; 2646 2647 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 2648 && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) 2649 return -1; 2650 if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) 2651 /* success */ 2652 return 1; 2653 if (op == REQ_OP_WRITE) { 2654 set_bit(WriteErrorSeen, &rdev->flags); 2655 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2656 set_bit(MD_RECOVERY_NEEDED, 2657 &rdev->mddev->recovery); 2658 } 2659 /* need to record an error - either for the block or the device */ 2660 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2661 md_error(rdev->mddev, rdev); 2662 return 0; 2663 } 2664 2665 /* 2666 * This is a kernel thread which: 2667 * 2668 * 1. Retries failed read operations on working mirrors. 2669 * 2. Updates the raid superblock when problems encounter. 2670 * 3. Performs writes following reads for array synchronising. 2671 */ 2672 2673 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 2674 { 2675 int sect = 0; /* Offset from r10_bio->sector */ 2676 int sectors = r10_bio->sectors; 2677 struct md_rdev *rdev; 2678 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2679 int d = r10_bio->devs[r10_bio->read_slot].devnum; 2680 2681 /* still own a reference to this rdev, so it cannot 2682 * have been cleared recently. 2683 */ 2684 rdev = conf->mirrors[d].rdev; 2685 2686 if (test_bit(Faulty, &rdev->flags)) 2687 /* drive has already been failed, just ignore any 2688 more fix_read_error() attempts */ 2689 return; 2690 2691 check_decay_read_errors(mddev, rdev); 2692 atomic_inc(&rdev->read_errors); 2693 if (atomic_read(&rdev->read_errors) > max_read_errors) { 2694 pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", 2695 mdname(mddev), rdev->bdev, 2696 atomic_read(&rdev->read_errors), max_read_errors); 2697 pr_notice("md/raid10:%s: %pg: Failing raid device\n", 2698 mdname(mddev), rdev->bdev); 2699 md_error(mddev, rdev); 2700 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2701 return; 2702 } 2703 2704 while(sectors) { 2705 int s = sectors; 2706 int sl = r10_bio->read_slot; 2707 int success = 0; 2708 int start; 2709 2710 if (s > (PAGE_SIZE>>9)) 2711 s = PAGE_SIZE >> 9; 2712 2713 rcu_read_lock(); 2714 do { 2715 sector_t first_bad; 2716 int bad_sectors; 2717 2718 d = r10_bio->devs[sl].devnum; 2719 rdev = rcu_dereference(conf->mirrors[d].rdev); 2720 if (rdev && 2721 test_bit(In_sync, &rdev->flags) && 2722 !test_bit(Faulty, &rdev->flags) && 2723 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2724 &first_bad, &bad_sectors) == 0) { 2725 atomic_inc(&rdev->nr_pending); 2726 rcu_read_unlock(); 2727 success = sync_page_io(rdev, 2728 r10_bio->devs[sl].addr + 2729 sect, 2730 s<<9, 2731 conf->tmppage, 2732 REQ_OP_READ, false); 2733 rdev_dec_pending(rdev, mddev); 2734 rcu_read_lock(); 2735 if (success) 2736 break; 2737 } 2738 sl++; 2739 if (sl == conf->copies) 2740 sl = 0; 2741 } while (!success && sl != r10_bio->read_slot); 2742 rcu_read_unlock(); 2743 2744 if (!success) { 2745 /* Cannot read from anywhere, just mark the block 2746 * as bad on the first device to discourage future 2747 * reads. 2748 */ 2749 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 2750 rdev = conf->mirrors[dn].rdev; 2751 2752 if (!rdev_set_badblocks( 2753 rdev, 2754 r10_bio->devs[r10_bio->read_slot].addr 2755 + sect, 2756 s, 0)) { 2757 md_error(mddev, rdev); 2758 r10_bio->devs[r10_bio->read_slot].bio 2759 = IO_BLOCKED; 2760 } 2761 break; 2762 } 2763 2764 start = sl; 2765 /* write it back and re-read */ 2766 rcu_read_lock(); 2767 while (sl != r10_bio->read_slot) { 2768 if (sl==0) 2769 sl = conf->copies; 2770 sl--; 2771 d = r10_bio->devs[sl].devnum; 2772 rdev = rcu_dereference(conf->mirrors[d].rdev); 2773 if (!rdev || 2774 test_bit(Faulty, &rdev->flags) || 2775 !test_bit(In_sync, &rdev->flags)) 2776 continue; 2777 2778 atomic_inc(&rdev->nr_pending); 2779 rcu_read_unlock(); 2780 if (r10_sync_page_io(rdev, 2781 r10_bio->devs[sl].addr + 2782 sect, 2783 s, conf->tmppage, REQ_OP_WRITE) 2784 == 0) { 2785 /* Well, this device is dead */ 2786 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", 2787 mdname(mddev), s, 2788 (unsigned long long)( 2789 sect + 2790 choose_data_offset(r10_bio, 2791 rdev)), 2792 rdev->bdev); 2793 pr_notice("md/raid10:%s: %pg: failing drive\n", 2794 mdname(mddev), 2795 rdev->bdev); 2796 } 2797 rdev_dec_pending(rdev, mddev); 2798 rcu_read_lock(); 2799 } 2800 sl = start; 2801 while (sl != r10_bio->read_slot) { 2802 if (sl==0) 2803 sl = conf->copies; 2804 sl--; 2805 d = r10_bio->devs[sl].devnum; 2806 rdev = rcu_dereference(conf->mirrors[d].rdev); 2807 if (!rdev || 2808 test_bit(Faulty, &rdev->flags) || 2809 !test_bit(In_sync, &rdev->flags)) 2810 continue; 2811 2812 atomic_inc(&rdev->nr_pending); 2813 rcu_read_unlock(); 2814 switch (r10_sync_page_io(rdev, 2815 r10_bio->devs[sl].addr + 2816 sect, 2817 s, conf->tmppage, REQ_OP_READ)) { 2818 case 0: 2819 /* Well, this device is dead */ 2820 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", 2821 mdname(mddev), s, 2822 (unsigned long long)( 2823 sect + 2824 choose_data_offset(r10_bio, rdev)), 2825 rdev->bdev); 2826 pr_notice("md/raid10:%s: %pg: failing drive\n", 2827 mdname(mddev), 2828 rdev->bdev); 2829 break; 2830 case 1: 2831 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", 2832 mdname(mddev), s, 2833 (unsigned long long)( 2834 sect + 2835 choose_data_offset(r10_bio, rdev)), 2836 rdev->bdev); 2837 atomic_add(s, &rdev->corrected_errors); 2838 } 2839 2840 rdev_dec_pending(rdev, mddev); 2841 rcu_read_lock(); 2842 } 2843 rcu_read_unlock(); 2844 2845 sectors -= s; 2846 sect += s; 2847 } 2848 } 2849 2850 static int narrow_write_error(struct r10bio *r10_bio, int i) 2851 { 2852 struct bio *bio = r10_bio->master_bio; 2853 struct mddev *mddev = r10_bio->mddev; 2854 struct r10conf *conf = mddev->private; 2855 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 2856 /* bio has the data to be written to slot 'i' where 2857 * we just recently had a write error. 2858 * We repeatedly clone the bio and trim down to one block, 2859 * then try the write. Where the write fails we record 2860 * a bad block. 2861 * It is conceivable that the bio doesn't exactly align with 2862 * blocks. We must handle this. 2863 * 2864 * We currently own a reference to the rdev. 2865 */ 2866 2867 int block_sectors; 2868 sector_t sector; 2869 int sectors; 2870 int sect_to_write = r10_bio->sectors; 2871 int ok = 1; 2872 2873 if (rdev->badblocks.shift < 0) 2874 return 0; 2875 2876 block_sectors = roundup(1 << rdev->badblocks.shift, 2877 bdev_logical_block_size(rdev->bdev) >> 9); 2878 sector = r10_bio->sector; 2879 sectors = ((r10_bio->sector + block_sectors) 2880 & ~(sector_t)(block_sectors - 1)) 2881 - sector; 2882 2883 while (sect_to_write) { 2884 struct bio *wbio; 2885 sector_t wsector; 2886 if (sectors > sect_to_write) 2887 sectors = sect_to_write; 2888 /* Write at 'sector' for 'sectors' */ 2889 wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, 2890 &mddev->bio_set); 2891 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); 2892 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); 2893 wbio->bi_iter.bi_sector = wsector + 2894 choose_data_offset(r10_bio, rdev); 2895 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 2896 2897 if (submit_bio_wait(wbio) < 0) 2898 /* Failure! */ 2899 ok = rdev_set_badblocks(rdev, wsector, 2900 sectors, 0) 2901 && ok; 2902 2903 bio_put(wbio); 2904 sect_to_write -= sectors; 2905 sector += sectors; 2906 sectors = block_sectors; 2907 } 2908 return ok; 2909 } 2910 2911 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2912 { 2913 int slot = r10_bio->read_slot; 2914 struct bio *bio; 2915 struct r10conf *conf = mddev->private; 2916 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2917 2918 /* we got a read error. Maybe the drive is bad. Maybe just 2919 * the block and we can fix it. 2920 * We freeze all other IO, and try reading the block from 2921 * other devices. When we find one, we re-write 2922 * and check it that fixes the read error. 2923 * This is all done synchronously while the array is 2924 * frozen. 2925 */ 2926 bio = r10_bio->devs[slot].bio; 2927 bio_put(bio); 2928 r10_bio->devs[slot].bio = NULL; 2929 2930 if (mddev->ro) 2931 r10_bio->devs[slot].bio = IO_BLOCKED; 2932 else if (!test_bit(FailFast, &rdev->flags)) { 2933 freeze_array(conf, 1); 2934 fix_read_error(conf, mddev, r10_bio); 2935 unfreeze_array(conf); 2936 } else 2937 md_error(mddev, rdev); 2938 2939 rdev_dec_pending(rdev, mddev); 2940 allow_barrier(conf); 2941 r10_bio->state = 0; 2942 raid10_read_request(mddev, r10_bio->master_bio, r10_bio); 2943 } 2944 2945 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 2946 { 2947 /* Some sort of write request has finished and it 2948 * succeeded in writing where we thought there was a 2949 * bad block. So forget the bad block. 2950 * Or possibly if failed and we need to record 2951 * a bad block. 2952 */ 2953 int m; 2954 struct md_rdev *rdev; 2955 2956 if (test_bit(R10BIO_IsSync, &r10_bio->state) || 2957 test_bit(R10BIO_IsRecover, &r10_bio->state)) { 2958 for (m = 0; m < conf->copies; m++) { 2959 int dev = r10_bio->devs[m].devnum; 2960 rdev = conf->mirrors[dev].rdev; 2961 if (r10_bio->devs[m].bio == NULL || 2962 r10_bio->devs[m].bio->bi_end_io == NULL) 2963 continue; 2964 if (!r10_bio->devs[m].bio->bi_status) { 2965 rdev_clear_badblocks( 2966 rdev, 2967 r10_bio->devs[m].addr, 2968 r10_bio->sectors, 0); 2969 } else { 2970 if (!rdev_set_badblocks( 2971 rdev, 2972 r10_bio->devs[m].addr, 2973 r10_bio->sectors, 0)) 2974 md_error(conf->mddev, rdev); 2975 } 2976 rdev = conf->mirrors[dev].replacement; 2977 if (r10_bio->devs[m].repl_bio == NULL || 2978 r10_bio->devs[m].repl_bio->bi_end_io == NULL) 2979 continue; 2980 2981 if (!r10_bio->devs[m].repl_bio->bi_status) { 2982 rdev_clear_badblocks( 2983 rdev, 2984 r10_bio->devs[m].addr, 2985 r10_bio->sectors, 0); 2986 } else { 2987 if (!rdev_set_badblocks( 2988 rdev, 2989 r10_bio->devs[m].addr, 2990 r10_bio->sectors, 0)) 2991 md_error(conf->mddev, rdev); 2992 } 2993 } 2994 put_buf(r10_bio); 2995 } else { 2996 bool fail = false; 2997 for (m = 0; m < conf->copies; m++) { 2998 int dev = r10_bio->devs[m].devnum; 2999 struct bio *bio = r10_bio->devs[m].bio; 3000 rdev = conf->mirrors[dev].rdev; 3001 if (bio == IO_MADE_GOOD) { 3002 rdev_clear_badblocks( 3003 rdev, 3004 r10_bio->devs[m].addr, 3005 r10_bio->sectors, 0); 3006 rdev_dec_pending(rdev, conf->mddev); 3007 } else if (bio != NULL && bio->bi_status) { 3008 fail = true; 3009 if (!narrow_write_error(r10_bio, m)) { 3010 md_error(conf->mddev, rdev); 3011 set_bit(R10BIO_Degraded, 3012 &r10_bio->state); 3013 } 3014 rdev_dec_pending(rdev, conf->mddev); 3015 } 3016 bio = r10_bio->devs[m].repl_bio; 3017 rdev = conf->mirrors[dev].replacement; 3018 if (rdev && bio == IO_MADE_GOOD) { 3019 rdev_clear_badblocks( 3020 rdev, 3021 r10_bio->devs[m].addr, 3022 r10_bio->sectors, 0); 3023 rdev_dec_pending(rdev, conf->mddev); 3024 } 3025 } 3026 if (fail) { 3027 spin_lock_irq(&conf->device_lock); 3028 list_add(&r10_bio->retry_list, &conf->bio_end_io_list); 3029 conf->nr_queued++; 3030 spin_unlock_irq(&conf->device_lock); 3031 /* 3032 * In case freeze_array() is waiting for condition 3033 * nr_pending == nr_queued + extra to be true. 3034 */ 3035 wake_up(&conf->wait_barrier); 3036 md_wakeup_thread(conf->mddev->thread); 3037 } else { 3038 if (test_bit(R10BIO_WriteError, 3039 &r10_bio->state)) 3040 close_write(r10_bio); 3041 raid_end_bio_io(r10_bio); 3042 } 3043 } 3044 } 3045 3046 static void raid10d(struct md_thread *thread) 3047 { 3048 struct mddev *mddev = thread->mddev; 3049 struct r10bio *r10_bio; 3050 unsigned long flags; 3051 struct r10conf *conf = mddev->private; 3052 struct list_head *head = &conf->retry_list; 3053 struct blk_plug plug; 3054 3055 md_check_recovery(mddev); 3056 3057 if (!list_empty_careful(&conf->bio_end_io_list) && 3058 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3059 LIST_HEAD(tmp); 3060 spin_lock_irqsave(&conf->device_lock, flags); 3061 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3062 while (!list_empty(&conf->bio_end_io_list)) { 3063 list_move(conf->bio_end_io_list.prev, &tmp); 3064 conf->nr_queued--; 3065 } 3066 } 3067 spin_unlock_irqrestore(&conf->device_lock, flags); 3068 while (!list_empty(&tmp)) { 3069 r10_bio = list_first_entry(&tmp, struct r10bio, 3070 retry_list); 3071 list_del(&r10_bio->retry_list); 3072 if (mddev->degraded) 3073 set_bit(R10BIO_Degraded, &r10_bio->state); 3074 3075 if (test_bit(R10BIO_WriteError, 3076 &r10_bio->state)) 3077 close_write(r10_bio); 3078 raid_end_bio_io(r10_bio); 3079 } 3080 } 3081 3082 blk_start_plug(&plug); 3083 for (;;) { 3084 3085 flush_pending_writes(conf); 3086 3087 spin_lock_irqsave(&conf->device_lock, flags); 3088 if (list_empty(head)) { 3089 spin_unlock_irqrestore(&conf->device_lock, flags); 3090 break; 3091 } 3092 r10_bio = list_entry(head->prev, struct r10bio, retry_list); 3093 list_del(head->prev); 3094 conf->nr_queued--; 3095 spin_unlock_irqrestore(&conf->device_lock, flags); 3096 3097 mddev = r10_bio->mddev; 3098 conf = mddev->private; 3099 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 3100 test_bit(R10BIO_WriteError, &r10_bio->state)) 3101 handle_write_completed(conf, r10_bio); 3102 else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 3103 reshape_request_write(mddev, r10_bio); 3104 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 3105 sync_request_write(mddev, r10_bio); 3106 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 3107 recovery_request_write(mddev, r10_bio); 3108 else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 3109 handle_read_error(mddev, r10_bio); 3110 else 3111 WARN_ON_ONCE(1); 3112 3113 cond_resched(); 3114 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 3115 md_check_recovery(mddev); 3116 } 3117 blk_finish_plug(&plug); 3118 } 3119 3120 static int init_resync(struct r10conf *conf) 3121 { 3122 int ret, buffs, i; 3123 3124 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 3125 BUG_ON(mempool_initialized(&conf->r10buf_pool)); 3126 conf->have_replacement = 0; 3127 for (i = 0; i < conf->geo.raid_disks; i++) 3128 if (conf->mirrors[i].replacement) 3129 conf->have_replacement = 1; 3130 ret = mempool_init(&conf->r10buf_pool, buffs, 3131 r10buf_pool_alloc, r10buf_pool_free, conf); 3132 if (ret) 3133 return ret; 3134 conf->next_resync = 0; 3135 return 0; 3136 } 3137 3138 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) 3139 { 3140 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); 3141 struct rsync_pages *rp; 3142 struct bio *bio; 3143 int nalloc; 3144 int i; 3145 3146 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 3147 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 3148 nalloc = conf->copies; /* resync */ 3149 else 3150 nalloc = 2; /* recovery */ 3151 3152 for (i = 0; i < nalloc; i++) { 3153 bio = r10bio->devs[i].bio; 3154 rp = bio->bi_private; 3155 bio_reset(bio, NULL, 0); 3156 bio->bi_private = rp; 3157 bio = r10bio->devs[i].repl_bio; 3158 if (bio) { 3159 rp = bio->bi_private; 3160 bio_reset(bio, NULL, 0); 3161 bio->bi_private = rp; 3162 } 3163 } 3164 return r10bio; 3165 } 3166 3167 /* 3168 * Set cluster_sync_high since we need other nodes to add the 3169 * range [cluster_sync_low, cluster_sync_high] to suspend list. 3170 */ 3171 static void raid10_set_cluster_sync_high(struct r10conf *conf) 3172 { 3173 sector_t window_size; 3174 int extra_chunk, chunks; 3175 3176 /* 3177 * First, here we define "stripe" as a unit which across 3178 * all member devices one time, so we get chunks by use 3179 * raid_disks / near_copies. Otherwise, if near_copies is 3180 * close to raid_disks, then resync window could increases 3181 * linearly with the increase of raid_disks, which means 3182 * we will suspend a really large IO window while it is not 3183 * necessary. If raid_disks is not divisible by near_copies, 3184 * an extra chunk is needed to ensure the whole "stripe" is 3185 * covered. 3186 */ 3187 3188 chunks = conf->geo.raid_disks / conf->geo.near_copies; 3189 if (conf->geo.raid_disks % conf->geo.near_copies == 0) 3190 extra_chunk = 0; 3191 else 3192 extra_chunk = 1; 3193 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; 3194 3195 /* 3196 * At least use a 32M window to align with raid1's resync window 3197 */ 3198 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? 3199 CLUSTER_RESYNC_WINDOW_SECTORS : window_size; 3200 3201 conf->cluster_sync_high = conf->cluster_sync_low + window_size; 3202 } 3203 3204 /* 3205 * perform a "sync" on one "block" 3206 * 3207 * We need to make sure that no normal I/O request - particularly write 3208 * requests - conflict with active sync requests. 3209 * 3210 * This is achieved by tracking pending requests and a 'barrier' concept 3211 * that can be installed to exclude normal IO requests. 3212 * 3213 * Resync and recovery are handled very differently. 3214 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. 3215 * 3216 * For resync, we iterate over virtual addresses, read all copies, 3217 * and update if there are differences. If only one copy is live, 3218 * skip it. 3219 * For recovery, we iterate over physical addresses, read a good 3220 * value for each non-in_sync drive, and over-write. 3221 * 3222 * So, for recovery we may have several outstanding complex requests for a 3223 * given address, one for each out-of-sync device. We model this by allocating 3224 * a number of r10_bio structures, one for each out-of-sync device. 3225 * As we setup these structures, we collect all bio's together into a list 3226 * which we then process collectively to add pages, and then process again 3227 * to pass to submit_bio_noacct. 3228 * 3229 * The r10_bio structures are linked using a borrowed master_bio pointer. 3230 * This link is counted in ->remaining. When the r10_bio that points to NULL 3231 * has its remaining count decremented to 0, the whole complex operation 3232 * is complete. 3233 * 3234 */ 3235 3236 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, 3237 int *skipped) 3238 { 3239 struct r10conf *conf = mddev->private; 3240 struct r10bio *r10_bio; 3241 struct bio *biolist = NULL, *bio; 3242 sector_t max_sector, nr_sectors; 3243 int i; 3244 int max_sync; 3245 sector_t sync_blocks; 3246 sector_t sectors_skipped = 0; 3247 int chunks_skipped = 0; 3248 sector_t chunk_mask = conf->geo.chunk_mask; 3249 int page_idx = 0; 3250 3251 if (!mempool_initialized(&conf->r10buf_pool)) 3252 if (init_resync(conf)) 3253 return 0; 3254 3255 /* 3256 * Allow skipping a full rebuild for incremental assembly 3257 * of a clean array, like RAID1 does. 3258 */ 3259 if (mddev->bitmap == NULL && 3260 mddev->recovery_cp == MaxSector && 3261 mddev->reshape_position == MaxSector && 3262 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 3263 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 3264 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 3265 conf->fullsync == 0) { 3266 *skipped = 1; 3267 return mddev->dev_sectors - sector_nr; 3268 } 3269 3270 skipped: 3271 max_sector = mddev->dev_sectors; 3272 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 3273 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3274 max_sector = mddev->resync_max_sectors; 3275 if (sector_nr >= max_sector) { 3276 conf->cluster_sync_low = 0; 3277 conf->cluster_sync_high = 0; 3278 3279 /* If we aborted, we need to abort the 3280 * sync on the 'current' bitmap chucks (there can 3281 * be several when recovering multiple devices). 3282 * as we may have started syncing it but not finished. 3283 * We can find the current address in 3284 * mddev->curr_resync, but for recovery, 3285 * we need to convert that to several 3286 * virtual addresses. 3287 */ 3288 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3289 end_reshape(conf); 3290 close_sync(conf); 3291 return 0; 3292 } 3293 3294 if (mddev->curr_resync < max_sector) { /* aborted */ 3295 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3296 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3297 &sync_blocks, 1); 3298 else for (i = 0; i < conf->geo.raid_disks; i++) { 3299 sector_t sect = 3300 raid10_find_virt(conf, mddev->curr_resync, i); 3301 md_bitmap_end_sync(mddev->bitmap, sect, 3302 &sync_blocks, 1); 3303 } 3304 } else { 3305 /* completed sync */ 3306 if ((!mddev->bitmap || conf->fullsync) 3307 && conf->have_replacement 3308 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3309 /* Completed a full sync so the replacements 3310 * are now fully recovered. 3311 */ 3312 rcu_read_lock(); 3313 for (i = 0; i < conf->geo.raid_disks; i++) { 3314 struct md_rdev *rdev = 3315 rcu_dereference(conf->mirrors[i].replacement); 3316 if (rdev) 3317 rdev->recovery_offset = MaxSector; 3318 } 3319 rcu_read_unlock(); 3320 } 3321 conf->fullsync = 0; 3322 } 3323 md_bitmap_close_sync(mddev->bitmap); 3324 close_sync(conf); 3325 *skipped = 1; 3326 return sectors_skipped; 3327 } 3328 3329 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3330 return reshape_request(mddev, sector_nr, skipped); 3331 3332 if (chunks_skipped >= conf->geo.raid_disks) { 3333 /* if there has been nothing to do on any drive, 3334 * then there is nothing to do at all.. 3335 */ 3336 *skipped = 1; 3337 return (max_sector - sector_nr) + sectors_skipped; 3338 } 3339 3340 if (max_sector > mddev->resync_max) 3341 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 3342 3343 /* make sure whole request will fit in a chunk - if chunks 3344 * are meaningful 3345 */ 3346 if (conf->geo.near_copies < conf->geo.raid_disks && 3347 max_sector > (sector_nr | chunk_mask)) 3348 max_sector = (sector_nr | chunk_mask) + 1; 3349 3350 /* 3351 * If there is non-resync activity waiting for a turn, then let it 3352 * though before starting on this new sync request. 3353 */ 3354 if (conf->nr_waiting) 3355 schedule_timeout_uninterruptible(1); 3356 3357 /* Again, very different code for resync and recovery. 3358 * Both must result in an r10bio with a list of bios that 3359 * have bi_end_io, bi_sector, bi_bdev set, 3360 * and bi_private set to the r10bio. 3361 * For recovery, we may actually create several r10bios 3362 * with 2 bios in each, that correspond to the bios in the main one. 3363 * In this case, the subordinate r10bios link back through a 3364 * borrowed master_bio pointer, and the counter in the master 3365 * includes a ref from each subordinate. 3366 */ 3367 /* First, we decide what to do and set ->bi_end_io 3368 * To end_sync_read if we want to read, and 3369 * end_sync_write if we will want to write. 3370 */ 3371 3372 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 3373 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3374 /* recovery... the complicated one */ 3375 int j; 3376 r10_bio = NULL; 3377 3378 for (i = 0 ; i < conf->geo.raid_disks; i++) { 3379 int still_degraded; 3380 struct r10bio *rb2; 3381 sector_t sect; 3382 int must_sync; 3383 int any_working; 3384 int need_recover = 0; 3385 int need_replace = 0; 3386 struct raid10_info *mirror = &conf->mirrors[i]; 3387 struct md_rdev *mrdev, *mreplace; 3388 3389 rcu_read_lock(); 3390 mrdev = rcu_dereference(mirror->rdev); 3391 mreplace = rcu_dereference(mirror->replacement); 3392 3393 if (mrdev != NULL && 3394 !test_bit(Faulty, &mrdev->flags) && 3395 !test_bit(In_sync, &mrdev->flags)) 3396 need_recover = 1; 3397 if (mreplace != NULL && 3398 !test_bit(Faulty, &mreplace->flags)) 3399 need_replace = 1; 3400 3401 if (!need_recover && !need_replace) { 3402 rcu_read_unlock(); 3403 continue; 3404 } 3405 3406 still_degraded = 0; 3407 /* want to reconstruct this device */ 3408 rb2 = r10_bio; 3409 sect = raid10_find_virt(conf, sector_nr, i); 3410 if (sect >= mddev->resync_max_sectors) { 3411 /* last stripe is not complete - don't 3412 * try to recover this sector. 3413 */ 3414 rcu_read_unlock(); 3415 continue; 3416 } 3417 if (mreplace && test_bit(Faulty, &mreplace->flags)) 3418 mreplace = NULL; 3419 /* Unless we are doing a full sync, or a replacement 3420 * we only need to recover the block if it is set in 3421 * the bitmap 3422 */ 3423 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3424 &sync_blocks, 1); 3425 if (sync_blocks < max_sync) 3426 max_sync = sync_blocks; 3427 if (!must_sync && 3428 mreplace == NULL && 3429 !conf->fullsync) { 3430 /* yep, skip the sync_blocks here, but don't assume 3431 * that there will never be anything to do here 3432 */ 3433 chunks_skipped = -1; 3434 rcu_read_unlock(); 3435 continue; 3436 } 3437 atomic_inc(&mrdev->nr_pending); 3438 if (mreplace) 3439 atomic_inc(&mreplace->nr_pending); 3440 rcu_read_unlock(); 3441 3442 r10_bio = raid10_alloc_init_r10buf(conf); 3443 r10_bio->state = 0; 3444 raise_barrier(conf, rb2 != NULL); 3445 atomic_set(&r10_bio->remaining, 0); 3446 3447 r10_bio->master_bio = (struct bio*)rb2; 3448 if (rb2) 3449 atomic_inc(&rb2->remaining); 3450 r10_bio->mddev = mddev; 3451 set_bit(R10BIO_IsRecover, &r10_bio->state); 3452 r10_bio->sector = sect; 3453 3454 raid10_find_phys(conf, r10_bio); 3455 3456 /* Need to check if the array will still be 3457 * degraded 3458 */ 3459 rcu_read_lock(); 3460 for (j = 0; j < conf->geo.raid_disks; j++) { 3461 struct md_rdev *rdev = rcu_dereference( 3462 conf->mirrors[j].rdev); 3463 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3464 still_degraded = 1; 3465 break; 3466 } 3467 } 3468 3469 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3470 &sync_blocks, still_degraded); 3471 3472 any_working = 0; 3473 for (j=0; j<conf->copies;j++) { 3474 int k; 3475 int d = r10_bio->devs[j].devnum; 3476 sector_t from_addr, to_addr; 3477 struct md_rdev *rdev = 3478 rcu_dereference(conf->mirrors[d].rdev); 3479 sector_t sector, first_bad; 3480 int bad_sectors; 3481 if (!rdev || 3482 !test_bit(In_sync, &rdev->flags)) 3483 continue; 3484 /* This is where we read from */ 3485 any_working = 1; 3486 sector = r10_bio->devs[j].addr; 3487 3488 if (is_badblock(rdev, sector, max_sync, 3489 &first_bad, &bad_sectors)) { 3490 if (first_bad > sector) 3491 max_sync = first_bad - sector; 3492 else { 3493 bad_sectors -= (sector 3494 - first_bad); 3495 if (max_sync > bad_sectors) 3496 max_sync = bad_sectors; 3497 continue; 3498 } 3499 } 3500 bio = r10_bio->devs[0].bio; 3501 bio->bi_next = biolist; 3502 biolist = bio; 3503 bio->bi_end_io = end_sync_read; 3504 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3505 if (test_bit(FailFast, &rdev->flags)) 3506 bio->bi_opf |= MD_FAILFAST; 3507 from_addr = r10_bio->devs[j].addr; 3508 bio->bi_iter.bi_sector = from_addr + 3509 rdev->data_offset; 3510 bio_set_dev(bio, rdev->bdev); 3511 atomic_inc(&rdev->nr_pending); 3512 /* and we write to 'i' (if not in_sync) */ 3513 3514 for (k=0; k<conf->copies; k++) 3515 if (r10_bio->devs[k].devnum == i) 3516 break; 3517 BUG_ON(k == conf->copies); 3518 to_addr = r10_bio->devs[k].addr; 3519 r10_bio->devs[0].devnum = d; 3520 r10_bio->devs[0].addr = from_addr; 3521 r10_bio->devs[1].devnum = i; 3522 r10_bio->devs[1].addr = to_addr; 3523 3524 if (need_recover) { 3525 bio = r10_bio->devs[1].bio; 3526 bio->bi_next = biolist; 3527 biolist = bio; 3528 bio->bi_end_io = end_sync_write; 3529 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3530 bio->bi_iter.bi_sector = to_addr 3531 + mrdev->data_offset; 3532 bio_set_dev(bio, mrdev->bdev); 3533 atomic_inc(&r10_bio->remaining); 3534 } else 3535 r10_bio->devs[1].bio->bi_end_io = NULL; 3536 3537 /* and maybe write to replacement */ 3538 bio = r10_bio->devs[1].repl_bio; 3539 if (bio) 3540 bio->bi_end_io = NULL; 3541 /* Note: if need_replace, then bio 3542 * cannot be NULL as r10buf_pool_alloc will 3543 * have allocated it. 3544 */ 3545 if (!need_replace) 3546 break; 3547 bio->bi_next = biolist; 3548 biolist = bio; 3549 bio->bi_end_io = end_sync_write; 3550 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3551 bio->bi_iter.bi_sector = to_addr + 3552 mreplace->data_offset; 3553 bio_set_dev(bio, mreplace->bdev); 3554 atomic_inc(&r10_bio->remaining); 3555 break; 3556 } 3557 rcu_read_unlock(); 3558 if (j == conf->copies) { 3559 /* Cannot recover, so abort the recovery or 3560 * record a bad block */ 3561 if (any_working) { 3562 /* problem is that there are bad blocks 3563 * on other device(s) 3564 */ 3565 int k; 3566 for (k = 0; k < conf->copies; k++) 3567 if (r10_bio->devs[k].devnum == i) 3568 break; 3569 if (!test_bit(In_sync, 3570 &mrdev->flags) 3571 && !rdev_set_badblocks( 3572 mrdev, 3573 r10_bio->devs[k].addr, 3574 max_sync, 0)) 3575 any_working = 0; 3576 if (mreplace && 3577 !rdev_set_badblocks( 3578 mreplace, 3579 r10_bio->devs[k].addr, 3580 max_sync, 0)) 3581 any_working = 0; 3582 } 3583 if (!any_working) { 3584 if (!test_and_set_bit(MD_RECOVERY_INTR, 3585 &mddev->recovery)) 3586 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", 3587 mdname(mddev)); 3588 mirror->recovery_disabled 3589 = mddev->recovery_disabled; 3590 } 3591 put_buf(r10_bio); 3592 if (rb2) 3593 atomic_dec(&rb2->remaining); 3594 r10_bio = rb2; 3595 rdev_dec_pending(mrdev, mddev); 3596 if (mreplace) 3597 rdev_dec_pending(mreplace, mddev); 3598 break; 3599 } 3600 rdev_dec_pending(mrdev, mddev); 3601 if (mreplace) 3602 rdev_dec_pending(mreplace, mddev); 3603 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { 3604 /* Only want this if there is elsewhere to 3605 * read from. 'j' is currently the first 3606 * readable copy. 3607 */ 3608 int targets = 1; 3609 for (; j < conf->copies; j++) { 3610 int d = r10_bio->devs[j].devnum; 3611 if (conf->mirrors[d].rdev && 3612 test_bit(In_sync, 3613 &conf->mirrors[d].rdev->flags)) 3614 targets++; 3615 } 3616 if (targets == 1) 3617 r10_bio->devs[0].bio->bi_opf 3618 &= ~MD_FAILFAST; 3619 } 3620 } 3621 if (biolist == NULL) { 3622 while (r10_bio) { 3623 struct r10bio *rb2 = r10_bio; 3624 r10_bio = (struct r10bio*) rb2->master_bio; 3625 rb2->master_bio = NULL; 3626 put_buf(rb2); 3627 } 3628 goto giveup; 3629 } 3630 } else { 3631 /* resync. Schedule a read for every block at this virt offset */ 3632 int count = 0; 3633 3634 /* 3635 * Since curr_resync_completed could probably not update in 3636 * time, and we will set cluster_sync_low based on it. 3637 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for 3638 * safety reason, which ensures curr_resync_completed is 3639 * updated in bitmap_cond_end_sync. 3640 */ 3641 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 3642 mddev_is_clustered(mddev) && 3643 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 3644 3645 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 3646 &sync_blocks, mddev->degraded) && 3647 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 3648 &mddev->recovery)) { 3649 /* We can skip this block */ 3650 *skipped = 1; 3651 return sync_blocks + sectors_skipped; 3652 } 3653 if (sync_blocks < max_sync) 3654 max_sync = sync_blocks; 3655 r10_bio = raid10_alloc_init_r10buf(conf); 3656 r10_bio->state = 0; 3657 3658 r10_bio->mddev = mddev; 3659 atomic_set(&r10_bio->remaining, 0); 3660 raise_barrier(conf, 0); 3661 conf->next_resync = sector_nr; 3662 3663 r10_bio->master_bio = NULL; 3664 r10_bio->sector = sector_nr; 3665 set_bit(R10BIO_IsSync, &r10_bio->state); 3666 raid10_find_phys(conf, r10_bio); 3667 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 3668 3669 for (i = 0; i < conf->copies; i++) { 3670 int d = r10_bio->devs[i].devnum; 3671 sector_t first_bad, sector; 3672 int bad_sectors; 3673 struct md_rdev *rdev; 3674 3675 if (r10_bio->devs[i].repl_bio) 3676 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3677 3678 bio = r10_bio->devs[i].bio; 3679 bio->bi_status = BLK_STS_IOERR; 3680 rcu_read_lock(); 3681 rdev = rcu_dereference(conf->mirrors[d].rdev); 3682 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3683 rcu_read_unlock(); 3684 continue; 3685 } 3686 sector = r10_bio->devs[i].addr; 3687 if (is_badblock(rdev, sector, max_sync, 3688 &first_bad, &bad_sectors)) { 3689 if (first_bad > sector) 3690 max_sync = first_bad - sector; 3691 else { 3692 bad_sectors -= (sector - first_bad); 3693 if (max_sync > bad_sectors) 3694 max_sync = bad_sectors; 3695 rcu_read_unlock(); 3696 continue; 3697 } 3698 } 3699 atomic_inc(&rdev->nr_pending); 3700 atomic_inc(&r10_bio->remaining); 3701 bio->bi_next = biolist; 3702 biolist = bio; 3703 bio->bi_end_io = end_sync_read; 3704 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3705 if (test_bit(FailFast, &rdev->flags)) 3706 bio->bi_opf |= MD_FAILFAST; 3707 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3708 bio_set_dev(bio, rdev->bdev); 3709 count++; 3710 3711 rdev = rcu_dereference(conf->mirrors[d].replacement); 3712 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3713 rcu_read_unlock(); 3714 continue; 3715 } 3716 atomic_inc(&rdev->nr_pending); 3717 3718 /* Need to set up for writing to the replacement */ 3719 bio = r10_bio->devs[i].repl_bio; 3720 bio->bi_status = BLK_STS_IOERR; 3721 3722 sector = r10_bio->devs[i].addr; 3723 bio->bi_next = biolist; 3724 biolist = bio; 3725 bio->bi_end_io = end_sync_write; 3726 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3727 if (test_bit(FailFast, &rdev->flags)) 3728 bio->bi_opf |= MD_FAILFAST; 3729 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3730 bio_set_dev(bio, rdev->bdev); 3731 count++; 3732 rcu_read_unlock(); 3733 } 3734 3735 if (count < 2) { 3736 for (i=0; i<conf->copies; i++) { 3737 int d = r10_bio->devs[i].devnum; 3738 if (r10_bio->devs[i].bio->bi_end_io) 3739 rdev_dec_pending(conf->mirrors[d].rdev, 3740 mddev); 3741 if (r10_bio->devs[i].repl_bio && 3742 r10_bio->devs[i].repl_bio->bi_end_io) 3743 rdev_dec_pending( 3744 conf->mirrors[d].replacement, 3745 mddev); 3746 } 3747 put_buf(r10_bio); 3748 biolist = NULL; 3749 goto giveup; 3750 } 3751 } 3752 3753 nr_sectors = 0; 3754 if (sector_nr + max_sync < max_sector) 3755 max_sector = sector_nr + max_sync; 3756 do { 3757 struct page *page; 3758 int len = PAGE_SIZE; 3759 if (sector_nr + (len>>9) > max_sector) 3760 len = (max_sector - sector_nr) << 9; 3761 if (len == 0) 3762 break; 3763 for (bio= biolist ; bio ; bio=bio->bi_next) { 3764 struct resync_pages *rp = get_resync_pages(bio); 3765 page = resync_fetch_page(rp, page_idx); 3766 /* 3767 * won't fail because the vec table is big enough 3768 * to hold all these pages 3769 */ 3770 bio_add_page(bio, page, len, 0); 3771 } 3772 nr_sectors += len>>9; 3773 sector_nr += len>>9; 3774 } while (++page_idx < RESYNC_PAGES); 3775 r10_bio->sectors = nr_sectors; 3776 3777 if (mddev_is_clustered(mddev) && 3778 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3779 /* It is resync not recovery */ 3780 if (conf->cluster_sync_high < sector_nr + nr_sectors) { 3781 conf->cluster_sync_low = mddev->curr_resync_completed; 3782 raid10_set_cluster_sync_high(conf); 3783 /* Send resync message */ 3784 md_cluster_ops->resync_info_update(mddev, 3785 conf->cluster_sync_low, 3786 conf->cluster_sync_high); 3787 } 3788 } else if (mddev_is_clustered(mddev)) { 3789 /* This is recovery not resync */ 3790 sector_t sect_va1, sect_va2; 3791 bool broadcast_msg = false; 3792 3793 for (i = 0; i < conf->geo.raid_disks; i++) { 3794 /* 3795 * sector_nr is a device address for recovery, so we 3796 * need translate it to array address before compare 3797 * with cluster_sync_high. 3798 */ 3799 sect_va1 = raid10_find_virt(conf, sector_nr, i); 3800 3801 if (conf->cluster_sync_high < sect_va1 + nr_sectors) { 3802 broadcast_msg = true; 3803 /* 3804 * curr_resync_completed is similar as 3805 * sector_nr, so make the translation too. 3806 */ 3807 sect_va2 = raid10_find_virt(conf, 3808 mddev->curr_resync_completed, i); 3809 3810 if (conf->cluster_sync_low == 0 || 3811 conf->cluster_sync_low > sect_va2) 3812 conf->cluster_sync_low = sect_va2; 3813 } 3814 } 3815 if (broadcast_msg) { 3816 raid10_set_cluster_sync_high(conf); 3817 md_cluster_ops->resync_info_update(mddev, 3818 conf->cluster_sync_low, 3819 conf->cluster_sync_high); 3820 } 3821 } 3822 3823 while (biolist) { 3824 bio = biolist; 3825 biolist = biolist->bi_next; 3826 3827 bio->bi_next = NULL; 3828 r10_bio = get_resync_r10bio(bio); 3829 r10_bio->sectors = nr_sectors; 3830 3831 if (bio->bi_end_io == end_sync_read) { 3832 md_sync_acct_bio(bio, nr_sectors); 3833 bio->bi_status = 0; 3834 submit_bio_noacct(bio); 3835 } 3836 } 3837 3838 if (sectors_skipped) 3839 /* pretend they weren't skipped, it makes 3840 * no important difference in this case 3841 */ 3842 md_done_sync(mddev, sectors_skipped, 1); 3843 3844 return sectors_skipped + nr_sectors; 3845 giveup: 3846 /* There is nowhere to write, so all non-sync 3847 * drives must be failed or in resync, all drives 3848 * have a bad block, so try the next chunk... 3849 */ 3850 if (sector_nr + max_sync < max_sector) 3851 max_sector = sector_nr + max_sync; 3852 3853 sectors_skipped += (max_sector - sector_nr); 3854 chunks_skipped ++; 3855 sector_nr = max_sector; 3856 goto skipped; 3857 } 3858 3859 static sector_t 3860 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 3861 { 3862 sector_t size; 3863 struct r10conf *conf = mddev->private; 3864 3865 if (!raid_disks) 3866 raid_disks = min(conf->geo.raid_disks, 3867 conf->prev.raid_disks); 3868 if (!sectors) 3869 sectors = conf->dev_sectors; 3870 3871 size = sectors >> conf->geo.chunk_shift; 3872 sector_div(size, conf->geo.far_copies); 3873 size = size * raid_disks; 3874 sector_div(size, conf->geo.near_copies); 3875 3876 return size << conf->geo.chunk_shift; 3877 } 3878 3879 static void calc_sectors(struct r10conf *conf, sector_t size) 3880 { 3881 /* Calculate the number of sectors-per-device that will 3882 * actually be used, and set conf->dev_sectors and 3883 * conf->stride 3884 */ 3885 3886 size = size >> conf->geo.chunk_shift; 3887 sector_div(size, conf->geo.far_copies); 3888 size = size * conf->geo.raid_disks; 3889 sector_div(size, conf->geo.near_copies); 3890 /* 'size' is now the number of chunks in the array */ 3891 /* calculate "used chunks per device" */ 3892 size = size * conf->copies; 3893 3894 /* We need to round up when dividing by raid_disks to 3895 * get the stride size. 3896 */ 3897 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 3898 3899 conf->dev_sectors = size << conf->geo.chunk_shift; 3900 3901 if (conf->geo.far_offset) 3902 conf->geo.stride = 1 << conf->geo.chunk_shift; 3903 else { 3904 sector_div(size, conf->geo.far_copies); 3905 conf->geo.stride = size << conf->geo.chunk_shift; 3906 } 3907 } 3908 3909 enum geo_type {geo_new, geo_old, geo_start}; 3910 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 3911 { 3912 int nc, fc, fo; 3913 int layout, chunk, disks; 3914 switch (new) { 3915 case geo_old: 3916 layout = mddev->layout; 3917 chunk = mddev->chunk_sectors; 3918 disks = mddev->raid_disks - mddev->delta_disks; 3919 break; 3920 case geo_new: 3921 layout = mddev->new_layout; 3922 chunk = mddev->new_chunk_sectors; 3923 disks = mddev->raid_disks; 3924 break; 3925 default: /* avoid 'may be unused' warnings */ 3926 case geo_start: /* new when starting reshape - raid_disks not 3927 * updated yet. */ 3928 layout = mddev->new_layout; 3929 chunk = mddev->new_chunk_sectors; 3930 disks = mddev->raid_disks + mddev->delta_disks; 3931 break; 3932 } 3933 if (layout >> 19) 3934 return -1; 3935 if (chunk < (PAGE_SIZE >> 9) || 3936 !is_power_of_2(chunk)) 3937 return -2; 3938 nc = layout & 255; 3939 fc = (layout >> 8) & 255; 3940 fo = layout & (1<<16); 3941 geo->raid_disks = disks; 3942 geo->near_copies = nc; 3943 geo->far_copies = fc; 3944 geo->far_offset = fo; 3945 switch (layout >> 17) { 3946 case 0: /* original layout. simple but not always optimal */ 3947 geo->far_set_size = disks; 3948 break; 3949 case 1: /* "improved" layout which was buggy. Hopefully no-one is 3950 * actually using this, but leave code here just in case.*/ 3951 geo->far_set_size = disks/fc; 3952 WARN(geo->far_set_size < fc, 3953 "This RAID10 layout does not provide data safety - please backup and create new array\n"); 3954 break; 3955 case 2: /* "improved" layout fixed to match documentation */ 3956 geo->far_set_size = fc * nc; 3957 break; 3958 default: /* Not a valid layout */ 3959 return -1; 3960 } 3961 geo->chunk_mask = chunk - 1; 3962 geo->chunk_shift = ffz(~chunk); 3963 return nc*fc; 3964 } 3965 3966 static struct r10conf *setup_conf(struct mddev *mddev) 3967 { 3968 struct r10conf *conf = NULL; 3969 int err = -EINVAL; 3970 struct geom geo; 3971 int copies; 3972 3973 copies = setup_geo(&geo, mddev, geo_new); 3974 3975 if (copies == -2) { 3976 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", 3977 mdname(mddev), PAGE_SIZE); 3978 goto out; 3979 } 3980 3981 if (copies < 2 || copies > mddev->raid_disks) { 3982 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3983 mdname(mddev), mddev->new_layout); 3984 goto out; 3985 } 3986 3987 err = -ENOMEM; 3988 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 3989 if (!conf) 3990 goto out; 3991 3992 /* FIXME calc properly */ 3993 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks), 3994 sizeof(struct raid10_info), 3995 GFP_KERNEL); 3996 if (!conf->mirrors) 3997 goto out; 3998 3999 conf->tmppage = alloc_page(GFP_KERNEL); 4000 if (!conf->tmppage) 4001 goto out; 4002 4003 conf->geo = geo; 4004 conf->copies = copies; 4005 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, 4006 rbio_pool_free, conf); 4007 if (err) 4008 goto out; 4009 4010 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 4011 if (err) 4012 goto out; 4013 4014 calc_sectors(conf, mddev->dev_sectors); 4015 if (mddev->reshape_position == MaxSector) { 4016 conf->prev = conf->geo; 4017 conf->reshape_progress = MaxSector; 4018 } else { 4019 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 4020 err = -EINVAL; 4021 goto out; 4022 } 4023 conf->reshape_progress = mddev->reshape_position; 4024 if (conf->prev.far_offset) 4025 conf->prev.stride = 1 << conf->prev.chunk_shift; 4026 else 4027 /* far_copies must be 1 */ 4028 conf->prev.stride = conf->dev_sectors; 4029 } 4030 conf->reshape_safe = conf->reshape_progress; 4031 spin_lock_init(&conf->device_lock); 4032 INIT_LIST_HEAD(&conf->retry_list); 4033 INIT_LIST_HEAD(&conf->bio_end_io_list); 4034 4035 spin_lock_init(&conf->resync_lock); 4036 init_waitqueue_head(&conf->wait_barrier); 4037 atomic_set(&conf->nr_pending, 0); 4038 4039 err = -ENOMEM; 4040 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 4041 if (!conf->thread) 4042 goto out; 4043 4044 conf->mddev = mddev; 4045 return conf; 4046 4047 out: 4048 if (conf) { 4049 mempool_exit(&conf->r10bio_pool); 4050 kfree(conf->mirrors); 4051 safe_put_page(conf->tmppage); 4052 bioset_exit(&conf->bio_split); 4053 kfree(conf); 4054 } 4055 return ERR_PTR(err); 4056 } 4057 4058 static void raid10_set_io_opt(struct r10conf *conf) 4059 { 4060 int raid_disks = conf->geo.raid_disks; 4061 4062 if (!(conf->geo.raid_disks % conf->geo.near_copies)) 4063 raid_disks /= conf->geo.near_copies; 4064 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 4065 raid_disks); 4066 } 4067 4068 static int raid10_run(struct mddev *mddev) 4069 { 4070 struct r10conf *conf; 4071 int i, disk_idx; 4072 struct raid10_info *disk; 4073 struct md_rdev *rdev; 4074 sector_t size; 4075 sector_t min_offset_diff = 0; 4076 int first = 1; 4077 4078 if (mddev_init_writes_pending(mddev) < 0) 4079 return -ENOMEM; 4080 4081 if (mddev->private == NULL) { 4082 conf = setup_conf(mddev); 4083 if (IS_ERR(conf)) 4084 return PTR_ERR(conf); 4085 mddev->private = conf; 4086 } 4087 conf = mddev->private; 4088 if (!conf) 4089 goto out; 4090 4091 if (mddev_is_clustered(conf->mddev)) { 4092 int fc, fo; 4093 4094 fc = (mddev->layout >> 8) & 255; 4095 fo = mddev->layout & (1<<16); 4096 if (fc > 1 || fo > 0) { 4097 pr_err("only near layout is supported by clustered" 4098 " raid10\n"); 4099 goto out_free_conf; 4100 } 4101 } 4102 4103 mddev->thread = conf->thread; 4104 conf->thread = NULL; 4105 4106 if (mddev->queue) { 4107 blk_queue_max_discard_sectors(mddev->queue, 4108 UINT_MAX); 4109 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 4110 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 4111 raid10_set_io_opt(conf); 4112 } 4113 4114 rdev_for_each(rdev, mddev) { 4115 long long diff; 4116 4117 disk_idx = rdev->raid_disk; 4118 if (disk_idx < 0) 4119 continue; 4120 if (disk_idx >= conf->geo.raid_disks && 4121 disk_idx >= conf->prev.raid_disks) 4122 continue; 4123 disk = conf->mirrors + disk_idx; 4124 4125 if (test_bit(Replacement, &rdev->flags)) { 4126 if (disk->replacement) 4127 goto out_free_conf; 4128 disk->replacement = rdev; 4129 } else { 4130 if (disk->rdev) 4131 goto out_free_conf; 4132 disk->rdev = rdev; 4133 } 4134 diff = (rdev->new_data_offset - rdev->data_offset); 4135 if (!mddev->reshape_backwards) 4136 diff = -diff; 4137 if (diff < 0) 4138 diff = 0; 4139 if (first || diff < min_offset_diff) 4140 min_offset_diff = diff; 4141 4142 if (mddev->gendisk) 4143 disk_stack_limits(mddev->gendisk, rdev->bdev, 4144 rdev->data_offset << 9); 4145 4146 disk->head_position = 0; 4147 first = 0; 4148 } 4149 4150 /* need to check that every block has at least one working mirror */ 4151 if (!enough(conf, -1)) { 4152 pr_err("md/raid10:%s: not enough operational mirrors.\n", 4153 mdname(mddev)); 4154 goto out_free_conf; 4155 } 4156 4157 if (conf->reshape_progress != MaxSector) { 4158 /* must ensure that shape change is supported */ 4159 if (conf->geo.far_copies != 1 && 4160 conf->geo.far_offset == 0) 4161 goto out_free_conf; 4162 if (conf->prev.far_copies != 1 && 4163 conf->prev.far_offset == 0) 4164 goto out_free_conf; 4165 } 4166 4167 mddev->degraded = 0; 4168 for (i = 0; 4169 i < conf->geo.raid_disks 4170 || i < conf->prev.raid_disks; 4171 i++) { 4172 4173 disk = conf->mirrors + i; 4174 4175 if (!disk->rdev && disk->replacement) { 4176 /* The replacement is all we have - use it */ 4177 disk->rdev = disk->replacement; 4178 disk->replacement = NULL; 4179 clear_bit(Replacement, &disk->rdev->flags); 4180 } 4181 4182 if (!disk->rdev || 4183 !test_bit(In_sync, &disk->rdev->flags)) { 4184 disk->head_position = 0; 4185 mddev->degraded++; 4186 if (disk->rdev && 4187 disk->rdev->saved_raid_disk < 0) 4188 conf->fullsync = 1; 4189 } 4190 4191 if (disk->replacement && 4192 !test_bit(In_sync, &disk->replacement->flags) && 4193 disk->replacement->saved_raid_disk < 0) { 4194 conf->fullsync = 1; 4195 } 4196 4197 disk->recovery_disabled = mddev->recovery_disabled - 1; 4198 } 4199 4200 if (mddev->recovery_cp != MaxSector) 4201 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", 4202 mdname(mddev)); 4203 pr_info("md/raid10:%s: active with %d out of %d devices\n", 4204 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 4205 conf->geo.raid_disks); 4206 /* 4207 * Ok, everything is just fine now 4208 */ 4209 mddev->dev_sectors = conf->dev_sectors; 4210 size = raid10_size(mddev, 0, 0); 4211 md_set_array_sectors(mddev, size); 4212 mddev->resync_max_sectors = size; 4213 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 4214 4215 if (md_integrity_register(mddev)) 4216 goto out_free_conf; 4217 4218 if (conf->reshape_progress != MaxSector) { 4219 unsigned long before_length, after_length; 4220 4221 before_length = ((1 << conf->prev.chunk_shift) * 4222 conf->prev.far_copies); 4223 after_length = ((1 << conf->geo.chunk_shift) * 4224 conf->geo.far_copies); 4225 4226 if (max(before_length, after_length) > min_offset_diff) { 4227 /* This cannot work */ 4228 pr_warn("md/raid10: offset difference not enough to continue reshape\n"); 4229 goto out_free_conf; 4230 } 4231 conf->offset_diff = min_offset_diff; 4232 4233 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4234 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4235 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4236 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4237 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4238 "reshape"); 4239 if (!mddev->sync_thread) 4240 goto out_free_conf; 4241 } 4242 4243 return 0; 4244 4245 out_free_conf: 4246 md_unregister_thread(&mddev->thread); 4247 mempool_exit(&conf->r10bio_pool); 4248 safe_put_page(conf->tmppage); 4249 kfree(conf->mirrors); 4250 kfree(conf); 4251 mddev->private = NULL; 4252 out: 4253 return -EIO; 4254 } 4255 4256 static void raid10_free(struct mddev *mddev, void *priv) 4257 { 4258 struct r10conf *conf = priv; 4259 4260 mempool_exit(&conf->r10bio_pool); 4261 safe_put_page(conf->tmppage); 4262 kfree(conf->mirrors); 4263 kfree(conf->mirrors_old); 4264 kfree(conf->mirrors_new); 4265 bioset_exit(&conf->bio_split); 4266 kfree(conf); 4267 } 4268 4269 static void raid10_quiesce(struct mddev *mddev, int quiesce) 4270 { 4271 struct r10conf *conf = mddev->private; 4272 4273 if (quiesce) 4274 raise_barrier(conf, 0); 4275 else 4276 lower_barrier(conf); 4277 } 4278 4279 static int raid10_resize(struct mddev *mddev, sector_t sectors) 4280 { 4281 /* Resize of 'far' arrays is not supported. 4282 * For 'near' and 'offset' arrays we can set the 4283 * number of sectors used to be an appropriate multiple 4284 * of the chunk size. 4285 * For 'offset', this is far_copies*chunksize. 4286 * For 'near' the multiplier is the LCM of 4287 * near_copies and raid_disks. 4288 * So if far_copies > 1 && !far_offset, fail. 4289 * Else find LCM(raid_disks, near_copy)*far_copies and 4290 * multiply by chunk_size. Then round to this number. 4291 * This is mostly done by raid10_size() 4292 */ 4293 struct r10conf *conf = mddev->private; 4294 sector_t oldsize, size; 4295 4296 if (mddev->reshape_position != MaxSector) 4297 return -EBUSY; 4298 4299 if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 4300 return -EINVAL; 4301 4302 oldsize = raid10_size(mddev, 0, 0); 4303 size = raid10_size(mddev, sectors, 0); 4304 if (mddev->external_size && 4305 mddev->array_sectors > size) 4306 return -EINVAL; 4307 if (mddev->bitmap) { 4308 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); 4309 if (ret) 4310 return ret; 4311 } 4312 md_set_array_sectors(mddev, size); 4313 if (sectors > mddev->dev_sectors && 4314 mddev->recovery_cp > oldsize) { 4315 mddev->recovery_cp = oldsize; 4316 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4317 } 4318 calc_sectors(conf, sectors); 4319 mddev->dev_sectors = conf->dev_sectors; 4320 mddev->resync_max_sectors = size; 4321 return 0; 4322 } 4323 4324 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) 4325 { 4326 struct md_rdev *rdev; 4327 struct r10conf *conf; 4328 4329 if (mddev->degraded > 0) { 4330 pr_warn("md/raid10:%s: Error: degraded raid0!\n", 4331 mdname(mddev)); 4332 return ERR_PTR(-EINVAL); 4333 } 4334 sector_div(size, devs); 4335 4336 /* Set new parameters */ 4337 mddev->new_level = 10; 4338 /* new layout: far_copies = 1, near_copies = 2 */ 4339 mddev->new_layout = (1<<8) + 2; 4340 mddev->new_chunk_sectors = mddev->chunk_sectors; 4341 mddev->delta_disks = mddev->raid_disks; 4342 mddev->raid_disks *= 2; 4343 /* make sure it will be not marked as dirty */ 4344 mddev->recovery_cp = MaxSector; 4345 mddev->dev_sectors = size; 4346 4347 conf = setup_conf(mddev); 4348 if (!IS_ERR(conf)) { 4349 rdev_for_each(rdev, mddev) 4350 if (rdev->raid_disk >= 0) { 4351 rdev->new_raid_disk = rdev->raid_disk * 2; 4352 rdev->sectors = size; 4353 } 4354 conf->barrier = 1; 4355 } 4356 4357 return conf; 4358 } 4359 4360 static void *raid10_takeover(struct mddev *mddev) 4361 { 4362 struct r0conf *raid0_conf; 4363 4364 /* raid10 can take over: 4365 * raid0 - providing it has only two drives 4366 */ 4367 if (mddev->level == 0) { 4368 /* for raid0 takeover only one zone is supported */ 4369 raid0_conf = mddev->private; 4370 if (raid0_conf->nr_strip_zones > 1) { 4371 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", 4372 mdname(mddev)); 4373 return ERR_PTR(-EINVAL); 4374 } 4375 return raid10_takeover_raid0(mddev, 4376 raid0_conf->strip_zone->zone_end, 4377 raid0_conf->strip_zone->nb_dev); 4378 } 4379 return ERR_PTR(-EINVAL); 4380 } 4381 4382 static int raid10_check_reshape(struct mddev *mddev) 4383 { 4384 /* Called when there is a request to change 4385 * - layout (to ->new_layout) 4386 * - chunk size (to ->new_chunk_sectors) 4387 * - raid_disks (by delta_disks) 4388 * or when trying to restart a reshape that was ongoing. 4389 * 4390 * We need to validate the request and possibly allocate 4391 * space if that might be an issue later. 4392 * 4393 * Currently we reject any reshape of a 'far' mode array, 4394 * allow chunk size to change if new is generally acceptable, 4395 * allow raid_disks to increase, and allow 4396 * a switch between 'near' mode and 'offset' mode. 4397 */ 4398 struct r10conf *conf = mddev->private; 4399 struct geom geo; 4400 4401 if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 4402 return -EINVAL; 4403 4404 if (setup_geo(&geo, mddev, geo_start) != conf->copies) 4405 /* mustn't change number of copies */ 4406 return -EINVAL; 4407 if (geo.far_copies > 1 && !geo.far_offset) 4408 /* Cannot switch to 'far' mode */ 4409 return -EINVAL; 4410 4411 if (mddev->array_sectors & geo.chunk_mask) 4412 /* not factor of array size */ 4413 return -EINVAL; 4414 4415 if (!enough(conf, -1)) 4416 return -EINVAL; 4417 4418 kfree(conf->mirrors_new); 4419 conf->mirrors_new = NULL; 4420 if (mddev->delta_disks > 0) { 4421 /* allocate new 'mirrors' list */ 4422 conf->mirrors_new = 4423 kcalloc(mddev->raid_disks + mddev->delta_disks, 4424 sizeof(struct raid10_info), 4425 GFP_KERNEL); 4426 if (!conf->mirrors_new) 4427 return -ENOMEM; 4428 } 4429 return 0; 4430 } 4431 4432 /* 4433 * Need to check if array has failed when deciding whether to: 4434 * - start an array 4435 * - remove non-faulty devices 4436 * - add a spare 4437 * - allow a reshape 4438 * This determination is simple when no reshape is happening. 4439 * However if there is a reshape, we need to carefully check 4440 * both the before and after sections. 4441 * This is because some failed devices may only affect one 4442 * of the two sections, and some non-in_sync devices may 4443 * be insync in the section most affected by failed devices. 4444 */ 4445 static int calc_degraded(struct r10conf *conf) 4446 { 4447 int degraded, degraded2; 4448 int i; 4449 4450 rcu_read_lock(); 4451 degraded = 0; 4452 /* 'prev' section first */ 4453 for (i = 0; i < conf->prev.raid_disks; i++) { 4454 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4455 if (!rdev || test_bit(Faulty, &rdev->flags)) 4456 degraded++; 4457 else if (!test_bit(In_sync, &rdev->flags)) 4458 /* When we can reduce the number of devices in 4459 * an array, this might not contribute to 4460 * 'degraded'. It does now. 4461 */ 4462 degraded++; 4463 } 4464 rcu_read_unlock(); 4465 if (conf->geo.raid_disks == conf->prev.raid_disks) 4466 return degraded; 4467 rcu_read_lock(); 4468 degraded2 = 0; 4469 for (i = 0; i < conf->geo.raid_disks; i++) { 4470 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4471 if (!rdev || test_bit(Faulty, &rdev->flags)) 4472 degraded2++; 4473 else if (!test_bit(In_sync, &rdev->flags)) { 4474 /* If reshape is increasing the number of devices, 4475 * this section has already been recovered, so 4476 * it doesn't contribute to degraded. 4477 * else it does. 4478 */ 4479 if (conf->geo.raid_disks <= conf->prev.raid_disks) 4480 degraded2++; 4481 } 4482 } 4483 rcu_read_unlock(); 4484 if (degraded2 > degraded) 4485 return degraded2; 4486 return degraded; 4487 } 4488 4489 static int raid10_start_reshape(struct mddev *mddev) 4490 { 4491 /* A 'reshape' has been requested. This commits 4492 * the various 'new' fields and sets MD_RECOVER_RESHAPE 4493 * This also checks if there are enough spares and adds them 4494 * to the array. 4495 * We currently require enough spares to make the final 4496 * array non-degraded. We also require that the difference 4497 * between old and new data_offset - on each device - is 4498 * enough that we never risk over-writing. 4499 */ 4500 4501 unsigned long before_length, after_length; 4502 sector_t min_offset_diff = 0; 4503 int first = 1; 4504 struct geom new; 4505 struct r10conf *conf = mddev->private; 4506 struct md_rdev *rdev; 4507 int spares = 0; 4508 int ret; 4509 4510 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4511 return -EBUSY; 4512 4513 if (setup_geo(&new, mddev, geo_start) != conf->copies) 4514 return -EINVAL; 4515 4516 before_length = ((1 << conf->prev.chunk_shift) * 4517 conf->prev.far_copies); 4518 after_length = ((1 << conf->geo.chunk_shift) * 4519 conf->geo.far_copies); 4520 4521 rdev_for_each(rdev, mddev) { 4522 if (!test_bit(In_sync, &rdev->flags) 4523 && !test_bit(Faulty, &rdev->flags)) 4524 spares++; 4525 if (rdev->raid_disk >= 0) { 4526 long long diff = (rdev->new_data_offset 4527 - rdev->data_offset); 4528 if (!mddev->reshape_backwards) 4529 diff = -diff; 4530 if (diff < 0) 4531 diff = 0; 4532 if (first || diff < min_offset_diff) 4533 min_offset_diff = diff; 4534 first = 0; 4535 } 4536 } 4537 4538 if (max(before_length, after_length) > min_offset_diff) 4539 return -EINVAL; 4540 4541 if (spares < mddev->delta_disks) 4542 return -EINVAL; 4543 4544 conf->offset_diff = min_offset_diff; 4545 spin_lock_irq(&conf->device_lock); 4546 if (conf->mirrors_new) { 4547 memcpy(conf->mirrors_new, conf->mirrors, 4548 sizeof(struct raid10_info)*conf->prev.raid_disks); 4549 smp_mb(); 4550 kfree(conf->mirrors_old); 4551 conf->mirrors_old = conf->mirrors; 4552 conf->mirrors = conf->mirrors_new; 4553 conf->mirrors_new = NULL; 4554 } 4555 setup_geo(&conf->geo, mddev, geo_start); 4556 smp_mb(); 4557 if (mddev->reshape_backwards) { 4558 sector_t size = raid10_size(mddev, 0, 0); 4559 if (size < mddev->array_sectors) { 4560 spin_unlock_irq(&conf->device_lock); 4561 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", 4562 mdname(mddev)); 4563 return -EINVAL; 4564 } 4565 mddev->resync_max_sectors = size; 4566 conf->reshape_progress = size; 4567 } else 4568 conf->reshape_progress = 0; 4569 conf->reshape_safe = conf->reshape_progress; 4570 spin_unlock_irq(&conf->device_lock); 4571 4572 if (mddev->delta_disks && mddev->bitmap) { 4573 struct mdp_superblock_1 *sb = NULL; 4574 sector_t oldsize, newsize; 4575 4576 oldsize = raid10_size(mddev, 0, 0); 4577 newsize = raid10_size(mddev, 0, conf->geo.raid_disks); 4578 4579 if (!mddev_is_clustered(mddev)) { 4580 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4581 if (ret) 4582 goto abort; 4583 else 4584 goto out; 4585 } 4586 4587 rdev_for_each(rdev, mddev) { 4588 if (rdev->raid_disk > -1 && 4589 !test_bit(Faulty, &rdev->flags)) 4590 sb = page_address(rdev->sb_page); 4591 } 4592 4593 /* 4594 * some node is already performing reshape, and no need to 4595 * call md_bitmap_resize again since it should be called when 4596 * receiving BITMAP_RESIZE msg 4597 */ 4598 if ((sb && (le32_to_cpu(sb->feature_map) & 4599 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) 4600 goto out; 4601 4602 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4603 if (ret) 4604 goto abort; 4605 4606 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); 4607 if (ret) { 4608 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); 4609 goto abort; 4610 } 4611 } 4612 out: 4613 if (mddev->delta_disks > 0) { 4614 rdev_for_each(rdev, mddev) 4615 if (rdev->raid_disk < 0 && 4616 !test_bit(Faulty, &rdev->flags)) { 4617 if (raid10_add_disk(mddev, rdev) == 0) { 4618 if (rdev->raid_disk >= 4619 conf->prev.raid_disks) 4620 set_bit(In_sync, &rdev->flags); 4621 else 4622 rdev->recovery_offset = 0; 4623 4624 /* Failure here is OK */ 4625 sysfs_link_rdev(mddev, rdev); 4626 } 4627 } else if (rdev->raid_disk >= conf->prev.raid_disks 4628 && !test_bit(Faulty, &rdev->flags)) { 4629 /* This is a spare that was manually added */ 4630 set_bit(In_sync, &rdev->flags); 4631 } 4632 } 4633 /* When a reshape changes the number of devices, 4634 * ->degraded is measured against the larger of the 4635 * pre and post numbers. 4636 */ 4637 spin_lock_irq(&conf->device_lock); 4638 mddev->degraded = calc_degraded(conf); 4639 spin_unlock_irq(&conf->device_lock); 4640 mddev->raid_disks = conf->geo.raid_disks; 4641 mddev->reshape_position = conf->reshape_progress; 4642 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4643 4644 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4645 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4646 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4647 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4648 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4649 4650 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4651 "reshape"); 4652 if (!mddev->sync_thread) { 4653 ret = -EAGAIN; 4654 goto abort; 4655 } 4656 conf->reshape_checkpoint = jiffies; 4657 md_wakeup_thread(mddev->sync_thread); 4658 md_new_event(); 4659 return 0; 4660 4661 abort: 4662 mddev->recovery = 0; 4663 spin_lock_irq(&conf->device_lock); 4664 conf->geo = conf->prev; 4665 mddev->raid_disks = conf->geo.raid_disks; 4666 rdev_for_each(rdev, mddev) 4667 rdev->new_data_offset = rdev->data_offset; 4668 smp_wmb(); 4669 conf->reshape_progress = MaxSector; 4670 conf->reshape_safe = MaxSector; 4671 mddev->reshape_position = MaxSector; 4672 spin_unlock_irq(&conf->device_lock); 4673 return ret; 4674 } 4675 4676 /* Calculate the last device-address that could contain 4677 * any block from the chunk that includes the array-address 's' 4678 * and report the next address. 4679 * i.e. the address returned will be chunk-aligned and after 4680 * any data that is in the chunk containing 's'. 4681 */ 4682 static sector_t last_dev_address(sector_t s, struct geom *geo) 4683 { 4684 s = (s | geo->chunk_mask) + 1; 4685 s >>= geo->chunk_shift; 4686 s *= geo->near_copies; 4687 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 4688 s *= geo->far_copies; 4689 s <<= geo->chunk_shift; 4690 return s; 4691 } 4692 4693 /* Calculate the first device-address that could contain 4694 * any block from the chunk that includes the array-address 's'. 4695 * This too will be the start of a chunk 4696 */ 4697 static sector_t first_dev_address(sector_t s, struct geom *geo) 4698 { 4699 s >>= geo->chunk_shift; 4700 s *= geo->near_copies; 4701 sector_div(s, geo->raid_disks); 4702 s *= geo->far_copies; 4703 s <<= geo->chunk_shift; 4704 return s; 4705 } 4706 4707 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 4708 int *skipped) 4709 { 4710 /* We simply copy at most one chunk (smallest of old and new) 4711 * at a time, possibly less if that exceeds RESYNC_PAGES, 4712 * or we hit a bad block or something. 4713 * This might mean we pause for normal IO in the middle of 4714 * a chunk, but that is not a problem as mddev->reshape_position 4715 * can record any location. 4716 * 4717 * If we will want to write to a location that isn't 4718 * yet recorded as 'safe' (i.e. in metadata on disk) then 4719 * we need to flush all reshape requests and update the metadata. 4720 * 4721 * When reshaping forwards (e.g. to more devices), we interpret 4722 * 'safe' as the earliest block which might not have been copied 4723 * down yet. We divide this by previous stripe size and multiply 4724 * by previous stripe length to get lowest device offset that we 4725 * cannot write to yet. 4726 * We interpret 'sector_nr' as an address that we want to write to. 4727 * From this we use last_device_address() to find where we might 4728 * write to, and first_device_address on the 'safe' position. 4729 * If this 'next' write position is after the 'safe' position, 4730 * we must update the metadata to increase the 'safe' position. 4731 * 4732 * When reshaping backwards, we round in the opposite direction 4733 * and perform the reverse test: next write position must not be 4734 * less than current safe position. 4735 * 4736 * In all this the minimum difference in data offsets 4737 * (conf->offset_diff - always positive) allows a bit of slack, 4738 * so next can be after 'safe', but not by more than offset_diff 4739 * 4740 * We need to prepare all the bios here before we start any IO 4741 * to ensure the size we choose is acceptable to all devices. 4742 * The means one for each copy for write-out and an extra one for 4743 * read-in. 4744 * We store the read-in bio in ->master_bio and the others in 4745 * ->devs[x].bio and ->devs[x].repl_bio. 4746 */ 4747 struct r10conf *conf = mddev->private; 4748 struct r10bio *r10_bio; 4749 sector_t next, safe, last; 4750 int max_sectors; 4751 int nr_sectors; 4752 int s; 4753 struct md_rdev *rdev; 4754 int need_flush = 0; 4755 struct bio *blist; 4756 struct bio *bio, *read_bio; 4757 int sectors_done = 0; 4758 struct page **pages; 4759 4760 if (sector_nr == 0) { 4761 /* If restarting in the middle, skip the initial sectors */ 4762 if (mddev->reshape_backwards && 4763 conf->reshape_progress < raid10_size(mddev, 0, 0)) { 4764 sector_nr = (raid10_size(mddev, 0, 0) 4765 - conf->reshape_progress); 4766 } else if (!mddev->reshape_backwards && 4767 conf->reshape_progress > 0) 4768 sector_nr = conf->reshape_progress; 4769 if (sector_nr) { 4770 mddev->curr_resync_completed = sector_nr; 4771 sysfs_notify_dirent_safe(mddev->sysfs_completed); 4772 *skipped = 1; 4773 return sector_nr; 4774 } 4775 } 4776 4777 /* We don't use sector_nr to track where we are up to 4778 * as that doesn't work well for ->reshape_backwards. 4779 * So just use ->reshape_progress. 4780 */ 4781 if (mddev->reshape_backwards) { 4782 /* 'next' is the earliest device address that we might 4783 * write to for this chunk in the new layout 4784 */ 4785 next = first_dev_address(conf->reshape_progress - 1, 4786 &conf->geo); 4787 4788 /* 'safe' is the last device address that we might read from 4789 * in the old layout after a restart 4790 */ 4791 safe = last_dev_address(conf->reshape_safe - 1, 4792 &conf->prev); 4793 4794 if (next + conf->offset_diff < safe) 4795 need_flush = 1; 4796 4797 last = conf->reshape_progress - 1; 4798 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 4799 & conf->prev.chunk_mask); 4800 if (sector_nr + RESYNC_SECTORS < last) 4801 sector_nr = last + 1 - RESYNC_SECTORS; 4802 } else { 4803 /* 'next' is after the last device address that we 4804 * might write to for this chunk in the new layout 4805 */ 4806 next = last_dev_address(conf->reshape_progress, &conf->geo); 4807 4808 /* 'safe' is the earliest device address that we might 4809 * read from in the old layout after a restart 4810 */ 4811 safe = first_dev_address(conf->reshape_safe, &conf->prev); 4812 4813 /* Need to update metadata if 'next' might be beyond 'safe' 4814 * as that would possibly corrupt data 4815 */ 4816 if (next > safe + conf->offset_diff) 4817 need_flush = 1; 4818 4819 sector_nr = conf->reshape_progress; 4820 last = sector_nr | (conf->geo.chunk_mask 4821 & conf->prev.chunk_mask); 4822 4823 if (sector_nr + RESYNC_SECTORS <= last) 4824 last = sector_nr + RESYNC_SECTORS - 1; 4825 } 4826 4827 if (need_flush || 4828 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4829 /* Need to update reshape_position in metadata */ 4830 wait_barrier(conf, false); 4831 mddev->reshape_position = conf->reshape_progress; 4832 if (mddev->reshape_backwards) 4833 mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 4834 - conf->reshape_progress; 4835 else 4836 mddev->curr_resync_completed = conf->reshape_progress; 4837 conf->reshape_checkpoint = jiffies; 4838 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4839 md_wakeup_thread(mddev->thread); 4840 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 4841 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4842 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4843 allow_barrier(conf); 4844 return sectors_done; 4845 } 4846 conf->reshape_safe = mddev->reshape_position; 4847 allow_barrier(conf); 4848 } 4849 4850 raise_barrier(conf, 0); 4851 read_more: 4852 /* Now schedule reads for blocks from sector_nr to last */ 4853 r10_bio = raid10_alloc_init_r10buf(conf); 4854 r10_bio->state = 0; 4855 raise_barrier(conf, 1); 4856 atomic_set(&r10_bio->remaining, 0); 4857 r10_bio->mddev = mddev; 4858 r10_bio->sector = sector_nr; 4859 set_bit(R10BIO_IsReshape, &r10_bio->state); 4860 r10_bio->sectors = last - sector_nr + 1; 4861 rdev = read_balance(conf, r10_bio, &max_sectors); 4862 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 4863 4864 if (!rdev) { 4865 /* Cannot read from here, so need to record bad blocks 4866 * on all the target devices. 4867 */ 4868 // FIXME 4869 mempool_free(r10_bio, &conf->r10buf_pool); 4870 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4871 return sectors_done; 4872 } 4873 4874 read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ, 4875 GFP_KERNEL, &mddev->bio_set); 4876 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 4877 + rdev->data_offset); 4878 read_bio->bi_private = r10_bio; 4879 read_bio->bi_end_io = end_reshape_read; 4880 r10_bio->master_bio = read_bio; 4881 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4882 4883 /* 4884 * Broadcast RESYNC message to other nodes, so all nodes would not 4885 * write to the region to avoid conflict. 4886 */ 4887 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { 4888 struct mdp_superblock_1 *sb = NULL; 4889 int sb_reshape_pos = 0; 4890 4891 conf->cluster_sync_low = sector_nr; 4892 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; 4893 sb = page_address(rdev->sb_page); 4894 if (sb) { 4895 sb_reshape_pos = le64_to_cpu(sb->reshape_position); 4896 /* 4897 * Set cluster_sync_low again if next address for array 4898 * reshape is less than cluster_sync_low. Since we can't 4899 * update cluster_sync_low until it has finished reshape. 4900 */ 4901 if (sb_reshape_pos < conf->cluster_sync_low) 4902 conf->cluster_sync_low = sb_reshape_pos; 4903 } 4904 4905 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, 4906 conf->cluster_sync_high); 4907 } 4908 4909 /* Now find the locations in the new layout */ 4910 __raid10_find_phys(&conf->geo, r10_bio); 4911 4912 blist = read_bio; 4913 read_bio->bi_next = NULL; 4914 4915 rcu_read_lock(); 4916 for (s = 0; s < conf->copies*2; s++) { 4917 struct bio *b; 4918 int d = r10_bio->devs[s/2].devnum; 4919 struct md_rdev *rdev2; 4920 if (s&1) { 4921 rdev2 = rcu_dereference(conf->mirrors[d].replacement); 4922 b = r10_bio->devs[s/2].repl_bio; 4923 } else { 4924 rdev2 = rcu_dereference(conf->mirrors[d].rdev); 4925 b = r10_bio->devs[s/2].bio; 4926 } 4927 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4928 continue; 4929 4930 bio_set_dev(b, rdev2->bdev); 4931 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + 4932 rdev2->new_data_offset; 4933 b->bi_end_io = end_reshape_write; 4934 bio_set_op_attrs(b, REQ_OP_WRITE, 0); 4935 b->bi_next = blist; 4936 blist = b; 4937 } 4938 4939 /* Now add as many pages as possible to all of these bios. */ 4940 4941 nr_sectors = 0; 4942 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 4943 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 4944 struct page *page = pages[s / (PAGE_SIZE >> 9)]; 4945 int len = (max_sectors - s) << 9; 4946 if (len > PAGE_SIZE) 4947 len = PAGE_SIZE; 4948 for (bio = blist; bio ; bio = bio->bi_next) { 4949 /* 4950 * won't fail because the vec table is big enough 4951 * to hold all these pages 4952 */ 4953 bio_add_page(bio, page, len, 0); 4954 } 4955 sector_nr += len >> 9; 4956 nr_sectors += len >> 9; 4957 } 4958 rcu_read_unlock(); 4959 r10_bio->sectors = nr_sectors; 4960 4961 /* Now submit the read */ 4962 md_sync_acct_bio(read_bio, r10_bio->sectors); 4963 atomic_inc(&r10_bio->remaining); 4964 read_bio->bi_next = NULL; 4965 submit_bio_noacct(read_bio); 4966 sectors_done += nr_sectors; 4967 if (sector_nr <= last) 4968 goto read_more; 4969 4970 lower_barrier(conf); 4971 4972 /* Now that we have done the whole section we can 4973 * update reshape_progress 4974 */ 4975 if (mddev->reshape_backwards) 4976 conf->reshape_progress -= sectors_done; 4977 else 4978 conf->reshape_progress += sectors_done; 4979 4980 return sectors_done; 4981 } 4982 4983 static void end_reshape_request(struct r10bio *r10_bio); 4984 static int handle_reshape_read_error(struct mddev *mddev, 4985 struct r10bio *r10_bio); 4986 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 4987 { 4988 /* Reshape read completed. Hopefully we have a block 4989 * to write out. 4990 * If we got a read error then we do sync 1-page reads from 4991 * elsewhere until we find the data - or give up. 4992 */ 4993 struct r10conf *conf = mddev->private; 4994 int s; 4995 4996 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 4997 if (handle_reshape_read_error(mddev, r10_bio) < 0) { 4998 /* Reshape has been aborted */ 4999 md_done_sync(mddev, r10_bio->sectors, 0); 5000 return; 5001 } 5002 5003 /* We definitely have the data in the pages, schedule the 5004 * writes. 5005 */ 5006 atomic_set(&r10_bio->remaining, 1); 5007 for (s = 0; s < conf->copies*2; s++) { 5008 struct bio *b; 5009 int d = r10_bio->devs[s/2].devnum; 5010 struct md_rdev *rdev; 5011 rcu_read_lock(); 5012 if (s&1) { 5013 rdev = rcu_dereference(conf->mirrors[d].replacement); 5014 b = r10_bio->devs[s/2].repl_bio; 5015 } else { 5016 rdev = rcu_dereference(conf->mirrors[d].rdev); 5017 b = r10_bio->devs[s/2].bio; 5018 } 5019 if (!rdev || test_bit(Faulty, &rdev->flags)) { 5020 rcu_read_unlock(); 5021 continue; 5022 } 5023 atomic_inc(&rdev->nr_pending); 5024 rcu_read_unlock(); 5025 md_sync_acct_bio(b, r10_bio->sectors); 5026 atomic_inc(&r10_bio->remaining); 5027 b->bi_next = NULL; 5028 submit_bio_noacct(b); 5029 } 5030 end_reshape_request(r10_bio); 5031 } 5032 5033 static void end_reshape(struct r10conf *conf) 5034 { 5035 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 5036 return; 5037 5038 spin_lock_irq(&conf->device_lock); 5039 conf->prev = conf->geo; 5040 md_finish_reshape(conf->mddev); 5041 smp_wmb(); 5042 conf->reshape_progress = MaxSector; 5043 conf->reshape_safe = MaxSector; 5044 spin_unlock_irq(&conf->device_lock); 5045 5046 if (conf->mddev->queue) 5047 raid10_set_io_opt(conf); 5048 conf->fullsync = 0; 5049 } 5050 5051 static void raid10_update_reshape_pos(struct mddev *mddev) 5052 { 5053 struct r10conf *conf = mddev->private; 5054 sector_t lo, hi; 5055 5056 md_cluster_ops->resync_info_get(mddev, &lo, &hi); 5057 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) 5058 || mddev->reshape_position == MaxSector) 5059 conf->reshape_progress = mddev->reshape_position; 5060 else 5061 WARN_ON_ONCE(1); 5062 } 5063 5064 static int handle_reshape_read_error(struct mddev *mddev, 5065 struct r10bio *r10_bio) 5066 { 5067 /* Use sync reads to get the blocks from somewhere else */ 5068 int sectors = r10_bio->sectors; 5069 struct r10conf *conf = mddev->private; 5070 struct r10bio *r10b; 5071 int slot = 0; 5072 int idx = 0; 5073 struct page **pages; 5074 5075 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); 5076 if (!r10b) { 5077 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5078 return -ENOMEM; 5079 } 5080 5081 /* reshape IOs share pages from .devs[0].bio */ 5082 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 5083 5084 r10b->sector = r10_bio->sector; 5085 __raid10_find_phys(&conf->prev, r10b); 5086 5087 while (sectors) { 5088 int s = sectors; 5089 int success = 0; 5090 int first_slot = slot; 5091 5092 if (s > (PAGE_SIZE >> 9)) 5093 s = PAGE_SIZE >> 9; 5094 5095 rcu_read_lock(); 5096 while (!success) { 5097 int d = r10b->devs[slot].devnum; 5098 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5099 sector_t addr; 5100 if (rdev == NULL || 5101 test_bit(Faulty, &rdev->flags) || 5102 !test_bit(In_sync, &rdev->flags)) 5103 goto failed; 5104 5105 addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 5106 atomic_inc(&rdev->nr_pending); 5107 rcu_read_unlock(); 5108 success = sync_page_io(rdev, 5109 addr, 5110 s << 9, 5111 pages[idx], 5112 REQ_OP_READ, false); 5113 rdev_dec_pending(rdev, mddev); 5114 rcu_read_lock(); 5115 if (success) 5116 break; 5117 failed: 5118 slot++; 5119 if (slot >= conf->copies) 5120 slot = 0; 5121 if (slot == first_slot) 5122 break; 5123 } 5124 rcu_read_unlock(); 5125 if (!success) { 5126 /* couldn't read this block, must give up */ 5127 set_bit(MD_RECOVERY_INTR, 5128 &mddev->recovery); 5129 kfree(r10b); 5130 return -EIO; 5131 } 5132 sectors -= s; 5133 idx++; 5134 } 5135 kfree(r10b); 5136 return 0; 5137 } 5138 5139 static void end_reshape_write(struct bio *bio) 5140 { 5141 struct r10bio *r10_bio = get_resync_r10bio(bio); 5142 struct mddev *mddev = r10_bio->mddev; 5143 struct r10conf *conf = mddev->private; 5144 int d; 5145 int slot; 5146 int repl; 5147 struct md_rdev *rdev = NULL; 5148 5149 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 5150 if (repl) 5151 rdev = conf->mirrors[d].replacement; 5152 if (!rdev) { 5153 smp_mb(); 5154 rdev = conf->mirrors[d].rdev; 5155 } 5156 5157 if (bio->bi_status) { 5158 /* FIXME should record badblock */ 5159 md_error(mddev, rdev); 5160 } 5161 5162 rdev_dec_pending(rdev, mddev); 5163 end_reshape_request(r10_bio); 5164 } 5165 5166 static void end_reshape_request(struct r10bio *r10_bio) 5167 { 5168 if (!atomic_dec_and_test(&r10_bio->remaining)) 5169 return; 5170 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 5171 bio_put(r10_bio->master_bio); 5172 put_buf(r10_bio); 5173 } 5174 5175 static void raid10_finish_reshape(struct mddev *mddev) 5176 { 5177 struct r10conf *conf = mddev->private; 5178 5179 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5180 return; 5181 5182 if (mddev->delta_disks > 0) { 5183 if (mddev->recovery_cp > mddev->resync_max_sectors) { 5184 mddev->recovery_cp = mddev->resync_max_sectors; 5185 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5186 } 5187 mddev->resync_max_sectors = mddev->array_sectors; 5188 } else { 5189 int d; 5190 rcu_read_lock(); 5191 for (d = conf->geo.raid_disks ; 5192 d < conf->geo.raid_disks - mddev->delta_disks; 5193 d++) { 5194 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5195 if (rdev) 5196 clear_bit(In_sync, &rdev->flags); 5197 rdev = rcu_dereference(conf->mirrors[d].replacement); 5198 if (rdev) 5199 clear_bit(In_sync, &rdev->flags); 5200 } 5201 rcu_read_unlock(); 5202 } 5203 mddev->layout = mddev->new_layout; 5204 mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 5205 mddev->reshape_position = MaxSector; 5206 mddev->delta_disks = 0; 5207 mddev->reshape_backwards = 0; 5208 } 5209 5210 static struct md_personality raid10_personality = 5211 { 5212 .name = "raid10", 5213 .level = 10, 5214 .owner = THIS_MODULE, 5215 .make_request = raid10_make_request, 5216 .run = raid10_run, 5217 .free = raid10_free, 5218 .status = raid10_status, 5219 .error_handler = raid10_error, 5220 .hot_add_disk = raid10_add_disk, 5221 .hot_remove_disk= raid10_remove_disk, 5222 .spare_active = raid10_spare_active, 5223 .sync_request = raid10_sync_request, 5224 .quiesce = raid10_quiesce, 5225 .size = raid10_size, 5226 .resize = raid10_resize, 5227 .takeover = raid10_takeover, 5228 .check_reshape = raid10_check_reshape, 5229 .start_reshape = raid10_start_reshape, 5230 .finish_reshape = raid10_finish_reshape, 5231 .update_reshape_pos = raid10_update_reshape_pos, 5232 }; 5233 5234 static int __init raid_init(void) 5235 { 5236 return register_md_personality(&raid10_personality); 5237 } 5238 5239 static void raid_exit(void) 5240 { 5241 unregister_md_personality(&raid10_personality); 5242 } 5243 5244 module_init(raid_init); 5245 module_exit(raid_exit); 5246 MODULE_LICENSE("GPL"); 5247 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); 5248 MODULE_ALIAS("md-personality-9"); /* RAID10 */ 5249 MODULE_ALIAS("md-raid10"); 5250 MODULE_ALIAS("md-level-10"); 5251