1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * raid10.c : Multiple Devices driver for Linux 4 * 5 * Copyright (C) 2000-2004 Neil Brown 6 * 7 * RAID-10 support for md. 8 * 9 * Base on code in raid1.c. See raid1.c for further copyright information. 10 */ 11 12 #include <linux/slab.h> 13 #include <linux/delay.h> 14 #include <linux/blkdev.h> 15 #include <linux/module.h> 16 #include <linux/seq_file.h> 17 #include <linux/ratelimit.h> 18 #include <linux/kthread.h> 19 #include <linux/raid/md_p.h> 20 #include <trace/events/block.h> 21 #include "md.h" 22 #include "raid10.h" 23 #include "raid0.h" 24 #include "md-bitmap.h" 25 26 /* 27 * RAID10 provides a combination of RAID0 and RAID1 functionality. 28 * The layout of data is defined by 29 * chunk_size 30 * raid_disks 31 * near_copies (stored in low byte of layout) 32 * far_copies (stored in second byte of layout) 33 * far_offset (stored in bit 16 of layout ) 34 * use_far_sets (stored in bit 17 of layout ) 35 * use_far_sets_bugfixed (stored in bit 18 of layout ) 36 * 37 * The data to be stored is divided into chunks using chunksize. Each device 38 * is divided into far_copies sections. In each section, chunks are laid out 39 * in a style similar to raid0, but near_copies copies of each chunk is stored 40 * (each on a different drive). The starting device for each section is offset 41 * near_copies from the starting device of the previous section. Thus there 42 * are (near_copies * far_copies) of each chunk, and each is on a different 43 * drive. near_copies and far_copies must be at least one, and their product 44 * is at most raid_disks. 45 * 46 * If far_offset is true, then the far_copies are handled a bit differently. 47 * The copies are still in different stripes, but instead of being very far 48 * apart on disk, there are adjacent stripes. 49 * 50 * The far and offset algorithms are handled slightly differently if 51 * 'use_far_sets' is true. In this case, the array's devices are grouped into 52 * sets that are (near_copies * far_copies) in size. The far copied stripes 53 * are still shifted by 'near_copies' devices, but this shifting stays confined 54 * to the set rather than the entire array. This is done to improve the number 55 * of device combinations that can fail without causing the array to fail. 56 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk 57 * on a device): 58 * A B C D A B C D E 59 * ... ... 60 * D A B C E A B C D 61 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): 62 * [A B] [C D] [A B] [C D E] 63 * |...| |...| |...| | ... | 64 * [B A] [D C] [B A] [E C D] 65 */ 66 67 static void allow_barrier(struct r10conf *conf); 68 static void lower_barrier(struct r10conf *conf); 69 static int _enough(struct r10conf *conf, int previous, int ignore); 70 static int enough(struct r10conf *conf, int ignore); 71 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 72 int *skipped); 73 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 74 static void end_reshape_write(struct bio *bio); 75 static void end_reshape(struct r10conf *conf); 76 77 #define raid10_log(md, fmt, args...) \ 78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) 79 80 #include "raid1-10.c" 81 82 #define NULL_CMD 83 #define cmd_before(conf, cmd) \ 84 do { \ 85 write_sequnlock_irq(&(conf)->resync_lock); \ 86 cmd; \ 87 } while (0) 88 #define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) 89 90 #define wait_event_barrier_cmd(conf, cond, cmd) \ 91 wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ 92 cmd_after(conf)) 93 94 #define wait_event_barrier(conf, cond) \ 95 wait_event_barrier_cmd(conf, cond, NULL_CMD) 96 97 /* 98 * for resync bio, r10bio pointer can be retrieved from the per-bio 99 * 'struct resync_pages'. 100 */ 101 static inline struct r10bio *get_resync_r10bio(struct bio *bio) 102 { 103 return get_resync_pages(bio)->raid_bio; 104 } 105 106 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 107 { 108 struct r10conf *conf = data; 109 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); 110 111 /* allocate a r10bio with room for raid_disks entries in the 112 * bios array */ 113 return kzalloc(size, gfp_flags); 114 } 115 116 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 117 /* amount of memory to reserve for resync requests */ 118 #define RESYNC_WINDOW (1024*1024) 119 /* maximum number of concurrent requests, memory permitting */ 120 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 121 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) 122 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 123 124 /* 125 * When performing a resync, we need to read and compare, so 126 * we need as many pages are there are copies. 127 * When performing a recovery, we need 2 bios, one for read, 128 * one for write (we recover only one drive per r10buf) 129 * 130 */ 131 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 132 { 133 struct r10conf *conf = data; 134 struct r10bio *r10_bio; 135 struct bio *bio; 136 int j; 137 int nalloc, nalloc_rp; 138 struct resync_pages *rps; 139 140 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 141 if (!r10_bio) 142 return NULL; 143 144 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 145 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 146 nalloc = conf->copies; /* resync */ 147 else 148 nalloc = 2; /* recovery */ 149 150 /* allocate once for all bios */ 151 if (!conf->have_replacement) 152 nalloc_rp = nalloc; 153 else 154 nalloc_rp = nalloc * 2; 155 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags); 156 if (!rps) 157 goto out_free_r10bio; 158 159 /* 160 * Allocate bios. 161 */ 162 for (j = nalloc ; j-- ; ) { 163 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 164 if (!bio) 165 goto out_free_bio; 166 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 167 r10_bio->devs[j].bio = bio; 168 if (!conf->have_replacement) 169 continue; 170 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 171 if (!bio) 172 goto out_free_bio; 173 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 174 r10_bio->devs[j].repl_bio = bio; 175 } 176 /* 177 * Allocate RESYNC_PAGES data pages and attach them 178 * where needed. 179 */ 180 for (j = 0; j < nalloc; j++) { 181 struct bio *rbio = r10_bio->devs[j].repl_bio; 182 struct resync_pages *rp, *rp_repl; 183 184 rp = &rps[j]; 185 if (rbio) 186 rp_repl = &rps[nalloc + j]; 187 188 bio = r10_bio->devs[j].bio; 189 190 if (!j || test_bit(MD_RECOVERY_SYNC, 191 &conf->mddev->recovery)) { 192 if (resync_alloc_pages(rp, gfp_flags)) 193 goto out_free_pages; 194 } else { 195 memcpy(rp, &rps[0], sizeof(*rp)); 196 resync_get_all_pages(rp); 197 } 198 199 rp->raid_bio = r10_bio; 200 bio->bi_private = rp; 201 if (rbio) { 202 memcpy(rp_repl, rp, sizeof(*rp)); 203 rbio->bi_private = rp_repl; 204 } 205 } 206 207 return r10_bio; 208 209 out_free_pages: 210 while (--j >= 0) 211 resync_free_pages(&rps[j]); 212 213 j = 0; 214 out_free_bio: 215 for ( ; j < nalloc; j++) { 216 if (r10_bio->devs[j].bio) 217 bio_uninit(r10_bio->devs[j].bio); 218 kfree(r10_bio->devs[j].bio); 219 if (r10_bio->devs[j].repl_bio) 220 bio_uninit(r10_bio->devs[j].repl_bio); 221 kfree(r10_bio->devs[j].repl_bio); 222 } 223 kfree(rps); 224 out_free_r10bio: 225 rbio_pool_free(r10_bio, conf); 226 return NULL; 227 } 228 229 static void r10buf_pool_free(void *__r10_bio, void *data) 230 { 231 struct r10conf *conf = data; 232 struct r10bio *r10bio = __r10_bio; 233 int j; 234 struct resync_pages *rp = NULL; 235 236 for (j = conf->copies; j--; ) { 237 struct bio *bio = r10bio->devs[j].bio; 238 239 if (bio) { 240 rp = get_resync_pages(bio); 241 resync_free_pages(rp); 242 bio_uninit(bio); 243 kfree(bio); 244 } 245 246 bio = r10bio->devs[j].repl_bio; 247 if (bio) { 248 bio_uninit(bio); 249 kfree(bio); 250 } 251 } 252 253 /* resync pages array stored in the 1st bio's .bi_private */ 254 kfree(rp); 255 256 rbio_pool_free(r10bio, conf); 257 } 258 259 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 260 { 261 int i; 262 263 for (i = 0; i < conf->geo.raid_disks; i++) { 264 struct bio **bio = & r10_bio->devs[i].bio; 265 if (!BIO_SPECIAL(*bio)) 266 bio_put(*bio); 267 *bio = NULL; 268 bio = &r10_bio->devs[i].repl_bio; 269 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) 270 bio_put(*bio); 271 *bio = NULL; 272 } 273 } 274 275 static void free_r10bio(struct r10bio *r10_bio) 276 { 277 struct r10conf *conf = r10_bio->mddev->private; 278 279 put_all_bios(conf, r10_bio); 280 mempool_free(r10_bio, &conf->r10bio_pool); 281 } 282 283 static void put_buf(struct r10bio *r10_bio) 284 { 285 struct r10conf *conf = r10_bio->mddev->private; 286 287 mempool_free(r10_bio, &conf->r10buf_pool); 288 289 lower_barrier(conf); 290 } 291 292 static void wake_up_barrier(struct r10conf *conf) 293 { 294 if (wq_has_sleeper(&conf->wait_barrier)) 295 wake_up(&conf->wait_barrier); 296 } 297 298 static void reschedule_retry(struct r10bio *r10_bio) 299 { 300 unsigned long flags; 301 struct mddev *mddev = r10_bio->mddev; 302 struct r10conf *conf = mddev->private; 303 304 spin_lock_irqsave(&conf->device_lock, flags); 305 list_add(&r10_bio->retry_list, &conf->retry_list); 306 conf->nr_queued ++; 307 spin_unlock_irqrestore(&conf->device_lock, flags); 308 309 /* wake up frozen array... */ 310 wake_up(&conf->wait_barrier); 311 312 md_wakeup_thread(mddev->thread); 313 } 314 315 /* 316 * raid_end_bio_io() is called when we have finished servicing a mirrored 317 * operation and are ready to return a success/failure code to the buffer 318 * cache layer. 319 */ 320 static void raid_end_bio_io(struct r10bio *r10_bio) 321 { 322 struct bio *bio = r10_bio->master_bio; 323 struct r10conf *conf = r10_bio->mddev->private; 324 325 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 326 bio->bi_status = BLK_STS_IOERR; 327 328 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 329 bio_end_io_acct(bio, r10_bio->start_time); 330 bio_endio(bio); 331 /* 332 * Wake up any possible resync thread that waits for the device 333 * to go idle. 334 */ 335 allow_barrier(conf); 336 337 free_r10bio(r10_bio); 338 } 339 340 /* 341 * Update disk head position estimator based on IRQ completion info. 342 */ 343 static inline void update_head_pos(int slot, struct r10bio *r10_bio) 344 { 345 struct r10conf *conf = r10_bio->mddev->private; 346 347 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 348 r10_bio->devs[slot].addr + (r10_bio->sectors); 349 } 350 351 /* 352 * Find the disk number which triggered given bio 353 */ 354 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 355 struct bio *bio, int *slotp, int *replp) 356 { 357 int slot; 358 int repl = 0; 359 360 for (slot = 0; slot < conf->geo.raid_disks; slot++) { 361 if (r10_bio->devs[slot].bio == bio) 362 break; 363 if (r10_bio->devs[slot].repl_bio == bio) { 364 repl = 1; 365 break; 366 } 367 } 368 369 update_head_pos(slot, r10_bio); 370 371 if (slotp) 372 *slotp = slot; 373 if (replp) 374 *replp = repl; 375 return r10_bio->devs[slot].devnum; 376 } 377 378 static void raid10_end_read_request(struct bio *bio) 379 { 380 int uptodate = !bio->bi_status; 381 struct r10bio *r10_bio = bio->bi_private; 382 int slot; 383 struct md_rdev *rdev; 384 struct r10conf *conf = r10_bio->mddev->private; 385 386 slot = r10_bio->read_slot; 387 rdev = r10_bio->devs[slot].rdev; 388 /* 389 * this branch is our 'one mirror IO has finished' event handler: 390 */ 391 update_head_pos(slot, r10_bio); 392 393 if (uptodate) { 394 /* 395 * Set R10BIO_Uptodate in our master bio, so that 396 * we will return a good error code to the higher 397 * levels even if IO on some other mirrored buffer fails. 398 * 399 * The 'master' represents the composite IO operation to 400 * user-side. So if something waits for IO, then it will 401 * wait for the 'master' bio. 402 */ 403 set_bit(R10BIO_Uptodate, &r10_bio->state); 404 } else { 405 /* If all other devices that store this block have 406 * failed, we want to return the error upwards rather 407 * than fail the last device. Here we redefine 408 * "uptodate" to mean "Don't want to retry" 409 */ 410 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), 411 rdev->raid_disk)) 412 uptodate = 1; 413 } 414 if (uptodate) { 415 raid_end_bio_io(r10_bio); 416 rdev_dec_pending(rdev, conf->mddev); 417 } else { 418 /* 419 * oops, read error - keep the refcount on the rdev 420 */ 421 pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", 422 mdname(conf->mddev), 423 rdev->bdev, 424 (unsigned long long)r10_bio->sector); 425 set_bit(R10BIO_ReadError, &r10_bio->state); 426 reschedule_retry(r10_bio); 427 } 428 } 429 430 static void close_write(struct r10bio *r10_bio) 431 { 432 /* clear the bitmap if all writes complete successfully */ 433 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 434 r10_bio->sectors, 435 !test_bit(R10BIO_Degraded, &r10_bio->state), 436 0); 437 md_write_end(r10_bio->mddev); 438 } 439 440 static void one_write_done(struct r10bio *r10_bio) 441 { 442 if (atomic_dec_and_test(&r10_bio->remaining)) { 443 if (test_bit(R10BIO_WriteError, &r10_bio->state)) 444 reschedule_retry(r10_bio); 445 else { 446 close_write(r10_bio); 447 if (test_bit(R10BIO_MadeGood, &r10_bio->state)) 448 reschedule_retry(r10_bio); 449 else 450 raid_end_bio_io(r10_bio); 451 } 452 } 453 } 454 455 static void raid10_end_write_request(struct bio *bio) 456 { 457 struct r10bio *r10_bio = bio->bi_private; 458 int dev; 459 int dec_rdev = 1; 460 struct r10conf *conf = r10_bio->mddev->private; 461 int slot, repl; 462 struct md_rdev *rdev = NULL; 463 struct bio *to_put = NULL; 464 bool discard_error; 465 466 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 467 468 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 469 470 if (repl) 471 rdev = conf->mirrors[dev].replacement; 472 if (!rdev) { 473 smp_rmb(); 474 repl = 0; 475 rdev = conf->mirrors[dev].rdev; 476 } 477 /* 478 * this branch is our 'one mirror IO has finished' event handler: 479 */ 480 if (bio->bi_status && !discard_error) { 481 if (repl) 482 /* Never record new bad blocks to replacement, 483 * just fail it. 484 */ 485 md_error(rdev->mddev, rdev); 486 else { 487 set_bit(WriteErrorSeen, &rdev->flags); 488 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 489 set_bit(MD_RECOVERY_NEEDED, 490 &rdev->mddev->recovery); 491 492 dec_rdev = 0; 493 if (test_bit(FailFast, &rdev->flags) && 494 (bio->bi_opf & MD_FAILFAST)) { 495 md_error(rdev->mddev, rdev); 496 } 497 498 /* 499 * When the device is faulty, it is not necessary to 500 * handle write error. 501 */ 502 if (!test_bit(Faulty, &rdev->flags)) 503 set_bit(R10BIO_WriteError, &r10_bio->state); 504 else { 505 /* Fail the request */ 506 set_bit(R10BIO_Degraded, &r10_bio->state); 507 r10_bio->devs[slot].bio = NULL; 508 to_put = bio; 509 dec_rdev = 1; 510 } 511 } 512 } else { 513 /* 514 * Set R10BIO_Uptodate in our master bio, so that 515 * we will return a good error code for to the higher 516 * levels even if IO on some other mirrored buffer fails. 517 * 518 * The 'master' represents the composite IO operation to 519 * user-side. So if something waits for IO, then it will 520 * wait for the 'master' bio. 521 */ 522 sector_t first_bad; 523 int bad_sectors; 524 525 /* 526 * Do not set R10BIO_Uptodate if the current device is 527 * rebuilding or Faulty. This is because we cannot use 528 * such device for properly reading the data back (we could 529 * potentially use it, if the current write would have felt 530 * before rdev->recovery_offset, but for simplicity we don't 531 * check this here. 532 */ 533 if (test_bit(In_sync, &rdev->flags) && 534 !test_bit(Faulty, &rdev->flags)) 535 set_bit(R10BIO_Uptodate, &r10_bio->state); 536 537 /* Maybe we can clear some bad blocks. */ 538 if (is_badblock(rdev, 539 r10_bio->devs[slot].addr, 540 r10_bio->sectors, 541 &first_bad, &bad_sectors) && !discard_error) { 542 bio_put(bio); 543 if (repl) 544 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; 545 else 546 r10_bio->devs[slot].bio = IO_MADE_GOOD; 547 dec_rdev = 0; 548 set_bit(R10BIO_MadeGood, &r10_bio->state); 549 } 550 } 551 552 /* 553 * 554 * Let's see if all mirrored write operations have finished 555 * already. 556 */ 557 one_write_done(r10_bio); 558 if (dec_rdev) 559 rdev_dec_pending(rdev, conf->mddev); 560 if (to_put) 561 bio_put(to_put); 562 } 563 564 /* 565 * RAID10 layout manager 566 * As well as the chunksize and raid_disks count, there are two 567 * parameters: near_copies and far_copies. 568 * near_copies * far_copies must be <= raid_disks. 569 * Normally one of these will be 1. 570 * If both are 1, we get raid0. 571 * If near_copies == raid_disks, we get raid1. 572 * 573 * Chunks are laid out in raid0 style with near_copies copies of the 574 * first chunk, followed by near_copies copies of the next chunk and 575 * so on. 576 * If far_copies > 1, then after 1/far_copies of the array has been assigned 577 * as described above, we start again with a device offset of near_copies. 578 * So we effectively have another copy of the whole array further down all 579 * the drives, but with blocks on different drives. 580 * With this layout, and block is never stored twice on the one device. 581 * 582 * raid10_find_phys finds the sector offset of a given virtual sector 583 * on each device that it is on. 584 * 585 * raid10_find_virt does the reverse mapping, from a device and a 586 * sector offset to a virtual address 587 */ 588 589 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 590 { 591 int n,f; 592 sector_t sector; 593 sector_t chunk; 594 sector_t stripe; 595 int dev; 596 int slot = 0; 597 int last_far_set_start, last_far_set_size; 598 599 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 600 last_far_set_start *= geo->far_set_size; 601 602 last_far_set_size = geo->far_set_size; 603 last_far_set_size += (geo->raid_disks % geo->far_set_size); 604 605 /* now calculate first sector/dev */ 606 chunk = r10bio->sector >> geo->chunk_shift; 607 sector = r10bio->sector & geo->chunk_mask; 608 609 chunk *= geo->near_copies; 610 stripe = chunk; 611 dev = sector_div(stripe, geo->raid_disks); 612 if (geo->far_offset) 613 stripe *= geo->far_copies; 614 615 sector += stripe << geo->chunk_shift; 616 617 /* and calculate all the others */ 618 for (n = 0; n < geo->near_copies; n++) { 619 int d = dev; 620 int set; 621 sector_t s = sector; 622 r10bio->devs[slot].devnum = d; 623 r10bio->devs[slot].addr = s; 624 slot++; 625 626 for (f = 1; f < geo->far_copies; f++) { 627 set = d / geo->far_set_size; 628 d += geo->near_copies; 629 630 if ((geo->raid_disks % geo->far_set_size) && 631 (d > last_far_set_start)) { 632 d -= last_far_set_start; 633 d %= last_far_set_size; 634 d += last_far_set_start; 635 } else { 636 d %= geo->far_set_size; 637 d += geo->far_set_size * set; 638 } 639 s += geo->stride; 640 r10bio->devs[slot].devnum = d; 641 r10bio->devs[slot].addr = s; 642 slot++; 643 } 644 dev++; 645 if (dev >= geo->raid_disks) { 646 dev = 0; 647 sector += (geo->chunk_mask + 1); 648 } 649 } 650 } 651 652 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 653 { 654 struct geom *geo = &conf->geo; 655 656 if (conf->reshape_progress != MaxSector && 657 ((r10bio->sector >= conf->reshape_progress) != 658 conf->mddev->reshape_backwards)) { 659 set_bit(R10BIO_Previous, &r10bio->state); 660 geo = &conf->prev; 661 } else 662 clear_bit(R10BIO_Previous, &r10bio->state); 663 664 __raid10_find_phys(geo, r10bio); 665 } 666 667 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 668 { 669 sector_t offset, chunk, vchunk; 670 /* Never use conf->prev as this is only called during resync 671 * or recovery, so reshape isn't happening 672 */ 673 struct geom *geo = &conf->geo; 674 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; 675 int far_set_size = geo->far_set_size; 676 int last_far_set_start; 677 678 if (geo->raid_disks % geo->far_set_size) { 679 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 680 last_far_set_start *= geo->far_set_size; 681 682 if (dev >= last_far_set_start) { 683 far_set_size = geo->far_set_size; 684 far_set_size += (geo->raid_disks % geo->far_set_size); 685 far_set_start = last_far_set_start; 686 } 687 } 688 689 offset = sector & geo->chunk_mask; 690 if (geo->far_offset) { 691 int fc; 692 chunk = sector >> geo->chunk_shift; 693 fc = sector_div(chunk, geo->far_copies); 694 dev -= fc * geo->near_copies; 695 if (dev < far_set_start) 696 dev += far_set_size; 697 } else { 698 while (sector >= geo->stride) { 699 sector -= geo->stride; 700 if (dev < (geo->near_copies + far_set_start)) 701 dev += far_set_size - geo->near_copies; 702 else 703 dev -= geo->near_copies; 704 } 705 chunk = sector >> geo->chunk_shift; 706 } 707 vchunk = chunk * geo->raid_disks + dev; 708 sector_div(vchunk, geo->near_copies); 709 return (vchunk << geo->chunk_shift) + offset; 710 } 711 712 /* 713 * This routine returns the disk from which the requested read should 714 * be done. There is a per-array 'next expected sequential IO' sector 715 * number - if this matches on the next IO then we use the last disk. 716 * There is also a per-disk 'last know head position' sector that is 717 * maintained from IRQ contexts, both the normal and the resync IO 718 * completion handlers update this position correctly. If there is no 719 * perfect sequential match then we pick the disk whose head is closest. 720 * 721 * If there are 2 mirrors in the same 2 devices, performance degrades 722 * because position is mirror, not device based. 723 * 724 * The rdev for the device selected will have nr_pending incremented. 725 */ 726 727 /* 728 * FIXME: possibly should rethink readbalancing and do it differently 729 * depending on near_copies / far_copies geometry. 730 */ 731 static struct md_rdev *read_balance(struct r10conf *conf, 732 struct r10bio *r10_bio, 733 int *max_sectors) 734 { 735 const sector_t this_sector = r10_bio->sector; 736 int disk, slot; 737 int sectors = r10_bio->sectors; 738 int best_good_sectors; 739 sector_t new_distance, best_dist; 740 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; 741 int do_balance; 742 int best_dist_slot, best_pending_slot; 743 bool has_nonrot_disk = false; 744 unsigned int min_pending; 745 struct geom *geo = &conf->geo; 746 747 raid10_find_phys(conf, r10_bio); 748 rcu_read_lock(); 749 best_dist_slot = -1; 750 min_pending = UINT_MAX; 751 best_dist_rdev = NULL; 752 best_pending_rdev = NULL; 753 best_dist = MaxSector; 754 best_good_sectors = 0; 755 do_balance = 1; 756 clear_bit(R10BIO_FailFast, &r10_bio->state); 757 /* 758 * Check if we can balance. We can balance on the whole 759 * device if no resync is going on (recovery is ok), or below 760 * the resync window. We take the first readable disk when 761 * above the resync window. 762 */ 763 if ((conf->mddev->recovery_cp < MaxSector 764 && (this_sector + sectors >= conf->next_resync)) || 765 (mddev_is_clustered(conf->mddev) && 766 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 767 this_sector + sectors))) 768 do_balance = 0; 769 770 for (slot = 0; slot < conf->copies ; slot++) { 771 sector_t first_bad; 772 int bad_sectors; 773 sector_t dev_sector; 774 unsigned int pending; 775 bool nonrot; 776 777 if (r10_bio->devs[slot].bio == IO_BLOCKED) 778 continue; 779 disk = r10_bio->devs[slot].devnum; 780 rdev = rcu_dereference(conf->mirrors[disk].replacement); 781 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 782 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 783 rdev = rcu_dereference(conf->mirrors[disk].rdev); 784 if (rdev == NULL || 785 test_bit(Faulty, &rdev->flags)) 786 continue; 787 if (!test_bit(In_sync, &rdev->flags) && 788 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 789 continue; 790 791 dev_sector = r10_bio->devs[slot].addr; 792 if (is_badblock(rdev, dev_sector, sectors, 793 &first_bad, &bad_sectors)) { 794 if (best_dist < MaxSector) 795 /* Already have a better slot */ 796 continue; 797 if (first_bad <= dev_sector) { 798 /* Cannot read here. If this is the 799 * 'primary' device, then we must not read 800 * beyond 'bad_sectors' from another device. 801 */ 802 bad_sectors -= (dev_sector - first_bad); 803 if (!do_balance && sectors > bad_sectors) 804 sectors = bad_sectors; 805 if (best_good_sectors > sectors) 806 best_good_sectors = sectors; 807 } else { 808 sector_t good_sectors = 809 first_bad - dev_sector; 810 if (good_sectors > best_good_sectors) { 811 best_good_sectors = good_sectors; 812 best_dist_slot = slot; 813 best_dist_rdev = rdev; 814 } 815 if (!do_balance) 816 /* Must read from here */ 817 break; 818 } 819 continue; 820 } else 821 best_good_sectors = sectors; 822 823 if (!do_balance) 824 break; 825 826 nonrot = bdev_nonrot(rdev->bdev); 827 has_nonrot_disk |= nonrot; 828 pending = atomic_read(&rdev->nr_pending); 829 if (min_pending > pending && nonrot) { 830 min_pending = pending; 831 best_pending_slot = slot; 832 best_pending_rdev = rdev; 833 } 834 835 if (best_dist_slot >= 0) 836 /* At least 2 disks to choose from so failfast is OK */ 837 set_bit(R10BIO_FailFast, &r10_bio->state); 838 /* This optimisation is debatable, and completely destroys 839 * sequential read speed for 'far copies' arrays. So only 840 * keep it for 'near' arrays, and review those later. 841 */ 842 if (geo->near_copies > 1 && !pending) 843 new_distance = 0; 844 845 /* for far > 1 always use the lowest address */ 846 else if (geo->far_copies > 1) 847 new_distance = r10_bio->devs[slot].addr; 848 else 849 new_distance = abs(r10_bio->devs[slot].addr - 850 conf->mirrors[disk].head_position); 851 852 if (new_distance < best_dist) { 853 best_dist = new_distance; 854 best_dist_slot = slot; 855 best_dist_rdev = rdev; 856 } 857 } 858 if (slot >= conf->copies) { 859 if (has_nonrot_disk) { 860 slot = best_pending_slot; 861 rdev = best_pending_rdev; 862 } else { 863 slot = best_dist_slot; 864 rdev = best_dist_rdev; 865 } 866 } 867 868 if (slot >= 0) { 869 atomic_inc(&rdev->nr_pending); 870 r10_bio->read_slot = slot; 871 } else 872 rdev = NULL; 873 rcu_read_unlock(); 874 *max_sectors = best_good_sectors; 875 876 return rdev; 877 } 878 879 static void flush_pending_writes(struct r10conf *conf) 880 { 881 /* Any writes that have been queued but are awaiting 882 * bitmap updates get flushed here. 883 */ 884 spin_lock_irq(&conf->device_lock); 885 886 if (conf->pending_bio_list.head) { 887 struct blk_plug plug; 888 struct bio *bio; 889 890 bio = bio_list_get(&conf->pending_bio_list); 891 spin_unlock_irq(&conf->device_lock); 892 893 /* 894 * As this is called in a wait_event() loop (see freeze_array), 895 * current->state might be TASK_UNINTERRUPTIBLE which will 896 * cause a warning when we prepare to wait again. As it is 897 * rare that this path is taken, it is perfectly safe to force 898 * us to go around the wait_event() loop again, so the warning 899 * is a false-positive. Silence the warning by resetting 900 * thread state 901 */ 902 __set_current_state(TASK_RUNNING); 903 904 blk_start_plug(&plug); 905 /* flush any pending bitmap writes to disk 906 * before proceeding w/ I/O */ 907 md_bitmap_unplug(conf->mddev->bitmap); 908 wake_up(&conf->wait_barrier); 909 910 while (bio) { /* submit pending writes */ 911 struct bio *next = bio->bi_next; 912 struct md_rdev *rdev = (void*)bio->bi_bdev; 913 bio->bi_next = NULL; 914 bio_set_dev(bio, rdev->bdev); 915 if (test_bit(Faulty, &rdev->flags)) { 916 bio_io_error(bio); 917 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 918 !bdev_max_discard_sectors(bio->bi_bdev))) 919 /* Just ignore it */ 920 bio_endio(bio); 921 else 922 submit_bio_noacct(bio); 923 bio = next; 924 } 925 blk_finish_plug(&plug); 926 } else 927 spin_unlock_irq(&conf->device_lock); 928 } 929 930 /* Barriers.... 931 * Sometimes we need to suspend IO while we do something else, 932 * either some resync/recovery, or reconfigure the array. 933 * To do this we raise a 'barrier'. 934 * The 'barrier' is a counter that can be raised multiple times 935 * to count how many activities are happening which preclude 936 * normal IO. 937 * We can only raise the barrier if there is no pending IO. 938 * i.e. if nr_pending == 0. 939 * We choose only to raise the barrier if no-one is waiting for the 940 * barrier to go down. This means that as soon as an IO request 941 * is ready, no other operations which require a barrier will start 942 * until the IO request has had a chance. 943 * 944 * So: regular IO calls 'wait_barrier'. When that returns there 945 * is no backgroup IO happening, It must arrange to call 946 * allow_barrier when it has finished its IO. 947 * backgroup IO calls must call raise_barrier. Once that returns 948 * there is no normal IO happeing. It must arrange to call 949 * lower_barrier when the particular background IO completes. 950 */ 951 952 static void raise_barrier(struct r10conf *conf, int force) 953 { 954 write_seqlock_irq(&conf->resync_lock); 955 BUG_ON(force && !conf->barrier); 956 957 /* Wait until no block IO is waiting (unless 'force') */ 958 wait_event_barrier(conf, force || !conf->nr_waiting); 959 960 /* block any new IO from starting */ 961 WRITE_ONCE(conf->barrier, conf->barrier + 1); 962 963 /* Now wait for all pending IO to complete */ 964 wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && 965 conf->barrier < RESYNC_DEPTH); 966 967 write_sequnlock_irq(&conf->resync_lock); 968 } 969 970 static void lower_barrier(struct r10conf *conf) 971 { 972 unsigned long flags; 973 974 write_seqlock_irqsave(&conf->resync_lock, flags); 975 WRITE_ONCE(conf->barrier, conf->barrier - 1); 976 write_sequnlock_irqrestore(&conf->resync_lock, flags); 977 wake_up(&conf->wait_barrier); 978 } 979 980 static bool stop_waiting_barrier(struct r10conf *conf) 981 { 982 struct bio_list *bio_list = current->bio_list; 983 984 /* barrier is dropped */ 985 if (!conf->barrier) 986 return true; 987 988 /* 989 * If there are already pending requests (preventing the barrier from 990 * rising completely), and the pre-process bio queue isn't empty, then 991 * don't wait, as we need to empty that queue to get the nr_pending 992 * count down. 993 */ 994 if (atomic_read(&conf->nr_pending) && bio_list && 995 (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) 996 return true; 997 998 /* move on if recovery thread is blocked by us */ 999 if (conf->mddev->thread->tsk == current && 1000 test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && 1001 conf->nr_queued > 0) 1002 return true; 1003 1004 return false; 1005 } 1006 1007 static bool wait_barrier_nolock(struct r10conf *conf) 1008 { 1009 unsigned int seq = read_seqbegin(&conf->resync_lock); 1010 1011 if (READ_ONCE(conf->barrier)) 1012 return false; 1013 1014 atomic_inc(&conf->nr_pending); 1015 if (!read_seqretry(&conf->resync_lock, seq)) 1016 return true; 1017 1018 if (atomic_dec_and_test(&conf->nr_pending)) 1019 wake_up_barrier(conf); 1020 1021 return false; 1022 } 1023 1024 static bool wait_barrier(struct r10conf *conf, bool nowait) 1025 { 1026 bool ret = true; 1027 1028 if (wait_barrier_nolock(conf)) 1029 return true; 1030 1031 write_seqlock_irq(&conf->resync_lock); 1032 if (conf->barrier) { 1033 /* Return false when nowait flag is set */ 1034 if (nowait) { 1035 ret = false; 1036 } else { 1037 conf->nr_waiting++; 1038 raid10_log(conf->mddev, "wait barrier"); 1039 wait_event_barrier(conf, stop_waiting_barrier(conf)); 1040 conf->nr_waiting--; 1041 } 1042 if (!conf->nr_waiting) 1043 wake_up(&conf->wait_barrier); 1044 } 1045 /* Only increment nr_pending when we wait */ 1046 if (ret) 1047 atomic_inc(&conf->nr_pending); 1048 write_sequnlock_irq(&conf->resync_lock); 1049 return ret; 1050 } 1051 1052 static void allow_barrier(struct r10conf *conf) 1053 { 1054 if ((atomic_dec_and_test(&conf->nr_pending)) || 1055 (conf->array_freeze_pending)) 1056 wake_up_barrier(conf); 1057 } 1058 1059 static void freeze_array(struct r10conf *conf, int extra) 1060 { 1061 /* stop syncio and normal IO and wait for everything to 1062 * go quiet. 1063 * We increment barrier and nr_waiting, and then 1064 * wait until nr_pending match nr_queued+extra 1065 * This is called in the context of one normal IO request 1066 * that has failed. Thus any sync request that might be pending 1067 * will be blocked by nr_pending, and we need to wait for 1068 * pending IO requests to complete or be queued for re-try. 1069 * Thus the number queued (nr_queued) plus this request (extra) 1070 * must match the number of pending IOs (nr_pending) before 1071 * we continue. 1072 */ 1073 write_seqlock_irq(&conf->resync_lock); 1074 conf->array_freeze_pending++; 1075 WRITE_ONCE(conf->barrier, conf->barrier + 1); 1076 conf->nr_waiting++; 1077 wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) == 1078 conf->nr_queued + extra, flush_pending_writes(conf)); 1079 conf->array_freeze_pending--; 1080 write_sequnlock_irq(&conf->resync_lock); 1081 } 1082 1083 static void unfreeze_array(struct r10conf *conf) 1084 { 1085 /* reverse the effect of the freeze */ 1086 write_seqlock_irq(&conf->resync_lock); 1087 WRITE_ONCE(conf->barrier, conf->barrier - 1); 1088 conf->nr_waiting--; 1089 wake_up(&conf->wait_barrier); 1090 write_sequnlock_irq(&conf->resync_lock); 1091 } 1092 1093 static sector_t choose_data_offset(struct r10bio *r10_bio, 1094 struct md_rdev *rdev) 1095 { 1096 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 1097 test_bit(R10BIO_Previous, &r10_bio->state)) 1098 return rdev->data_offset; 1099 else 1100 return rdev->new_data_offset; 1101 } 1102 1103 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 1104 { 1105 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); 1106 struct mddev *mddev = plug->cb.data; 1107 struct r10conf *conf = mddev->private; 1108 struct bio *bio; 1109 1110 if (from_schedule || current->bio_list) { 1111 spin_lock_irq(&conf->device_lock); 1112 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1113 spin_unlock_irq(&conf->device_lock); 1114 wake_up(&conf->wait_barrier); 1115 md_wakeup_thread(mddev->thread); 1116 kfree(plug); 1117 return; 1118 } 1119 1120 /* we aren't scheduling, so we can do the write-out directly. */ 1121 bio = bio_list_get(&plug->pending); 1122 md_bitmap_unplug(mddev->bitmap); 1123 wake_up(&conf->wait_barrier); 1124 1125 while (bio) { /* submit pending writes */ 1126 struct bio *next = bio->bi_next; 1127 struct md_rdev *rdev = (void*)bio->bi_bdev; 1128 bio->bi_next = NULL; 1129 bio_set_dev(bio, rdev->bdev); 1130 if (test_bit(Faulty, &rdev->flags)) { 1131 bio_io_error(bio); 1132 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 1133 !bdev_max_discard_sectors(bio->bi_bdev))) 1134 /* Just ignore it */ 1135 bio_endio(bio); 1136 else 1137 submit_bio_noacct(bio); 1138 bio = next; 1139 } 1140 kfree(plug); 1141 } 1142 1143 /* 1144 * 1. Register the new request and wait if the reconstruction thread has put 1145 * up a bar for new requests. Continue immediately if no resync is active 1146 * currently. 1147 * 2. If IO spans the reshape position. Need to wait for reshape to pass. 1148 */ 1149 static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, 1150 struct bio *bio, sector_t sectors) 1151 { 1152 /* Bail out if REQ_NOWAIT is set for the bio */ 1153 if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { 1154 bio_wouldblock_error(bio); 1155 return false; 1156 } 1157 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1158 bio->bi_iter.bi_sector < conf->reshape_progress && 1159 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 1160 allow_barrier(conf); 1161 if (bio->bi_opf & REQ_NOWAIT) { 1162 bio_wouldblock_error(bio); 1163 return false; 1164 } 1165 raid10_log(conf->mddev, "wait reshape"); 1166 wait_event(conf->wait_barrier, 1167 conf->reshape_progress <= bio->bi_iter.bi_sector || 1168 conf->reshape_progress >= bio->bi_iter.bi_sector + 1169 sectors); 1170 wait_barrier(conf, false); 1171 } 1172 return true; 1173 } 1174 1175 static void raid10_read_request(struct mddev *mddev, struct bio *bio, 1176 struct r10bio *r10_bio) 1177 { 1178 struct r10conf *conf = mddev->private; 1179 struct bio *read_bio; 1180 const enum req_op op = bio_op(bio); 1181 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1182 int max_sectors; 1183 struct md_rdev *rdev; 1184 char b[BDEVNAME_SIZE]; 1185 int slot = r10_bio->read_slot; 1186 struct md_rdev *err_rdev = NULL; 1187 gfp_t gfp = GFP_NOIO; 1188 1189 if (slot >= 0 && r10_bio->devs[slot].rdev) { 1190 /* 1191 * This is an error retry, but we cannot 1192 * safely dereference the rdev in the r10_bio, 1193 * we must use the one in conf. 1194 * If it has already been disconnected (unlikely) 1195 * we lose the device name in error messages. 1196 */ 1197 int disk; 1198 /* 1199 * As we are blocking raid10, it is a little safer to 1200 * use __GFP_HIGH. 1201 */ 1202 gfp = GFP_NOIO | __GFP_HIGH; 1203 1204 rcu_read_lock(); 1205 disk = r10_bio->devs[slot].devnum; 1206 err_rdev = rcu_dereference(conf->mirrors[disk].rdev); 1207 if (err_rdev) 1208 snprintf(b, sizeof(b), "%pg", err_rdev->bdev); 1209 else { 1210 strcpy(b, "???"); 1211 /* This never gets dereferenced */ 1212 err_rdev = r10_bio->devs[slot].rdev; 1213 } 1214 rcu_read_unlock(); 1215 } 1216 1217 if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) 1218 return; 1219 rdev = read_balance(conf, r10_bio, &max_sectors); 1220 if (!rdev) { 1221 if (err_rdev) { 1222 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", 1223 mdname(mddev), b, 1224 (unsigned long long)r10_bio->sector); 1225 } 1226 raid_end_bio_io(r10_bio); 1227 return; 1228 } 1229 if (err_rdev) 1230 pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", 1231 mdname(mddev), 1232 rdev->bdev, 1233 (unsigned long long)r10_bio->sector); 1234 if (max_sectors < bio_sectors(bio)) { 1235 struct bio *split = bio_split(bio, max_sectors, 1236 gfp, &conf->bio_split); 1237 bio_chain(split, bio); 1238 allow_barrier(conf); 1239 submit_bio_noacct(bio); 1240 wait_barrier(conf, false); 1241 bio = split; 1242 r10_bio->master_bio = bio; 1243 r10_bio->sectors = max_sectors; 1244 } 1245 slot = r10_bio->read_slot; 1246 1247 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1248 r10_bio->start_time = bio_start_io_acct(bio); 1249 read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); 1250 1251 r10_bio->devs[slot].bio = read_bio; 1252 r10_bio->devs[slot].rdev = rdev; 1253 1254 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + 1255 choose_data_offset(r10_bio, rdev); 1256 read_bio->bi_end_io = raid10_end_read_request; 1257 bio_set_op_attrs(read_bio, op, do_sync); 1258 if (test_bit(FailFast, &rdev->flags) && 1259 test_bit(R10BIO_FailFast, &r10_bio->state)) 1260 read_bio->bi_opf |= MD_FAILFAST; 1261 read_bio->bi_private = r10_bio; 1262 1263 if (mddev->gendisk) 1264 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 1265 r10_bio->sector); 1266 submit_bio_noacct(read_bio); 1267 return; 1268 } 1269 1270 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, 1271 struct bio *bio, bool replacement, 1272 int n_copy) 1273 { 1274 const enum req_op op = bio_op(bio); 1275 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1276 const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 1277 unsigned long flags; 1278 struct blk_plug_cb *cb; 1279 struct raid1_plug_cb *plug = NULL; 1280 struct r10conf *conf = mddev->private; 1281 struct md_rdev *rdev; 1282 int devnum = r10_bio->devs[n_copy].devnum; 1283 struct bio *mbio; 1284 1285 if (replacement) { 1286 rdev = conf->mirrors[devnum].replacement; 1287 if (rdev == NULL) { 1288 /* Replacement just got moved to main 'rdev' */ 1289 smp_mb(); 1290 rdev = conf->mirrors[devnum].rdev; 1291 } 1292 } else 1293 rdev = conf->mirrors[devnum].rdev; 1294 1295 mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); 1296 if (replacement) 1297 r10_bio->devs[n_copy].repl_bio = mbio; 1298 else 1299 r10_bio->devs[n_copy].bio = mbio; 1300 1301 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1302 choose_data_offset(r10_bio, rdev)); 1303 mbio->bi_end_io = raid10_end_write_request; 1304 bio_set_op_attrs(mbio, op, do_sync | do_fua); 1305 if (!replacement && test_bit(FailFast, 1306 &conf->mirrors[devnum].rdev->flags) 1307 && enough(conf, devnum)) 1308 mbio->bi_opf |= MD_FAILFAST; 1309 mbio->bi_private = r10_bio; 1310 1311 if (conf->mddev->gendisk) 1312 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), 1313 r10_bio->sector); 1314 /* flush_pending_writes() needs access to the rdev so...*/ 1315 mbio->bi_bdev = (void *)rdev; 1316 1317 atomic_inc(&r10_bio->remaining); 1318 1319 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1320 if (cb) 1321 plug = container_of(cb, struct raid1_plug_cb, cb); 1322 else 1323 plug = NULL; 1324 if (plug) { 1325 bio_list_add(&plug->pending, mbio); 1326 } else { 1327 spin_lock_irqsave(&conf->device_lock, flags); 1328 bio_list_add(&conf->pending_bio_list, mbio); 1329 spin_unlock_irqrestore(&conf->device_lock, flags); 1330 md_wakeup_thread(mddev->thread); 1331 } 1332 } 1333 1334 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 1335 { 1336 int i; 1337 struct r10conf *conf = mddev->private; 1338 struct md_rdev *blocked_rdev; 1339 1340 retry_wait: 1341 blocked_rdev = NULL; 1342 rcu_read_lock(); 1343 for (i = 0; i < conf->copies; i++) { 1344 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1345 struct md_rdev *rrdev = rcu_dereference( 1346 conf->mirrors[i].replacement); 1347 if (rdev == rrdev) 1348 rrdev = NULL; 1349 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1350 atomic_inc(&rdev->nr_pending); 1351 blocked_rdev = rdev; 1352 break; 1353 } 1354 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 1355 atomic_inc(&rrdev->nr_pending); 1356 blocked_rdev = rrdev; 1357 break; 1358 } 1359 1360 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1361 sector_t first_bad; 1362 sector_t dev_sector = r10_bio->devs[i].addr; 1363 int bad_sectors; 1364 int is_bad; 1365 1366 /* 1367 * Discard request doesn't care the write result 1368 * so it doesn't need to wait blocked disk here. 1369 */ 1370 if (!r10_bio->sectors) 1371 continue; 1372 1373 is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, 1374 &first_bad, &bad_sectors); 1375 if (is_bad < 0) { 1376 /* 1377 * Mustn't write here until the bad block 1378 * is acknowledged 1379 */ 1380 atomic_inc(&rdev->nr_pending); 1381 set_bit(BlockedBadBlocks, &rdev->flags); 1382 blocked_rdev = rdev; 1383 break; 1384 } 1385 } 1386 } 1387 rcu_read_unlock(); 1388 1389 if (unlikely(blocked_rdev)) { 1390 /* Have to wait for this device to get unblocked, then retry */ 1391 allow_barrier(conf); 1392 raid10_log(conf->mddev, "%s wait rdev %d blocked", 1393 __func__, blocked_rdev->raid_disk); 1394 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1395 wait_barrier(conf, false); 1396 goto retry_wait; 1397 } 1398 } 1399 1400 static void raid10_write_request(struct mddev *mddev, struct bio *bio, 1401 struct r10bio *r10_bio) 1402 { 1403 struct r10conf *conf = mddev->private; 1404 int i; 1405 sector_t sectors; 1406 int max_sectors; 1407 1408 if ((mddev_is_clustered(mddev) && 1409 md_cluster_ops->area_resyncing(mddev, WRITE, 1410 bio->bi_iter.bi_sector, 1411 bio_end_sector(bio)))) { 1412 DEFINE_WAIT(w); 1413 /* Bail out if REQ_NOWAIT is set for the bio */ 1414 if (bio->bi_opf & REQ_NOWAIT) { 1415 bio_wouldblock_error(bio); 1416 return; 1417 } 1418 for (;;) { 1419 prepare_to_wait(&conf->wait_barrier, 1420 &w, TASK_IDLE); 1421 if (!md_cluster_ops->area_resyncing(mddev, WRITE, 1422 bio->bi_iter.bi_sector, bio_end_sector(bio))) 1423 break; 1424 schedule(); 1425 } 1426 finish_wait(&conf->wait_barrier, &w); 1427 } 1428 1429 sectors = r10_bio->sectors; 1430 if (!regular_request_wait(mddev, conf, bio, sectors)) 1431 return; 1432 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1433 (mddev->reshape_backwards 1434 ? (bio->bi_iter.bi_sector < conf->reshape_safe && 1435 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) 1436 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && 1437 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1438 /* Need to update reshape_position in metadata */ 1439 mddev->reshape_position = conf->reshape_progress; 1440 set_mask_bits(&mddev->sb_flags, 0, 1441 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1442 md_wakeup_thread(mddev->thread); 1443 if (bio->bi_opf & REQ_NOWAIT) { 1444 allow_barrier(conf); 1445 bio_wouldblock_error(bio); 1446 return; 1447 } 1448 raid10_log(conf->mddev, "wait reshape metadata"); 1449 wait_event(mddev->sb_wait, 1450 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 1451 1452 conf->reshape_safe = mddev->reshape_position; 1453 } 1454 1455 /* first select target devices under rcu_lock and 1456 * inc refcount on their rdev. Record them by setting 1457 * bios[x] to bio 1458 * If there are known/acknowledged bad blocks on any device 1459 * on which we have seen a write error, we want to avoid 1460 * writing to those blocks. This potentially requires several 1461 * writes to write around the bad blocks. Each set of writes 1462 * gets its own r10_bio with a set of bios attached. 1463 */ 1464 1465 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 1466 raid10_find_phys(conf, r10_bio); 1467 1468 wait_blocked_dev(mddev, r10_bio); 1469 1470 rcu_read_lock(); 1471 max_sectors = r10_bio->sectors; 1472 1473 for (i = 0; i < conf->copies; i++) { 1474 int d = r10_bio->devs[i].devnum; 1475 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1476 struct md_rdev *rrdev = rcu_dereference( 1477 conf->mirrors[d].replacement); 1478 if (rdev == rrdev) 1479 rrdev = NULL; 1480 if (rdev && (test_bit(Faulty, &rdev->flags))) 1481 rdev = NULL; 1482 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1483 rrdev = NULL; 1484 1485 r10_bio->devs[i].bio = NULL; 1486 r10_bio->devs[i].repl_bio = NULL; 1487 1488 if (!rdev && !rrdev) { 1489 set_bit(R10BIO_Degraded, &r10_bio->state); 1490 continue; 1491 } 1492 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1493 sector_t first_bad; 1494 sector_t dev_sector = r10_bio->devs[i].addr; 1495 int bad_sectors; 1496 int is_bad; 1497 1498 is_bad = is_badblock(rdev, dev_sector, max_sectors, 1499 &first_bad, &bad_sectors); 1500 if (is_bad && first_bad <= dev_sector) { 1501 /* Cannot write here at all */ 1502 bad_sectors -= (dev_sector - first_bad); 1503 if (bad_sectors < max_sectors) 1504 /* Mustn't write more than bad_sectors 1505 * to other devices yet 1506 */ 1507 max_sectors = bad_sectors; 1508 /* We don't set R10BIO_Degraded as that 1509 * only applies if the disk is missing, 1510 * so it might be re-added, and we want to 1511 * know to recover this chunk. 1512 * In this case the device is here, and the 1513 * fact that this chunk is not in-sync is 1514 * recorded in the bad block log. 1515 */ 1516 continue; 1517 } 1518 if (is_bad) { 1519 int good_sectors = first_bad - dev_sector; 1520 if (good_sectors < max_sectors) 1521 max_sectors = good_sectors; 1522 } 1523 } 1524 if (rdev) { 1525 r10_bio->devs[i].bio = bio; 1526 atomic_inc(&rdev->nr_pending); 1527 } 1528 if (rrdev) { 1529 r10_bio->devs[i].repl_bio = bio; 1530 atomic_inc(&rrdev->nr_pending); 1531 } 1532 } 1533 rcu_read_unlock(); 1534 1535 if (max_sectors < r10_bio->sectors) 1536 r10_bio->sectors = max_sectors; 1537 1538 if (r10_bio->sectors < bio_sectors(bio)) { 1539 struct bio *split = bio_split(bio, r10_bio->sectors, 1540 GFP_NOIO, &conf->bio_split); 1541 bio_chain(split, bio); 1542 allow_barrier(conf); 1543 submit_bio_noacct(bio); 1544 wait_barrier(conf, false); 1545 bio = split; 1546 r10_bio->master_bio = bio; 1547 } 1548 1549 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1550 r10_bio->start_time = bio_start_io_acct(bio); 1551 atomic_set(&r10_bio->remaining, 1); 1552 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1553 1554 for (i = 0; i < conf->copies; i++) { 1555 if (r10_bio->devs[i].bio) 1556 raid10_write_one_disk(mddev, r10_bio, bio, false, i); 1557 if (r10_bio->devs[i].repl_bio) 1558 raid10_write_one_disk(mddev, r10_bio, bio, true, i); 1559 } 1560 one_write_done(r10_bio); 1561 } 1562 1563 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) 1564 { 1565 struct r10conf *conf = mddev->private; 1566 struct r10bio *r10_bio; 1567 1568 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1569 1570 r10_bio->master_bio = bio; 1571 r10_bio->sectors = sectors; 1572 1573 r10_bio->mddev = mddev; 1574 r10_bio->sector = bio->bi_iter.bi_sector; 1575 r10_bio->state = 0; 1576 r10_bio->read_slot = -1; 1577 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * 1578 conf->geo.raid_disks); 1579 1580 if (bio_data_dir(bio) == READ) 1581 raid10_read_request(mddev, bio, r10_bio); 1582 else 1583 raid10_write_request(mddev, bio, r10_bio); 1584 } 1585 1586 static void raid_end_discard_bio(struct r10bio *r10bio) 1587 { 1588 struct r10conf *conf = r10bio->mddev->private; 1589 struct r10bio *first_r10bio; 1590 1591 while (atomic_dec_and_test(&r10bio->remaining)) { 1592 1593 allow_barrier(conf); 1594 1595 if (!test_bit(R10BIO_Discard, &r10bio->state)) { 1596 first_r10bio = (struct r10bio *)r10bio->master_bio; 1597 free_r10bio(r10bio); 1598 r10bio = first_r10bio; 1599 } else { 1600 md_write_end(r10bio->mddev); 1601 bio_endio(r10bio->master_bio); 1602 free_r10bio(r10bio); 1603 break; 1604 } 1605 } 1606 } 1607 1608 static void raid10_end_discard_request(struct bio *bio) 1609 { 1610 struct r10bio *r10_bio = bio->bi_private; 1611 struct r10conf *conf = r10_bio->mddev->private; 1612 struct md_rdev *rdev = NULL; 1613 int dev; 1614 int slot, repl; 1615 1616 /* 1617 * We don't care the return value of discard bio 1618 */ 1619 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 1620 set_bit(R10BIO_Uptodate, &r10_bio->state); 1621 1622 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1623 if (repl) 1624 rdev = conf->mirrors[dev].replacement; 1625 if (!rdev) { 1626 /* 1627 * raid10_remove_disk uses smp_mb to make sure rdev is set to 1628 * replacement before setting replacement to NULL. It can read 1629 * rdev first without barrier protect even replacment is NULL 1630 */ 1631 smp_rmb(); 1632 rdev = conf->mirrors[dev].rdev; 1633 } 1634 1635 raid_end_discard_bio(r10_bio); 1636 rdev_dec_pending(rdev, conf->mddev); 1637 } 1638 1639 /* 1640 * There are some limitations to handle discard bio 1641 * 1st, the discard size is bigger than stripe_size*2. 1642 * 2st, if the discard bio spans reshape progress, we use the old way to 1643 * handle discard bio 1644 */ 1645 static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) 1646 { 1647 struct r10conf *conf = mddev->private; 1648 struct geom *geo = &conf->geo; 1649 int far_copies = geo->far_copies; 1650 bool first_copy = true; 1651 struct r10bio *r10_bio, *first_r10bio; 1652 struct bio *split; 1653 int disk; 1654 sector_t chunk; 1655 unsigned int stripe_size; 1656 unsigned int stripe_data_disks; 1657 sector_t split_size; 1658 sector_t bio_start, bio_end; 1659 sector_t first_stripe_index, last_stripe_index; 1660 sector_t start_disk_offset; 1661 unsigned int start_disk_index; 1662 sector_t end_disk_offset; 1663 unsigned int end_disk_index; 1664 unsigned int remainder; 1665 1666 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1667 return -EAGAIN; 1668 1669 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { 1670 bio_wouldblock_error(bio); 1671 return 0; 1672 } 1673 wait_barrier(conf, false); 1674 1675 /* 1676 * Check reshape again to avoid reshape happens after checking 1677 * MD_RECOVERY_RESHAPE and before wait_barrier 1678 */ 1679 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1680 goto out; 1681 1682 if (geo->near_copies) 1683 stripe_data_disks = geo->raid_disks / geo->near_copies + 1684 geo->raid_disks % geo->near_copies; 1685 else 1686 stripe_data_disks = geo->raid_disks; 1687 1688 stripe_size = stripe_data_disks << geo->chunk_shift; 1689 1690 bio_start = bio->bi_iter.bi_sector; 1691 bio_end = bio_end_sector(bio); 1692 1693 /* 1694 * Maybe one discard bio is smaller than strip size or across one 1695 * stripe and discard region is larger than one stripe size. For far 1696 * offset layout, if the discard region is not aligned with stripe 1697 * size, there is hole when we submit discard bio to member disk. 1698 * For simplicity, we only handle discard bio which discard region 1699 * is bigger than stripe_size * 2 1700 */ 1701 if (bio_sectors(bio) < stripe_size*2) 1702 goto out; 1703 1704 /* 1705 * Keep bio aligned with strip size. 1706 */ 1707 div_u64_rem(bio_start, stripe_size, &remainder); 1708 if (remainder) { 1709 split_size = stripe_size - remainder; 1710 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1711 bio_chain(split, bio); 1712 allow_barrier(conf); 1713 /* Resend the fist split part */ 1714 submit_bio_noacct(split); 1715 wait_barrier(conf, false); 1716 } 1717 div_u64_rem(bio_end, stripe_size, &remainder); 1718 if (remainder) { 1719 split_size = bio_sectors(bio) - remainder; 1720 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1721 bio_chain(split, bio); 1722 allow_barrier(conf); 1723 /* Resend the second split part */ 1724 submit_bio_noacct(bio); 1725 bio = split; 1726 wait_barrier(conf, false); 1727 } 1728 1729 bio_start = bio->bi_iter.bi_sector; 1730 bio_end = bio_end_sector(bio); 1731 1732 /* 1733 * Raid10 uses chunk as the unit to store data. It's similar like raid0. 1734 * One stripe contains the chunks from all member disk (one chunk from 1735 * one disk at the same HBA address). For layout detail, see 'man md 4' 1736 */ 1737 chunk = bio_start >> geo->chunk_shift; 1738 chunk *= geo->near_copies; 1739 first_stripe_index = chunk; 1740 start_disk_index = sector_div(first_stripe_index, geo->raid_disks); 1741 if (geo->far_offset) 1742 first_stripe_index *= geo->far_copies; 1743 start_disk_offset = (bio_start & geo->chunk_mask) + 1744 (first_stripe_index << geo->chunk_shift); 1745 1746 chunk = bio_end >> geo->chunk_shift; 1747 chunk *= geo->near_copies; 1748 last_stripe_index = chunk; 1749 end_disk_index = sector_div(last_stripe_index, geo->raid_disks); 1750 if (geo->far_offset) 1751 last_stripe_index *= geo->far_copies; 1752 end_disk_offset = (bio_end & geo->chunk_mask) + 1753 (last_stripe_index << geo->chunk_shift); 1754 1755 retry_discard: 1756 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1757 r10_bio->mddev = mddev; 1758 r10_bio->state = 0; 1759 r10_bio->sectors = 0; 1760 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); 1761 wait_blocked_dev(mddev, r10_bio); 1762 1763 /* 1764 * For far layout it needs more than one r10bio to cover all regions. 1765 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio 1766 * to record the discard bio. Other r10bio->master_bio record the first 1767 * r10bio. The first r10bio only release after all other r10bios finish. 1768 * The discard bio returns only first r10bio finishes 1769 */ 1770 if (first_copy) { 1771 r10_bio->master_bio = bio; 1772 set_bit(R10BIO_Discard, &r10_bio->state); 1773 first_copy = false; 1774 first_r10bio = r10_bio; 1775 } else 1776 r10_bio->master_bio = (struct bio *)first_r10bio; 1777 1778 /* 1779 * first select target devices under rcu_lock and 1780 * inc refcount on their rdev. Record them by setting 1781 * bios[x] to bio 1782 */ 1783 rcu_read_lock(); 1784 for (disk = 0; disk < geo->raid_disks; disk++) { 1785 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1786 struct md_rdev *rrdev = rcu_dereference( 1787 conf->mirrors[disk].replacement); 1788 1789 r10_bio->devs[disk].bio = NULL; 1790 r10_bio->devs[disk].repl_bio = NULL; 1791 1792 if (rdev && (test_bit(Faulty, &rdev->flags))) 1793 rdev = NULL; 1794 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1795 rrdev = NULL; 1796 if (!rdev && !rrdev) 1797 continue; 1798 1799 if (rdev) { 1800 r10_bio->devs[disk].bio = bio; 1801 atomic_inc(&rdev->nr_pending); 1802 } 1803 if (rrdev) { 1804 r10_bio->devs[disk].repl_bio = bio; 1805 atomic_inc(&rrdev->nr_pending); 1806 } 1807 } 1808 rcu_read_unlock(); 1809 1810 atomic_set(&r10_bio->remaining, 1); 1811 for (disk = 0; disk < geo->raid_disks; disk++) { 1812 sector_t dev_start, dev_end; 1813 struct bio *mbio, *rbio = NULL; 1814 1815 /* 1816 * Now start to calculate the start and end address for each disk. 1817 * The space between dev_start and dev_end is the discard region. 1818 * 1819 * For dev_start, it needs to consider three conditions: 1820 * 1st, the disk is before start_disk, you can imagine the disk in 1821 * the next stripe. So the dev_start is the start address of next 1822 * stripe. 1823 * 2st, the disk is after start_disk, it means the disk is at the 1824 * same stripe of first disk 1825 * 3st, the first disk itself, we can use start_disk_offset directly 1826 */ 1827 if (disk < start_disk_index) 1828 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; 1829 else if (disk > start_disk_index) 1830 dev_start = first_stripe_index * mddev->chunk_sectors; 1831 else 1832 dev_start = start_disk_offset; 1833 1834 if (disk < end_disk_index) 1835 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 1836 else if (disk > end_disk_index) 1837 dev_end = last_stripe_index * mddev->chunk_sectors; 1838 else 1839 dev_end = end_disk_offset; 1840 1841 /* 1842 * It only handles discard bio which size is >= stripe size, so 1843 * dev_end > dev_start all the time. 1844 * It doesn't need to use rcu lock to get rdev here. We already 1845 * add rdev->nr_pending in the first loop. 1846 */ 1847 if (r10_bio->devs[disk].bio) { 1848 struct md_rdev *rdev = conf->mirrors[disk].rdev; 1849 mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1850 &mddev->bio_set); 1851 mbio->bi_end_io = raid10_end_discard_request; 1852 mbio->bi_private = r10_bio; 1853 r10_bio->devs[disk].bio = mbio; 1854 r10_bio->devs[disk].devnum = disk; 1855 atomic_inc(&r10_bio->remaining); 1856 md_submit_discard_bio(mddev, rdev, mbio, 1857 dev_start + choose_data_offset(r10_bio, rdev), 1858 dev_end - dev_start); 1859 bio_endio(mbio); 1860 } 1861 if (r10_bio->devs[disk].repl_bio) { 1862 struct md_rdev *rrdev = conf->mirrors[disk].replacement; 1863 rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1864 &mddev->bio_set); 1865 rbio->bi_end_io = raid10_end_discard_request; 1866 rbio->bi_private = r10_bio; 1867 r10_bio->devs[disk].repl_bio = rbio; 1868 r10_bio->devs[disk].devnum = disk; 1869 atomic_inc(&r10_bio->remaining); 1870 md_submit_discard_bio(mddev, rrdev, rbio, 1871 dev_start + choose_data_offset(r10_bio, rrdev), 1872 dev_end - dev_start); 1873 bio_endio(rbio); 1874 } 1875 } 1876 1877 if (!geo->far_offset && --far_copies) { 1878 first_stripe_index += geo->stride >> geo->chunk_shift; 1879 start_disk_offset += geo->stride; 1880 last_stripe_index += geo->stride >> geo->chunk_shift; 1881 end_disk_offset += geo->stride; 1882 atomic_inc(&first_r10bio->remaining); 1883 raid_end_discard_bio(r10_bio); 1884 wait_barrier(conf, false); 1885 goto retry_discard; 1886 } 1887 1888 raid_end_discard_bio(r10_bio); 1889 1890 return 0; 1891 out: 1892 allow_barrier(conf); 1893 return -EAGAIN; 1894 } 1895 1896 static bool raid10_make_request(struct mddev *mddev, struct bio *bio) 1897 { 1898 struct r10conf *conf = mddev->private; 1899 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 1900 int chunk_sects = chunk_mask + 1; 1901 int sectors = bio_sectors(bio); 1902 1903 if (unlikely(bio->bi_opf & REQ_PREFLUSH) 1904 && md_flush_request(mddev, bio)) 1905 return true; 1906 1907 if (!md_write_start(mddev, bio)) 1908 return false; 1909 1910 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1911 if (!raid10_handle_discard(mddev, bio)) 1912 return true; 1913 1914 /* 1915 * If this request crosses a chunk boundary, we need to split 1916 * it. 1917 */ 1918 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + 1919 sectors > chunk_sects 1920 && (conf->geo.near_copies < conf->geo.raid_disks 1921 || conf->prev.near_copies < 1922 conf->prev.raid_disks))) 1923 sectors = chunk_sects - 1924 (bio->bi_iter.bi_sector & 1925 (chunk_sects - 1)); 1926 __make_request(mddev, bio, sectors); 1927 1928 /* In case raid10d snuck in to freeze_array */ 1929 wake_up_barrier(conf); 1930 return true; 1931 } 1932 1933 static void raid10_status(struct seq_file *seq, struct mddev *mddev) 1934 { 1935 struct r10conf *conf = mddev->private; 1936 int i; 1937 1938 if (conf->geo.near_copies < conf->geo.raid_disks) 1939 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1940 if (conf->geo.near_copies > 1) 1941 seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1942 if (conf->geo.far_copies > 1) { 1943 if (conf->geo.far_offset) 1944 seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1945 else 1946 seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1947 if (conf->geo.far_set_size != conf->geo.raid_disks) 1948 seq_printf(seq, " %d devices per set", conf->geo.far_set_size); 1949 } 1950 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1951 conf->geo.raid_disks - mddev->degraded); 1952 rcu_read_lock(); 1953 for (i = 0; i < conf->geo.raid_disks; i++) { 1954 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1955 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 1956 } 1957 rcu_read_unlock(); 1958 seq_printf(seq, "]"); 1959 } 1960 1961 /* check if there are enough drives for 1962 * every block to appear on atleast one. 1963 * Don't consider the device numbered 'ignore' 1964 * as we might be about to remove it. 1965 */ 1966 static int _enough(struct r10conf *conf, int previous, int ignore) 1967 { 1968 int first = 0; 1969 int has_enough = 0; 1970 int disks, ncopies; 1971 if (previous) { 1972 disks = conf->prev.raid_disks; 1973 ncopies = conf->prev.near_copies; 1974 } else { 1975 disks = conf->geo.raid_disks; 1976 ncopies = conf->geo.near_copies; 1977 } 1978 1979 rcu_read_lock(); 1980 do { 1981 int n = conf->copies; 1982 int cnt = 0; 1983 int this = first; 1984 while (n--) { 1985 struct md_rdev *rdev; 1986 if (this != ignore && 1987 (rdev = rcu_dereference(conf->mirrors[this].rdev)) && 1988 test_bit(In_sync, &rdev->flags)) 1989 cnt++; 1990 this = (this+1) % disks; 1991 } 1992 if (cnt == 0) 1993 goto out; 1994 first = (first + ncopies) % disks; 1995 } while (first != 0); 1996 has_enough = 1; 1997 out: 1998 rcu_read_unlock(); 1999 return has_enough; 2000 } 2001 2002 static int enough(struct r10conf *conf, int ignore) 2003 { 2004 /* when calling 'enough', both 'prev' and 'geo' must 2005 * be stable. 2006 * This is ensured if ->reconfig_mutex or ->device_lock 2007 * is held. 2008 */ 2009 return _enough(conf, 0, ignore) && 2010 _enough(conf, 1, ignore); 2011 } 2012 2013 /** 2014 * raid10_error() - RAID10 error handler. 2015 * @mddev: affected md device. 2016 * @rdev: member device to fail. 2017 * 2018 * The routine acknowledges &rdev failure and determines new @mddev state. 2019 * If it failed, then: 2020 * - &MD_BROKEN flag is set in &mddev->flags. 2021 * Otherwise, it must be degraded: 2022 * - recovery is interrupted. 2023 * - &mddev->degraded is bumped. 2024 * 2025 * @rdev is marked as &Faulty excluding case when array is failed and 2026 * &mddev->fail_last_dev is off. 2027 */ 2028 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) 2029 { 2030 struct r10conf *conf = mddev->private; 2031 unsigned long flags; 2032 2033 spin_lock_irqsave(&conf->device_lock, flags); 2034 2035 if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { 2036 set_bit(MD_BROKEN, &mddev->flags); 2037 2038 if (!mddev->fail_last_dev) { 2039 spin_unlock_irqrestore(&conf->device_lock, flags); 2040 return; 2041 } 2042 } 2043 if (test_and_clear_bit(In_sync, &rdev->flags)) 2044 mddev->degraded++; 2045 2046 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2047 set_bit(Blocked, &rdev->flags); 2048 set_bit(Faulty, &rdev->flags); 2049 set_mask_bits(&mddev->sb_flags, 0, 2050 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2051 spin_unlock_irqrestore(&conf->device_lock, flags); 2052 pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" 2053 "md/raid10:%s: Operation continuing on %d devices.\n", 2054 mdname(mddev), rdev->bdev, 2055 mdname(mddev), conf->geo.raid_disks - mddev->degraded); 2056 } 2057 2058 static void print_conf(struct r10conf *conf) 2059 { 2060 int i; 2061 struct md_rdev *rdev; 2062 2063 pr_debug("RAID10 conf printout:\n"); 2064 if (!conf) { 2065 pr_debug("(!conf)\n"); 2066 return; 2067 } 2068 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 2069 conf->geo.raid_disks); 2070 2071 /* This is only called with ->reconfix_mutex held, so 2072 * rcu protection of rdev is not needed */ 2073 for (i = 0; i < conf->geo.raid_disks; i++) { 2074 rdev = conf->mirrors[i].rdev; 2075 if (rdev) 2076 pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", 2077 i, !test_bit(In_sync, &rdev->flags), 2078 !test_bit(Faulty, &rdev->flags), 2079 rdev->bdev); 2080 } 2081 } 2082 2083 static void close_sync(struct r10conf *conf) 2084 { 2085 wait_barrier(conf, false); 2086 allow_barrier(conf); 2087 2088 mempool_exit(&conf->r10buf_pool); 2089 } 2090 2091 static int raid10_spare_active(struct mddev *mddev) 2092 { 2093 int i; 2094 struct r10conf *conf = mddev->private; 2095 struct raid10_info *tmp; 2096 int count = 0; 2097 unsigned long flags; 2098 2099 /* 2100 * Find all non-in_sync disks within the RAID10 configuration 2101 * and mark them in_sync 2102 */ 2103 for (i = 0; i < conf->geo.raid_disks; i++) { 2104 tmp = conf->mirrors + i; 2105 if (tmp->replacement 2106 && tmp->replacement->recovery_offset == MaxSector 2107 && !test_bit(Faulty, &tmp->replacement->flags) 2108 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 2109 /* Replacement has just become active */ 2110 if (!tmp->rdev 2111 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 2112 count++; 2113 if (tmp->rdev) { 2114 /* Replaced device not technically faulty, 2115 * but we need to be sure it gets removed 2116 * and never re-added. 2117 */ 2118 set_bit(Faulty, &tmp->rdev->flags); 2119 sysfs_notify_dirent_safe( 2120 tmp->rdev->sysfs_state); 2121 } 2122 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 2123 } else if (tmp->rdev 2124 && tmp->rdev->recovery_offset == MaxSector 2125 && !test_bit(Faulty, &tmp->rdev->flags) 2126 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 2127 count++; 2128 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 2129 } 2130 } 2131 spin_lock_irqsave(&conf->device_lock, flags); 2132 mddev->degraded -= count; 2133 spin_unlock_irqrestore(&conf->device_lock, flags); 2134 2135 print_conf(conf); 2136 return count; 2137 } 2138 2139 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 2140 { 2141 struct r10conf *conf = mddev->private; 2142 int err = -EEXIST; 2143 int mirror; 2144 int first = 0; 2145 int last = conf->geo.raid_disks - 1; 2146 2147 if (mddev->recovery_cp < MaxSector) 2148 /* only hot-add to in-sync arrays, as recovery is 2149 * very different from resync 2150 */ 2151 return -EBUSY; 2152 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) 2153 return -EINVAL; 2154 2155 if (md_integrity_add_rdev(rdev, mddev)) 2156 return -ENXIO; 2157 2158 if (rdev->raid_disk >= 0) 2159 first = last = rdev->raid_disk; 2160 2161 if (rdev->saved_raid_disk >= first && 2162 rdev->saved_raid_disk < conf->geo.raid_disks && 2163 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 2164 mirror = rdev->saved_raid_disk; 2165 else 2166 mirror = first; 2167 for ( ; mirror <= last ; mirror++) { 2168 struct raid10_info *p = &conf->mirrors[mirror]; 2169 if (p->recovery_disabled == mddev->recovery_disabled) 2170 continue; 2171 if (p->rdev) { 2172 if (!test_bit(WantReplacement, &p->rdev->flags) || 2173 p->replacement != NULL) 2174 continue; 2175 clear_bit(In_sync, &rdev->flags); 2176 set_bit(Replacement, &rdev->flags); 2177 rdev->raid_disk = mirror; 2178 err = 0; 2179 if (mddev->gendisk) 2180 disk_stack_limits(mddev->gendisk, rdev->bdev, 2181 rdev->data_offset << 9); 2182 conf->fullsync = 1; 2183 rcu_assign_pointer(p->replacement, rdev); 2184 break; 2185 } 2186 2187 if (mddev->gendisk) 2188 disk_stack_limits(mddev->gendisk, rdev->bdev, 2189 rdev->data_offset << 9); 2190 2191 p->head_position = 0; 2192 p->recovery_disabled = mddev->recovery_disabled - 1; 2193 rdev->raid_disk = mirror; 2194 err = 0; 2195 if (rdev->saved_raid_disk != mirror) 2196 conf->fullsync = 1; 2197 rcu_assign_pointer(p->rdev, rdev); 2198 break; 2199 } 2200 2201 print_conf(conf); 2202 return err; 2203 } 2204 2205 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 2206 { 2207 struct r10conf *conf = mddev->private; 2208 int err = 0; 2209 int number = rdev->raid_disk; 2210 struct md_rdev **rdevp; 2211 struct raid10_info *p; 2212 2213 print_conf(conf); 2214 if (unlikely(number >= mddev->raid_disks)) 2215 return 0; 2216 p = conf->mirrors + number; 2217 if (rdev == p->rdev) 2218 rdevp = &p->rdev; 2219 else if (rdev == p->replacement) 2220 rdevp = &p->replacement; 2221 else 2222 return 0; 2223 2224 if (test_bit(In_sync, &rdev->flags) || 2225 atomic_read(&rdev->nr_pending)) { 2226 err = -EBUSY; 2227 goto abort; 2228 } 2229 /* Only remove non-faulty devices if recovery 2230 * is not possible. 2231 */ 2232 if (!test_bit(Faulty, &rdev->flags) && 2233 mddev->recovery_disabled != p->recovery_disabled && 2234 (!p->replacement || p->replacement == rdev) && 2235 number < conf->geo.raid_disks && 2236 enough(conf, -1)) { 2237 err = -EBUSY; 2238 goto abort; 2239 } 2240 *rdevp = NULL; 2241 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 2242 synchronize_rcu(); 2243 if (atomic_read(&rdev->nr_pending)) { 2244 /* lost the race, try later */ 2245 err = -EBUSY; 2246 *rdevp = rdev; 2247 goto abort; 2248 } 2249 } 2250 if (p->replacement) { 2251 /* We must have just cleared 'rdev' */ 2252 p->rdev = p->replacement; 2253 clear_bit(Replacement, &p->replacement->flags); 2254 smp_mb(); /* Make sure other CPUs may see both as identical 2255 * but will never see neither -- if they are careful. 2256 */ 2257 p->replacement = NULL; 2258 } 2259 2260 clear_bit(WantReplacement, &rdev->flags); 2261 err = md_integrity_register(mddev); 2262 2263 abort: 2264 2265 print_conf(conf); 2266 return err; 2267 } 2268 2269 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) 2270 { 2271 struct r10conf *conf = r10_bio->mddev->private; 2272 2273 if (!bio->bi_status) 2274 set_bit(R10BIO_Uptodate, &r10_bio->state); 2275 else 2276 /* The write handler will notice the lack of 2277 * R10BIO_Uptodate and record any errors etc 2278 */ 2279 atomic_add(r10_bio->sectors, 2280 &conf->mirrors[d].rdev->corrected_errors); 2281 2282 /* for reconstruct, we always reschedule after a read. 2283 * for resync, only after all reads 2284 */ 2285 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); 2286 if (test_bit(R10BIO_IsRecover, &r10_bio->state) || 2287 atomic_dec_and_test(&r10_bio->remaining)) { 2288 /* we have read all the blocks, 2289 * do the comparison in process context in raid10d 2290 */ 2291 reschedule_retry(r10_bio); 2292 } 2293 } 2294 2295 static void end_sync_read(struct bio *bio) 2296 { 2297 struct r10bio *r10_bio = get_resync_r10bio(bio); 2298 struct r10conf *conf = r10_bio->mddev->private; 2299 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 2300 2301 __end_sync_read(r10_bio, bio, d); 2302 } 2303 2304 static void end_reshape_read(struct bio *bio) 2305 { 2306 /* reshape read bio isn't allocated from r10buf_pool */ 2307 struct r10bio *r10_bio = bio->bi_private; 2308 2309 __end_sync_read(r10_bio, bio, r10_bio->read_slot); 2310 } 2311 2312 static void end_sync_request(struct r10bio *r10_bio) 2313 { 2314 struct mddev *mddev = r10_bio->mddev; 2315 2316 while (atomic_dec_and_test(&r10_bio->remaining)) { 2317 if (r10_bio->master_bio == NULL) { 2318 /* the primary of several recovery bios */ 2319 sector_t s = r10_bio->sectors; 2320 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2321 test_bit(R10BIO_WriteError, &r10_bio->state)) 2322 reschedule_retry(r10_bio); 2323 else 2324 put_buf(r10_bio); 2325 md_done_sync(mddev, s, 1); 2326 break; 2327 } else { 2328 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 2329 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2330 test_bit(R10BIO_WriteError, &r10_bio->state)) 2331 reschedule_retry(r10_bio); 2332 else 2333 put_buf(r10_bio); 2334 r10_bio = r10_bio2; 2335 } 2336 } 2337 } 2338 2339 static void end_sync_write(struct bio *bio) 2340 { 2341 struct r10bio *r10_bio = get_resync_r10bio(bio); 2342 struct mddev *mddev = r10_bio->mddev; 2343 struct r10conf *conf = mddev->private; 2344 int d; 2345 sector_t first_bad; 2346 int bad_sectors; 2347 int slot; 2348 int repl; 2349 struct md_rdev *rdev = NULL; 2350 2351 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 2352 if (repl) 2353 rdev = conf->mirrors[d].replacement; 2354 else 2355 rdev = conf->mirrors[d].rdev; 2356 2357 if (bio->bi_status) { 2358 if (repl) 2359 md_error(mddev, rdev); 2360 else { 2361 set_bit(WriteErrorSeen, &rdev->flags); 2362 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2363 set_bit(MD_RECOVERY_NEEDED, 2364 &rdev->mddev->recovery); 2365 set_bit(R10BIO_WriteError, &r10_bio->state); 2366 } 2367 } else if (is_badblock(rdev, 2368 r10_bio->devs[slot].addr, 2369 r10_bio->sectors, 2370 &first_bad, &bad_sectors)) 2371 set_bit(R10BIO_MadeGood, &r10_bio->state); 2372 2373 rdev_dec_pending(rdev, mddev); 2374 2375 end_sync_request(r10_bio); 2376 } 2377 2378 /* 2379 * Note: sync and recover and handled very differently for raid10 2380 * This code is for resync. 2381 * For resync, we read through virtual addresses and read all blocks. 2382 * If there is any error, we schedule a write. The lowest numbered 2383 * drive is authoritative. 2384 * However requests come for physical address, so we need to map. 2385 * For every physical address there are raid_disks/copies virtual addresses, 2386 * which is always are least one, but is not necessarly an integer. 2387 * This means that a physical address can span multiple chunks, so we may 2388 * have to submit multiple io requests for a single sync request. 2389 */ 2390 /* 2391 * We check if all blocks are in-sync and only write to blocks that 2392 * aren't in sync 2393 */ 2394 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2395 { 2396 struct r10conf *conf = mddev->private; 2397 int i, first; 2398 struct bio *tbio, *fbio; 2399 int vcnt; 2400 struct page **tpages, **fpages; 2401 2402 atomic_set(&r10_bio->remaining, 1); 2403 2404 /* find the first device with a block */ 2405 for (i=0; i<conf->copies; i++) 2406 if (!r10_bio->devs[i].bio->bi_status) 2407 break; 2408 2409 if (i == conf->copies) 2410 goto done; 2411 2412 first = i; 2413 fbio = r10_bio->devs[i].bio; 2414 fbio->bi_iter.bi_size = r10_bio->sectors << 9; 2415 fbio->bi_iter.bi_idx = 0; 2416 fpages = get_resync_pages(fbio)->pages; 2417 2418 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); 2419 /* now find blocks with errors */ 2420 for (i=0 ; i < conf->copies ; i++) { 2421 int j, d; 2422 struct md_rdev *rdev; 2423 struct resync_pages *rp; 2424 2425 tbio = r10_bio->devs[i].bio; 2426 2427 if (tbio->bi_end_io != end_sync_read) 2428 continue; 2429 if (i == first) 2430 continue; 2431 2432 tpages = get_resync_pages(tbio)->pages; 2433 d = r10_bio->devs[i].devnum; 2434 rdev = conf->mirrors[d].rdev; 2435 if (!r10_bio->devs[i].bio->bi_status) { 2436 /* We know that the bi_io_vec layout is the same for 2437 * both 'first' and 'i', so we just compare them. 2438 * All vec entries are PAGE_SIZE; 2439 */ 2440 int sectors = r10_bio->sectors; 2441 for (j = 0; j < vcnt; j++) { 2442 int len = PAGE_SIZE; 2443 if (sectors < (len / 512)) 2444 len = sectors * 512; 2445 if (memcmp(page_address(fpages[j]), 2446 page_address(tpages[j]), 2447 len)) 2448 break; 2449 sectors -= len/512; 2450 } 2451 if (j == vcnt) 2452 continue; 2453 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 2454 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2455 /* Don't fix anything. */ 2456 continue; 2457 } else if (test_bit(FailFast, &rdev->flags)) { 2458 /* Just give up on this device */ 2459 md_error(rdev->mddev, rdev); 2460 continue; 2461 } 2462 /* Ok, we need to write this bio, either to correct an 2463 * inconsistency or to correct an unreadable block. 2464 * First we need to fixup bv_offset, bv_len and 2465 * bi_vecs, as the read request might have corrupted these 2466 */ 2467 rp = get_resync_pages(tbio); 2468 bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE); 2469 2470 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); 2471 2472 rp->raid_bio = r10_bio; 2473 tbio->bi_private = rp; 2474 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; 2475 tbio->bi_end_io = end_sync_write; 2476 2477 bio_copy_data(tbio, fbio); 2478 2479 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2480 atomic_inc(&r10_bio->remaining); 2481 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2482 2483 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) 2484 tbio->bi_opf |= MD_FAILFAST; 2485 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; 2486 submit_bio_noacct(tbio); 2487 } 2488 2489 /* Now write out to any replacement devices 2490 * that are active 2491 */ 2492 for (i = 0; i < conf->copies; i++) { 2493 int d; 2494 2495 tbio = r10_bio->devs[i].repl_bio; 2496 if (!tbio || !tbio->bi_end_io) 2497 continue; 2498 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write 2499 && r10_bio->devs[i].bio != fbio) 2500 bio_copy_data(tbio, fbio); 2501 d = r10_bio->devs[i].devnum; 2502 atomic_inc(&r10_bio->remaining); 2503 md_sync_acct(conf->mirrors[d].replacement->bdev, 2504 bio_sectors(tbio)); 2505 submit_bio_noacct(tbio); 2506 } 2507 2508 done: 2509 if (atomic_dec_and_test(&r10_bio->remaining)) { 2510 md_done_sync(mddev, r10_bio->sectors, 1); 2511 put_buf(r10_bio); 2512 } 2513 } 2514 2515 /* 2516 * Now for the recovery code. 2517 * Recovery happens across physical sectors. 2518 * We recover all non-is_sync drives by finding the virtual address of 2519 * each, and then choose a working drive that also has that virt address. 2520 * There is a separate r10_bio for each non-in_sync drive. 2521 * Only the first two slots are in use. The first for reading, 2522 * The second for writing. 2523 * 2524 */ 2525 static void fix_recovery_read_error(struct r10bio *r10_bio) 2526 { 2527 /* We got a read error during recovery. 2528 * We repeat the read in smaller page-sized sections. 2529 * If a read succeeds, write it to the new device or record 2530 * a bad block if we cannot. 2531 * If a read fails, record a bad block on both old and 2532 * new devices. 2533 */ 2534 struct mddev *mddev = r10_bio->mddev; 2535 struct r10conf *conf = mddev->private; 2536 struct bio *bio = r10_bio->devs[0].bio; 2537 sector_t sect = 0; 2538 int sectors = r10_bio->sectors; 2539 int idx = 0; 2540 int dr = r10_bio->devs[0].devnum; 2541 int dw = r10_bio->devs[1].devnum; 2542 struct page **pages = get_resync_pages(bio)->pages; 2543 2544 while (sectors) { 2545 int s = sectors; 2546 struct md_rdev *rdev; 2547 sector_t addr; 2548 int ok; 2549 2550 if (s > (PAGE_SIZE>>9)) 2551 s = PAGE_SIZE >> 9; 2552 2553 rdev = conf->mirrors[dr].rdev; 2554 addr = r10_bio->devs[0].addr + sect, 2555 ok = sync_page_io(rdev, 2556 addr, 2557 s << 9, 2558 pages[idx], 2559 REQ_OP_READ, false); 2560 if (ok) { 2561 rdev = conf->mirrors[dw].rdev; 2562 addr = r10_bio->devs[1].addr + sect; 2563 ok = sync_page_io(rdev, 2564 addr, 2565 s << 9, 2566 pages[idx], 2567 REQ_OP_WRITE, false); 2568 if (!ok) { 2569 set_bit(WriteErrorSeen, &rdev->flags); 2570 if (!test_and_set_bit(WantReplacement, 2571 &rdev->flags)) 2572 set_bit(MD_RECOVERY_NEEDED, 2573 &rdev->mddev->recovery); 2574 } 2575 } 2576 if (!ok) { 2577 /* We don't worry if we cannot set a bad block - 2578 * it really is bad so there is no loss in not 2579 * recording it yet 2580 */ 2581 rdev_set_badblocks(rdev, addr, s, 0); 2582 2583 if (rdev != conf->mirrors[dw].rdev) { 2584 /* need bad block on destination too */ 2585 struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 2586 addr = r10_bio->devs[1].addr + sect; 2587 ok = rdev_set_badblocks(rdev2, addr, s, 0); 2588 if (!ok) { 2589 /* just abort the recovery */ 2590 pr_notice("md/raid10:%s: recovery aborted due to read error\n", 2591 mdname(mddev)); 2592 2593 conf->mirrors[dw].recovery_disabled 2594 = mddev->recovery_disabled; 2595 set_bit(MD_RECOVERY_INTR, 2596 &mddev->recovery); 2597 break; 2598 } 2599 } 2600 } 2601 2602 sectors -= s; 2603 sect += s; 2604 idx++; 2605 } 2606 } 2607 2608 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2609 { 2610 struct r10conf *conf = mddev->private; 2611 int d; 2612 struct bio *wbio, *wbio2; 2613 2614 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 2615 fix_recovery_read_error(r10_bio); 2616 end_sync_request(r10_bio); 2617 return; 2618 } 2619 2620 /* 2621 * share the pages with the first bio 2622 * and submit the write request 2623 */ 2624 d = r10_bio->devs[1].devnum; 2625 wbio = r10_bio->devs[1].bio; 2626 wbio2 = r10_bio->devs[1].repl_bio; 2627 /* Need to test wbio2->bi_end_io before we call 2628 * submit_bio_noacct as if the former is NULL, 2629 * the latter is free to free wbio2. 2630 */ 2631 if (wbio2 && !wbio2->bi_end_io) 2632 wbio2 = NULL; 2633 if (wbio->bi_end_io) { 2634 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2635 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 2636 submit_bio_noacct(wbio); 2637 } 2638 if (wbio2) { 2639 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2640 md_sync_acct(conf->mirrors[d].replacement->bdev, 2641 bio_sectors(wbio2)); 2642 submit_bio_noacct(wbio2); 2643 } 2644 } 2645 2646 /* 2647 * Used by fix_read_error() to decay the per rdev read_errors. 2648 * We halve the read error count for every hour that has elapsed 2649 * since the last recorded read error. 2650 * 2651 */ 2652 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 2653 { 2654 long cur_time_mon; 2655 unsigned long hours_since_last; 2656 unsigned int read_errors = atomic_read(&rdev->read_errors); 2657 2658 cur_time_mon = ktime_get_seconds(); 2659 2660 if (rdev->last_read_error == 0) { 2661 /* first time we've seen a read error */ 2662 rdev->last_read_error = cur_time_mon; 2663 return; 2664 } 2665 2666 hours_since_last = (long)(cur_time_mon - 2667 rdev->last_read_error) / 3600; 2668 2669 rdev->last_read_error = cur_time_mon; 2670 2671 /* 2672 * if hours_since_last is > the number of bits in read_errors 2673 * just set read errors to 0. We do this to avoid 2674 * overflowing the shift of read_errors by hours_since_last. 2675 */ 2676 if (hours_since_last >= 8 * sizeof(read_errors)) 2677 atomic_set(&rdev->read_errors, 0); 2678 else 2679 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 2680 } 2681 2682 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 2683 int sectors, struct page *page, enum req_op op) 2684 { 2685 sector_t first_bad; 2686 int bad_sectors; 2687 2688 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 2689 && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) 2690 return -1; 2691 if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) 2692 /* success */ 2693 return 1; 2694 if (op == REQ_OP_WRITE) { 2695 set_bit(WriteErrorSeen, &rdev->flags); 2696 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2697 set_bit(MD_RECOVERY_NEEDED, 2698 &rdev->mddev->recovery); 2699 } 2700 /* need to record an error - either for the block or the device */ 2701 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2702 md_error(rdev->mddev, rdev); 2703 return 0; 2704 } 2705 2706 /* 2707 * This is a kernel thread which: 2708 * 2709 * 1. Retries failed read operations on working mirrors. 2710 * 2. Updates the raid superblock when problems encounter. 2711 * 3. Performs writes following reads for array synchronising. 2712 */ 2713 2714 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 2715 { 2716 int sect = 0; /* Offset from r10_bio->sector */ 2717 int sectors = r10_bio->sectors; 2718 struct md_rdev *rdev; 2719 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2720 int d = r10_bio->devs[r10_bio->read_slot].devnum; 2721 2722 /* still own a reference to this rdev, so it cannot 2723 * have been cleared recently. 2724 */ 2725 rdev = conf->mirrors[d].rdev; 2726 2727 if (test_bit(Faulty, &rdev->flags)) 2728 /* drive has already been failed, just ignore any 2729 more fix_read_error() attempts */ 2730 return; 2731 2732 check_decay_read_errors(mddev, rdev); 2733 atomic_inc(&rdev->read_errors); 2734 if (atomic_read(&rdev->read_errors) > max_read_errors) { 2735 pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", 2736 mdname(mddev), rdev->bdev, 2737 atomic_read(&rdev->read_errors), max_read_errors); 2738 pr_notice("md/raid10:%s: %pg: Failing raid device\n", 2739 mdname(mddev), rdev->bdev); 2740 md_error(mddev, rdev); 2741 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2742 return; 2743 } 2744 2745 while(sectors) { 2746 int s = sectors; 2747 int sl = r10_bio->read_slot; 2748 int success = 0; 2749 int start; 2750 2751 if (s > (PAGE_SIZE>>9)) 2752 s = PAGE_SIZE >> 9; 2753 2754 rcu_read_lock(); 2755 do { 2756 sector_t first_bad; 2757 int bad_sectors; 2758 2759 d = r10_bio->devs[sl].devnum; 2760 rdev = rcu_dereference(conf->mirrors[d].rdev); 2761 if (rdev && 2762 test_bit(In_sync, &rdev->flags) && 2763 !test_bit(Faulty, &rdev->flags) && 2764 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2765 &first_bad, &bad_sectors) == 0) { 2766 atomic_inc(&rdev->nr_pending); 2767 rcu_read_unlock(); 2768 success = sync_page_io(rdev, 2769 r10_bio->devs[sl].addr + 2770 sect, 2771 s<<9, 2772 conf->tmppage, 2773 REQ_OP_READ, false); 2774 rdev_dec_pending(rdev, mddev); 2775 rcu_read_lock(); 2776 if (success) 2777 break; 2778 } 2779 sl++; 2780 if (sl == conf->copies) 2781 sl = 0; 2782 } while (!success && sl != r10_bio->read_slot); 2783 rcu_read_unlock(); 2784 2785 if (!success) { 2786 /* Cannot read from anywhere, just mark the block 2787 * as bad on the first device to discourage future 2788 * reads. 2789 */ 2790 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 2791 rdev = conf->mirrors[dn].rdev; 2792 2793 if (!rdev_set_badblocks( 2794 rdev, 2795 r10_bio->devs[r10_bio->read_slot].addr 2796 + sect, 2797 s, 0)) { 2798 md_error(mddev, rdev); 2799 r10_bio->devs[r10_bio->read_slot].bio 2800 = IO_BLOCKED; 2801 } 2802 break; 2803 } 2804 2805 start = sl; 2806 /* write it back and re-read */ 2807 rcu_read_lock(); 2808 while (sl != r10_bio->read_slot) { 2809 if (sl==0) 2810 sl = conf->copies; 2811 sl--; 2812 d = r10_bio->devs[sl].devnum; 2813 rdev = rcu_dereference(conf->mirrors[d].rdev); 2814 if (!rdev || 2815 test_bit(Faulty, &rdev->flags) || 2816 !test_bit(In_sync, &rdev->flags)) 2817 continue; 2818 2819 atomic_inc(&rdev->nr_pending); 2820 rcu_read_unlock(); 2821 if (r10_sync_page_io(rdev, 2822 r10_bio->devs[sl].addr + 2823 sect, 2824 s, conf->tmppage, REQ_OP_WRITE) 2825 == 0) { 2826 /* Well, this device is dead */ 2827 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", 2828 mdname(mddev), s, 2829 (unsigned long long)( 2830 sect + 2831 choose_data_offset(r10_bio, 2832 rdev)), 2833 rdev->bdev); 2834 pr_notice("md/raid10:%s: %pg: failing drive\n", 2835 mdname(mddev), 2836 rdev->bdev); 2837 } 2838 rdev_dec_pending(rdev, mddev); 2839 rcu_read_lock(); 2840 } 2841 sl = start; 2842 while (sl != r10_bio->read_slot) { 2843 if (sl==0) 2844 sl = conf->copies; 2845 sl--; 2846 d = r10_bio->devs[sl].devnum; 2847 rdev = rcu_dereference(conf->mirrors[d].rdev); 2848 if (!rdev || 2849 test_bit(Faulty, &rdev->flags) || 2850 !test_bit(In_sync, &rdev->flags)) 2851 continue; 2852 2853 atomic_inc(&rdev->nr_pending); 2854 rcu_read_unlock(); 2855 switch (r10_sync_page_io(rdev, 2856 r10_bio->devs[sl].addr + 2857 sect, 2858 s, conf->tmppage, REQ_OP_READ)) { 2859 case 0: 2860 /* Well, this device is dead */ 2861 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", 2862 mdname(mddev), s, 2863 (unsigned long long)( 2864 sect + 2865 choose_data_offset(r10_bio, rdev)), 2866 rdev->bdev); 2867 pr_notice("md/raid10:%s: %pg: failing drive\n", 2868 mdname(mddev), 2869 rdev->bdev); 2870 break; 2871 case 1: 2872 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", 2873 mdname(mddev), s, 2874 (unsigned long long)( 2875 sect + 2876 choose_data_offset(r10_bio, rdev)), 2877 rdev->bdev); 2878 atomic_add(s, &rdev->corrected_errors); 2879 } 2880 2881 rdev_dec_pending(rdev, mddev); 2882 rcu_read_lock(); 2883 } 2884 rcu_read_unlock(); 2885 2886 sectors -= s; 2887 sect += s; 2888 } 2889 } 2890 2891 static int narrow_write_error(struct r10bio *r10_bio, int i) 2892 { 2893 struct bio *bio = r10_bio->master_bio; 2894 struct mddev *mddev = r10_bio->mddev; 2895 struct r10conf *conf = mddev->private; 2896 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 2897 /* bio has the data to be written to slot 'i' where 2898 * we just recently had a write error. 2899 * We repeatedly clone the bio and trim down to one block, 2900 * then try the write. Where the write fails we record 2901 * a bad block. 2902 * It is conceivable that the bio doesn't exactly align with 2903 * blocks. We must handle this. 2904 * 2905 * We currently own a reference to the rdev. 2906 */ 2907 2908 int block_sectors; 2909 sector_t sector; 2910 int sectors; 2911 int sect_to_write = r10_bio->sectors; 2912 int ok = 1; 2913 2914 if (rdev->badblocks.shift < 0) 2915 return 0; 2916 2917 block_sectors = roundup(1 << rdev->badblocks.shift, 2918 bdev_logical_block_size(rdev->bdev) >> 9); 2919 sector = r10_bio->sector; 2920 sectors = ((r10_bio->sector + block_sectors) 2921 & ~(sector_t)(block_sectors - 1)) 2922 - sector; 2923 2924 while (sect_to_write) { 2925 struct bio *wbio; 2926 sector_t wsector; 2927 if (sectors > sect_to_write) 2928 sectors = sect_to_write; 2929 /* Write at 'sector' for 'sectors' */ 2930 wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, 2931 &mddev->bio_set); 2932 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); 2933 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); 2934 wbio->bi_iter.bi_sector = wsector + 2935 choose_data_offset(r10_bio, rdev); 2936 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 2937 2938 if (submit_bio_wait(wbio) < 0) 2939 /* Failure! */ 2940 ok = rdev_set_badblocks(rdev, wsector, 2941 sectors, 0) 2942 && ok; 2943 2944 bio_put(wbio); 2945 sect_to_write -= sectors; 2946 sector += sectors; 2947 sectors = block_sectors; 2948 } 2949 return ok; 2950 } 2951 2952 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2953 { 2954 int slot = r10_bio->read_slot; 2955 struct bio *bio; 2956 struct r10conf *conf = mddev->private; 2957 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2958 2959 /* we got a read error. Maybe the drive is bad. Maybe just 2960 * the block and we can fix it. 2961 * We freeze all other IO, and try reading the block from 2962 * other devices. When we find one, we re-write 2963 * and check it that fixes the read error. 2964 * This is all done synchronously while the array is 2965 * frozen. 2966 */ 2967 bio = r10_bio->devs[slot].bio; 2968 bio_put(bio); 2969 r10_bio->devs[slot].bio = NULL; 2970 2971 if (mddev->ro) 2972 r10_bio->devs[slot].bio = IO_BLOCKED; 2973 else if (!test_bit(FailFast, &rdev->flags)) { 2974 freeze_array(conf, 1); 2975 fix_read_error(conf, mddev, r10_bio); 2976 unfreeze_array(conf); 2977 } else 2978 md_error(mddev, rdev); 2979 2980 rdev_dec_pending(rdev, mddev); 2981 allow_barrier(conf); 2982 r10_bio->state = 0; 2983 raid10_read_request(mddev, r10_bio->master_bio, r10_bio); 2984 } 2985 2986 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 2987 { 2988 /* Some sort of write request has finished and it 2989 * succeeded in writing where we thought there was a 2990 * bad block. So forget the bad block. 2991 * Or possibly if failed and we need to record 2992 * a bad block. 2993 */ 2994 int m; 2995 struct md_rdev *rdev; 2996 2997 if (test_bit(R10BIO_IsSync, &r10_bio->state) || 2998 test_bit(R10BIO_IsRecover, &r10_bio->state)) { 2999 for (m = 0; m < conf->copies; m++) { 3000 int dev = r10_bio->devs[m].devnum; 3001 rdev = conf->mirrors[dev].rdev; 3002 if (r10_bio->devs[m].bio == NULL || 3003 r10_bio->devs[m].bio->bi_end_io == NULL) 3004 continue; 3005 if (!r10_bio->devs[m].bio->bi_status) { 3006 rdev_clear_badblocks( 3007 rdev, 3008 r10_bio->devs[m].addr, 3009 r10_bio->sectors, 0); 3010 } else { 3011 if (!rdev_set_badblocks( 3012 rdev, 3013 r10_bio->devs[m].addr, 3014 r10_bio->sectors, 0)) 3015 md_error(conf->mddev, rdev); 3016 } 3017 rdev = conf->mirrors[dev].replacement; 3018 if (r10_bio->devs[m].repl_bio == NULL || 3019 r10_bio->devs[m].repl_bio->bi_end_io == NULL) 3020 continue; 3021 3022 if (!r10_bio->devs[m].repl_bio->bi_status) { 3023 rdev_clear_badblocks( 3024 rdev, 3025 r10_bio->devs[m].addr, 3026 r10_bio->sectors, 0); 3027 } else { 3028 if (!rdev_set_badblocks( 3029 rdev, 3030 r10_bio->devs[m].addr, 3031 r10_bio->sectors, 0)) 3032 md_error(conf->mddev, rdev); 3033 } 3034 } 3035 put_buf(r10_bio); 3036 } else { 3037 bool fail = false; 3038 for (m = 0; m < conf->copies; m++) { 3039 int dev = r10_bio->devs[m].devnum; 3040 struct bio *bio = r10_bio->devs[m].bio; 3041 rdev = conf->mirrors[dev].rdev; 3042 if (bio == IO_MADE_GOOD) { 3043 rdev_clear_badblocks( 3044 rdev, 3045 r10_bio->devs[m].addr, 3046 r10_bio->sectors, 0); 3047 rdev_dec_pending(rdev, conf->mddev); 3048 } else if (bio != NULL && bio->bi_status) { 3049 fail = true; 3050 if (!narrow_write_error(r10_bio, m)) { 3051 md_error(conf->mddev, rdev); 3052 set_bit(R10BIO_Degraded, 3053 &r10_bio->state); 3054 } 3055 rdev_dec_pending(rdev, conf->mddev); 3056 } 3057 bio = r10_bio->devs[m].repl_bio; 3058 rdev = conf->mirrors[dev].replacement; 3059 if (rdev && bio == IO_MADE_GOOD) { 3060 rdev_clear_badblocks( 3061 rdev, 3062 r10_bio->devs[m].addr, 3063 r10_bio->sectors, 0); 3064 rdev_dec_pending(rdev, conf->mddev); 3065 } 3066 } 3067 if (fail) { 3068 spin_lock_irq(&conf->device_lock); 3069 list_add(&r10_bio->retry_list, &conf->bio_end_io_list); 3070 conf->nr_queued++; 3071 spin_unlock_irq(&conf->device_lock); 3072 /* 3073 * In case freeze_array() is waiting for condition 3074 * nr_pending == nr_queued + extra to be true. 3075 */ 3076 wake_up(&conf->wait_barrier); 3077 md_wakeup_thread(conf->mddev->thread); 3078 } else { 3079 if (test_bit(R10BIO_WriteError, 3080 &r10_bio->state)) 3081 close_write(r10_bio); 3082 raid_end_bio_io(r10_bio); 3083 } 3084 } 3085 } 3086 3087 static void raid10d(struct md_thread *thread) 3088 { 3089 struct mddev *mddev = thread->mddev; 3090 struct r10bio *r10_bio; 3091 unsigned long flags; 3092 struct r10conf *conf = mddev->private; 3093 struct list_head *head = &conf->retry_list; 3094 struct blk_plug plug; 3095 3096 md_check_recovery(mddev); 3097 3098 if (!list_empty_careful(&conf->bio_end_io_list) && 3099 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3100 LIST_HEAD(tmp); 3101 spin_lock_irqsave(&conf->device_lock, flags); 3102 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3103 while (!list_empty(&conf->bio_end_io_list)) { 3104 list_move(conf->bio_end_io_list.prev, &tmp); 3105 conf->nr_queued--; 3106 } 3107 } 3108 spin_unlock_irqrestore(&conf->device_lock, flags); 3109 while (!list_empty(&tmp)) { 3110 r10_bio = list_first_entry(&tmp, struct r10bio, 3111 retry_list); 3112 list_del(&r10_bio->retry_list); 3113 if (mddev->degraded) 3114 set_bit(R10BIO_Degraded, &r10_bio->state); 3115 3116 if (test_bit(R10BIO_WriteError, 3117 &r10_bio->state)) 3118 close_write(r10_bio); 3119 raid_end_bio_io(r10_bio); 3120 } 3121 } 3122 3123 blk_start_plug(&plug); 3124 for (;;) { 3125 3126 flush_pending_writes(conf); 3127 3128 spin_lock_irqsave(&conf->device_lock, flags); 3129 if (list_empty(head)) { 3130 spin_unlock_irqrestore(&conf->device_lock, flags); 3131 break; 3132 } 3133 r10_bio = list_entry(head->prev, struct r10bio, retry_list); 3134 list_del(head->prev); 3135 conf->nr_queued--; 3136 spin_unlock_irqrestore(&conf->device_lock, flags); 3137 3138 mddev = r10_bio->mddev; 3139 conf = mddev->private; 3140 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 3141 test_bit(R10BIO_WriteError, &r10_bio->state)) 3142 handle_write_completed(conf, r10_bio); 3143 else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 3144 reshape_request_write(mddev, r10_bio); 3145 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 3146 sync_request_write(mddev, r10_bio); 3147 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 3148 recovery_request_write(mddev, r10_bio); 3149 else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 3150 handle_read_error(mddev, r10_bio); 3151 else 3152 WARN_ON_ONCE(1); 3153 3154 cond_resched(); 3155 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 3156 md_check_recovery(mddev); 3157 } 3158 blk_finish_plug(&plug); 3159 } 3160 3161 static int init_resync(struct r10conf *conf) 3162 { 3163 int ret, buffs, i; 3164 3165 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 3166 BUG_ON(mempool_initialized(&conf->r10buf_pool)); 3167 conf->have_replacement = 0; 3168 for (i = 0; i < conf->geo.raid_disks; i++) 3169 if (conf->mirrors[i].replacement) 3170 conf->have_replacement = 1; 3171 ret = mempool_init(&conf->r10buf_pool, buffs, 3172 r10buf_pool_alloc, r10buf_pool_free, conf); 3173 if (ret) 3174 return ret; 3175 conf->next_resync = 0; 3176 return 0; 3177 } 3178 3179 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) 3180 { 3181 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); 3182 struct rsync_pages *rp; 3183 struct bio *bio; 3184 int nalloc; 3185 int i; 3186 3187 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 3188 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 3189 nalloc = conf->copies; /* resync */ 3190 else 3191 nalloc = 2; /* recovery */ 3192 3193 for (i = 0; i < nalloc; i++) { 3194 bio = r10bio->devs[i].bio; 3195 rp = bio->bi_private; 3196 bio_reset(bio, NULL, 0); 3197 bio->bi_private = rp; 3198 bio = r10bio->devs[i].repl_bio; 3199 if (bio) { 3200 rp = bio->bi_private; 3201 bio_reset(bio, NULL, 0); 3202 bio->bi_private = rp; 3203 } 3204 } 3205 return r10bio; 3206 } 3207 3208 /* 3209 * Set cluster_sync_high since we need other nodes to add the 3210 * range [cluster_sync_low, cluster_sync_high] to suspend list. 3211 */ 3212 static void raid10_set_cluster_sync_high(struct r10conf *conf) 3213 { 3214 sector_t window_size; 3215 int extra_chunk, chunks; 3216 3217 /* 3218 * First, here we define "stripe" as a unit which across 3219 * all member devices one time, so we get chunks by use 3220 * raid_disks / near_copies. Otherwise, if near_copies is 3221 * close to raid_disks, then resync window could increases 3222 * linearly with the increase of raid_disks, which means 3223 * we will suspend a really large IO window while it is not 3224 * necessary. If raid_disks is not divisible by near_copies, 3225 * an extra chunk is needed to ensure the whole "stripe" is 3226 * covered. 3227 */ 3228 3229 chunks = conf->geo.raid_disks / conf->geo.near_copies; 3230 if (conf->geo.raid_disks % conf->geo.near_copies == 0) 3231 extra_chunk = 0; 3232 else 3233 extra_chunk = 1; 3234 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; 3235 3236 /* 3237 * At least use a 32M window to align with raid1's resync window 3238 */ 3239 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? 3240 CLUSTER_RESYNC_WINDOW_SECTORS : window_size; 3241 3242 conf->cluster_sync_high = conf->cluster_sync_low + window_size; 3243 } 3244 3245 /* 3246 * perform a "sync" on one "block" 3247 * 3248 * We need to make sure that no normal I/O request - particularly write 3249 * requests - conflict with active sync requests. 3250 * 3251 * This is achieved by tracking pending requests and a 'barrier' concept 3252 * that can be installed to exclude normal IO requests. 3253 * 3254 * Resync and recovery are handled very differently. 3255 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. 3256 * 3257 * For resync, we iterate over virtual addresses, read all copies, 3258 * and update if there are differences. If only one copy is live, 3259 * skip it. 3260 * For recovery, we iterate over physical addresses, read a good 3261 * value for each non-in_sync drive, and over-write. 3262 * 3263 * So, for recovery we may have several outstanding complex requests for a 3264 * given address, one for each out-of-sync device. We model this by allocating 3265 * a number of r10_bio structures, one for each out-of-sync device. 3266 * As we setup these structures, we collect all bio's together into a list 3267 * which we then process collectively to add pages, and then process again 3268 * to pass to submit_bio_noacct. 3269 * 3270 * The r10_bio structures are linked using a borrowed master_bio pointer. 3271 * This link is counted in ->remaining. When the r10_bio that points to NULL 3272 * has its remaining count decremented to 0, the whole complex operation 3273 * is complete. 3274 * 3275 */ 3276 3277 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, 3278 int *skipped) 3279 { 3280 struct r10conf *conf = mddev->private; 3281 struct r10bio *r10_bio; 3282 struct bio *biolist = NULL, *bio; 3283 sector_t max_sector, nr_sectors; 3284 int i; 3285 int max_sync; 3286 sector_t sync_blocks; 3287 sector_t sectors_skipped = 0; 3288 int chunks_skipped = 0; 3289 sector_t chunk_mask = conf->geo.chunk_mask; 3290 int page_idx = 0; 3291 3292 if (!mempool_initialized(&conf->r10buf_pool)) 3293 if (init_resync(conf)) 3294 return 0; 3295 3296 /* 3297 * Allow skipping a full rebuild for incremental assembly 3298 * of a clean array, like RAID1 does. 3299 */ 3300 if (mddev->bitmap == NULL && 3301 mddev->recovery_cp == MaxSector && 3302 mddev->reshape_position == MaxSector && 3303 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 3304 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 3305 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 3306 conf->fullsync == 0) { 3307 *skipped = 1; 3308 return mddev->dev_sectors - sector_nr; 3309 } 3310 3311 skipped: 3312 max_sector = mddev->dev_sectors; 3313 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 3314 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3315 max_sector = mddev->resync_max_sectors; 3316 if (sector_nr >= max_sector) { 3317 conf->cluster_sync_low = 0; 3318 conf->cluster_sync_high = 0; 3319 3320 /* If we aborted, we need to abort the 3321 * sync on the 'current' bitmap chucks (there can 3322 * be several when recovering multiple devices). 3323 * as we may have started syncing it but not finished. 3324 * We can find the current address in 3325 * mddev->curr_resync, but for recovery, 3326 * we need to convert that to several 3327 * virtual addresses. 3328 */ 3329 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3330 end_reshape(conf); 3331 close_sync(conf); 3332 return 0; 3333 } 3334 3335 if (mddev->curr_resync < max_sector) { /* aborted */ 3336 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3337 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3338 &sync_blocks, 1); 3339 else for (i = 0; i < conf->geo.raid_disks; i++) { 3340 sector_t sect = 3341 raid10_find_virt(conf, mddev->curr_resync, i); 3342 md_bitmap_end_sync(mddev->bitmap, sect, 3343 &sync_blocks, 1); 3344 } 3345 } else { 3346 /* completed sync */ 3347 if ((!mddev->bitmap || conf->fullsync) 3348 && conf->have_replacement 3349 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3350 /* Completed a full sync so the replacements 3351 * are now fully recovered. 3352 */ 3353 rcu_read_lock(); 3354 for (i = 0; i < conf->geo.raid_disks; i++) { 3355 struct md_rdev *rdev = 3356 rcu_dereference(conf->mirrors[i].replacement); 3357 if (rdev) 3358 rdev->recovery_offset = MaxSector; 3359 } 3360 rcu_read_unlock(); 3361 } 3362 conf->fullsync = 0; 3363 } 3364 md_bitmap_close_sync(mddev->bitmap); 3365 close_sync(conf); 3366 *skipped = 1; 3367 return sectors_skipped; 3368 } 3369 3370 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3371 return reshape_request(mddev, sector_nr, skipped); 3372 3373 if (chunks_skipped >= conf->geo.raid_disks) { 3374 /* if there has been nothing to do on any drive, 3375 * then there is nothing to do at all.. 3376 */ 3377 *skipped = 1; 3378 return (max_sector - sector_nr) + sectors_skipped; 3379 } 3380 3381 if (max_sector > mddev->resync_max) 3382 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 3383 3384 /* make sure whole request will fit in a chunk - if chunks 3385 * are meaningful 3386 */ 3387 if (conf->geo.near_copies < conf->geo.raid_disks && 3388 max_sector > (sector_nr | chunk_mask)) 3389 max_sector = (sector_nr | chunk_mask) + 1; 3390 3391 /* 3392 * If there is non-resync activity waiting for a turn, then let it 3393 * though before starting on this new sync request. 3394 */ 3395 if (conf->nr_waiting) 3396 schedule_timeout_uninterruptible(1); 3397 3398 /* Again, very different code for resync and recovery. 3399 * Both must result in an r10bio with a list of bios that 3400 * have bi_end_io, bi_sector, bi_bdev set, 3401 * and bi_private set to the r10bio. 3402 * For recovery, we may actually create several r10bios 3403 * with 2 bios in each, that correspond to the bios in the main one. 3404 * In this case, the subordinate r10bios link back through a 3405 * borrowed master_bio pointer, and the counter in the master 3406 * includes a ref from each subordinate. 3407 */ 3408 /* First, we decide what to do and set ->bi_end_io 3409 * To end_sync_read if we want to read, and 3410 * end_sync_write if we will want to write. 3411 */ 3412 3413 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 3414 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3415 /* recovery... the complicated one */ 3416 int j; 3417 r10_bio = NULL; 3418 3419 for (i = 0 ; i < conf->geo.raid_disks; i++) { 3420 int still_degraded; 3421 struct r10bio *rb2; 3422 sector_t sect; 3423 int must_sync; 3424 int any_working; 3425 int need_recover = 0; 3426 int need_replace = 0; 3427 struct raid10_info *mirror = &conf->mirrors[i]; 3428 struct md_rdev *mrdev, *mreplace; 3429 3430 rcu_read_lock(); 3431 mrdev = rcu_dereference(mirror->rdev); 3432 mreplace = rcu_dereference(mirror->replacement); 3433 3434 if (mrdev != NULL && 3435 !test_bit(Faulty, &mrdev->flags) && 3436 !test_bit(In_sync, &mrdev->flags)) 3437 need_recover = 1; 3438 if (mreplace != NULL && 3439 !test_bit(Faulty, &mreplace->flags)) 3440 need_replace = 1; 3441 3442 if (!need_recover && !need_replace) { 3443 rcu_read_unlock(); 3444 continue; 3445 } 3446 3447 still_degraded = 0; 3448 /* want to reconstruct this device */ 3449 rb2 = r10_bio; 3450 sect = raid10_find_virt(conf, sector_nr, i); 3451 if (sect >= mddev->resync_max_sectors) { 3452 /* last stripe is not complete - don't 3453 * try to recover this sector. 3454 */ 3455 rcu_read_unlock(); 3456 continue; 3457 } 3458 if (mreplace && test_bit(Faulty, &mreplace->flags)) 3459 mreplace = NULL; 3460 /* Unless we are doing a full sync, or a replacement 3461 * we only need to recover the block if it is set in 3462 * the bitmap 3463 */ 3464 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3465 &sync_blocks, 1); 3466 if (sync_blocks < max_sync) 3467 max_sync = sync_blocks; 3468 if (!must_sync && 3469 mreplace == NULL && 3470 !conf->fullsync) { 3471 /* yep, skip the sync_blocks here, but don't assume 3472 * that there will never be anything to do here 3473 */ 3474 chunks_skipped = -1; 3475 rcu_read_unlock(); 3476 continue; 3477 } 3478 atomic_inc(&mrdev->nr_pending); 3479 if (mreplace) 3480 atomic_inc(&mreplace->nr_pending); 3481 rcu_read_unlock(); 3482 3483 r10_bio = raid10_alloc_init_r10buf(conf); 3484 r10_bio->state = 0; 3485 raise_barrier(conf, rb2 != NULL); 3486 atomic_set(&r10_bio->remaining, 0); 3487 3488 r10_bio->master_bio = (struct bio*)rb2; 3489 if (rb2) 3490 atomic_inc(&rb2->remaining); 3491 r10_bio->mddev = mddev; 3492 set_bit(R10BIO_IsRecover, &r10_bio->state); 3493 r10_bio->sector = sect; 3494 3495 raid10_find_phys(conf, r10_bio); 3496 3497 /* Need to check if the array will still be 3498 * degraded 3499 */ 3500 rcu_read_lock(); 3501 for (j = 0; j < conf->geo.raid_disks; j++) { 3502 struct md_rdev *rdev = rcu_dereference( 3503 conf->mirrors[j].rdev); 3504 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3505 still_degraded = 1; 3506 break; 3507 } 3508 } 3509 3510 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3511 &sync_blocks, still_degraded); 3512 3513 any_working = 0; 3514 for (j=0; j<conf->copies;j++) { 3515 int k; 3516 int d = r10_bio->devs[j].devnum; 3517 sector_t from_addr, to_addr; 3518 struct md_rdev *rdev = 3519 rcu_dereference(conf->mirrors[d].rdev); 3520 sector_t sector, first_bad; 3521 int bad_sectors; 3522 if (!rdev || 3523 !test_bit(In_sync, &rdev->flags)) 3524 continue; 3525 /* This is where we read from */ 3526 any_working = 1; 3527 sector = r10_bio->devs[j].addr; 3528 3529 if (is_badblock(rdev, sector, max_sync, 3530 &first_bad, &bad_sectors)) { 3531 if (first_bad > sector) 3532 max_sync = first_bad - sector; 3533 else { 3534 bad_sectors -= (sector 3535 - first_bad); 3536 if (max_sync > bad_sectors) 3537 max_sync = bad_sectors; 3538 continue; 3539 } 3540 } 3541 bio = r10_bio->devs[0].bio; 3542 bio->bi_next = biolist; 3543 biolist = bio; 3544 bio->bi_end_io = end_sync_read; 3545 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3546 if (test_bit(FailFast, &rdev->flags)) 3547 bio->bi_opf |= MD_FAILFAST; 3548 from_addr = r10_bio->devs[j].addr; 3549 bio->bi_iter.bi_sector = from_addr + 3550 rdev->data_offset; 3551 bio_set_dev(bio, rdev->bdev); 3552 atomic_inc(&rdev->nr_pending); 3553 /* and we write to 'i' (if not in_sync) */ 3554 3555 for (k=0; k<conf->copies; k++) 3556 if (r10_bio->devs[k].devnum == i) 3557 break; 3558 BUG_ON(k == conf->copies); 3559 to_addr = r10_bio->devs[k].addr; 3560 r10_bio->devs[0].devnum = d; 3561 r10_bio->devs[0].addr = from_addr; 3562 r10_bio->devs[1].devnum = i; 3563 r10_bio->devs[1].addr = to_addr; 3564 3565 if (need_recover) { 3566 bio = r10_bio->devs[1].bio; 3567 bio->bi_next = biolist; 3568 biolist = bio; 3569 bio->bi_end_io = end_sync_write; 3570 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3571 bio->bi_iter.bi_sector = to_addr 3572 + mrdev->data_offset; 3573 bio_set_dev(bio, mrdev->bdev); 3574 atomic_inc(&r10_bio->remaining); 3575 } else 3576 r10_bio->devs[1].bio->bi_end_io = NULL; 3577 3578 /* and maybe write to replacement */ 3579 bio = r10_bio->devs[1].repl_bio; 3580 if (bio) 3581 bio->bi_end_io = NULL; 3582 /* Note: if need_replace, then bio 3583 * cannot be NULL as r10buf_pool_alloc will 3584 * have allocated it. 3585 */ 3586 if (!need_replace) 3587 break; 3588 bio->bi_next = biolist; 3589 biolist = bio; 3590 bio->bi_end_io = end_sync_write; 3591 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3592 bio->bi_iter.bi_sector = to_addr + 3593 mreplace->data_offset; 3594 bio_set_dev(bio, mreplace->bdev); 3595 atomic_inc(&r10_bio->remaining); 3596 break; 3597 } 3598 rcu_read_unlock(); 3599 if (j == conf->copies) { 3600 /* Cannot recover, so abort the recovery or 3601 * record a bad block */ 3602 if (any_working) { 3603 /* problem is that there are bad blocks 3604 * on other device(s) 3605 */ 3606 int k; 3607 for (k = 0; k < conf->copies; k++) 3608 if (r10_bio->devs[k].devnum == i) 3609 break; 3610 if (!test_bit(In_sync, 3611 &mrdev->flags) 3612 && !rdev_set_badblocks( 3613 mrdev, 3614 r10_bio->devs[k].addr, 3615 max_sync, 0)) 3616 any_working = 0; 3617 if (mreplace && 3618 !rdev_set_badblocks( 3619 mreplace, 3620 r10_bio->devs[k].addr, 3621 max_sync, 0)) 3622 any_working = 0; 3623 } 3624 if (!any_working) { 3625 if (!test_and_set_bit(MD_RECOVERY_INTR, 3626 &mddev->recovery)) 3627 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", 3628 mdname(mddev)); 3629 mirror->recovery_disabled 3630 = mddev->recovery_disabled; 3631 } 3632 put_buf(r10_bio); 3633 if (rb2) 3634 atomic_dec(&rb2->remaining); 3635 r10_bio = rb2; 3636 rdev_dec_pending(mrdev, mddev); 3637 if (mreplace) 3638 rdev_dec_pending(mreplace, mddev); 3639 break; 3640 } 3641 rdev_dec_pending(mrdev, mddev); 3642 if (mreplace) 3643 rdev_dec_pending(mreplace, mddev); 3644 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { 3645 /* Only want this if there is elsewhere to 3646 * read from. 'j' is currently the first 3647 * readable copy. 3648 */ 3649 int targets = 1; 3650 for (; j < conf->copies; j++) { 3651 int d = r10_bio->devs[j].devnum; 3652 if (conf->mirrors[d].rdev && 3653 test_bit(In_sync, 3654 &conf->mirrors[d].rdev->flags)) 3655 targets++; 3656 } 3657 if (targets == 1) 3658 r10_bio->devs[0].bio->bi_opf 3659 &= ~MD_FAILFAST; 3660 } 3661 } 3662 if (biolist == NULL) { 3663 while (r10_bio) { 3664 struct r10bio *rb2 = r10_bio; 3665 r10_bio = (struct r10bio*) rb2->master_bio; 3666 rb2->master_bio = NULL; 3667 put_buf(rb2); 3668 } 3669 goto giveup; 3670 } 3671 } else { 3672 /* resync. Schedule a read for every block at this virt offset */ 3673 int count = 0; 3674 3675 /* 3676 * Since curr_resync_completed could probably not update in 3677 * time, and we will set cluster_sync_low based on it. 3678 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for 3679 * safety reason, which ensures curr_resync_completed is 3680 * updated in bitmap_cond_end_sync. 3681 */ 3682 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 3683 mddev_is_clustered(mddev) && 3684 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 3685 3686 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 3687 &sync_blocks, mddev->degraded) && 3688 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 3689 &mddev->recovery)) { 3690 /* We can skip this block */ 3691 *skipped = 1; 3692 return sync_blocks + sectors_skipped; 3693 } 3694 if (sync_blocks < max_sync) 3695 max_sync = sync_blocks; 3696 r10_bio = raid10_alloc_init_r10buf(conf); 3697 r10_bio->state = 0; 3698 3699 r10_bio->mddev = mddev; 3700 atomic_set(&r10_bio->remaining, 0); 3701 raise_barrier(conf, 0); 3702 conf->next_resync = sector_nr; 3703 3704 r10_bio->master_bio = NULL; 3705 r10_bio->sector = sector_nr; 3706 set_bit(R10BIO_IsSync, &r10_bio->state); 3707 raid10_find_phys(conf, r10_bio); 3708 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 3709 3710 for (i = 0; i < conf->copies; i++) { 3711 int d = r10_bio->devs[i].devnum; 3712 sector_t first_bad, sector; 3713 int bad_sectors; 3714 struct md_rdev *rdev; 3715 3716 if (r10_bio->devs[i].repl_bio) 3717 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3718 3719 bio = r10_bio->devs[i].bio; 3720 bio->bi_status = BLK_STS_IOERR; 3721 rcu_read_lock(); 3722 rdev = rcu_dereference(conf->mirrors[d].rdev); 3723 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3724 rcu_read_unlock(); 3725 continue; 3726 } 3727 sector = r10_bio->devs[i].addr; 3728 if (is_badblock(rdev, sector, max_sync, 3729 &first_bad, &bad_sectors)) { 3730 if (first_bad > sector) 3731 max_sync = first_bad - sector; 3732 else { 3733 bad_sectors -= (sector - first_bad); 3734 if (max_sync > bad_sectors) 3735 max_sync = bad_sectors; 3736 rcu_read_unlock(); 3737 continue; 3738 } 3739 } 3740 atomic_inc(&rdev->nr_pending); 3741 atomic_inc(&r10_bio->remaining); 3742 bio->bi_next = biolist; 3743 biolist = bio; 3744 bio->bi_end_io = end_sync_read; 3745 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3746 if (test_bit(FailFast, &rdev->flags)) 3747 bio->bi_opf |= MD_FAILFAST; 3748 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3749 bio_set_dev(bio, rdev->bdev); 3750 count++; 3751 3752 rdev = rcu_dereference(conf->mirrors[d].replacement); 3753 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3754 rcu_read_unlock(); 3755 continue; 3756 } 3757 atomic_inc(&rdev->nr_pending); 3758 3759 /* Need to set up for writing to the replacement */ 3760 bio = r10_bio->devs[i].repl_bio; 3761 bio->bi_status = BLK_STS_IOERR; 3762 3763 sector = r10_bio->devs[i].addr; 3764 bio->bi_next = biolist; 3765 biolist = bio; 3766 bio->bi_end_io = end_sync_write; 3767 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3768 if (test_bit(FailFast, &rdev->flags)) 3769 bio->bi_opf |= MD_FAILFAST; 3770 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3771 bio_set_dev(bio, rdev->bdev); 3772 count++; 3773 rcu_read_unlock(); 3774 } 3775 3776 if (count < 2) { 3777 for (i=0; i<conf->copies; i++) { 3778 int d = r10_bio->devs[i].devnum; 3779 if (r10_bio->devs[i].bio->bi_end_io) 3780 rdev_dec_pending(conf->mirrors[d].rdev, 3781 mddev); 3782 if (r10_bio->devs[i].repl_bio && 3783 r10_bio->devs[i].repl_bio->bi_end_io) 3784 rdev_dec_pending( 3785 conf->mirrors[d].replacement, 3786 mddev); 3787 } 3788 put_buf(r10_bio); 3789 biolist = NULL; 3790 goto giveup; 3791 } 3792 } 3793 3794 nr_sectors = 0; 3795 if (sector_nr + max_sync < max_sector) 3796 max_sector = sector_nr + max_sync; 3797 do { 3798 struct page *page; 3799 int len = PAGE_SIZE; 3800 if (sector_nr + (len>>9) > max_sector) 3801 len = (max_sector - sector_nr) << 9; 3802 if (len == 0) 3803 break; 3804 for (bio= biolist ; bio ; bio=bio->bi_next) { 3805 struct resync_pages *rp = get_resync_pages(bio); 3806 page = resync_fetch_page(rp, page_idx); 3807 /* 3808 * won't fail because the vec table is big enough 3809 * to hold all these pages 3810 */ 3811 bio_add_page(bio, page, len, 0); 3812 } 3813 nr_sectors += len>>9; 3814 sector_nr += len>>9; 3815 } while (++page_idx < RESYNC_PAGES); 3816 r10_bio->sectors = nr_sectors; 3817 3818 if (mddev_is_clustered(mddev) && 3819 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3820 /* It is resync not recovery */ 3821 if (conf->cluster_sync_high < sector_nr + nr_sectors) { 3822 conf->cluster_sync_low = mddev->curr_resync_completed; 3823 raid10_set_cluster_sync_high(conf); 3824 /* Send resync message */ 3825 md_cluster_ops->resync_info_update(mddev, 3826 conf->cluster_sync_low, 3827 conf->cluster_sync_high); 3828 } 3829 } else if (mddev_is_clustered(mddev)) { 3830 /* This is recovery not resync */ 3831 sector_t sect_va1, sect_va2; 3832 bool broadcast_msg = false; 3833 3834 for (i = 0; i < conf->geo.raid_disks; i++) { 3835 /* 3836 * sector_nr is a device address for recovery, so we 3837 * need translate it to array address before compare 3838 * with cluster_sync_high. 3839 */ 3840 sect_va1 = raid10_find_virt(conf, sector_nr, i); 3841 3842 if (conf->cluster_sync_high < sect_va1 + nr_sectors) { 3843 broadcast_msg = true; 3844 /* 3845 * curr_resync_completed is similar as 3846 * sector_nr, so make the translation too. 3847 */ 3848 sect_va2 = raid10_find_virt(conf, 3849 mddev->curr_resync_completed, i); 3850 3851 if (conf->cluster_sync_low == 0 || 3852 conf->cluster_sync_low > sect_va2) 3853 conf->cluster_sync_low = sect_va2; 3854 } 3855 } 3856 if (broadcast_msg) { 3857 raid10_set_cluster_sync_high(conf); 3858 md_cluster_ops->resync_info_update(mddev, 3859 conf->cluster_sync_low, 3860 conf->cluster_sync_high); 3861 } 3862 } 3863 3864 while (biolist) { 3865 bio = biolist; 3866 biolist = biolist->bi_next; 3867 3868 bio->bi_next = NULL; 3869 r10_bio = get_resync_r10bio(bio); 3870 r10_bio->sectors = nr_sectors; 3871 3872 if (bio->bi_end_io == end_sync_read) { 3873 md_sync_acct_bio(bio, nr_sectors); 3874 bio->bi_status = 0; 3875 submit_bio_noacct(bio); 3876 } 3877 } 3878 3879 if (sectors_skipped) 3880 /* pretend they weren't skipped, it makes 3881 * no important difference in this case 3882 */ 3883 md_done_sync(mddev, sectors_skipped, 1); 3884 3885 return sectors_skipped + nr_sectors; 3886 giveup: 3887 /* There is nowhere to write, so all non-sync 3888 * drives must be failed or in resync, all drives 3889 * have a bad block, so try the next chunk... 3890 */ 3891 if (sector_nr + max_sync < max_sector) 3892 max_sector = sector_nr + max_sync; 3893 3894 sectors_skipped += (max_sector - sector_nr); 3895 chunks_skipped ++; 3896 sector_nr = max_sector; 3897 goto skipped; 3898 } 3899 3900 static sector_t 3901 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 3902 { 3903 sector_t size; 3904 struct r10conf *conf = mddev->private; 3905 3906 if (!raid_disks) 3907 raid_disks = min(conf->geo.raid_disks, 3908 conf->prev.raid_disks); 3909 if (!sectors) 3910 sectors = conf->dev_sectors; 3911 3912 size = sectors >> conf->geo.chunk_shift; 3913 sector_div(size, conf->geo.far_copies); 3914 size = size * raid_disks; 3915 sector_div(size, conf->geo.near_copies); 3916 3917 return size << conf->geo.chunk_shift; 3918 } 3919 3920 static void calc_sectors(struct r10conf *conf, sector_t size) 3921 { 3922 /* Calculate the number of sectors-per-device that will 3923 * actually be used, and set conf->dev_sectors and 3924 * conf->stride 3925 */ 3926 3927 size = size >> conf->geo.chunk_shift; 3928 sector_div(size, conf->geo.far_copies); 3929 size = size * conf->geo.raid_disks; 3930 sector_div(size, conf->geo.near_copies); 3931 /* 'size' is now the number of chunks in the array */ 3932 /* calculate "used chunks per device" */ 3933 size = size * conf->copies; 3934 3935 /* We need to round up when dividing by raid_disks to 3936 * get the stride size. 3937 */ 3938 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 3939 3940 conf->dev_sectors = size << conf->geo.chunk_shift; 3941 3942 if (conf->geo.far_offset) 3943 conf->geo.stride = 1 << conf->geo.chunk_shift; 3944 else { 3945 sector_div(size, conf->geo.far_copies); 3946 conf->geo.stride = size << conf->geo.chunk_shift; 3947 } 3948 } 3949 3950 enum geo_type {geo_new, geo_old, geo_start}; 3951 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 3952 { 3953 int nc, fc, fo; 3954 int layout, chunk, disks; 3955 switch (new) { 3956 case geo_old: 3957 layout = mddev->layout; 3958 chunk = mddev->chunk_sectors; 3959 disks = mddev->raid_disks - mddev->delta_disks; 3960 break; 3961 case geo_new: 3962 layout = mddev->new_layout; 3963 chunk = mddev->new_chunk_sectors; 3964 disks = mddev->raid_disks; 3965 break; 3966 default: /* avoid 'may be unused' warnings */ 3967 case geo_start: /* new when starting reshape - raid_disks not 3968 * updated yet. */ 3969 layout = mddev->new_layout; 3970 chunk = mddev->new_chunk_sectors; 3971 disks = mddev->raid_disks + mddev->delta_disks; 3972 break; 3973 } 3974 if (layout >> 19) 3975 return -1; 3976 if (chunk < (PAGE_SIZE >> 9) || 3977 !is_power_of_2(chunk)) 3978 return -2; 3979 nc = layout & 255; 3980 fc = (layout >> 8) & 255; 3981 fo = layout & (1<<16); 3982 geo->raid_disks = disks; 3983 geo->near_copies = nc; 3984 geo->far_copies = fc; 3985 geo->far_offset = fo; 3986 switch (layout >> 17) { 3987 case 0: /* original layout. simple but not always optimal */ 3988 geo->far_set_size = disks; 3989 break; 3990 case 1: /* "improved" layout which was buggy. Hopefully no-one is 3991 * actually using this, but leave code here just in case.*/ 3992 geo->far_set_size = disks/fc; 3993 WARN(geo->far_set_size < fc, 3994 "This RAID10 layout does not provide data safety - please backup and create new array\n"); 3995 break; 3996 case 2: /* "improved" layout fixed to match documentation */ 3997 geo->far_set_size = fc * nc; 3998 break; 3999 default: /* Not a valid layout */ 4000 return -1; 4001 } 4002 geo->chunk_mask = chunk - 1; 4003 geo->chunk_shift = ffz(~chunk); 4004 return nc*fc; 4005 } 4006 4007 static struct r10conf *setup_conf(struct mddev *mddev) 4008 { 4009 struct r10conf *conf = NULL; 4010 int err = -EINVAL; 4011 struct geom geo; 4012 int copies; 4013 4014 copies = setup_geo(&geo, mddev, geo_new); 4015 4016 if (copies == -2) { 4017 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", 4018 mdname(mddev), PAGE_SIZE); 4019 goto out; 4020 } 4021 4022 if (copies < 2 || copies > mddev->raid_disks) { 4023 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 4024 mdname(mddev), mddev->new_layout); 4025 goto out; 4026 } 4027 4028 err = -ENOMEM; 4029 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 4030 if (!conf) 4031 goto out; 4032 4033 /* FIXME calc properly */ 4034 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks), 4035 sizeof(struct raid10_info), 4036 GFP_KERNEL); 4037 if (!conf->mirrors) 4038 goto out; 4039 4040 conf->tmppage = alloc_page(GFP_KERNEL); 4041 if (!conf->tmppage) 4042 goto out; 4043 4044 conf->geo = geo; 4045 conf->copies = copies; 4046 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, 4047 rbio_pool_free, conf); 4048 if (err) 4049 goto out; 4050 4051 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 4052 if (err) 4053 goto out; 4054 4055 calc_sectors(conf, mddev->dev_sectors); 4056 if (mddev->reshape_position == MaxSector) { 4057 conf->prev = conf->geo; 4058 conf->reshape_progress = MaxSector; 4059 } else { 4060 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 4061 err = -EINVAL; 4062 goto out; 4063 } 4064 conf->reshape_progress = mddev->reshape_position; 4065 if (conf->prev.far_offset) 4066 conf->prev.stride = 1 << conf->prev.chunk_shift; 4067 else 4068 /* far_copies must be 1 */ 4069 conf->prev.stride = conf->dev_sectors; 4070 } 4071 conf->reshape_safe = conf->reshape_progress; 4072 spin_lock_init(&conf->device_lock); 4073 INIT_LIST_HEAD(&conf->retry_list); 4074 INIT_LIST_HEAD(&conf->bio_end_io_list); 4075 4076 seqlock_init(&conf->resync_lock); 4077 init_waitqueue_head(&conf->wait_barrier); 4078 atomic_set(&conf->nr_pending, 0); 4079 4080 err = -ENOMEM; 4081 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 4082 if (!conf->thread) 4083 goto out; 4084 4085 conf->mddev = mddev; 4086 return conf; 4087 4088 out: 4089 if (conf) { 4090 mempool_exit(&conf->r10bio_pool); 4091 kfree(conf->mirrors); 4092 safe_put_page(conf->tmppage); 4093 bioset_exit(&conf->bio_split); 4094 kfree(conf); 4095 } 4096 return ERR_PTR(err); 4097 } 4098 4099 static void raid10_set_io_opt(struct r10conf *conf) 4100 { 4101 int raid_disks = conf->geo.raid_disks; 4102 4103 if (!(conf->geo.raid_disks % conf->geo.near_copies)) 4104 raid_disks /= conf->geo.near_copies; 4105 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 4106 raid_disks); 4107 } 4108 4109 static int raid10_run(struct mddev *mddev) 4110 { 4111 struct r10conf *conf; 4112 int i, disk_idx; 4113 struct raid10_info *disk; 4114 struct md_rdev *rdev; 4115 sector_t size; 4116 sector_t min_offset_diff = 0; 4117 int first = 1; 4118 4119 if (mddev_init_writes_pending(mddev) < 0) 4120 return -ENOMEM; 4121 4122 if (mddev->private == NULL) { 4123 conf = setup_conf(mddev); 4124 if (IS_ERR(conf)) 4125 return PTR_ERR(conf); 4126 mddev->private = conf; 4127 } 4128 conf = mddev->private; 4129 if (!conf) 4130 goto out; 4131 4132 if (mddev_is_clustered(conf->mddev)) { 4133 int fc, fo; 4134 4135 fc = (mddev->layout >> 8) & 255; 4136 fo = mddev->layout & (1<<16); 4137 if (fc > 1 || fo > 0) { 4138 pr_err("only near layout is supported by clustered" 4139 " raid10\n"); 4140 goto out_free_conf; 4141 } 4142 } 4143 4144 mddev->thread = conf->thread; 4145 conf->thread = NULL; 4146 4147 if (mddev->queue) { 4148 blk_queue_max_discard_sectors(mddev->queue, 4149 UINT_MAX); 4150 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 4151 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 4152 raid10_set_io_opt(conf); 4153 } 4154 4155 rdev_for_each(rdev, mddev) { 4156 long long diff; 4157 4158 disk_idx = rdev->raid_disk; 4159 if (disk_idx < 0) 4160 continue; 4161 if (disk_idx >= conf->geo.raid_disks && 4162 disk_idx >= conf->prev.raid_disks) 4163 continue; 4164 disk = conf->mirrors + disk_idx; 4165 4166 if (test_bit(Replacement, &rdev->flags)) { 4167 if (disk->replacement) 4168 goto out_free_conf; 4169 disk->replacement = rdev; 4170 } else { 4171 if (disk->rdev) 4172 goto out_free_conf; 4173 disk->rdev = rdev; 4174 } 4175 diff = (rdev->new_data_offset - rdev->data_offset); 4176 if (!mddev->reshape_backwards) 4177 diff = -diff; 4178 if (diff < 0) 4179 diff = 0; 4180 if (first || diff < min_offset_diff) 4181 min_offset_diff = diff; 4182 4183 if (mddev->gendisk) 4184 disk_stack_limits(mddev->gendisk, rdev->bdev, 4185 rdev->data_offset << 9); 4186 4187 disk->head_position = 0; 4188 first = 0; 4189 } 4190 4191 /* need to check that every block has at least one working mirror */ 4192 if (!enough(conf, -1)) { 4193 pr_err("md/raid10:%s: not enough operational mirrors.\n", 4194 mdname(mddev)); 4195 goto out_free_conf; 4196 } 4197 4198 if (conf->reshape_progress != MaxSector) { 4199 /* must ensure that shape change is supported */ 4200 if (conf->geo.far_copies != 1 && 4201 conf->geo.far_offset == 0) 4202 goto out_free_conf; 4203 if (conf->prev.far_copies != 1 && 4204 conf->prev.far_offset == 0) 4205 goto out_free_conf; 4206 } 4207 4208 mddev->degraded = 0; 4209 for (i = 0; 4210 i < conf->geo.raid_disks 4211 || i < conf->prev.raid_disks; 4212 i++) { 4213 4214 disk = conf->mirrors + i; 4215 4216 if (!disk->rdev && disk->replacement) { 4217 /* The replacement is all we have - use it */ 4218 disk->rdev = disk->replacement; 4219 disk->replacement = NULL; 4220 clear_bit(Replacement, &disk->rdev->flags); 4221 } 4222 4223 if (!disk->rdev || 4224 !test_bit(In_sync, &disk->rdev->flags)) { 4225 disk->head_position = 0; 4226 mddev->degraded++; 4227 if (disk->rdev && 4228 disk->rdev->saved_raid_disk < 0) 4229 conf->fullsync = 1; 4230 } 4231 4232 if (disk->replacement && 4233 !test_bit(In_sync, &disk->replacement->flags) && 4234 disk->replacement->saved_raid_disk < 0) { 4235 conf->fullsync = 1; 4236 } 4237 4238 disk->recovery_disabled = mddev->recovery_disabled - 1; 4239 } 4240 4241 if (mddev->recovery_cp != MaxSector) 4242 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", 4243 mdname(mddev)); 4244 pr_info("md/raid10:%s: active with %d out of %d devices\n", 4245 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 4246 conf->geo.raid_disks); 4247 /* 4248 * Ok, everything is just fine now 4249 */ 4250 mddev->dev_sectors = conf->dev_sectors; 4251 size = raid10_size(mddev, 0, 0); 4252 md_set_array_sectors(mddev, size); 4253 mddev->resync_max_sectors = size; 4254 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 4255 4256 if (md_integrity_register(mddev)) 4257 goto out_free_conf; 4258 4259 if (conf->reshape_progress != MaxSector) { 4260 unsigned long before_length, after_length; 4261 4262 before_length = ((1 << conf->prev.chunk_shift) * 4263 conf->prev.far_copies); 4264 after_length = ((1 << conf->geo.chunk_shift) * 4265 conf->geo.far_copies); 4266 4267 if (max(before_length, after_length) > min_offset_diff) { 4268 /* This cannot work */ 4269 pr_warn("md/raid10: offset difference not enough to continue reshape\n"); 4270 goto out_free_conf; 4271 } 4272 conf->offset_diff = min_offset_diff; 4273 4274 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4275 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4276 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4277 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4278 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4279 "reshape"); 4280 if (!mddev->sync_thread) 4281 goto out_free_conf; 4282 } 4283 4284 return 0; 4285 4286 out_free_conf: 4287 md_unregister_thread(&mddev->thread); 4288 mempool_exit(&conf->r10bio_pool); 4289 safe_put_page(conf->tmppage); 4290 kfree(conf->mirrors); 4291 kfree(conf); 4292 mddev->private = NULL; 4293 out: 4294 return -EIO; 4295 } 4296 4297 static void raid10_free(struct mddev *mddev, void *priv) 4298 { 4299 struct r10conf *conf = priv; 4300 4301 mempool_exit(&conf->r10bio_pool); 4302 safe_put_page(conf->tmppage); 4303 kfree(conf->mirrors); 4304 kfree(conf->mirrors_old); 4305 kfree(conf->mirrors_new); 4306 bioset_exit(&conf->bio_split); 4307 kfree(conf); 4308 } 4309 4310 static void raid10_quiesce(struct mddev *mddev, int quiesce) 4311 { 4312 struct r10conf *conf = mddev->private; 4313 4314 if (quiesce) 4315 raise_barrier(conf, 0); 4316 else 4317 lower_barrier(conf); 4318 } 4319 4320 static int raid10_resize(struct mddev *mddev, sector_t sectors) 4321 { 4322 /* Resize of 'far' arrays is not supported. 4323 * For 'near' and 'offset' arrays we can set the 4324 * number of sectors used to be an appropriate multiple 4325 * of the chunk size. 4326 * For 'offset', this is far_copies*chunksize. 4327 * For 'near' the multiplier is the LCM of 4328 * near_copies and raid_disks. 4329 * So if far_copies > 1 && !far_offset, fail. 4330 * Else find LCM(raid_disks, near_copy)*far_copies and 4331 * multiply by chunk_size. Then round to this number. 4332 * This is mostly done by raid10_size() 4333 */ 4334 struct r10conf *conf = mddev->private; 4335 sector_t oldsize, size; 4336 4337 if (mddev->reshape_position != MaxSector) 4338 return -EBUSY; 4339 4340 if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 4341 return -EINVAL; 4342 4343 oldsize = raid10_size(mddev, 0, 0); 4344 size = raid10_size(mddev, sectors, 0); 4345 if (mddev->external_size && 4346 mddev->array_sectors > size) 4347 return -EINVAL; 4348 if (mddev->bitmap) { 4349 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); 4350 if (ret) 4351 return ret; 4352 } 4353 md_set_array_sectors(mddev, size); 4354 if (sectors > mddev->dev_sectors && 4355 mddev->recovery_cp > oldsize) { 4356 mddev->recovery_cp = oldsize; 4357 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4358 } 4359 calc_sectors(conf, sectors); 4360 mddev->dev_sectors = conf->dev_sectors; 4361 mddev->resync_max_sectors = size; 4362 return 0; 4363 } 4364 4365 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) 4366 { 4367 struct md_rdev *rdev; 4368 struct r10conf *conf; 4369 4370 if (mddev->degraded > 0) { 4371 pr_warn("md/raid10:%s: Error: degraded raid0!\n", 4372 mdname(mddev)); 4373 return ERR_PTR(-EINVAL); 4374 } 4375 sector_div(size, devs); 4376 4377 /* Set new parameters */ 4378 mddev->new_level = 10; 4379 /* new layout: far_copies = 1, near_copies = 2 */ 4380 mddev->new_layout = (1<<8) + 2; 4381 mddev->new_chunk_sectors = mddev->chunk_sectors; 4382 mddev->delta_disks = mddev->raid_disks; 4383 mddev->raid_disks *= 2; 4384 /* make sure it will be not marked as dirty */ 4385 mddev->recovery_cp = MaxSector; 4386 mddev->dev_sectors = size; 4387 4388 conf = setup_conf(mddev); 4389 if (!IS_ERR(conf)) { 4390 rdev_for_each(rdev, mddev) 4391 if (rdev->raid_disk >= 0) { 4392 rdev->new_raid_disk = rdev->raid_disk * 2; 4393 rdev->sectors = size; 4394 } 4395 WRITE_ONCE(conf->barrier, 1); 4396 } 4397 4398 return conf; 4399 } 4400 4401 static void *raid10_takeover(struct mddev *mddev) 4402 { 4403 struct r0conf *raid0_conf; 4404 4405 /* raid10 can take over: 4406 * raid0 - providing it has only two drives 4407 */ 4408 if (mddev->level == 0) { 4409 /* for raid0 takeover only one zone is supported */ 4410 raid0_conf = mddev->private; 4411 if (raid0_conf->nr_strip_zones > 1) { 4412 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", 4413 mdname(mddev)); 4414 return ERR_PTR(-EINVAL); 4415 } 4416 return raid10_takeover_raid0(mddev, 4417 raid0_conf->strip_zone->zone_end, 4418 raid0_conf->strip_zone->nb_dev); 4419 } 4420 return ERR_PTR(-EINVAL); 4421 } 4422 4423 static int raid10_check_reshape(struct mddev *mddev) 4424 { 4425 /* Called when there is a request to change 4426 * - layout (to ->new_layout) 4427 * - chunk size (to ->new_chunk_sectors) 4428 * - raid_disks (by delta_disks) 4429 * or when trying to restart a reshape that was ongoing. 4430 * 4431 * We need to validate the request and possibly allocate 4432 * space if that might be an issue later. 4433 * 4434 * Currently we reject any reshape of a 'far' mode array, 4435 * allow chunk size to change if new is generally acceptable, 4436 * allow raid_disks to increase, and allow 4437 * a switch between 'near' mode and 'offset' mode. 4438 */ 4439 struct r10conf *conf = mddev->private; 4440 struct geom geo; 4441 4442 if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 4443 return -EINVAL; 4444 4445 if (setup_geo(&geo, mddev, geo_start) != conf->copies) 4446 /* mustn't change number of copies */ 4447 return -EINVAL; 4448 if (geo.far_copies > 1 && !geo.far_offset) 4449 /* Cannot switch to 'far' mode */ 4450 return -EINVAL; 4451 4452 if (mddev->array_sectors & geo.chunk_mask) 4453 /* not factor of array size */ 4454 return -EINVAL; 4455 4456 if (!enough(conf, -1)) 4457 return -EINVAL; 4458 4459 kfree(conf->mirrors_new); 4460 conf->mirrors_new = NULL; 4461 if (mddev->delta_disks > 0) { 4462 /* allocate new 'mirrors' list */ 4463 conf->mirrors_new = 4464 kcalloc(mddev->raid_disks + mddev->delta_disks, 4465 sizeof(struct raid10_info), 4466 GFP_KERNEL); 4467 if (!conf->mirrors_new) 4468 return -ENOMEM; 4469 } 4470 return 0; 4471 } 4472 4473 /* 4474 * Need to check if array has failed when deciding whether to: 4475 * - start an array 4476 * - remove non-faulty devices 4477 * - add a spare 4478 * - allow a reshape 4479 * This determination is simple when no reshape is happening. 4480 * However if there is a reshape, we need to carefully check 4481 * both the before and after sections. 4482 * This is because some failed devices may only affect one 4483 * of the two sections, and some non-in_sync devices may 4484 * be insync in the section most affected by failed devices. 4485 */ 4486 static int calc_degraded(struct r10conf *conf) 4487 { 4488 int degraded, degraded2; 4489 int i; 4490 4491 rcu_read_lock(); 4492 degraded = 0; 4493 /* 'prev' section first */ 4494 for (i = 0; i < conf->prev.raid_disks; i++) { 4495 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4496 if (!rdev || test_bit(Faulty, &rdev->flags)) 4497 degraded++; 4498 else if (!test_bit(In_sync, &rdev->flags)) 4499 /* When we can reduce the number of devices in 4500 * an array, this might not contribute to 4501 * 'degraded'. It does now. 4502 */ 4503 degraded++; 4504 } 4505 rcu_read_unlock(); 4506 if (conf->geo.raid_disks == conf->prev.raid_disks) 4507 return degraded; 4508 rcu_read_lock(); 4509 degraded2 = 0; 4510 for (i = 0; i < conf->geo.raid_disks; i++) { 4511 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4512 if (!rdev || test_bit(Faulty, &rdev->flags)) 4513 degraded2++; 4514 else if (!test_bit(In_sync, &rdev->flags)) { 4515 /* If reshape is increasing the number of devices, 4516 * this section has already been recovered, so 4517 * it doesn't contribute to degraded. 4518 * else it does. 4519 */ 4520 if (conf->geo.raid_disks <= conf->prev.raid_disks) 4521 degraded2++; 4522 } 4523 } 4524 rcu_read_unlock(); 4525 if (degraded2 > degraded) 4526 return degraded2; 4527 return degraded; 4528 } 4529 4530 static int raid10_start_reshape(struct mddev *mddev) 4531 { 4532 /* A 'reshape' has been requested. This commits 4533 * the various 'new' fields and sets MD_RECOVER_RESHAPE 4534 * This also checks if there are enough spares and adds them 4535 * to the array. 4536 * We currently require enough spares to make the final 4537 * array non-degraded. We also require that the difference 4538 * between old and new data_offset - on each device - is 4539 * enough that we never risk over-writing. 4540 */ 4541 4542 unsigned long before_length, after_length; 4543 sector_t min_offset_diff = 0; 4544 int first = 1; 4545 struct geom new; 4546 struct r10conf *conf = mddev->private; 4547 struct md_rdev *rdev; 4548 int spares = 0; 4549 int ret; 4550 4551 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4552 return -EBUSY; 4553 4554 if (setup_geo(&new, mddev, geo_start) != conf->copies) 4555 return -EINVAL; 4556 4557 before_length = ((1 << conf->prev.chunk_shift) * 4558 conf->prev.far_copies); 4559 after_length = ((1 << conf->geo.chunk_shift) * 4560 conf->geo.far_copies); 4561 4562 rdev_for_each(rdev, mddev) { 4563 if (!test_bit(In_sync, &rdev->flags) 4564 && !test_bit(Faulty, &rdev->flags)) 4565 spares++; 4566 if (rdev->raid_disk >= 0) { 4567 long long diff = (rdev->new_data_offset 4568 - rdev->data_offset); 4569 if (!mddev->reshape_backwards) 4570 diff = -diff; 4571 if (diff < 0) 4572 diff = 0; 4573 if (first || diff < min_offset_diff) 4574 min_offset_diff = diff; 4575 first = 0; 4576 } 4577 } 4578 4579 if (max(before_length, after_length) > min_offset_diff) 4580 return -EINVAL; 4581 4582 if (spares < mddev->delta_disks) 4583 return -EINVAL; 4584 4585 conf->offset_diff = min_offset_diff; 4586 spin_lock_irq(&conf->device_lock); 4587 if (conf->mirrors_new) { 4588 memcpy(conf->mirrors_new, conf->mirrors, 4589 sizeof(struct raid10_info)*conf->prev.raid_disks); 4590 smp_mb(); 4591 kfree(conf->mirrors_old); 4592 conf->mirrors_old = conf->mirrors; 4593 conf->mirrors = conf->mirrors_new; 4594 conf->mirrors_new = NULL; 4595 } 4596 setup_geo(&conf->geo, mddev, geo_start); 4597 smp_mb(); 4598 if (mddev->reshape_backwards) { 4599 sector_t size = raid10_size(mddev, 0, 0); 4600 if (size < mddev->array_sectors) { 4601 spin_unlock_irq(&conf->device_lock); 4602 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", 4603 mdname(mddev)); 4604 return -EINVAL; 4605 } 4606 mddev->resync_max_sectors = size; 4607 conf->reshape_progress = size; 4608 } else 4609 conf->reshape_progress = 0; 4610 conf->reshape_safe = conf->reshape_progress; 4611 spin_unlock_irq(&conf->device_lock); 4612 4613 if (mddev->delta_disks && mddev->bitmap) { 4614 struct mdp_superblock_1 *sb = NULL; 4615 sector_t oldsize, newsize; 4616 4617 oldsize = raid10_size(mddev, 0, 0); 4618 newsize = raid10_size(mddev, 0, conf->geo.raid_disks); 4619 4620 if (!mddev_is_clustered(mddev)) { 4621 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4622 if (ret) 4623 goto abort; 4624 else 4625 goto out; 4626 } 4627 4628 rdev_for_each(rdev, mddev) { 4629 if (rdev->raid_disk > -1 && 4630 !test_bit(Faulty, &rdev->flags)) 4631 sb = page_address(rdev->sb_page); 4632 } 4633 4634 /* 4635 * some node is already performing reshape, and no need to 4636 * call md_bitmap_resize again since it should be called when 4637 * receiving BITMAP_RESIZE msg 4638 */ 4639 if ((sb && (le32_to_cpu(sb->feature_map) & 4640 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) 4641 goto out; 4642 4643 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4644 if (ret) 4645 goto abort; 4646 4647 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); 4648 if (ret) { 4649 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); 4650 goto abort; 4651 } 4652 } 4653 out: 4654 if (mddev->delta_disks > 0) { 4655 rdev_for_each(rdev, mddev) 4656 if (rdev->raid_disk < 0 && 4657 !test_bit(Faulty, &rdev->flags)) { 4658 if (raid10_add_disk(mddev, rdev) == 0) { 4659 if (rdev->raid_disk >= 4660 conf->prev.raid_disks) 4661 set_bit(In_sync, &rdev->flags); 4662 else 4663 rdev->recovery_offset = 0; 4664 4665 /* Failure here is OK */ 4666 sysfs_link_rdev(mddev, rdev); 4667 } 4668 } else if (rdev->raid_disk >= conf->prev.raid_disks 4669 && !test_bit(Faulty, &rdev->flags)) { 4670 /* This is a spare that was manually added */ 4671 set_bit(In_sync, &rdev->flags); 4672 } 4673 } 4674 /* When a reshape changes the number of devices, 4675 * ->degraded is measured against the larger of the 4676 * pre and post numbers. 4677 */ 4678 spin_lock_irq(&conf->device_lock); 4679 mddev->degraded = calc_degraded(conf); 4680 spin_unlock_irq(&conf->device_lock); 4681 mddev->raid_disks = conf->geo.raid_disks; 4682 mddev->reshape_position = conf->reshape_progress; 4683 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4684 4685 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4686 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4687 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4688 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4689 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4690 4691 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4692 "reshape"); 4693 if (!mddev->sync_thread) { 4694 ret = -EAGAIN; 4695 goto abort; 4696 } 4697 conf->reshape_checkpoint = jiffies; 4698 md_wakeup_thread(mddev->sync_thread); 4699 md_new_event(); 4700 return 0; 4701 4702 abort: 4703 mddev->recovery = 0; 4704 spin_lock_irq(&conf->device_lock); 4705 conf->geo = conf->prev; 4706 mddev->raid_disks = conf->geo.raid_disks; 4707 rdev_for_each(rdev, mddev) 4708 rdev->new_data_offset = rdev->data_offset; 4709 smp_wmb(); 4710 conf->reshape_progress = MaxSector; 4711 conf->reshape_safe = MaxSector; 4712 mddev->reshape_position = MaxSector; 4713 spin_unlock_irq(&conf->device_lock); 4714 return ret; 4715 } 4716 4717 /* Calculate the last device-address that could contain 4718 * any block from the chunk that includes the array-address 's' 4719 * and report the next address. 4720 * i.e. the address returned will be chunk-aligned and after 4721 * any data that is in the chunk containing 's'. 4722 */ 4723 static sector_t last_dev_address(sector_t s, struct geom *geo) 4724 { 4725 s = (s | geo->chunk_mask) + 1; 4726 s >>= geo->chunk_shift; 4727 s *= geo->near_copies; 4728 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 4729 s *= geo->far_copies; 4730 s <<= geo->chunk_shift; 4731 return s; 4732 } 4733 4734 /* Calculate the first device-address that could contain 4735 * any block from the chunk that includes the array-address 's'. 4736 * This too will be the start of a chunk 4737 */ 4738 static sector_t first_dev_address(sector_t s, struct geom *geo) 4739 { 4740 s >>= geo->chunk_shift; 4741 s *= geo->near_copies; 4742 sector_div(s, geo->raid_disks); 4743 s *= geo->far_copies; 4744 s <<= geo->chunk_shift; 4745 return s; 4746 } 4747 4748 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 4749 int *skipped) 4750 { 4751 /* We simply copy at most one chunk (smallest of old and new) 4752 * at a time, possibly less if that exceeds RESYNC_PAGES, 4753 * or we hit a bad block or something. 4754 * This might mean we pause for normal IO in the middle of 4755 * a chunk, but that is not a problem as mddev->reshape_position 4756 * can record any location. 4757 * 4758 * If we will want to write to a location that isn't 4759 * yet recorded as 'safe' (i.e. in metadata on disk) then 4760 * we need to flush all reshape requests and update the metadata. 4761 * 4762 * When reshaping forwards (e.g. to more devices), we interpret 4763 * 'safe' as the earliest block which might not have been copied 4764 * down yet. We divide this by previous stripe size and multiply 4765 * by previous stripe length to get lowest device offset that we 4766 * cannot write to yet. 4767 * We interpret 'sector_nr' as an address that we want to write to. 4768 * From this we use last_device_address() to find where we might 4769 * write to, and first_device_address on the 'safe' position. 4770 * If this 'next' write position is after the 'safe' position, 4771 * we must update the metadata to increase the 'safe' position. 4772 * 4773 * When reshaping backwards, we round in the opposite direction 4774 * and perform the reverse test: next write position must not be 4775 * less than current safe position. 4776 * 4777 * In all this the minimum difference in data offsets 4778 * (conf->offset_diff - always positive) allows a bit of slack, 4779 * so next can be after 'safe', but not by more than offset_diff 4780 * 4781 * We need to prepare all the bios here before we start any IO 4782 * to ensure the size we choose is acceptable to all devices. 4783 * The means one for each copy for write-out and an extra one for 4784 * read-in. 4785 * We store the read-in bio in ->master_bio and the others in 4786 * ->devs[x].bio and ->devs[x].repl_bio. 4787 */ 4788 struct r10conf *conf = mddev->private; 4789 struct r10bio *r10_bio; 4790 sector_t next, safe, last; 4791 int max_sectors; 4792 int nr_sectors; 4793 int s; 4794 struct md_rdev *rdev; 4795 int need_flush = 0; 4796 struct bio *blist; 4797 struct bio *bio, *read_bio; 4798 int sectors_done = 0; 4799 struct page **pages; 4800 4801 if (sector_nr == 0) { 4802 /* If restarting in the middle, skip the initial sectors */ 4803 if (mddev->reshape_backwards && 4804 conf->reshape_progress < raid10_size(mddev, 0, 0)) { 4805 sector_nr = (raid10_size(mddev, 0, 0) 4806 - conf->reshape_progress); 4807 } else if (!mddev->reshape_backwards && 4808 conf->reshape_progress > 0) 4809 sector_nr = conf->reshape_progress; 4810 if (sector_nr) { 4811 mddev->curr_resync_completed = sector_nr; 4812 sysfs_notify_dirent_safe(mddev->sysfs_completed); 4813 *skipped = 1; 4814 return sector_nr; 4815 } 4816 } 4817 4818 /* We don't use sector_nr to track where we are up to 4819 * as that doesn't work well for ->reshape_backwards. 4820 * So just use ->reshape_progress. 4821 */ 4822 if (mddev->reshape_backwards) { 4823 /* 'next' is the earliest device address that we might 4824 * write to for this chunk in the new layout 4825 */ 4826 next = first_dev_address(conf->reshape_progress - 1, 4827 &conf->geo); 4828 4829 /* 'safe' is the last device address that we might read from 4830 * in the old layout after a restart 4831 */ 4832 safe = last_dev_address(conf->reshape_safe - 1, 4833 &conf->prev); 4834 4835 if (next + conf->offset_diff < safe) 4836 need_flush = 1; 4837 4838 last = conf->reshape_progress - 1; 4839 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 4840 & conf->prev.chunk_mask); 4841 if (sector_nr + RESYNC_SECTORS < last) 4842 sector_nr = last + 1 - RESYNC_SECTORS; 4843 } else { 4844 /* 'next' is after the last device address that we 4845 * might write to for this chunk in the new layout 4846 */ 4847 next = last_dev_address(conf->reshape_progress, &conf->geo); 4848 4849 /* 'safe' is the earliest device address that we might 4850 * read from in the old layout after a restart 4851 */ 4852 safe = first_dev_address(conf->reshape_safe, &conf->prev); 4853 4854 /* Need to update metadata if 'next' might be beyond 'safe' 4855 * as that would possibly corrupt data 4856 */ 4857 if (next > safe + conf->offset_diff) 4858 need_flush = 1; 4859 4860 sector_nr = conf->reshape_progress; 4861 last = sector_nr | (conf->geo.chunk_mask 4862 & conf->prev.chunk_mask); 4863 4864 if (sector_nr + RESYNC_SECTORS <= last) 4865 last = sector_nr + RESYNC_SECTORS - 1; 4866 } 4867 4868 if (need_flush || 4869 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4870 /* Need to update reshape_position in metadata */ 4871 wait_barrier(conf, false); 4872 mddev->reshape_position = conf->reshape_progress; 4873 if (mddev->reshape_backwards) 4874 mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 4875 - conf->reshape_progress; 4876 else 4877 mddev->curr_resync_completed = conf->reshape_progress; 4878 conf->reshape_checkpoint = jiffies; 4879 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4880 md_wakeup_thread(mddev->thread); 4881 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 4882 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4883 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4884 allow_barrier(conf); 4885 return sectors_done; 4886 } 4887 conf->reshape_safe = mddev->reshape_position; 4888 allow_barrier(conf); 4889 } 4890 4891 raise_barrier(conf, 0); 4892 read_more: 4893 /* Now schedule reads for blocks from sector_nr to last */ 4894 r10_bio = raid10_alloc_init_r10buf(conf); 4895 r10_bio->state = 0; 4896 raise_barrier(conf, 1); 4897 atomic_set(&r10_bio->remaining, 0); 4898 r10_bio->mddev = mddev; 4899 r10_bio->sector = sector_nr; 4900 set_bit(R10BIO_IsReshape, &r10_bio->state); 4901 r10_bio->sectors = last - sector_nr + 1; 4902 rdev = read_balance(conf, r10_bio, &max_sectors); 4903 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 4904 4905 if (!rdev) { 4906 /* Cannot read from here, so need to record bad blocks 4907 * on all the target devices. 4908 */ 4909 // FIXME 4910 mempool_free(r10_bio, &conf->r10buf_pool); 4911 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4912 return sectors_done; 4913 } 4914 4915 read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ, 4916 GFP_KERNEL, &mddev->bio_set); 4917 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 4918 + rdev->data_offset); 4919 read_bio->bi_private = r10_bio; 4920 read_bio->bi_end_io = end_reshape_read; 4921 r10_bio->master_bio = read_bio; 4922 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4923 4924 /* 4925 * Broadcast RESYNC message to other nodes, so all nodes would not 4926 * write to the region to avoid conflict. 4927 */ 4928 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { 4929 struct mdp_superblock_1 *sb = NULL; 4930 int sb_reshape_pos = 0; 4931 4932 conf->cluster_sync_low = sector_nr; 4933 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; 4934 sb = page_address(rdev->sb_page); 4935 if (sb) { 4936 sb_reshape_pos = le64_to_cpu(sb->reshape_position); 4937 /* 4938 * Set cluster_sync_low again if next address for array 4939 * reshape is less than cluster_sync_low. Since we can't 4940 * update cluster_sync_low until it has finished reshape. 4941 */ 4942 if (sb_reshape_pos < conf->cluster_sync_low) 4943 conf->cluster_sync_low = sb_reshape_pos; 4944 } 4945 4946 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, 4947 conf->cluster_sync_high); 4948 } 4949 4950 /* Now find the locations in the new layout */ 4951 __raid10_find_phys(&conf->geo, r10_bio); 4952 4953 blist = read_bio; 4954 read_bio->bi_next = NULL; 4955 4956 rcu_read_lock(); 4957 for (s = 0; s < conf->copies*2; s++) { 4958 struct bio *b; 4959 int d = r10_bio->devs[s/2].devnum; 4960 struct md_rdev *rdev2; 4961 if (s&1) { 4962 rdev2 = rcu_dereference(conf->mirrors[d].replacement); 4963 b = r10_bio->devs[s/2].repl_bio; 4964 } else { 4965 rdev2 = rcu_dereference(conf->mirrors[d].rdev); 4966 b = r10_bio->devs[s/2].bio; 4967 } 4968 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4969 continue; 4970 4971 bio_set_dev(b, rdev2->bdev); 4972 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + 4973 rdev2->new_data_offset; 4974 b->bi_end_io = end_reshape_write; 4975 bio_set_op_attrs(b, REQ_OP_WRITE, 0); 4976 b->bi_next = blist; 4977 blist = b; 4978 } 4979 4980 /* Now add as many pages as possible to all of these bios. */ 4981 4982 nr_sectors = 0; 4983 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 4984 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 4985 struct page *page = pages[s / (PAGE_SIZE >> 9)]; 4986 int len = (max_sectors - s) << 9; 4987 if (len > PAGE_SIZE) 4988 len = PAGE_SIZE; 4989 for (bio = blist; bio ; bio = bio->bi_next) { 4990 /* 4991 * won't fail because the vec table is big enough 4992 * to hold all these pages 4993 */ 4994 bio_add_page(bio, page, len, 0); 4995 } 4996 sector_nr += len >> 9; 4997 nr_sectors += len >> 9; 4998 } 4999 rcu_read_unlock(); 5000 r10_bio->sectors = nr_sectors; 5001 5002 /* Now submit the read */ 5003 md_sync_acct_bio(read_bio, r10_bio->sectors); 5004 atomic_inc(&r10_bio->remaining); 5005 read_bio->bi_next = NULL; 5006 submit_bio_noacct(read_bio); 5007 sectors_done += nr_sectors; 5008 if (sector_nr <= last) 5009 goto read_more; 5010 5011 lower_barrier(conf); 5012 5013 /* Now that we have done the whole section we can 5014 * update reshape_progress 5015 */ 5016 if (mddev->reshape_backwards) 5017 conf->reshape_progress -= sectors_done; 5018 else 5019 conf->reshape_progress += sectors_done; 5020 5021 return sectors_done; 5022 } 5023 5024 static void end_reshape_request(struct r10bio *r10_bio); 5025 static int handle_reshape_read_error(struct mddev *mddev, 5026 struct r10bio *r10_bio); 5027 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 5028 { 5029 /* Reshape read completed. Hopefully we have a block 5030 * to write out. 5031 * If we got a read error then we do sync 1-page reads from 5032 * elsewhere until we find the data - or give up. 5033 */ 5034 struct r10conf *conf = mddev->private; 5035 int s; 5036 5037 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 5038 if (handle_reshape_read_error(mddev, r10_bio) < 0) { 5039 /* Reshape has been aborted */ 5040 md_done_sync(mddev, r10_bio->sectors, 0); 5041 return; 5042 } 5043 5044 /* We definitely have the data in the pages, schedule the 5045 * writes. 5046 */ 5047 atomic_set(&r10_bio->remaining, 1); 5048 for (s = 0; s < conf->copies*2; s++) { 5049 struct bio *b; 5050 int d = r10_bio->devs[s/2].devnum; 5051 struct md_rdev *rdev; 5052 rcu_read_lock(); 5053 if (s&1) { 5054 rdev = rcu_dereference(conf->mirrors[d].replacement); 5055 b = r10_bio->devs[s/2].repl_bio; 5056 } else { 5057 rdev = rcu_dereference(conf->mirrors[d].rdev); 5058 b = r10_bio->devs[s/2].bio; 5059 } 5060 if (!rdev || test_bit(Faulty, &rdev->flags)) { 5061 rcu_read_unlock(); 5062 continue; 5063 } 5064 atomic_inc(&rdev->nr_pending); 5065 rcu_read_unlock(); 5066 md_sync_acct_bio(b, r10_bio->sectors); 5067 atomic_inc(&r10_bio->remaining); 5068 b->bi_next = NULL; 5069 submit_bio_noacct(b); 5070 } 5071 end_reshape_request(r10_bio); 5072 } 5073 5074 static void end_reshape(struct r10conf *conf) 5075 { 5076 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 5077 return; 5078 5079 spin_lock_irq(&conf->device_lock); 5080 conf->prev = conf->geo; 5081 md_finish_reshape(conf->mddev); 5082 smp_wmb(); 5083 conf->reshape_progress = MaxSector; 5084 conf->reshape_safe = MaxSector; 5085 spin_unlock_irq(&conf->device_lock); 5086 5087 if (conf->mddev->queue) 5088 raid10_set_io_opt(conf); 5089 conf->fullsync = 0; 5090 } 5091 5092 static void raid10_update_reshape_pos(struct mddev *mddev) 5093 { 5094 struct r10conf *conf = mddev->private; 5095 sector_t lo, hi; 5096 5097 md_cluster_ops->resync_info_get(mddev, &lo, &hi); 5098 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) 5099 || mddev->reshape_position == MaxSector) 5100 conf->reshape_progress = mddev->reshape_position; 5101 else 5102 WARN_ON_ONCE(1); 5103 } 5104 5105 static int handle_reshape_read_error(struct mddev *mddev, 5106 struct r10bio *r10_bio) 5107 { 5108 /* Use sync reads to get the blocks from somewhere else */ 5109 int sectors = r10_bio->sectors; 5110 struct r10conf *conf = mddev->private; 5111 struct r10bio *r10b; 5112 int slot = 0; 5113 int idx = 0; 5114 struct page **pages; 5115 5116 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); 5117 if (!r10b) { 5118 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5119 return -ENOMEM; 5120 } 5121 5122 /* reshape IOs share pages from .devs[0].bio */ 5123 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 5124 5125 r10b->sector = r10_bio->sector; 5126 __raid10_find_phys(&conf->prev, r10b); 5127 5128 while (sectors) { 5129 int s = sectors; 5130 int success = 0; 5131 int first_slot = slot; 5132 5133 if (s > (PAGE_SIZE >> 9)) 5134 s = PAGE_SIZE >> 9; 5135 5136 rcu_read_lock(); 5137 while (!success) { 5138 int d = r10b->devs[slot].devnum; 5139 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5140 sector_t addr; 5141 if (rdev == NULL || 5142 test_bit(Faulty, &rdev->flags) || 5143 !test_bit(In_sync, &rdev->flags)) 5144 goto failed; 5145 5146 addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 5147 atomic_inc(&rdev->nr_pending); 5148 rcu_read_unlock(); 5149 success = sync_page_io(rdev, 5150 addr, 5151 s << 9, 5152 pages[idx], 5153 REQ_OP_READ, false); 5154 rdev_dec_pending(rdev, mddev); 5155 rcu_read_lock(); 5156 if (success) 5157 break; 5158 failed: 5159 slot++; 5160 if (slot >= conf->copies) 5161 slot = 0; 5162 if (slot == first_slot) 5163 break; 5164 } 5165 rcu_read_unlock(); 5166 if (!success) { 5167 /* couldn't read this block, must give up */ 5168 set_bit(MD_RECOVERY_INTR, 5169 &mddev->recovery); 5170 kfree(r10b); 5171 return -EIO; 5172 } 5173 sectors -= s; 5174 idx++; 5175 } 5176 kfree(r10b); 5177 return 0; 5178 } 5179 5180 static void end_reshape_write(struct bio *bio) 5181 { 5182 struct r10bio *r10_bio = get_resync_r10bio(bio); 5183 struct mddev *mddev = r10_bio->mddev; 5184 struct r10conf *conf = mddev->private; 5185 int d; 5186 int slot; 5187 int repl; 5188 struct md_rdev *rdev = NULL; 5189 5190 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 5191 if (repl) 5192 rdev = conf->mirrors[d].replacement; 5193 if (!rdev) { 5194 smp_mb(); 5195 rdev = conf->mirrors[d].rdev; 5196 } 5197 5198 if (bio->bi_status) { 5199 /* FIXME should record badblock */ 5200 md_error(mddev, rdev); 5201 } 5202 5203 rdev_dec_pending(rdev, mddev); 5204 end_reshape_request(r10_bio); 5205 } 5206 5207 static void end_reshape_request(struct r10bio *r10_bio) 5208 { 5209 if (!atomic_dec_and_test(&r10_bio->remaining)) 5210 return; 5211 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 5212 bio_put(r10_bio->master_bio); 5213 put_buf(r10_bio); 5214 } 5215 5216 static void raid10_finish_reshape(struct mddev *mddev) 5217 { 5218 struct r10conf *conf = mddev->private; 5219 5220 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5221 return; 5222 5223 if (mddev->delta_disks > 0) { 5224 if (mddev->recovery_cp > mddev->resync_max_sectors) { 5225 mddev->recovery_cp = mddev->resync_max_sectors; 5226 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5227 } 5228 mddev->resync_max_sectors = mddev->array_sectors; 5229 } else { 5230 int d; 5231 rcu_read_lock(); 5232 for (d = conf->geo.raid_disks ; 5233 d < conf->geo.raid_disks - mddev->delta_disks; 5234 d++) { 5235 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5236 if (rdev) 5237 clear_bit(In_sync, &rdev->flags); 5238 rdev = rcu_dereference(conf->mirrors[d].replacement); 5239 if (rdev) 5240 clear_bit(In_sync, &rdev->flags); 5241 } 5242 rcu_read_unlock(); 5243 } 5244 mddev->layout = mddev->new_layout; 5245 mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 5246 mddev->reshape_position = MaxSector; 5247 mddev->delta_disks = 0; 5248 mddev->reshape_backwards = 0; 5249 } 5250 5251 static struct md_personality raid10_personality = 5252 { 5253 .name = "raid10", 5254 .level = 10, 5255 .owner = THIS_MODULE, 5256 .make_request = raid10_make_request, 5257 .run = raid10_run, 5258 .free = raid10_free, 5259 .status = raid10_status, 5260 .error_handler = raid10_error, 5261 .hot_add_disk = raid10_add_disk, 5262 .hot_remove_disk= raid10_remove_disk, 5263 .spare_active = raid10_spare_active, 5264 .sync_request = raid10_sync_request, 5265 .quiesce = raid10_quiesce, 5266 .size = raid10_size, 5267 .resize = raid10_resize, 5268 .takeover = raid10_takeover, 5269 .check_reshape = raid10_check_reshape, 5270 .start_reshape = raid10_start_reshape, 5271 .finish_reshape = raid10_finish_reshape, 5272 .update_reshape_pos = raid10_update_reshape_pos, 5273 }; 5274 5275 static int __init raid_init(void) 5276 { 5277 return register_md_personality(&raid10_personality); 5278 } 5279 5280 static void raid_exit(void) 5281 { 5282 unregister_md_personality(&raid10_personality); 5283 } 5284 5285 module_init(raid_init); 5286 module_exit(raid_exit); 5287 MODULE_LICENSE("GPL"); 5288 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); 5289 MODULE_ALIAS("md-personality-9"); /* RAID10 */ 5290 MODULE_ALIAS("md-raid10"); 5291 MODULE_ALIAS("md-level-10"); 5292