1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * raid10.c : Multiple Devices driver for Linux 4 * 5 * Copyright (C) 2000-2004 Neil Brown 6 * 7 * RAID-10 support for md. 8 * 9 * Base on code in raid1.c. See raid1.c for further copyright information. 10 */ 11 12 #include <linux/slab.h> 13 #include <linux/delay.h> 14 #include <linux/blkdev.h> 15 #include <linux/module.h> 16 #include <linux/seq_file.h> 17 #include <linux/ratelimit.h> 18 #include <linux/kthread.h> 19 #include <linux/raid/md_p.h> 20 #include <trace/events/block.h> 21 #include "md.h" 22 #include "raid10.h" 23 #include "raid0.h" 24 #include "md-bitmap.h" 25 26 /* 27 * RAID10 provides a combination of RAID0 and RAID1 functionality. 28 * The layout of data is defined by 29 * chunk_size 30 * raid_disks 31 * near_copies (stored in low byte of layout) 32 * far_copies (stored in second byte of layout) 33 * far_offset (stored in bit 16 of layout ) 34 * use_far_sets (stored in bit 17 of layout ) 35 * use_far_sets_bugfixed (stored in bit 18 of layout ) 36 * 37 * The data to be stored is divided into chunks using chunksize. Each device 38 * is divided into far_copies sections. In each section, chunks are laid out 39 * in a style similar to raid0, but near_copies copies of each chunk is stored 40 * (each on a different drive). The starting device for each section is offset 41 * near_copies from the starting device of the previous section. Thus there 42 * are (near_copies * far_copies) of each chunk, and each is on a different 43 * drive. near_copies and far_copies must be at least one, and their product 44 * is at most raid_disks. 45 * 46 * If far_offset is true, then the far_copies are handled a bit differently. 47 * The copies are still in different stripes, but instead of being very far 48 * apart on disk, there are adjacent stripes. 49 * 50 * The far and offset algorithms are handled slightly differently if 51 * 'use_far_sets' is true. In this case, the array's devices are grouped into 52 * sets that are (near_copies * far_copies) in size. The far copied stripes 53 * are still shifted by 'near_copies' devices, but this shifting stays confined 54 * to the set rather than the entire array. This is done to improve the number 55 * of device combinations that can fail without causing the array to fail. 56 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk 57 * on a device): 58 * A B C D A B C D E 59 * ... ... 60 * D A B C E A B C D 61 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): 62 * [A B] [C D] [A B] [C D E] 63 * |...| |...| |...| | ... | 64 * [B A] [D C] [B A] [E C D] 65 */ 66 67 static void allow_barrier(struct r10conf *conf); 68 static void lower_barrier(struct r10conf *conf); 69 static int _enough(struct r10conf *conf, int previous, int ignore); 70 static int enough(struct r10conf *conf, int ignore); 71 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 72 int *skipped); 73 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 74 static void end_reshape_write(struct bio *bio); 75 static void end_reshape(struct r10conf *conf); 76 77 #define raid10_log(md, fmt, args...) \ 78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) 79 80 #include "raid1-10.c" 81 82 /* 83 * for resync bio, r10bio pointer can be retrieved from the per-bio 84 * 'struct resync_pages'. 85 */ 86 static inline struct r10bio *get_resync_r10bio(struct bio *bio) 87 { 88 return get_resync_pages(bio)->raid_bio; 89 } 90 91 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 92 { 93 struct r10conf *conf = data; 94 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); 95 96 /* allocate a r10bio with room for raid_disks entries in the 97 * bios array */ 98 return kzalloc(size, gfp_flags); 99 } 100 101 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 102 /* amount of memory to reserve for resync requests */ 103 #define RESYNC_WINDOW (1024*1024) 104 /* maximum number of concurrent requests, memory permitting */ 105 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 106 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) 107 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 108 109 /* 110 * When performing a resync, we need to read and compare, so 111 * we need as many pages are there are copies. 112 * When performing a recovery, we need 2 bios, one for read, 113 * one for write (we recover only one drive per r10buf) 114 * 115 */ 116 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 117 { 118 struct r10conf *conf = data; 119 struct r10bio *r10_bio; 120 struct bio *bio; 121 int j; 122 int nalloc, nalloc_rp; 123 struct resync_pages *rps; 124 125 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 126 if (!r10_bio) 127 return NULL; 128 129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 131 nalloc = conf->copies; /* resync */ 132 else 133 nalloc = 2; /* recovery */ 134 135 /* allocate once for all bios */ 136 if (!conf->have_replacement) 137 nalloc_rp = nalloc; 138 else 139 nalloc_rp = nalloc * 2; 140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags); 141 if (!rps) 142 goto out_free_r10bio; 143 144 /* 145 * Allocate bios. 146 */ 147 for (j = nalloc ; j-- ; ) { 148 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 149 if (!bio) 150 goto out_free_bio; 151 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 152 r10_bio->devs[j].bio = bio; 153 if (!conf->have_replacement) 154 continue; 155 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 156 if (!bio) 157 goto out_free_bio; 158 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 159 r10_bio->devs[j].repl_bio = bio; 160 } 161 /* 162 * Allocate RESYNC_PAGES data pages and attach them 163 * where needed. 164 */ 165 for (j = 0; j < nalloc; j++) { 166 struct bio *rbio = r10_bio->devs[j].repl_bio; 167 struct resync_pages *rp, *rp_repl; 168 169 rp = &rps[j]; 170 if (rbio) 171 rp_repl = &rps[nalloc + j]; 172 173 bio = r10_bio->devs[j].bio; 174 175 if (!j || test_bit(MD_RECOVERY_SYNC, 176 &conf->mddev->recovery)) { 177 if (resync_alloc_pages(rp, gfp_flags)) 178 goto out_free_pages; 179 } else { 180 memcpy(rp, &rps[0], sizeof(*rp)); 181 resync_get_all_pages(rp); 182 } 183 184 rp->raid_bio = r10_bio; 185 bio->bi_private = rp; 186 if (rbio) { 187 memcpy(rp_repl, rp, sizeof(*rp)); 188 rbio->bi_private = rp_repl; 189 } 190 } 191 192 return r10_bio; 193 194 out_free_pages: 195 while (--j >= 0) 196 resync_free_pages(&rps[j]); 197 198 j = 0; 199 out_free_bio: 200 for ( ; j < nalloc; j++) { 201 if (r10_bio->devs[j].bio) 202 bio_uninit(r10_bio->devs[j].bio); 203 kfree(r10_bio->devs[j].bio); 204 if (r10_bio->devs[j].repl_bio) 205 bio_uninit(r10_bio->devs[j].repl_bio); 206 kfree(r10_bio->devs[j].repl_bio); 207 } 208 kfree(rps); 209 out_free_r10bio: 210 rbio_pool_free(r10_bio, conf); 211 return NULL; 212 } 213 214 static void r10buf_pool_free(void *__r10_bio, void *data) 215 { 216 struct r10conf *conf = data; 217 struct r10bio *r10bio = __r10_bio; 218 int j; 219 struct resync_pages *rp = NULL; 220 221 for (j = conf->copies; j--; ) { 222 struct bio *bio = r10bio->devs[j].bio; 223 224 if (bio) { 225 rp = get_resync_pages(bio); 226 resync_free_pages(rp); 227 bio_uninit(bio); 228 kfree(bio); 229 } 230 231 bio = r10bio->devs[j].repl_bio; 232 if (bio) { 233 bio_uninit(bio); 234 kfree(bio); 235 } 236 } 237 238 /* resync pages array stored in the 1st bio's .bi_private */ 239 kfree(rp); 240 241 rbio_pool_free(r10bio, conf); 242 } 243 244 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 245 { 246 int i; 247 248 for (i = 0; i < conf->geo.raid_disks; i++) { 249 struct bio **bio = & r10_bio->devs[i].bio; 250 if (!BIO_SPECIAL(*bio)) 251 bio_put(*bio); 252 *bio = NULL; 253 bio = &r10_bio->devs[i].repl_bio; 254 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) 255 bio_put(*bio); 256 *bio = NULL; 257 } 258 } 259 260 static void free_r10bio(struct r10bio *r10_bio) 261 { 262 struct r10conf *conf = r10_bio->mddev->private; 263 264 put_all_bios(conf, r10_bio); 265 mempool_free(r10_bio, &conf->r10bio_pool); 266 } 267 268 static void put_buf(struct r10bio *r10_bio) 269 { 270 struct r10conf *conf = r10_bio->mddev->private; 271 272 mempool_free(r10_bio, &conf->r10buf_pool); 273 274 lower_barrier(conf); 275 } 276 277 static void reschedule_retry(struct r10bio *r10_bio) 278 { 279 unsigned long flags; 280 struct mddev *mddev = r10_bio->mddev; 281 struct r10conf *conf = mddev->private; 282 283 spin_lock_irqsave(&conf->device_lock, flags); 284 list_add(&r10_bio->retry_list, &conf->retry_list); 285 conf->nr_queued ++; 286 spin_unlock_irqrestore(&conf->device_lock, flags); 287 288 /* wake up frozen array... */ 289 wake_up(&conf->wait_barrier); 290 291 md_wakeup_thread(mddev->thread); 292 } 293 294 /* 295 * raid_end_bio_io() is called when we have finished servicing a mirrored 296 * operation and are ready to return a success/failure code to the buffer 297 * cache layer. 298 */ 299 static void raid_end_bio_io(struct r10bio *r10_bio) 300 { 301 struct bio *bio = r10_bio->master_bio; 302 struct r10conf *conf = r10_bio->mddev->private; 303 304 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 305 bio->bi_status = BLK_STS_IOERR; 306 307 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 308 bio_end_io_acct(bio, r10_bio->start_time); 309 bio_endio(bio); 310 /* 311 * Wake up any possible resync thread that waits for the device 312 * to go idle. 313 */ 314 allow_barrier(conf); 315 316 free_r10bio(r10_bio); 317 } 318 319 /* 320 * Update disk head position estimator based on IRQ completion info. 321 */ 322 static inline void update_head_pos(int slot, struct r10bio *r10_bio) 323 { 324 struct r10conf *conf = r10_bio->mddev->private; 325 326 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 327 r10_bio->devs[slot].addr + (r10_bio->sectors); 328 } 329 330 /* 331 * Find the disk number which triggered given bio 332 */ 333 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 334 struct bio *bio, int *slotp, int *replp) 335 { 336 int slot; 337 int repl = 0; 338 339 for (slot = 0; slot < conf->geo.raid_disks; slot++) { 340 if (r10_bio->devs[slot].bio == bio) 341 break; 342 if (r10_bio->devs[slot].repl_bio == bio) { 343 repl = 1; 344 break; 345 } 346 } 347 348 update_head_pos(slot, r10_bio); 349 350 if (slotp) 351 *slotp = slot; 352 if (replp) 353 *replp = repl; 354 return r10_bio->devs[slot].devnum; 355 } 356 357 static void raid10_end_read_request(struct bio *bio) 358 { 359 int uptodate = !bio->bi_status; 360 struct r10bio *r10_bio = bio->bi_private; 361 int slot; 362 struct md_rdev *rdev; 363 struct r10conf *conf = r10_bio->mddev->private; 364 365 slot = r10_bio->read_slot; 366 rdev = r10_bio->devs[slot].rdev; 367 /* 368 * this branch is our 'one mirror IO has finished' event handler: 369 */ 370 update_head_pos(slot, r10_bio); 371 372 if (uptodate) { 373 /* 374 * Set R10BIO_Uptodate in our master bio, so that 375 * we will return a good error code to the higher 376 * levels even if IO on some other mirrored buffer fails. 377 * 378 * The 'master' represents the composite IO operation to 379 * user-side. So if something waits for IO, then it will 380 * wait for the 'master' bio. 381 */ 382 set_bit(R10BIO_Uptodate, &r10_bio->state); 383 } else { 384 /* If all other devices that store this block have 385 * failed, we want to return the error upwards rather 386 * than fail the last device. Here we redefine 387 * "uptodate" to mean "Don't want to retry" 388 */ 389 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), 390 rdev->raid_disk)) 391 uptodate = 1; 392 } 393 if (uptodate) { 394 raid_end_bio_io(r10_bio); 395 rdev_dec_pending(rdev, conf->mddev); 396 } else { 397 /* 398 * oops, read error - keep the refcount on the rdev 399 */ 400 pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", 401 mdname(conf->mddev), 402 rdev->bdev, 403 (unsigned long long)r10_bio->sector); 404 set_bit(R10BIO_ReadError, &r10_bio->state); 405 reschedule_retry(r10_bio); 406 } 407 } 408 409 static void close_write(struct r10bio *r10_bio) 410 { 411 /* clear the bitmap if all writes complete successfully */ 412 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 413 r10_bio->sectors, 414 !test_bit(R10BIO_Degraded, &r10_bio->state), 415 0); 416 md_write_end(r10_bio->mddev); 417 } 418 419 static void one_write_done(struct r10bio *r10_bio) 420 { 421 if (atomic_dec_and_test(&r10_bio->remaining)) { 422 if (test_bit(R10BIO_WriteError, &r10_bio->state)) 423 reschedule_retry(r10_bio); 424 else { 425 close_write(r10_bio); 426 if (test_bit(R10BIO_MadeGood, &r10_bio->state)) 427 reschedule_retry(r10_bio); 428 else 429 raid_end_bio_io(r10_bio); 430 } 431 } 432 } 433 434 static void raid10_end_write_request(struct bio *bio) 435 { 436 struct r10bio *r10_bio = bio->bi_private; 437 int dev; 438 int dec_rdev = 1; 439 struct r10conf *conf = r10_bio->mddev->private; 440 int slot, repl; 441 struct md_rdev *rdev = NULL; 442 struct bio *to_put = NULL; 443 bool discard_error; 444 445 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 446 447 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 448 449 if (repl) 450 rdev = conf->mirrors[dev].replacement; 451 if (!rdev) { 452 smp_rmb(); 453 repl = 0; 454 rdev = conf->mirrors[dev].rdev; 455 } 456 /* 457 * this branch is our 'one mirror IO has finished' event handler: 458 */ 459 if (bio->bi_status && !discard_error) { 460 if (repl) 461 /* Never record new bad blocks to replacement, 462 * just fail it. 463 */ 464 md_error(rdev->mddev, rdev); 465 else { 466 set_bit(WriteErrorSeen, &rdev->flags); 467 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 468 set_bit(MD_RECOVERY_NEEDED, 469 &rdev->mddev->recovery); 470 471 dec_rdev = 0; 472 if (test_bit(FailFast, &rdev->flags) && 473 (bio->bi_opf & MD_FAILFAST)) { 474 md_error(rdev->mddev, rdev); 475 } 476 477 /* 478 * When the device is faulty, it is not necessary to 479 * handle write error. 480 */ 481 if (!test_bit(Faulty, &rdev->flags)) 482 set_bit(R10BIO_WriteError, &r10_bio->state); 483 else { 484 /* Fail the request */ 485 set_bit(R10BIO_Degraded, &r10_bio->state); 486 r10_bio->devs[slot].bio = NULL; 487 to_put = bio; 488 dec_rdev = 1; 489 } 490 } 491 } else { 492 /* 493 * Set R10BIO_Uptodate in our master bio, so that 494 * we will return a good error code for to the higher 495 * levels even if IO on some other mirrored buffer fails. 496 * 497 * The 'master' represents the composite IO operation to 498 * user-side. So if something waits for IO, then it will 499 * wait for the 'master' bio. 500 */ 501 sector_t first_bad; 502 int bad_sectors; 503 504 /* 505 * Do not set R10BIO_Uptodate if the current device is 506 * rebuilding or Faulty. This is because we cannot use 507 * such device for properly reading the data back (we could 508 * potentially use it, if the current write would have felt 509 * before rdev->recovery_offset, but for simplicity we don't 510 * check this here. 511 */ 512 if (test_bit(In_sync, &rdev->flags) && 513 !test_bit(Faulty, &rdev->flags)) 514 set_bit(R10BIO_Uptodate, &r10_bio->state); 515 516 /* Maybe we can clear some bad blocks. */ 517 if (is_badblock(rdev, 518 r10_bio->devs[slot].addr, 519 r10_bio->sectors, 520 &first_bad, &bad_sectors) && !discard_error) { 521 bio_put(bio); 522 if (repl) 523 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; 524 else 525 r10_bio->devs[slot].bio = IO_MADE_GOOD; 526 dec_rdev = 0; 527 set_bit(R10BIO_MadeGood, &r10_bio->state); 528 } 529 } 530 531 /* 532 * 533 * Let's see if all mirrored write operations have finished 534 * already. 535 */ 536 one_write_done(r10_bio); 537 if (dec_rdev) 538 rdev_dec_pending(rdev, conf->mddev); 539 if (to_put) 540 bio_put(to_put); 541 } 542 543 /* 544 * RAID10 layout manager 545 * As well as the chunksize and raid_disks count, there are two 546 * parameters: near_copies and far_copies. 547 * near_copies * far_copies must be <= raid_disks. 548 * Normally one of these will be 1. 549 * If both are 1, we get raid0. 550 * If near_copies == raid_disks, we get raid1. 551 * 552 * Chunks are laid out in raid0 style with near_copies copies of the 553 * first chunk, followed by near_copies copies of the next chunk and 554 * so on. 555 * If far_copies > 1, then after 1/far_copies of the array has been assigned 556 * as described above, we start again with a device offset of near_copies. 557 * So we effectively have another copy of the whole array further down all 558 * the drives, but with blocks on different drives. 559 * With this layout, and block is never stored twice on the one device. 560 * 561 * raid10_find_phys finds the sector offset of a given virtual sector 562 * on each device that it is on. 563 * 564 * raid10_find_virt does the reverse mapping, from a device and a 565 * sector offset to a virtual address 566 */ 567 568 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 569 { 570 int n,f; 571 sector_t sector; 572 sector_t chunk; 573 sector_t stripe; 574 int dev; 575 int slot = 0; 576 int last_far_set_start, last_far_set_size; 577 578 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 579 last_far_set_start *= geo->far_set_size; 580 581 last_far_set_size = geo->far_set_size; 582 last_far_set_size += (geo->raid_disks % geo->far_set_size); 583 584 /* now calculate first sector/dev */ 585 chunk = r10bio->sector >> geo->chunk_shift; 586 sector = r10bio->sector & geo->chunk_mask; 587 588 chunk *= geo->near_copies; 589 stripe = chunk; 590 dev = sector_div(stripe, geo->raid_disks); 591 if (geo->far_offset) 592 stripe *= geo->far_copies; 593 594 sector += stripe << geo->chunk_shift; 595 596 /* and calculate all the others */ 597 for (n = 0; n < geo->near_copies; n++) { 598 int d = dev; 599 int set; 600 sector_t s = sector; 601 r10bio->devs[slot].devnum = d; 602 r10bio->devs[slot].addr = s; 603 slot++; 604 605 for (f = 1; f < geo->far_copies; f++) { 606 set = d / geo->far_set_size; 607 d += geo->near_copies; 608 609 if ((geo->raid_disks % geo->far_set_size) && 610 (d > last_far_set_start)) { 611 d -= last_far_set_start; 612 d %= last_far_set_size; 613 d += last_far_set_start; 614 } else { 615 d %= geo->far_set_size; 616 d += geo->far_set_size * set; 617 } 618 s += geo->stride; 619 r10bio->devs[slot].devnum = d; 620 r10bio->devs[slot].addr = s; 621 slot++; 622 } 623 dev++; 624 if (dev >= geo->raid_disks) { 625 dev = 0; 626 sector += (geo->chunk_mask + 1); 627 } 628 } 629 } 630 631 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 632 { 633 struct geom *geo = &conf->geo; 634 635 if (conf->reshape_progress != MaxSector && 636 ((r10bio->sector >= conf->reshape_progress) != 637 conf->mddev->reshape_backwards)) { 638 set_bit(R10BIO_Previous, &r10bio->state); 639 geo = &conf->prev; 640 } else 641 clear_bit(R10BIO_Previous, &r10bio->state); 642 643 __raid10_find_phys(geo, r10bio); 644 } 645 646 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 647 { 648 sector_t offset, chunk, vchunk; 649 /* Never use conf->prev as this is only called during resync 650 * or recovery, so reshape isn't happening 651 */ 652 struct geom *geo = &conf->geo; 653 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; 654 int far_set_size = geo->far_set_size; 655 int last_far_set_start; 656 657 if (geo->raid_disks % geo->far_set_size) { 658 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 659 last_far_set_start *= geo->far_set_size; 660 661 if (dev >= last_far_set_start) { 662 far_set_size = geo->far_set_size; 663 far_set_size += (geo->raid_disks % geo->far_set_size); 664 far_set_start = last_far_set_start; 665 } 666 } 667 668 offset = sector & geo->chunk_mask; 669 if (geo->far_offset) { 670 int fc; 671 chunk = sector >> geo->chunk_shift; 672 fc = sector_div(chunk, geo->far_copies); 673 dev -= fc * geo->near_copies; 674 if (dev < far_set_start) 675 dev += far_set_size; 676 } else { 677 while (sector >= geo->stride) { 678 sector -= geo->stride; 679 if (dev < (geo->near_copies + far_set_start)) 680 dev += far_set_size - geo->near_copies; 681 else 682 dev -= geo->near_copies; 683 } 684 chunk = sector >> geo->chunk_shift; 685 } 686 vchunk = chunk * geo->raid_disks + dev; 687 sector_div(vchunk, geo->near_copies); 688 return (vchunk << geo->chunk_shift) + offset; 689 } 690 691 /* 692 * This routine returns the disk from which the requested read should 693 * be done. There is a per-array 'next expected sequential IO' sector 694 * number - if this matches on the next IO then we use the last disk. 695 * There is also a per-disk 'last know head position' sector that is 696 * maintained from IRQ contexts, both the normal and the resync IO 697 * completion handlers update this position correctly. If there is no 698 * perfect sequential match then we pick the disk whose head is closest. 699 * 700 * If there are 2 mirrors in the same 2 devices, performance degrades 701 * because position is mirror, not device based. 702 * 703 * The rdev for the device selected will have nr_pending incremented. 704 */ 705 706 /* 707 * FIXME: possibly should rethink readbalancing and do it differently 708 * depending on near_copies / far_copies geometry. 709 */ 710 static struct md_rdev *read_balance(struct r10conf *conf, 711 struct r10bio *r10_bio, 712 int *max_sectors) 713 { 714 const sector_t this_sector = r10_bio->sector; 715 int disk, slot; 716 int sectors = r10_bio->sectors; 717 int best_good_sectors; 718 sector_t new_distance, best_dist; 719 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; 720 int do_balance; 721 int best_dist_slot, best_pending_slot; 722 bool has_nonrot_disk = false; 723 unsigned int min_pending; 724 struct geom *geo = &conf->geo; 725 726 raid10_find_phys(conf, r10_bio); 727 rcu_read_lock(); 728 best_dist_slot = -1; 729 min_pending = UINT_MAX; 730 best_dist_rdev = NULL; 731 best_pending_rdev = NULL; 732 best_dist = MaxSector; 733 best_good_sectors = 0; 734 do_balance = 1; 735 clear_bit(R10BIO_FailFast, &r10_bio->state); 736 /* 737 * Check if we can balance. We can balance on the whole 738 * device if no resync is going on (recovery is ok), or below 739 * the resync window. We take the first readable disk when 740 * above the resync window. 741 */ 742 if ((conf->mddev->recovery_cp < MaxSector 743 && (this_sector + sectors >= conf->next_resync)) || 744 (mddev_is_clustered(conf->mddev) && 745 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 746 this_sector + sectors))) 747 do_balance = 0; 748 749 for (slot = 0; slot < conf->copies ; slot++) { 750 sector_t first_bad; 751 int bad_sectors; 752 sector_t dev_sector; 753 unsigned int pending; 754 bool nonrot; 755 756 if (r10_bio->devs[slot].bio == IO_BLOCKED) 757 continue; 758 disk = r10_bio->devs[slot].devnum; 759 rdev = rcu_dereference(conf->mirrors[disk].replacement); 760 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 761 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 762 rdev = rcu_dereference(conf->mirrors[disk].rdev); 763 if (rdev == NULL || 764 test_bit(Faulty, &rdev->flags)) 765 continue; 766 if (!test_bit(In_sync, &rdev->flags) && 767 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 768 continue; 769 770 dev_sector = r10_bio->devs[slot].addr; 771 if (is_badblock(rdev, dev_sector, sectors, 772 &first_bad, &bad_sectors)) { 773 if (best_dist < MaxSector) 774 /* Already have a better slot */ 775 continue; 776 if (first_bad <= dev_sector) { 777 /* Cannot read here. If this is the 778 * 'primary' device, then we must not read 779 * beyond 'bad_sectors' from another device. 780 */ 781 bad_sectors -= (dev_sector - first_bad); 782 if (!do_balance && sectors > bad_sectors) 783 sectors = bad_sectors; 784 if (best_good_sectors > sectors) 785 best_good_sectors = sectors; 786 } else { 787 sector_t good_sectors = 788 first_bad - dev_sector; 789 if (good_sectors > best_good_sectors) { 790 best_good_sectors = good_sectors; 791 best_dist_slot = slot; 792 best_dist_rdev = rdev; 793 } 794 if (!do_balance) 795 /* Must read from here */ 796 break; 797 } 798 continue; 799 } else 800 best_good_sectors = sectors; 801 802 if (!do_balance) 803 break; 804 805 nonrot = bdev_nonrot(rdev->bdev); 806 has_nonrot_disk |= nonrot; 807 pending = atomic_read(&rdev->nr_pending); 808 if (min_pending > pending && nonrot) { 809 min_pending = pending; 810 best_pending_slot = slot; 811 best_pending_rdev = rdev; 812 } 813 814 if (best_dist_slot >= 0) 815 /* At least 2 disks to choose from so failfast is OK */ 816 set_bit(R10BIO_FailFast, &r10_bio->state); 817 /* This optimisation is debatable, and completely destroys 818 * sequential read speed for 'far copies' arrays. So only 819 * keep it for 'near' arrays, and review those later. 820 */ 821 if (geo->near_copies > 1 && !pending) 822 new_distance = 0; 823 824 /* for far > 1 always use the lowest address */ 825 else if (geo->far_copies > 1) 826 new_distance = r10_bio->devs[slot].addr; 827 else 828 new_distance = abs(r10_bio->devs[slot].addr - 829 conf->mirrors[disk].head_position); 830 831 if (new_distance < best_dist) { 832 best_dist = new_distance; 833 best_dist_slot = slot; 834 best_dist_rdev = rdev; 835 } 836 } 837 if (slot >= conf->copies) { 838 if (has_nonrot_disk) { 839 slot = best_pending_slot; 840 rdev = best_pending_rdev; 841 } else { 842 slot = best_dist_slot; 843 rdev = best_dist_rdev; 844 } 845 } 846 847 if (slot >= 0) { 848 atomic_inc(&rdev->nr_pending); 849 r10_bio->read_slot = slot; 850 } else 851 rdev = NULL; 852 rcu_read_unlock(); 853 *max_sectors = best_good_sectors; 854 855 return rdev; 856 } 857 858 static void flush_pending_writes(struct r10conf *conf) 859 { 860 /* Any writes that have been queued but are awaiting 861 * bitmap updates get flushed here. 862 */ 863 spin_lock_irq(&conf->device_lock); 864 865 if (conf->pending_bio_list.head) { 866 struct blk_plug plug; 867 struct bio *bio; 868 869 bio = bio_list_get(&conf->pending_bio_list); 870 spin_unlock_irq(&conf->device_lock); 871 872 /* 873 * As this is called in a wait_event() loop (see freeze_array), 874 * current->state might be TASK_UNINTERRUPTIBLE which will 875 * cause a warning when we prepare to wait again. As it is 876 * rare that this path is taken, it is perfectly safe to force 877 * us to go around the wait_event() loop again, so the warning 878 * is a false-positive. Silence the warning by resetting 879 * thread state 880 */ 881 __set_current_state(TASK_RUNNING); 882 883 blk_start_plug(&plug); 884 /* flush any pending bitmap writes to disk 885 * before proceeding w/ I/O */ 886 md_bitmap_unplug(conf->mddev->bitmap); 887 wake_up(&conf->wait_barrier); 888 889 while (bio) { /* submit pending writes */ 890 struct bio *next = bio->bi_next; 891 struct md_rdev *rdev = (void*)bio->bi_bdev; 892 bio->bi_next = NULL; 893 bio_set_dev(bio, rdev->bdev); 894 if (test_bit(Faulty, &rdev->flags)) { 895 bio_io_error(bio); 896 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 897 !bdev_max_discard_sectors(bio->bi_bdev))) 898 /* Just ignore it */ 899 bio_endio(bio); 900 else 901 submit_bio_noacct(bio); 902 bio = next; 903 } 904 blk_finish_plug(&plug); 905 } else 906 spin_unlock_irq(&conf->device_lock); 907 } 908 909 /* Barriers.... 910 * Sometimes we need to suspend IO while we do something else, 911 * either some resync/recovery, or reconfigure the array. 912 * To do this we raise a 'barrier'. 913 * The 'barrier' is a counter that can be raised multiple times 914 * to count how many activities are happening which preclude 915 * normal IO. 916 * We can only raise the barrier if there is no pending IO. 917 * i.e. if nr_pending == 0. 918 * We choose only to raise the barrier if no-one is waiting for the 919 * barrier to go down. This means that as soon as an IO request 920 * is ready, no other operations which require a barrier will start 921 * until the IO request has had a chance. 922 * 923 * So: regular IO calls 'wait_barrier'. When that returns there 924 * is no backgroup IO happening, It must arrange to call 925 * allow_barrier when it has finished its IO. 926 * backgroup IO calls must call raise_barrier. Once that returns 927 * there is no normal IO happeing. It must arrange to call 928 * lower_barrier when the particular background IO completes. 929 */ 930 931 static void raise_barrier(struct r10conf *conf, int force) 932 { 933 BUG_ON(force && !conf->barrier); 934 spin_lock_irq(&conf->resync_lock); 935 936 /* Wait until no block IO is waiting (unless 'force') */ 937 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 938 conf->resync_lock); 939 940 /* block any new IO from starting */ 941 conf->barrier++; 942 943 /* Now wait for all pending IO to complete */ 944 wait_event_lock_irq(conf->wait_barrier, 945 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, 946 conf->resync_lock); 947 948 spin_unlock_irq(&conf->resync_lock); 949 } 950 951 static void lower_barrier(struct r10conf *conf) 952 { 953 unsigned long flags; 954 spin_lock_irqsave(&conf->resync_lock, flags); 955 conf->barrier--; 956 spin_unlock_irqrestore(&conf->resync_lock, flags); 957 wake_up(&conf->wait_barrier); 958 } 959 960 static bool wait_barrier(struct r10conf *conf, bool nowait) 961 { 962 bool ret = true; 963 964 spin_lock_irq(&conf->resync_lock); 965 if (conf->barrier) { 966 struct bio_list *bio_list = current->bio_list; 967 conf->nr_waiting++; 968 /* Wait for the barrier to drop. 969 * However if there are already pending 970 * requests (preventing the barrier from 971 * rising completely), and the 972 * pre-process bio queue isn't empty, 973 * then don't wait, as we need to empty 974 * that queue to get the nr_pending 975 * count down. 976 */ 977 /* Return false when nowait flag is set */ 978 if (nowait) { 979 ret = false; 980 } else { 981 raid10_log(conf->mddev, "wait barrier"); 982 wait_event_lock_irq(conf->wait_barrier, 983 !conf->barrier || 984 (atomic_read(&conf->nr_pending) && 985 bio_list && 986 (!bio_list_empty(&bio_list[0]) || 987 !bio_list_empty(&bio_list[1]))) || 988 /* move on if recovery thread is 989 * blocked by us 990 */ 991 (conf->mddev->thread->tsk == current && 992 test_bit(MD_RECOVERY_RUNNING, 993 &conf->mddev->recovery) && 994 conf->nr_queued > 0), 995 conf->resync_lock); 996 } 997 conf->nr_waiting--; 998 if (!conf->nr_waiting) 999 wake_up(&conf->wait_barrier); 1000 } 1001 /* Only increment nr_pending when we wait */ 1002 if (ret) 1003 atomic_inc(&conf->nr_pending); 1004 spin_unlock_irq(&conf->resync_lock); 1005 return ret; 1006 } 1007 1008 static void allow_barrier(struct r10conf *conf) 1009 { 1010 if ((atomic_dec_and_test(&conf->nr_pending)) || 1011 (conf->array_freeze_pending)) 1012 wake_up(&conf->wait_barrier); 1013 } 1014 1015 static void freeze_array(struct r10conf *conf, int extra) 1016 { 1017 /* stop syncio and normal IO and wait for everything to 1018 * go quiet. 1019 * We increment barrier and nr_waiting, and then 1020 * wait until nr_pending match nr_queued+extra 1021 * This is called in the context of one normal IO request 1022 * that has failed. Thus any sync request that might be pending 1023 * will be blocked by nr_pending, and we need to wait for 1024 * pending IO requests to complete or be queued for re-try. 1025 * Thus the number queued (nr_queued) plus this request (extra) 1026 * must match the number of pending IOs (nr_pending) before 1027 * we continue. 1028 */ 1029 spin_lock_irq(&conf->resync_lock); 1030 conf->array_freeze_pending++; 1031 conf->barrier++; 1032 conf->nr_waiting++; 1033 wait_event_lock_irq_cmd(conf->wait_barrier, 1034 atomic_read(&conf->nr_pending) == conf->nr_queued+extra, 1035 conf->resync_lock, 1036 flush_pending_writes(conf)); 1037 1038 conf->array_freeze_pending--; 1039 spin_unlock_irq(&conf->resync_lock); 1040 } 1041 1042 static void unfreeze_array(struct r10conf *conf) 1043 { 1044 /* reverse the effect of the freeze */ 1045 spin_lock_irq(&conf->resync_lock); 1046 conf->barrier--; 1047 conf->nr_waiting--; 1048 wake_up(&conf->wait_barrier); 1049 spin_unlock_irq(&conf->resync_lock); 1050 } 1051 1052 static sector_t choose_data_offset(struct r10bio *r10_bio, 1053 struct md_rdev *rdev) 1054 { 1055 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 1056 test_bit(R10BIO_Previous, &r10_bio->state)) 1057 return rdev->data_offset; 1058 else 1059 return rdev->new_data_offset; 1060 } 1061 1062 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 1063 { 1064 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); 1065 struct mddev *mddev = plug->cb.data; 1066 struct r10conf *conf = mddev->private; 1067 struct bio *bio; 1068 1069 if (from_schedule || current->bio_list) { 1070 spin_lock_irq(&conf->device_lock); 1071 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1072 spin_unlock_irq(&conf->device_lock); 1073 wake_up(&conf->wait_barrier); 1074 md_wakeup_thread(mddev->thread); 1075 kfree(plug); 1076 return; 1077 } 1078 1079 /* we aren't scheduling, so we can do the write-out directly. */ 1080 bio = bio_list_get(&plug->pending); 1081 md_bitmap_unplug(mddev->bitmap); 1082 wake_up(&conf->wait_barrier); 1083 1084 while (bio) { /* submit pending writes */ 1085 struct bio *next = bio->bi_next; 1086 struct md_rdev *rdev = (void*)bio->bi_bdev; 1087 bio->bi_next = NULL; 1088 bio_set_dev(bio, rdev->bdev); 1089 if (test_bit(Faulty, &rdev->flags)) { 1090 bio_io_error(bio); 1091 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 1092 !bdev_max_discard_sectors(bio->bi_bdev))) 1093 /* Just ignore it */ 1094 bio_endio(bio); 1095 else 1096 submit_bio_noacct(bio); 1097 bio = next; 1098 } 1099 kfree(plug); 1100 } 1101 1102 /* 1103 * 1. Register the new request and wait if the reconstruction thread has put 1104 * up a bar for new requests. Continue immediately if no resync is active 1105 * currently. 1106 * 2. If IO spans the reshape position. Need to wait for reshape to pass. 1107 */ 1108 static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, 1109 struct bio *bio, sector_t sectors) 1110 { 1111 /* Bail out if REQ_NOWAIT is set for the bio */ 1112 if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { 1113 bio_wouldblock_error(bio); 1114 return false; 1115 } 1116 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1117 bio->bi_iter.bi_sector < conf->reshape_progress && 1118 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 1119 allow_barrier(conf); 1120 if (bio->bi_opf & REQ_NOWAIT) { 1121 bio_wouldblock_error(bio); 1122 return false; 1123 } 1124 raid10_log(conf->mddev, "wait reshape"); 1125 wait_event(conf->wait_barrier, 1126 conf->reshape_progress <= bio->bi_iter.bi_sector || 1127 conf->reshape_progress >= bio->bi_iter.bi_sector + 1128 sectors); 1129 wait_barrier(conf, false); 1130 } 1131 return true; 1132 } 1133 1134 static void raid10_read_request(struct mddev *mddev, struct bio *bio, 1135 struct r10bio *r10_bio) 1136 { 1137 struct r10conf *conf = mddev->private; 1138 struct bio *read_bio; 1139 const int op = bio_op(bio); 1140 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); 1141 int max_sectors; 1142 struct md_rdev *rdev; 1143 char b[BDEVNAME_SIZE]; 1144 int slot = r10_bio->read_slot; 1145 struct md_rdev *err_rdev = NULL; 1146 gfp_t gfp = GFP_NOIO; 1147 1148 if (slot >= 0 && r10_bio->devs[slot].rdev) { 1149 /* 1150 * This is an error retry, but we cannot 1151 * safely dereference the rdev in the r10_bio, 1152 * we must use the one in conf. 1153 * If it has already been disconnected (unlikely) 1154 * we lose the device name in error messages. 1155 */ 1156 int disk; 1157 /* 1158 * As we are blocking raid10, it is a little safer to 1159 * use __GFP_HIGH. 1160 */ 1161 gfp = GFP_NOIO | __GFP_HIGH; 1162 1163 rcu_read_lock(); 1164 disk = r10_bio->devs[slot].devnum; 1165 err_rdev = rcu_dereference(conf->mirrors[disk].rdev); 1166 if (err_rdev) 1167 bdevname(err_rdev->bdev, b); 1168 else { 1169 strcpy(b, "???"); 1170 /* This never gets dereferenced */ 1171 err_rdev = r10_bio->devs[slot].rdev; 1172 } 1173 rcu_read_unlock(); 1174 } 1175 1176 if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) 1177 return; 1178 rdev = read_balance(conf, r10_bio, &max_sectors); 1179 if (!rdev) { 1180 if (err_rdev) { 1181 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", 1182 mdname(mddev), b, 1183 (unsigned long long)r10_bio->sector); 1184 } 1185 raid_end_bio_io(r10_bio); 1186 return; 1187 } 1188 if (err_rdev) 1189 pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", 1190 mdname(mddev), 1191 rdev->bdev, 1192 (unsigned long long)r10_bio->sector); 1193 if (max_sectors < bio_sectors(bio)) { 1194 struct bio *split = bio_split(bio, max_sectors, 1195 gfp, &conf->bio_split); 1196 bio_chain(split, bio); 1197 allow_barrier(conf); 1198 submit_bio_noacct(bio); 1199 wait_barrier(conf, false); 1200 bio = split; 1201 r10_bio->master_bio = bio; 1202 r10_bio->sectors = max_sectors; 1203 } 1204 slot = r10_bio->read_slot; 1205 1206 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1207 r10_bio->start_time = bio_start_io_acct(bio); 1208 read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); 1209 1210 r10_bio->devs[slot].bio = read_bio; 1211 r10_bio->devs[slot].rdev = rdev; 1212 1213 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + 1214 choose_data_offset(r10_bio, rdev); 1215 read_bio->bi_end_io = raid10_end_read_request; 1216 bio_set_op_attrs(read_bio, op, do_sync); 1217 if (test_bit(FailFast, &rdev->flags) && 1218 test_bit(R10BIO_FailFast, &r10_bio->state)) 1219 read_bio->bi_opf |= MD_FAILFAST; 1220 read_bio->bi_private = r10_bio; 1221 1222 if (mddev->gendisk) 1223 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 1224 r10_bio->sector); 1225 submit_bio_noacct(read_bio); 1226 return; 1227 } 1228 1229 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, 1230 struct bio *bio, bool replacement, 1231 int n_copy) 1232 { 1233 const int op = bio_op(bio); 1234 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); 1235 const unsigned long do_fua = (bio->bi_opf & REQ_FUA); 1236 unsigned long flags; 1237 struct blk_plug_cb *cb; 1238 struct raid1_plug_cb *plug = NULL; 1239 struct r10conf *conf = mddev->private; 1240 struct md_rdev *rdev; 1241 int devnum = r10_bio->devs[n_copy].devnum; 1242 struct bio *mbio; 1243 1244 if (replacement) { 1245 rdev = conf->mirrors[devnum].replacement; 1246 if (rdev == NULL) { 1247 /* Replacement just got moved to main 'rdev' */ 1248 smp_mb(); 1249 rdev = conf->mirrors[devnum].rdev; 1250 } 1251 } else 1252 rdev = conf->mirrors[devnum].rdev; 1253 1254 mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); 1255 if (replacement) 1256 r10_bio->devs[n_copy].repl_bio = mbio; 1257 else 1258 r10_bio->devs[n_copy].bio = mbio; 1259 1260 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1261 choose_data_offset(r10_bio, rdev)); 1262 mbio->bi_end_io = raid10_end_write_request; 1263 bio_set_op_attrs(mbio, op, do_sync | do_fua); 1264 if (!replacement && test_bit(FailFast, 1265 &conf->mirrors[devnum].rdev->flags) 1266 && enough(conf, devnum)) 1267 mbio->bi_opf |= MD_FAILFAST; 1268 mbio->bi_private = r10_bio; 1269 1270 if (conf->mddev->gendisk) 1271 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), 1272 r10_bio->sector); 1273 /* flush_pending_writes() needs access to the rdev so...*/ 1274 mbio->bi_bdev = (void *)rdev; 1275 1276 atomic_inc(&r10_bio->remaining); 1277 1278 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1279 if (cb) 1280 plug = container_of(cb, struct raid1_plug_cb, cb); 1281 else 1282 plug = NULL; 1283 if (plug) { 1284 bio_list_add(&plug->pending, mbio); 1285 } else { 1286 spin_lock_irqsave(&conf->device_lock, flags); 1287 bio_list_add(&conf->pending_bio_list, mbio); 1288 spin_unlock_irqrestore(&conf->device_lock, flags); 1289 md_wakeup_thread(mddev->thread); 1290 } 1291 } 1292 1293 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 1294 { 1295 int i; 1296 struct r10conf *conf = mddev->private; 1297 struct md_rdev *blocked_rdev; 1298 1299 retry_wait: 1300 blocked_rdev = NULL; 1301 rcu_read_lock(); 1302 for (i = 0; i < conf->copies; i++) { 1303 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1304 struct md_rdev *rrdev = rcu_dereference( 1305 conf->mirrors[i].replacement); 1306 if (rdev == rrdev) 1307 rrdev = NULL; 1308 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1309 atomic_inc(&rdev->nr_pending); 1310 blocked_rdev = rdev; 1311 break; 1312 } 1313 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 1314 atomic_inc(&rrdev->nr_pending); 1315 blocked_rdev = rrdev; 1316 break; 1317 } 1318 1319 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1320 sector_t first_bad; 1321 sector_t dev_sector = r10_bio->devs[i].addr; 1322 int bad_sectors; 1323 int is_bad; 1324 1325 /* 1326 * Discard request doesn't care the write result 1327 * so it doesn't need to wait blocked disk here. 1328 */ 1329 if (!r10_bio->sectors) 1330 continue; 1331 1332 is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, 1333 &first_bad, &bad_sectors); 1334 if (is_bad < 0) { 1335 /* 1336 * Mustn't write here until the bad block 1337 * is acknowledged 1338 */ 1339 atomic_inc(&rdev->nr_pending); 1340 set_bit(BlockedBadBlocks, &rdev->flags); 1341 blocked_rdev = rdev; 1342 break; 1343 } 1344 } 1345 } 1346 rcu_read_unlock(); 1347 1348 if (unlikely(blocked_rdev)) { 1349 /* Have to wait for this device to get unblocked, then retry */ 1350 allow_barrier(conf); 1351 raid10_log(conf->mddev, "%s wait rdev %d blocked", 1352 __func__, blocked_rdev->raid_disk); 1353 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1354 wait_barrier(conf, false); 1355 goto retry_wait; 1356 } 1357 } 1358 1359 static void raid10_write_request(struct mddev *mddev, struct bio *bio, 1360 struct r10bio *r10_bio) 1361 { 1362 struct r10conf *conf = mddev->private; 1363 int i; 1364 sector_t sectors; 1365 int max_sectors; 1366 1367 if ((mddev_is_clustered(mddev) && 1368 md_cluster_ops->area_resyncing(mddev, WRITE, 1369 bio->bi_iter.bi_sector, 1370 bio_end_sector(bio)))) { 1371 DEFINE_WAIT(w); 1372 /* Bail out if REQ_NOWAIT is set for the bio */ 1373 if (bio->bi_opf & REQ_NOWAIT) { 1374 bio_wouldblock_error(bio); 1375 return; 1376 } 1377 for (;;) { 1378 prepare_to_wait(&conf->wait_barrier, 1379 &w, TASK_IDLE); 1380 if (!md_cluster_ops->area_resyncing(mddev, WRITE, 1381 bio->bi_iter.bi_sector, bio_end_sector(bio))) 1382 break; 1383 schedule(); 1384 } 1385 finish_wait(&conf->wait_barrier, &w); 1386 } 1387 1388 sectors = r10_bio->sectors; 1389 if (!regular_request_wait(mddev, conf, bio, sectors)) 1390 return; 1391 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1392 (mddev->reshape_backwards 1393 ? (bio->bi_iter.bi_sector < conf->reshape_safe && 1394 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) 1395 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && 1396 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1397 /* Need to update reshape_position in metadata */ 1398 mddev->reshape_position = conf->reshape_progress; 1399 set_mask_bits(&mddev->sb_flags, 0, 1400 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1401 md_wakeup_thread(mddev->thread); 1402 if (bio->bi_opf & REQ_NOWAIT) { 1403 allow_barrier(conf); 1404 bio_wouldblock_error(bio); 1405 return; 1406 } 1407 raid10_log(conf->mddev, "wait reshape metadata"); 1408 wait_event(mddev->sb_wait, 1409 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 1410 1411 conf->reshape_safe = mddev->reshape_position; 1412 } 1413 1414 /* first select target devices under rcu_lock and 1415 * inc refcount on their rdev. Record them by setting 1416 * bios[x] to bio 1417 * If there are known/acknowledged bad blocks on any device 1418 * on which we have seen a write error, we want to avoid 1419 * writing to those blocks. This potentially requires several 1420 * writes to write around the bad blocks. Each set of writes 1421 * gets its own r10_bio with a set of bios attached. 1422 */ 1423 1424 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 1425 raid10_find_phys(conf, r10_bio); 1426 1427 wait_blocked_dev(mddev, r10_bio); 1428 1429 rcu_read_lock(); 1430 max_sectors = r10_bio->sectors; 1431 1432 for (i = 0; i < conf->copies; i++) { 1433 int d = r10_bio->devs[i].devnum; 1434 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1435 struct md_rdev *rrdev = rcu_dereference( 1436 conf->mirrors[d].replacement); 1437 if (rdev == rrdev) 1438 rrdev = NULL; 1439 if (rdev && (test_bit(Faulty, &rdev->flags))) 1440 rdev = NULL; 1441 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1442 rrdev = NULL; 1443 1444 r10_bio->devs[i].bio = NULL; 1445 r10_bio->devs[i].repl_bio = NULL; 1446 1447 if (!rdev && !rrdev) { 1448 set_bit(R10BIO_Degraded, &r10_bio->state); 1449 continue; 1450 } 1451 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1452 sector_t first_bad; 1453 sector_t dev_sector = r10_bio->devs[i].addr; 1454 int bad_sectors; 1455 int is_bad; 1456 1457 is_bad = is_badblock(rdev, dev_sector, max_sectors, 1458 &first_bad, &bad_sectors); 1459 if (is_bad && first_bad <= dev_sector) { 1460 /* Cannot write here at all */ 1461 bad_sectors -= (dev_sector - first_bad); 1462 if (bad_sectors < max_sectors) 1463 /* Mustn't write more than bad_sectors 1464 * to other devices yet 1465 */ 1466 max_sectors = bad_sectors; 1467 /* We don't set R10BIO_Degraded as that 1468 * only applies if the disk is missing, 1469 * so it might be re-added, and we want to 1470 * know to recover this chunk. 1471 * In this case the device is here, and the 1472 * fact that this chunk is not in-sync is 1473 * recorded in the bad block log. 1474 */ 1475 continue; 1476 } 1477 if (is_bad) { 1478 int good_sectors = first_bad - dev_sector; 1479 if (good_sectors < max_sectors) 1480 max_sectors = good_sectors; 1481 } 1482 } 1483 if (rdev) { 1484 r10_bio->devs[i].bio = bio; 1485 atomic_inc(&rdev->nr_pending); 1486 } 1487 if (rrdev) { 1488 r10_bio->devs[i].repl_bio = bio; 1489 atomic_inc(&rrdev->nr_pending); 1490 } 1491 } 1492 rcu_read_unlock(); 1493 1494 if (max_sectors < r10_bio->sectors) 1495 r10_bio->sectors = max_sectors; 1496 1497 if (r10_bio->sectors < bio_sectors(bio)) { 1498 struct bio *split = bio_split(bio, r10_bio->sectors, 1499 GFP_NOIO, &conf->bio_split); 1500 bio_chain(split, bio); 1501 allow_barrier(conf); 1502 submit_bio_noacct(bio); 1503 wait_barrier(conf, false); 1504 bio = split; 1505 r10_bio->master_bio = bio; 1506 } 1507 1508 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1509 r10_bio->start_time = bio_start_io_acct(bio); 1510 atomic_set(&r10_bio->remaining, 1); 1511 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1512 1513 for (i = 0; i < conf->copies; i++) { 1514 if (r10_bio->devs[i].bio) 1515 raid10_write_one_disk(mddev, r10_bio, bio, false, i); 1516 if (r10_bio->devs[i].repl_bio) 1517 raid10_write_one_disk(mddev, r10_bio, bio, true, i); 1518 } 1519 one_write_done(r10_bio); 1520 } 1521 1522 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) 1523 { 1524 struct r10conf *conf = mddev->private; 1525 struct r10bio *r10_bio; 1526 1527 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1528 1529 r10_bio->master_bio = bio; 1530 r10_bio->sectors = sectors; 1531 1532 r10_bio->mddev = mddev; 1533 r10_bio->sector = bio->bi_iter.bi_sector; 1534 r10_bio->state = 0; 1535 r10_bio->read_slot = -1; 1536 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * 1537 conf->geo.raid_disks); 1538 1539 if (bio_data_dir(bio) == READ) 1540 raid10_read_request(mddev, bio, r10_bio); 1541 else 1542 raid10_write_request(mddev, bio, r10_bio); 1543 } 1544 1545 static void raid_end_discard_bio(struct r10bio *r10bio) 1546 { 1547 struct r10conf *conf = r10bio->mddev->private; 1548 struct r10bio *first_r10bio; 1549 1550 while (atomic_dec_and_test(&r10bio->remaining)) { 1551 1552 allow_barrier(conf); 1553 1554 if (!test_bit(R10BIO_Discard, &r10bio->state)) { 1555 first_r10bio = (struct r10bio *)r10bio->master_bio; 1556 free_r10bio(r10bio); 1557 r10bio = first_r10bio; 1558 } else { 1559 md_write_end(r10bio->mddev); 1560 bio_endio(r10bio->master_bio); 1561 free_r10bio(r10bio); 1562 break; 1563 } 1564 } 1565 } 1566 1567 static void raid10_end_discard_request(struct bio *bio) 1568 { 1569 struct r10bio *r10_bio = bio->bi_private; 1570 struct r10conf *conf = r10_bio->mddev->private; 1571 struct md_rdev *rdev = NULL; 1572 int dev; 1573 int slot, repl; 1574 1575 /* 1576 * We don't care the return value of discard bio 1577 */ 1578 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 1579 set_bit(R10BIO_Uptodate, &r10_bio->state); 1580 1581 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1582 if (repl) 1583 rdev = conf->mirrors[dev].replacement; 1584 if (!rdev) { 1585 /* 1586 * raid10_remove_disk uses smp_mb to make sure rdev is set to 1587 * replacement before setting replacement to NULL. It can read 1588 * rdev first without barrier protect even replacment is NULL 1589 */ 1590 smp_rmb(); 1591 rdev = conf->mirrors[dev].rdev; 1592 } 1593 1594 raid_end_discard_bio(r10_bio); 1595 rdev_dec_pending(rdev, conf->mddev); 1596 } 1597 1598 /* 1599 * There are some limitations to handle discard bio 1600 * 1st, the discard size is bigger than stripe_size*2. 1601 * 2st, if the discard bio spans reshape progress, we use the old way to 1602 * handle discard bio 1603 */ 1604 static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) 1605 { 1606 struct r10conf *conf = mddev->private; 1607 struct geom *geo = &conf->geo; 1608 int far_copies = geo->far_copies; 1609 bool first_copy = true; 1610 struct r10bio *r10_bio, *first_r10bio; 1611 struct bio *split; 1612 int disk; 1613 sector_t chunk; 1614 unsigned int stripe_size; 1615 unsigned int stripe_data_disks; 1616 sector_t split_size; 1617 sector_t bio_start, bio_end; 1618 sector_t first_stripe_index, last_stripe_index; 1619 sector_t start_disk_offset; 1620 unsigned int start_disk_index; 1621 sector_t end_disk_offset; 1622 unsigned int end_disk_index; 1623 unsigned int remainder; 1624 1625 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1626 return -EAGAIN; 1627 1628 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { 1629 bio_wouldblock_error(bio); 1630 return 0; 1631 } 1632 wait_barrier(conf, false); 1633 1634 /* 1635 * Check reshape again to avoid reshape happens after checking 1636 * MD_RECOVERY_RESHAPE and before wait_barrier 1637 */ 1638 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1639 goto out; 1640 1641 if (geo->near_copies) 1642 stripe_data_disks = geo->raid_disks / geo->near_copies + 1643 geo->raid_disks % geo->near_copies; 1644 else 1645 stripe_data_disks = geo->raid_disks; 1646 1647 stripe_size = stripe_data_disks << geo->chunk_shift; 1648 1649 bio_start = bio->bi_iter.bi_sector; 1650 bio_end = bio_end_sector(bio); 1651 1652 /* 1653 * Maybe one discard bio is smaller than strip size or across one 1654 * stripe and discard region is larger than one stripe size. For far 1655 * offset layout, if the discard region is not aligned with stripe 1656 * size, there is hole when we submit discard bio to member disk. 1657 * For simplicity, we only handle discard bio which discard region 1658 * is bigger than stripe_size * 2 1659 */ 1660 if (bio_sectors(bio) < stripe_size*2) 1661 goto out; 1662 1663 /* 1664 * Keep bio aligned with strip size. 1665 */ 1666 div_u64_rem(bio_start, stripe_size, &remainder); 1667 if (remainder) { 1668 split_size = stripe_size - remainder; 1669 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1670 bio_chain(split, bio); 1671 allow_barrier(conf); 1672 /* Resend the fist split part */ 1673 submit_bio_noacct(split); 1674 wait_barrier(conf, false); 1675 } 1676 div_u64_rem(bio_end, stripe_size, &remainder); 1677 if (remainder) { 1678 split_size = bio_sectors(bio) - remainder; 1679 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1680 bio_chain(split, bio); 1681 allow_barrier(conf); 1682 /* Resend the second split part */ 1683 submit_bio_noacct(bio); 1684 bio = split; 1685 wait_barrier(conf, false); 1686 } 1687 1688 bio_start = bio->bi_iter.bi_sector; 1689 bio_end = bio_end_sector(bio); 1690 1691 /* 1692 * Raid10 uses chunk as the unit to store data. It's similar like raid0. 1693 * One stripe contains the chunks from all member disk (one chunk from 1694 * one disk at the same HBA address). For layout detail, see 'man md 4' 1695 */ 1696 chunk = bio_start >> geo->chunk_shift; 1697 chunk *= geo->near_copies; 1698 first_stripe_index = chunk; 1699 start_disk_index = sector_div(first_stripe_index, geo->raid_disks); 1700 if (geo->far_offset) 1701 first_stripe_index *= geo->far_copies; 1702 start_disk_offset = (bio_start & geo->chunk_mask) + 1703 (first_stripe_index << geo->chunk_shift); 1704 1705 chunk = bio_end >> geo->chunk_shift; 1706 chunk *= geo->near_copies; 1707 last_stripe_index = chunk; 1708 end_disk_index = sector_div(last_stripe_index, geo->raid_disks); 1709 if (geo->far_offset) 1710 last_stripe_index *= geo->far_copies; 1711 end_disk_offset = (bio_end & geo->chunk_mask) + 1712 (last_stripe_index << geo->chunk_shift); 1713 1714 retry_discard: 1715 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1716 r10_bio->mddev = mddev; 1717 r10_bio->state = 0; 1718 r10_bio->sectors = 0; 1719 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); 1720 wait_blocked_dev(mddev, r10_bio); 1721 1722 /* 1723 * For far layout it needs more than one r10bio to cover all regions. 1724 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio 1725 * to record the discard bio. Other r10bio->master_bio record the first 1726 * r10bio. The first r10bio only release after all other r10bios finish. 1727 * The discard bio returns only first r10bio finishes 1728 */ 1729 if (first_copy) { 1730 r10_bio->master_bio = bio; 1731 set_bit(R10BIO_Discard, &r10_bio->state); 1732 first_copy = false; 1733 first_r10bio = r10_bio; 1734 } else 1735 r10_bio->master_bio = (struct bio *)first_r10bio; 1736 1737 /* 1738 * first select target devices under rcu_lock and 1739 * inc refcount on their rdev. Record them by setting 1740 * bios[x] to bio 1741 */ 1742 rcu_read_lock(); 1743 for (disk = 0; disk < geo->raid_disks; disk++) { 1744 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1745 struct md_rdev *rrdev = rcu_dereference( 1746 conf->mirrors[disk].replacement); 1747 1748 r10_bio->devs[disk].bio = NULL; 1749 r10_bio->devs[disk].repl_bio = NULL; 1750 1751 if (rdev && (test_bit(Faulty, &rdev->flags))) 1752 rdev = NULL; 1753 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1754 rrdev = NULL; 1755 if (!rdev && !rrdev) 1756 continue; 1757 1758 if (rdev) { 1759 r10_bio->devs[disk].bio = bio; 1760 atomic_inc(&rdev->nr_pending); 1761 } 1762 if (rrdev) { 1763 r10_bio->devs[disk].repl_bio = bio; 1764 atomic_inc(&rrdev->nr_pending); 1765 } 1766 } 1767 rcu_read_unlock(); 1768 1769 atomic_set(&r10_bio->remaining, 1); 1770 for (disk = 0; disk < geo->raid_disks; disk++) { 1771 sector_t dev_start, dev_end; 1772 struct bio *mbio, *rbio = NULL; 1773 1774 /* 1775 * Now start to calculate the start and end address for each disk. 1776 * The space between dev_start and dev_end is the discard region. 1777 * 1778 * For dev_start, it needs to consider three conditions: 1779 * 1st, the disk is before start_disk, you can imagine the disk in 1780 * the next stripe. So the dev_start is the start address of next 1781 * stripe. 1782 * 2st, the disk is after start_disk, it means the disk is at the 1783 * same stripe of first disk 1784 * 3st, the first disk itself, we can use start_disk_offset directly 1785 */ 1786 if (disk < start_disk_index) 1787 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; 1788 else if (disk > start_disk_index) 1789 dev_start = first_stripe_index * mddev->chunk_sectors; 1790 else 1791 dev_start = start_disk_offset; 1792 1793 if (disk < end_disk_index) 1794 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 1795 else if (disk > end_disk_index) 1796 dev_end = last_stripe_index * mddev->chunk_sectors; 1797 else 1798 dev_end = end_disk_offset; 1799 1800 /* 1801 * It only handles discard bio which size is >= stripe size, so 1802 * dev_end > dev_start all the time. 1803 * It doesn't need to use rcu lock to get rdev here. We already 1804 * add rdev->nr_pending in the first loop. 1805 */ 1806 if (r10_bio->devs[disk].bio) { 1807 struct md_rdev *rdev = conf->mirrors[disk].rdev; 1808 mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1809 &mddev->bio_set); 1810 mbio->bi_end_io = raid10_end_discard_request; 1811 mbio->bi_private = r10_bio; 1812 r10_bio->devs[disk].bio = mbio; 1813 r10_bio->devs[disk].devnum = disk; 1814 atomic_inc(&r10_bio->remaining); 1815 md_submit_discard_bio(mddev, rdev, mbio, 1816 dev_start + choose_data_offset(r10_bio, rdev), 1817 dev_end - dev_start); 1818 bio_endio(mbio); 1819 } 1820 if (r10_bio->devs[disk].repl_bio) { 1821 struct md_rdev *rrdev = conf->mirrors[disk].replacement; 1822 rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1823 &mddev->bio_set); 1824 rbio->bi_end_io = raid10_end_discard_request; 1825 rbio->bi_private = r10_bio; 1826 r10_bio->devs[disk].repl_bio = rbio; 1827 r10_bio->devs[disk].devnum = disk; 1828 atomic_inc(&r10_bio->remaining); 1829 md_submit_discard_bio(mddev, rrdev, rbio, 1830 dev_start + choose_data_offset(r10_bio, rrdev), 1831 dev_end - dev_start); 1832 bio_endio(rbio); 1833 } 1834 } 1835 1836 if (!geo->far_offset && --far_copies) { 1837 first_stripe_index += geo->stride >> geo->chunk_shift; 1838 start_disk_offset += geo->stride; 1839 last_stripe_index += geo->stride >> geo->chunk_shift; 1840 end_disk_offset += geo->stride; 1841 atomic_inc(&first_r10bio->remaining); 1842 raid_end_discard_bio(r10_bio); 1843 wait_barrier(conf, false); 1844 goto retry_discard; 1845 } 1846 1847 raid_end_discard_bio(r10_bio); 1848 1849 return 0; 1850 out: 1851 allow_barrier(conf); 1852 return -EAGAIN; 1853 } 1854 1855 static bool raid10_make_request(struct mddev *mddev, struct bio *bio) 1856 { 1857 struct r10conf *conf = mddev->private; 1858 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 1859 int chunk_sects = chunk_mask + 1; 1860 int sectors = bio_sectors(bio); 1861 1862 if (unlikely(bio->bi_opf & REQ_PREFLUSH) 1863 && md_flush_request(mddev, bio)) 1864 return true; 1865 1866 if (!md_write_start(mddev, bio)) 1867 return false; 1868 1869 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1870 if (!raid10_handle_discard(mddev, bio)) 1871 return true; 1872 1873 /* 1874 * If this request crosses a chunk boundary, we need to split 1875 * it. 1876 */ 1877 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + 1878 sectors > chunk_sects 1879 && (conf->geo.near_copies < conf->geo.raid_disks 1880 || conf->prev.near_copies < 1881 conf->prev.raid_disks))) 1882 sectors = chunk_sects - 1883 (bio->bi_iter.bi_sector & 1884 (chunk_sects - 1)); 1885 __make_request(mddev, bio, sectors); 1886 1887 /* In case raid10d snuck in to freeze_array */ 1888 wake_up(&conf->wait_barrier); 1889 return true; 1890 } 1891 1892 static void raid10_status(struct seq_file *seq, struct mddev *mddev) 1893 { 1894 struct r10conf *conf = mddev->private; 1895 int i; 1896 1897 if (conf->geo.near_copies < conf->geo.raid_disks) 1898 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1899 if (conf->geo.near_copies > 1) 1900 seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1901 if (conf->geo.far_copies > 1) { 1902 if (conf->geo.far_offset) 1903 seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1904 else 1905 seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1906 if (conf->geo.far_set_size != conf->geo.raid_disks) 1907 seq_printf(seq, " %d devices per set", conf->geo.far_set_size); 1908 } 1909 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1910 conf->geo.raid_disks - mddev->degraded); 1911 rcu_read_lock(); 1912 for (i = 0; i < conf->geo.raid_disks; i++) { 1913 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1914 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 1915 } 1916 rcu_read_unlock(); 1917 seq_printf(seq, "]"); 1918 } 1919 1920 /* check if there are enough drives for 1921 * every block to appear on atleast one. 1922 * Don't consider the device numbered 'ignore' 1923 * as we might be about to remove it. 1924 */ 1925 static int _enough(struct r10conf *conf, int previous, int ignore) 1926 { 1927 int first = 0; 1928 int has_enough = 0; 1929 int disks, ncopies; 1930 if (previous) { 1931 disks = conf->prev.raid_disks; 1932 ncopies = conf->prev.near_copies; 1933 } else { 1934 disks = conf->geo.raid_disks; 1935 ncopies = conf->geo.near_copies; 1936 } 1937 1938 rcu_read_lock(); 1939 do { 1940 int n = conf->copies; 1941 int cnt = 0; 1942 int this = first; 1943 while (n--) { 1944 struct md_rdev *rdev; 1945 if (this != ignore && 1946 (rdev = rcu_dereference(conf->mirrors[this].rdev)) && 1947 test_bit(In_sync, &rdev->flags)) 1948 cnt++; 1949 this = (this+1) % disks; 1950 } 1951 if (cnt == 0) 1952 goto out; 1953 first = (first + ncopies) % disks; 1954 } while (first != 0); 1955 has_enough = 1; 1956 out: 1957 rcu_read_unlock(); 1958 return has_enough; 1959 } 1960 1961 static int enough(struct r10conf *conf, int ignore) 1962 { 1963 /* when calling 'enough', both 'prev' and 'geo' must 1964 * be stable. 1965 * This is ensured if ->reconfig_mutex or ->device_lock 1966 * is held. 1967 */ 1968 return _enough(conf, 0, ignore) && 1969 _enough(conf, 1, ignore); 1970 } 1971 1972 /** 1973 * raid10_error() - RAID10 error handler. 1974 * @mddev: affected md device. 1975 * @rdev: member device to fail. 1976 * 1977 * The routine acknowledges &rdev failure and determines new @mddev state. 1978 * If it failed, then: 1979 * - &MD_BROKEN flag is set in &mddev->flags. 1980 * Otherwise, it must be degraded: 1981 * - recovery is interrupted. 1982 * - &mddev->degraded is bumped. 1983 1984 * @rdev is marked as &Faulty excluding case when array is failed and 1985 * &mddev->fail_last_dev is off. 1986 */ 1987 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) 1988 { 1989 struct r10conf *conf = mddev->private; 1990 unsigned long flags; 1991 1992 spin_lock_irqsave(&conf->device_lock, flags); 1993 1994 if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { 1995 set_bit(MD_BROKEN, &mddev->flags); 1996 1997 if (!mddev->fail_last_dev) { 1998 spin_unlock_irqrestore(&conf->device_lock, flags); 1999 return; 2000 } 2001 } 2002 if (test_and_clear_bit(In_sync, &rdev->flags)) 2003 mddev->degraded++; 2004 2005 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2006 set_bit(Blocked, &rdev->flags); 2007 set_bit(Faulty, &rdev->flags); 2008 set_mask_bits(&mddev->sb_flags, 0, 2009 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2010 spin_unlock_irqrestore(&conf->device_lock, flags); 2011 pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" 2012 "md/raid10:%s: Operation continuing on %d devices.\n", 2013 mdname(mddev), rdev->bdev, 2014 mdname(mddev), conf->geo.raid_disks - mddev->degraded); 2015 } 2016 2017 static void print_conf(struct r10conf *conf) 2018 { 2019 int i; 2020 struct md_rdev *rdev; 2021 2022 pr_debug("RAID10 conf printout:\n"); 2023 if (!conf) { 2024 pr_debug("(!conf)\n"); 2025 return; 2026 } 2027 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 2028 conf->geo.raid_disks); 2029 2030 /* This is only called with ->reconfix_mutex held, so 2031 * rcu protection of rdev is not needed */ 2032 for (i = 0; i < conf->geo.raid_disks; i++) { 2033 rdev = conf->mirrors[i].rdev; 2034 if (rdev) 2035 pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", 2036 i, !test_bit(In_sync, &rdev->flags), 2037 !test_bit(Faulty, &rdev->flags), 2038 rdev->bdev); 2039 } 2040 } 2041 2042 static void close_sync(struct r10conf *conf) 2043 { 2044 wait_barrier(conf, false); 2045 allow_barrier(conf); 2046 2047 mempool_exit(&conf->r10buf_pool); 2048 } 2049 2050 static int raid10_spare_active(struct mddev *mddev) 2051 { 2052 int i; 2053 struct r10conf *conf = mddev->private; 2054 struct raid10_info *tmp; 2055 int count = 0; 2056 unsigned long flags; 2057 2058 /* 2059 * Find all non-in_sync disks within the RAID10 configuration 2060 * and mark them in_sync 2061 */ 2062 for (i = 0; i < conf->geo.raid_disks; i++) { 2063 tmp = conf->mirrors + i; 2064 if (tmp->replacement 2065 && tmp->replacement->recovery_offset == MaxSector 2066 && !test_bit(Faulty, &tmp->replacement->flags) 2067 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 2068 /* Replacement has just become active */ 2069 if (!tmp->rdev 2070 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 2071 count++; 2072 if (tmp->rdev) { 2073 /* Replaced device not technically faulty, 2074 * but we need to be sure it gets removed 2075 * and never re-added. 2076 */ 2077 set_bit(Faulty, &tmp->rdev->flags); 2078 sysfs_notify_dirent_safe( 2079 tmp->rdev->sysfs_state); 2080 } 2081 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 2082 } else if (tmp->rdev 2083 && tmp->rdev->recovery_offset == MaxSector 2084 && !test_bit(Faulty, &tmp->rdev->flags) 2085 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 2086 count++; 2087 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 2088 } 2089 } 2090 spin_lock_irqsave(&conf->device_lock, flags); 2091 mddev->degraded -= count; 2092 spin_unlock_irqrestore(&conf->device_lock, flags); 2093 2094 print_conf(conf); 2095 return count; 2096 } 2097 2098 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 2099 { 2100 struct r10conf *conf = mddev->private; 2101 int err = -EEXIST; 2102 int mirror; 2103 int first = 0; 2104 int last = conf->geo.raid_disks - 1; 2105 2106 if (mddev->recovery_cp < MaxSector) 2107 /* only hot-add to in-sync arrays, as recovery is 2108 * very different from resync 2109 */ 2110 return -EBUSY; 2111 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) 2112 return -EINVAL; 2113 2114 if (md_integrity_add_rdev(rdev, mddev)) 2115 return -ENXIO; 2116 2117 if (rdev->raid_disk >= 0) 2118 first = last = rdev->raid_disk; 2119 2120 if (rdev->saved_raid_disk >= first && 2121 rdev->saved_raid_disk < conf->geo.raid_disks && 2122 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 2123 mirror = rdev->saved_raid_disk; 2124 else 2125 mirror = first; 2126 for ( ; mirror <= last ; mirror++) { 2127 struct raid10_info *p = &conf->mirrors[mirror]; 2128 if (p->recovery_disabled == mddev->recovery_disabled) 2129 continue; 2130 if (p->rdev) { 2131 if (!test_bit(WantReplacement, &p->rdev->flags) || 2132 p->replacement != NULL) 2133 continue; 2134 clear_bit(In_sync, &rdev->flags); 2135 set_bit(Replacement, &rdev->flags); 2136 rdev->raid_disk = mirror; 2137 err = 0; 2138 if (mddev->gendisk) 2139 disk_stack_limits(mddev->gendisk, rdev->bdev, 2140 rdev->data_offset << 9); 2141 conf->fullsync = 1; 2142 rcu_assign_pointer(p->replacement, rdev); 2143 break; 2144 } 2145 2146 if (mddev->gendisk) 2147 disk_stack_limits(mddev->gendisk, rdev->bdev, 2148 rdev->data_offset << 9); 2149 2150 p->head_position = 0; 2151 p->recovery_disabled = mddev->recovery_disabled - 1; 2152 rdev->raid_disk = mirror; 2153 err = 0; 2154 if (rdev->saved_raid_disk != mirror) 2155 conf->fullsync = 1; 2156 rcu_assign_pointer(p->rdev, rdev); 2157 break; 2158 } 2159 2160 print_conf(conf); 2161 return err; 2162 } 2163 2164 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 2165 { 2166 struct r10conf *conf = mddev->private; 2167 int err = 0; 2168 int number = rdev->raid_disk; 2169 struct md_rdev **rdevp; 2170 struct raid10_info *p = conf->mirrors + number; 2171 2172 print_conf(conf); 2173 if (rdev == p->rdev) 2174 rdevp = &p->rdev; 2175 else if (rdev == p->replacement) 2176 rdevp = &p->replacement; 2177 else 2178 return 0; 2179 2180 if (test_bit(In_sync, &rdev->flags) || 2181 atomic_read(&rdev->nr_pending)) { 2182 err = -EBUSY; 2183 goto abort; 2184 } 2185 /* Only remove non-faulty devices if recovery 2186 * is not possible. 2187 */ 2188 if (!test_bit(Faulty, &rdev->flags) && 2189 mddev->recovery_disabled != p->recovery_disabled && 2190 (!p->replacement || p->replacement == rdev) && 2191 number < conf->geo.raid_disks && 2192 enough(conf, -1)) { 2193 err = -EBUSY; 2194 goto abort; 2195 } 2196 *rdevp = NULL; 2197 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 2198 synchronize_rcu(); 2199 if (atomic_read(&rdev->nr_pending)) { 2200 /* lost the race, try later */ 2201 err = -EBUSY; 2202 *rdevp = rdev; 2203 goto abort; 2204 } 2205 } 2206 if (p->replacement) { 2207 /* We must have just cleared 'rdev' */ 2208 p->rdev = p->replacement; 2209 clear_bit(Replacement, &p->replacement->flags); 2210 smp_mb(); /* Make sure other CPUs may see both as identical 2211 * but will never see neither -- if they are careful. 2212 */ 2213 p->replacement = NULL; 2214 } 2215 2216 clear_bit(WantReplacement, &rdev->flags); 2217 err = md_integrity_register(mddev); 2218 2219 abort: 2220 2221 print_conf(conf); 2222 return err; 2223 } 2224 2225 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) 2226 { 2227 struct r10conf *conf = r10_bio->mddev->private; 2228 2229 if (!bio->bi_status) 2230 set_bit(R10BIO_Uptodate, &r10_bio->state); 2231 else 2232 /* The write handler will notice the lack of 2233 * R10BIO_Uptodate and record any errors etc 2234 */ 2235 atomic_add(r10_bio->sectors, 2236 &conf->mirrors[d].rdev->corrected_errors); 2237 2238 /* for reconstruct, we always reschedule after a read. 2239 * for resync, only after all reads 2240 */ 2241 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); 2242 if (test_bit(R10BIO_IsRecover, &r10_bio->state) || 2243 atomic_dec_and_test(&r10_bio->remaining)) { 2244 /* we have read all the blocks, 2245 * do the comparison in process context in raid10d 2246 */ 2247 reschedule_retry(r10_bio); 2248 } 2249 } 2250 2251 static void end_sync_read(struct bio *bio) 2252 { 2253 struct r10bio *r10_bio = get_resync_r10bio(bio); 2254 struct r10conf *conf = r10_bio->mddev->private; 2255 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 2256 2257 __end_sync_read(r10_bio, bio, d); 2258 } 2259 2260 static void end_reshape_read(struct bio *bio) 2261 { 2262 /* reshape read bio isn't allocated from r10buf_pool */ 2263 struct r10bio *r10_bio = bio->bi_private; 2264 2265 __end_sync_read(r10_bio, bio, r10_bio->read_slot); 2266 } 2267 2268 static void end_sync_request(struct r10bio *r10_bio) 2269 { 2270 struct mddev *mddev = r10_bio->mddev; 2271 2272 while (atomic_dec_and_test(&r10_bio->remaining)) { 2273 if (r10_bio->master_bio == NULL) { 2274 /* the primary of several recovery bios */ 2275 sector_t s = r10_bio->sectors; 2276 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2277 test_bit(R10BIO_WriteError, &r10_bio->state)) 2278 reschedule_retry(r10_bio); 2279 else 2280 put_buf(r10_bio); 2281 md_done_sync(mddev, s, 1); 2282 break; 2283 } else { 2284 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 2285 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2286 test_bit(R10BIO_WriteError, &r10_bio->state)) 2287 reschedule_retry(r10_bio); 2288 else 2289 put_buf(r10_bio); 2290 r10_bio = r10_bio2; 2291 } 2292 } 2293 } 2294 2295 static void end_sync_write(struct bio *bio) 2296 { 2297 struct r10bio *r10_bio = get_resync_r10bio(bio); 2298 struct mddev *mddev = r10_bio->mddev; 2299 struct r10conf *conf = mddev->private; 2300 int d; 2301 sector_t first_bad; 2302 int bad_sectors; 2303 int slot; 2304 int repl; 2305 struct md_rdev *rdev = NULL; 2306 2307 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 2308 if (repl) 2309 rdev = conf->mirrors[d].replacement; 2310 else 2311 rdev = conf->mirrors[d].rdev; 2312 2313 if (bio->bi_status) { 2314 if (repl) 2315 md_error(mddev, rdev); 2316 else { 2317 set_bit(WriteErrorSeen, &rdev->flags); 2318 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2319 set_bit(MD_RECOVERY_NEEDED, 2320 &rdev->mddev->recovery); 2321 set_bit(R10BIO_WriteError, &r10_bio->state); 2322 } 2323 } else if (is_badblock(rdev, 2324 r10_bio->devs[slot].addr, 2325 r10_bio->sectors, 2326 &first_bad, &bad_sectors)) 2327 set_bit(R10BIO_MadeGood, &r10_bio->state); 2328 2329 rdev_dec_pending(rdev, mddev); 2330 2331 end_sync_request(r10_bio); 2332 } 2333 2334 /* 2335 * Note: sync and recover and handled very differently for raid10 2336 * This code is for resync. 2337 * For resync, we read through virtual addresses and read all blocks. 2338 * If there is any error, we schedule a write. The lowest numbered 2339 * drive is authoritative. 2340 * However requests come for physical address, so we need to map. 2341 * For every physical address there are raid_disks/copies virtual addresses, 2342 * which is always are least one, but is not necessarly an integer. 2343 * This means that a physical address can span multiple chunks, so we may 2344 * have to submit multiple io requests for a single sync request. 2345 */ 2346 /* 2347 * We check if all blocks are in-sync and only write to blocks that 2348 * aren't in sync 2349 */ 2350 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2351 { 2352 struct r10conf *conf = mddev->private; 2353 int i, first; 2354 struct bio *tbio, *fbio; 2355 int vcnt; 2356 struct page **tpages, **fpages; 2357 2358 atomic_set(&r10_bio->remaining, 1); 2359 2360 /* find the first device with a block */ 2361 for (i=0; i<conf->copies; i++) 2362 if (!r10_bio->devs[i].bio->bi_status) 2363 break; 2364 2365 if (i == conf->copies) 2366 goto done; 2367 2368 first = i; 2369 fbio = r10_bio->devs[i].bio; 2370 fbio->bi_iter.bi_size = r10_bio->sectors << 9; 2371 fbio->bi_iter.bi_idx = 0; 2372 fpages = get_resync_pages(fbio)->pages; 2373 2374 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); 2375 /* now find blocks with errors */ 2376 for (i=0 ; i < conf->copies ; i++) { 2377 int j, d; 2378 struct md_rdev *rdev; 2379 struct resync_pages *rp; 2380 2381 tbio = r10_bio->devs[i].bio; 2382 2383 if (tbio->bi_end_io != end_sync_read) 2384 continue; 2385 if (i == first) 2386 continue; 2387 2388 tpages = get_resync_pages(tbio)->pages; 2389 d = r10_bio->devs[i].devnum; 2390 rdev = conf->mirrors[d].rdev; 2391 if (!r10_bio->devs[i].bio->bi_status) { 2392 /* We know that the bi_io_vec layout is the same for 2393 * both 'first' and 'i', so we just compare them. 2394 * All vec entries are PAGE_SIZE; 2395 */ 2396 int sectors = r10_bio->sectors; 2397 for (j = 0; j < vcnt; j++) { 2398 int len = PAGE_SIZE; 2399 if (sectors < (len / 512)) 2400 len = sectors * 512; 2401 if (memcmp(page_address(fpages[j]), 2402 page_address(tpages[j]), 2403 len)) 2404 break; 2405 sectors -= len/512; 2406 } 2407 if (j == vcnt) 2408 continue; 2409 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 2410 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2411 /* Don't fix anything. */ 2412 continue; 2413 } else if (test_bit(FailFast, &rdev->flags)) { 2414 /* Just give up on this device */ 2415 md_error(rdev->mddev, rdev); 2416 continue; 2417 } 2418 /* Ok, we need to write this bio, either to correct an 2419 * inconsistency or to correct an unreadable block. 2420 * First we need to fixup bv_offset, bv_len and 2421 * bi_vecs, as the read request might have corrupted these 2422 */ 2423 rp = get_resync_pages(tbio); 2424 bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE); 2425 2426 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); 2427 2428 rp->raid_bio = r10_bio; 2429 tbio->bi_private = rp; 2430 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; 2431 tbio->bi_end_io = end_sync_write; 2432 2433 bio_copy_data(tbio, fbio); 2434 2435 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2436 atomic_inc(&r10_bio->remaining); 2437 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2438 2439 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) 2440 tbio->bi_opf |= MD_FAILFAST; 2441 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; 2442 submit_bio_noacct(tbio); 2443 } 2444 2445 /* Now write out to any replacement devices 2446 * that are active 2447 */ 2448 for (i = 0; i < conf->copies; i++) { 2449 int d; 2450 2451 tbio = r10_bio->devs[i].repl_bio; 2452 if (!tbio || !tbio->bi_end_io) 2453 continue; 2454 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write 2455 && r10_bio->devs[i].bio != fbio) 2456 bio_copy_data(tbio, fbio); 2457 d = r10_bio->devs[i].devnum; 2458 atomic_inc(&r10_bio->remaining); 2459 md_sync_acct(conf->mirrors[d].replacement->bdev, 2460 bio_sectors(tbio)); 2461 submit_bio_noacct(tbio); 2462 } 2463 2464 done: 2465 if (atomic_dec_and_test(&r10_bio->remaining)) { 2466 md_done_sync(mddev, r10_bio->sectors, 1); 2467 put_buf(r10_bio); 2468 } 2469 } 2470 2471 /* 2472 * Now for the recovery code. 2473 * Recovery happens across physical sectors. 2474 * We recover all non-is_sync drives by finding the virtual address of 2475 * each, and then choose a working drive that also has that virt address. 2476 * There is a separate r10_bio for each non-in_sync drive. 2477 * Only the first two slots are in use. The first for reading, 2478 * The second for writing. 2479 * 2480 */ 2481 static void fix_recovery_read_error(struct r10bio *r10_bio) 2482 { 2483 /* We got a read error during recovery. 2484 * We repeat the read in smaller page-sized sections. 2485 * If a read succeeds, write it to the new device or record 2486 * a bad block if we cannot. 2487 * If a read fails, record a bad block on both old and 2488 * new devices. 2489 */ 2490 struct mddev *mddev = r10_bio->mddev; 2491 struct r10conf *conf = mddev->private; 2492 struct bio *bio = r10_bio->devs[0].bio; 2493 sector_t sect = 0; 2494 int sectors = r10_bio->sectors; 2495 int idx = 0; 2496 int dr = r10_bio->devs[0].devnum; 2497 int dw = r10_bio->devs[1].devnum; 2498 struct page **pages = get_resync_pages(bio)->pages; 2499 2500 while (sectors) { 2501 int s = sectors; 2502 struct md_rdev *rdev; 2503 sector_t addr; 2504 int ok; 2505 2506 if (s > (PAGE_SIZE>>9)) 2507 s = PAGE_SIZE >> 9; 2508 2509 rdev = conf->mirrors[dr].rdev; 2510 addr = r10_bio->devs[0].addr + sect, 2511 ok = sync_page_io(rdev, 2512 addr, 2513 s << 9, 2514 pages[idx], 2515 REQ_OP_READ, 0, false); 2516 if (ok) { 2517 rdev = conf->mirrors[dw].rdev; 2518 addr = r10_bio->devs[1].addr + sect; 2519 ok = sync_page_io(rdev, 2520 addr, 2521 s << 9, 2522 pages[idx], 2523 REQ_OP_WRITE, 0, false); 2524 if (!ok) { 2525 set_bit(WriteErrorSeen, &rdev->flags); 2526 if (!test_and_set_bit(WantReplacement, 2527 &rdev->flags)) 2528 set_bit(MD_RECOVERY_NEEDED, 2529 &rdev->mddev->recovery); 2530 } 2531 } 2532 if (!ok) { 2533 /* We don't worry if we cannot set a bad block - 2534 * it really is bad so there is no loss in not 2535 * recording it yet 2536 */ 2537 rdev_set_badblocks(rdev, addr, s, 0); 2538 2539 if (rdev != conf->mirrors[dw].rdev) { 2540 /* need bad block on destination too */ 2541 struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 2542 addr = r10_bio->devs[1].addr + sect; 2543 ok = rdev_set_badblocks(rdev2, addr, s, 0); 2544 if (!ok) { 2545 /* just abort the recovery */ 2546 pr_notice("md/raid10:%s: recovery aborted due to read error\n", 2547 mdname(mddev)); 2548 2549 conf->mirrors[dw].recovery_disabled 2550 = mddev->recovery_disabled; 2551 set_bit(MD_RECOVERY_INTR, 2552 &mddev->recovery); 2553 break; 2554 } 2555 } 2556 } 2557 2558 sectors -= s; 2559 sect += s; 2560 idx++; 2561 } 2562 } 2563 2564 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2565 { 2566 struct r10conf *conf = mddev->private; 2567 int d; 2568 struct bio *wbio, *wbio2; 2569 2570 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 2571 fix_recovery_read_error(r10_bio); 2572 end_sync_request(r10_bio); 2573 return; 2574 } 2575 2576 /* 2577 * share the pages with the first bio 2578 * and submit the write request 2579 */ 2580 d = r10_bio->devs[1].devnum; 2581 wbio = r10_bio->devs[1].bio; 2582 wbio2 = r10_bio->devs[1].repl_bio; 2583 /* Need to test wbio2->bi_end_io before we call 2584 * submit_bio_noacct as if the former is NULL, 2585 * the latter is free to free wbio2. 2586 */ 2587 if (wbio2 && !wbio2->bi_end_io) 2588 wbio2 = NULL; 2589 if (wbio->bi_end_io) { 2590 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2591 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 2592 submit_bio_noacct(wbio); 2593 } 2594 if (wbio2) { 2595 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2596 md_sync_acct(conf->mirrors[d].replacement->bdev, 2597 bio_sectors(wbio2)); 2598 submit_bio_noacct(wbio2); 2599 } 2600 } 2601 2602 /* 2603 * Used by fix_read_error() to decay the per rdev read_errors. 2604 * We halve the read error count for every hour that has elapsed 2605 * since the last recorded read error. 2606 * 2607 */ 2608 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 2609 { 2610 long cur_time_mon; 2611 unsigned long hours_since_last; 2612 unsigned int read_errors = atomic_read(&rdev->read_errors); 2613 2614 cur_time_mon = ktime_get_seconds(); 2615 2616 if (rdev->last_read_error == 0) { 2617 /* first time we've seen a read error */ 2618 rdev->last_read_error = cur_time_mon; 2619 return; 2620 } 2621 2622 hours_since_last = (long)(cur_time_mon - 2623 rdev->last_read_error) / 3600; 2624 2625 rdev->last_read_error = cur_time_mon; 2626 2627 /* 2628 * if hours_since_last is > the number of bits in read_errors 2629 * just set read errors to 0. We do this to avoid 2630 * overflowing the shift of read_errors by hours_since_last. 2631 */ 2632 if (hours_since_last >= 8 * sizeof(read_errors)) 2633 atomic_set(&rdev->read_errors, 0); 2634 else 2635 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 2636 } 2637 2638 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 2639 int sectors, struct page *page, int rw) 2640 { 2641 sector_t first_bad; 2642 int bad_sectors; 2643 2644 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 2645 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) 2646 return -1; 2647 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false)) 2648 /* success */ 2649 return 1; 2650 if (rw == WRITE) { 2651 set_bit(WriteErrorSeen, &rdev->flags); 2652 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2653 set_bit(MD_RECOVERY_NEEDED, 2654 &rdev->mddev->recovery); 2655 } 2656 /* need to record an error - either for the block or the device */ 2657 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2658 md_error(rdev->mddev, rdev); 2659 return 0; 2660 } 2661 2662 /* 2663 * This is a kernel thread which: 2664 * 2665 * 1. Retries failed read operations on working mirrors. 2666 * 2. Updates the raid superblock when problems encounter. 2667 * 3. Performs writes following reads for array synchronising. 2668 */ 2669 2670 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 2671 { 2672 int sect = 0; /* Offset from r10_bio->sector */ 2673 int sectors = r10_bio->sectors; 2674 struct md_rdev *rdev; 2675 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2676 int d = r10_bio->devs[r10_bio->read_slot].devnum; 2677 2678 /* still own a reference to this rdev, so it cannot 2679 * have been cleared recently. 2680 */ 2681 rdev = conf->mirrors[d].rdev; 2682 2683 if (test_bit(Faulty, &rdev->flags)) 2684 /* drive has already been failed, just ignore any 2685 more fix_read_error() attempts */ 2686 return; 2687 2688 check_decay_read_errors(mddev, rdev); 2689 atomic_inc(&rdev->read_errors); 2690 if (atomic_read(&rdev->read_errors) > max_read_errors) { 2691 pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", 2692 mdname(mddev), rdev->bdev, 2693 atomic_read(&rdev->read_errors), max_read_errors); 2694 pr_notice("md/raid10:%s: %pg: Failing raid device\n", 2695 mdname(mddev), rdev->bdev); 2696 md_error(mddev, rdev); 2697 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2698 return; 2699 } 2700 2701 while(sectors) { 2702 int s = sectors; 2703 int sl = r10_bio->read_slot; 2704 int success = 0; 2705 int start; 2706 2707 if (s > (PAGE_SIZE>>9)) 2708 s = PAGE_SIZE >> 9; 2709 2710 rcu_read_lock(); 2711 do { 2712 sector_t first_bad; 2713 int bad_sectors; 2714 2715 d = r10_bio->devs[sl].devnum; 2716 rdev = rcu_dereference(conf->mirrors[d].rdev); 2717 if (rdev && 2718 test_bit(In_sync, &rdev->flags) && 2719 !test_bit(Faulty, &rdev->flags) && 2720 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2721 &first_bad, &bad_sectors) == 0) { 2722 atomic_inc(&rdev->nr_pending); 2723 rcu_read_unlock(); 2724 success = sync_page_io(rdev, 2725 r10_bio->devs[sl].addr + 2726 sect, 2727 s<<9, 2728 conf->tmppage, 2729 REQ_OP_READ, 0, false); 2730 rdev_dec_pending(rdev, mddev); 2731 rcu_read_lock(); 2732 if (success) 2733 break; 2734 } 2735 sl++; 2736 if (sl == conf->copies) 2737 sl = 0; 2738 } while (!success && sl != r10_bio->read_slot); 2739 rcu_read_unlock(); 2740 2741 if (!success) { 2742 /* Cannot read from anywhere, just mark the block 2743 * as bad on the first device to discourage future 2744 * reads. 2745 */ 2746 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 2747 rdev = conf->mirrors[dn].rdev; 2748 2749 if (!rdev_set_badblocks( 2750 rdev, 2751 r10_bio->devs[r10_bio->read_slot].addr 2752 + sect, 2753 s, 0)) { 2754 md_error(mddev, rdev); 2755 r10_bio->devs[r10_bio->read_slot].bio 2756 = IO_BLOCKED; 2757 } 2758 break; 2759 } 2760 2761 start = sl; 2762 /* write it back and re-read */ 2763 rcu_read_lock(); 2764 while (sl != r10_bio->read_slot) { 2765 if (sl==0) 2766 sl = conf->copies; 2767 sl--; 2768 d = r10_bio->devs[sl].devnum; 2769 rdev = rcu_dereference(conf->mirrors[d].rdev); 2770 if (!rdev || 2771 test_bit(Faulty, &rdev->flags) || 2772 !test_bit(In_sync, &rdev->flags)) 2773 continue; 2774 2775 atomic_inc(&rdev->nr_pending); 2776 rcu_read_unlock(); 2777 if (r10_sync_page_io(rdev, 2778 r10_bio->devs[sl].addr + 2779 sect, 2780 s, conf->tmppage, WRITE) 2781 == 0) { 2782 /* Well, this device is dead */ 2783 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", 2784 mdname(mddev), s, 2785 (unsigned long long)( 2786 sect + 2787 choose_data_offset(r10_bio, 2788 rdev)), 2789 rdev->bdev); 2790 pr_notice("md/raid10:%s: %pg: failing drive\n", 2791 mdname(mddev), 2792 rdev->bdev); 2793 } 2794 rdev_dec_pending(rdev, mddev); 2795 rcu_read_lock(); 2796 } 2797 sl = start; 2798 while (sl != r10_bio->read_slot) { 2799 if (sl==0) 2800 sl = conf->copies; 2801 sl--; 2802 d = r10_bio->devs[sl].devnum; 2803 rdev = rcu_dereference(conf->mirrors[d].rdev); 2804 if (!rdev || 2805 test_bit(Faulty, &rdev->flags) || 2806 !test_bit(In_sync, &rdev->flags)) 2807 continue; 2808 2809 atomic_inc(&rdev->nr_pending); 2810 rcu_read_unlock(); 2811 switch (r10_sync_page_io(rdev, 2812 r10_bio->devs[sl].addr + 2813 sect, 2814 s, conf->tmppage, 2815 READ)) { 2816 case 0: 2817 /* Well, this device is dead */ 2818 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", 2819 mdname(mddev), s, 2820 (unsigned long long)( 2821 sect + 2822 choose_data_offset(r10_bio, rdev)), 2823 rdev->bdev); 2824 pr_notice("md/raid10:%s: %pg: failing drive\n", 2825 mdname(mddev), 2826 rdev->bdev); 2827 break; 2828 case 1: 2829 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", 2830 mdname(mddev), s, 2831 (unsigned long long)( 2832 sect + 2833 choose_data_offset(r10_bio, rdev)), 2834 rdev->bdev); 2835 atomic_add(s, &rdev->corrected_errors); 2836 } 2837 2838 rdev_dec_pending(rdev, mddev); 2839 rcu_read_lock(); 2840 } 2841 rcu_read_unlock(); 2842 2843 sectors -= s; 2844 sect += s; 2845 } 2846 } 2847 2848 static int narrow_write_error(struct r10bio *r10_bio, int i) 2849 { 2850 struct bio *bio = r10_bio->master_bio; 2851 struct mddev *mddev = r10_bio->mddev; 2852 struct r10conf *conf = mddev->private; 2853 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 2854 /* bio has the data to be written to slot 'i' where 2855 * we just recently had a write error. 2856 * We repeatedly clone the bio and trim down to one block, 2857 * then try the write. Where the write fails we record 2858 * a bad block. 2859 * It is conceivable that the bio doesn't exactly align with 2860 * blocks. We must handle this. 2861 * 2862 * We currently own a reference to the rdev. 2863 */ 2864 2865 int block_sectors; 2866 sector_t sector; 2867 int sectors; 2868 int sect_to_write = r10_bio->sectors; 2869 int ok = 1; 2870 2871 if (rdev->badblocks.shift < 0) 2872 return 0; 2873 2874 block_sectors = roundup(1 << rdev->badblocks.shift, 2875 bdev_logical_block_size(rdev->bdev) >> 9); 2876 sector = r10_bio->sector; 2877 sectors = ((r10_bio->sector + block_sectors) 2878 & ~(sector_t)(block_sectors - 1)) 2879 - sector; 2880 2881 while (sect_to_write) { 2882 struct bio *wbio; 2883 sector_t wsector; 2884 if (sectors > sect_to_write) 2885 sectors = sect_to_write; 2886 /* Write at 'sector' for 'sectors' */ 2887 wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, 2888 &mddev->bio_set); 2889 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); 2890 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); 2891 wbio->bi_iter.bi_sector = wsector + 2892 choose_data_offset(r10_bio, rdev); 2893 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 2894 2895 if (submit_bio_wait(wbio) < 0) 2896 /* Failure! */ 2897 ok = rdev_set_badblocks(rdev, wsector, 2898 sectors, 0) 2899 && ok; 2900 2901 bio_put(wbio); 2902 sect_to_write -= sectors; 2903 sector += sectors; 2904 sectors = block_sectors; 2905 } 2906 return ok; 2907 } 2908 2909 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2910 { 2911 int slot = r10_bio->read_slot; 2912 struct bio *bio; 2913 struct r10conf *conf = mddev->private; 2914 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2915 2916 /* we got a read error. Maybe the drive is bad. Maybe just 2917 * the block and we can fix it. 2918 * We freeze all other IO, and try reading the block from 2919 * other devices. When we find one, we re-write 2920 * and check it that fixes the read error. 2921 * This is all done synchronously while the array is 2922 * frozen. 2923 */ 2924 bio = r10_bio->devs[slot].bio; 2925 bio_put(bio); 2926 r10_bio->devs[slot].bio = NULL; 2927 2928 if (mddev->ro) 2929 r10_bio->devs[slot].bio = IO_BLOCKED; 2930 else if (!test_bit(FailFast, &rdev->flags)) { 2931 freeze_array(conf, 1); 2932 fix_read_error(conf, mddev, r10_bio); 2933 unfreeze_array(conf); 2934 } else 2935 md_error(mddev, rdev); 2936 2937 rdev_dec_pending(rdev, mddev); 2938 allow_barrier(conf); 2939 r10_bio->state = 0; 2940 raid10_read_request(mddev, r10_bio->master_bio, r10_bio); 2941 } 2942 2943 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 2944 { 2945 /* Some sort of write request has finished and it 2946 * succeeded in writing where we thought there was a 2947 * bad block. So forget the bad block. 2948 * Or possibly if failed and we need to record 2949 * a bad block. 2950 */ 2951 int m; 2952 struct md_rdev *rdev; 2953 2954 if (test_bit(R10BIO_IsSync, &r10_bio->state) || 2955 test_bit(R10BIO_IsRecover, &r10_bio->state)) { 2956 for (m = 0; m < conf->copies; m++) { 2957 int dev = r10_bio->devs[m].devnum; 2958 rdev = conf->mirrors[dev].rdev; 2959 if (r10_bio->devs[m].bio == NULL || 2960 r10_bio->devs[m].bio->bi_end_io == NULL) 2961 continue; 2962 if (!r10_bio->devs[m].bio->bi_status) { 2963 rdev_clear_badblocks( 2964 rdev, 2965 r10_bio->devs[m].addr, 2966 r10_bio->sectors, 0); 2967 } else { 2968 if (!rdev_set_badblocks( 2969 rdev, 2970 r10_bio->devs[m].addr, 2971 r10_bio->sectors, 0)) 2972 md_error(conf->mddev, rdev); 2973 } 2974 rdev = conf->mirrors[dev].replacement; 2975 if (r10_bio->devs[m].repl_bio == NULL || 2976 r10_bio->devs[m].repl_bio->bi_end_io == NULL) 2977 continue; 2978 2979 if (!r10_bio->devs[m].repl_bio->bi_status) { 2980 rdev_clear_badblocks( 2981 rdev, 2982 r10_bio->devs[m].addr, 2983 r10_bio->sectors, 0); 2984 } else { 2985 if (!rdev_set_badblocks( 2986 rdev, 2987 r10_bio->devs[m].addr, 2988 r10_bio->sectors, 0)) 2989 md_error(conf->mddev, rdev); 2990 } 2991 } 2992 put_buf(r10_bio); 2993 } else { 2994 bool fail = false; 2995 for (m = 0; m < conf->copies; m++) { 2996 int dev = r10_bio->devs[m].devnum; 2997 struct bio *bio = r10_bio->devs[m].bio; 2998 rdev = conf->mirrors[dev].rdev; 2999 if (bio == IO_MADE_GOOD) { 3000 rdev_clear_badblocks( 3001 rdev, 3002 r10_bio->devs[m].addr, 3003 r10_bio->sectors, 0); 3004 rdev_dec_pending(rdev, conf->mddev); 3005 } else if (bio != NULL && bio->bi_status) { 3006 fail = true; 3007 if (!narrow_write_error(r10_bio, m)) { 3008 md_error(conf->mddev, rdev); 3009 set_bit(R10BIO_Degraded, 3010 &r10_bio->state); 3011 } 3012 rdev_dec_pending(rdev, conf->mddev); 3013 } 3014 bio = r10_bio->devs[m].repl_bio; 3015 rdev = conf->mirrors[dev].replacement; 3016 if (rdev && bio == IO_MADE_GOOD) { 3017 rdev_clear_badblocks( 3018 rdev, 3019 r10_bio->devs[m].addr, 3020 r10_bio->sectors, 0); 3021 rdev_dec_pending(rdev, conf->mddev); 3022 } 3023 } 3024 if (fail) { 3025 spin_lock_irq(&conf->device_lock); 3026 list_add(&r10_bio->retry_list, &conf->bio_end_io_list); 3027 conf->nr_queued++; 3028 spin_unlock_irq(&conf->device_lock); 3029 /* 3030 * In case freeze_array() is waiting for condition 3031 * nr_pending == nr_queued + extra to be true. 3032 */ 3033 wake_up(&conf->wait_barrier); 3034 md_wakeup_thread(conf->mddev->thread); 3035 } else { 3036 if (test_bit(R10BIO_WriteError, 3037 &r10_bio->state)) 3038 close_write(r10_bio); 3039 raid_end_bio_io(r10_bio); 3040 } 3041 } 3042 } 3043 3044 static void raid10d(struct md_thread *thread) 3045 { 3046 struct mddev *mddev = thread->mddev; 3047 struct r10bio *r10_bio; 3048 unsigned long flags; 3049 struct r10conf *conf = mddev->private; 3050 struct list_head *head = &conf->retry_list; 3051 struct blk_plug plug; 3052 3053 md_check_recovery(mddev); 3054 3055 if (!list_empty_careful(&conf->bio_end_io_list) && 3056 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3057 LIST_HEAD(tmp); 3058 spin_lock_irqsave(&conf->device_lock, flags); 3059 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3060 while (!list_empty(&conf->bio_end_io_list)) { 3061 list_move(conf->bio_end_io_list.prev, &tmp); 3062 conf->nr_queued--; 3063 } 3064 } 3065 spin_unlock_irqrestore(&conf->device_lock, flags); 3066 while (!list_empty(&tmp)) { 3067 r10_bio = list_first_entry(&tmp, struct r10bio, 3068 retry_list); 3069 list_del(&r10_bio->retry_list); 3070 if (mddev->degraded) 3071 set_bit(R10BIO_Degraded, &r10_bio->state); 3072 3073 if (test_bit(R10BIO_WriteError, 3074 &r10_bio->state)) 3075 close_write(r10_bio); 3076 raid_end_bio_io(r10_bio); 3077 } 3078 } 3079 3080 blk_start_plug(&plug); 3081 for (;;) { 3082 3083 flush_pending_writes(conf); 3084 3085 spin_lock_irqsave(&conf->device_lock, flags); 3086 if (list_empty(head)) { 3087 spin_unlock_irqrestore(&conf->device_lock, flags); 3088 break; 3089 } 3090 r10_bio = list_entry(head->prev, struct r10bio, retry_list); 3091 list_del(head->prev); 3092 conf->nr_queued--; 3093 spin_unlock_irqrestore(&conf->device_lock, flags); 3094 3095 mddev = r10_bio->mddev; 3096 conf = mddev->private; 3097 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 3098 test_bit(R10BIO_WriteError, &r10_bio->state)) 3099 handle_write_completed(conf, r10_bio); 3100 else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 3101 reshape_request_write(mddev, r10_bio); 3102 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 3103 sync_request_write(mddev, r10_bio); 3104 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 3105 recovery_request_write(mddev, r10_bio); 3106 else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 3107 handle_read_error(mddev, r10_bio); 3108 else 3109 WARN_ON_ONCE(1); 3110 3111 cond_resched(); 3112 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 3113 md_check_recovery(mddev); 3114 } 3115 blk_finish_plug(&plug); 3116 } 3117 3118 static int init_resync(struct r10conf *conf) 3119 { 3120 int ret, buffs, i; 3121 3122 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 3123 BUG_ON(mempool_initialized(&conf->r10buf_pool)); 3124 conf->have_replacement = 0; 3125 for (i = 0; i < conf->geo.raid_disks; i++) 3126 if (conf->mirrors[i].replacement) 3127 conf->have_replacement = 1; 3128 ret = mempool_init(&conf->r10buf_pool, buffs, 3129 r10buf_pool_alloc, r10buf_pool_free, conf); 3130 if (ret) 3131 return ret; 3132 conf->next_resync = 0; 3133 return 0; 3134 } 3135 3136 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) 3137 { 3138 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); 3139 struct rsync_pages *rp; 3140 struct bio *bio; 3141 int nalloc; 3142 int i; 3143 3144 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 3145 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 3146 nalloc = conf->copies; /* resync */ 3147 else 3148 nalloc = 2; /* recovery */ 3149 3150 for (i = 0; i < nalloc; i++) { 3151 bio = r10bio->devs[i].bio; 3152 rp = bio->bi_private; 3153 bio_reset(bio, NULL, 0); 3154 bio->bi_private = rp; 3155 bio = r10bio->devs[i].repl_bio; 3156 if (bio) { 3157 rp = bio->bi_private; 3158 bio_reset(bio, NULL, 0); 3159 bio->bi_private = rp; 3160 } 3161 } 3162 return r10bio; 3163 } 3164 3165 /* 3166 * Set cluster_sync_high since we need other nodes to add the 3167 * range [cluster_sync_low, cluster_sync_high] to suspend list. 3168 */ 3169 static void raid10_set_cluster_sync_high(struct r10conf *conf) 3170 { 3171 sector_t window_size; 3172 int extra_chunk, chunks; 3173 3174 /* 3175 * First, here we define "stripe" as a unit which across 3176 * all member devices one time, so we get chunks by use 3177 * raid_disks / near_copies. Otherwise, if near_copies is 3178 * close to raid_disks, then resync window could increases 3179 * linearly with the increase of raid_disks, which means 3180 * we will suspend a really large IO window while it is not 3181 * necessary. If raid_disks is not divisible by near_copies, 3182 * an extra chunk is needed to ensure the whole "stripe" is 3183 * covered. 3184 */ 3185 3186 chunks = conf->geo.raid_disks / conf->geo.near_copies; 3187 if (conf->geo.raid_disks % conf->geo.near_copies == 0) 3188 extra_chunk = 0; 3189 else 3190 extra_chunk = 1; 3191 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; 3192 3193 /* 3194 * At least use a 32M window to align with raid1's resync window 3195 */ 3196 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? 3197 CLUSTER_RESYNC_WINDOW_SECTORS : window_size; 3198 3199 conf->cluster_sync_high = conf->cluster_sync_low + window_size; 3200 } 3201 3202 /* 3203 * perform a "sync" on one "block" 3204 * 3205 * We need to make sure that no normal I/O request - particularly write 3206 * requests - conflict with active sync requests. 3207 * 3208 * This is achieved by tracking pending requests and a 'barrier' concept 3209 * that can be installed to exclude normal IO requests. 3210 * 3211 * Resync and recovery are handled very differently. 3212 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. 3213 * 3214 * For resync, we iterate over virtual addresses, read all copies, 3215 * and update if there are differences. If only one copy is live, 3216 * skip it. 3217 * For recovery, we iterate over physical addresses, read a good 3218 * value for each non-in_sync drive, and over-write. 3219 * 3220 * So, for recovery we may have several outstanding complex requests for a 3221 * given address, one for each out-of-sync device. We model this by allocating 3222 * a number of r10_bio structures, one for each out-of-sync device. 3223 * As we setup these structures, we collect all bio's together into a list 3224 * which we then process collectively to add pages, and then process again 3225 * to pass to submit_bio_noacct. 3226 * 3227 * The r10_bio structures are linked using a borrowed master_bio pointer. 3228 * This link is counted in ->remaining. When the r10_bio that points to NULL 3229 * has its remaining count decremented to 0, the whole complex operation 3230 * is complete. 3231 * 3232 */ 3233 3234 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, 3235 int *skipped) 3236 { 3237 struct r10conf *conf = mddev->private; 3238 struct r10bio *r10_bio; 3239 struct bio *biolist = NULL, *bio; 3240 sector_t max_sector, nr_sectors; 3241 int i; 3242 int max_sync; 3243 sector_t sync_blocks; 3244 sector_t sectors_skipped = 0; 3245 int chunks_skipped = 0; 3246 sector_t chunk_mask = conf->geo.chunk_mask; 3247 int page_idx = 0; 3248 3249 if (!mempool_initialized(&conf->r10buf_pool)) 3250 if (init_resync(conf)) 3251 return 0; 3252 3253 /* 3254 * Allow skipping a full rebuild for incremental assembly 3255 * of a clean array, like RAID1 does. 3256 */ 3257 if (mddev->bitmap == NULL && 3258 mddev->recovery_cp == MaxSector && 3259 mddev->reshape_position == MaxSector && 3260 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 3261 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 3262 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 3263 conf->fullsync == 0) { 3264 *skipped = 1; 3265 return mddev->dev_sectors - sector_nr; 3266 } 3267 3268 skipped: 3269 max_sector = mddev->dev_sectors; 3270 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 3271 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3272 max_sector = mddev->resync_max_sectors; 3273 if (sector_nr >= max_sector) { 3274 conf->cluster_sync_low = 0; 3275 conf->cluster_sync_high = 0; 3276 3277 /* If we aborted, we need to abort the 3278 * sync on the 'current' bitmap chucks (there can 3279 * be several when recovering multiple devices). 3280 * as we may have started syncing it but not finished. 3281 * We can find the current address in 3282 * mddev->curr_resync, but for recovery, 3283 * we need to convert that to several 3284 * virtual addresses. 3285 */ 3286 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3287 end_reshape(conf); 3288 close_sync(conf); 3289 return 0; 3290 } 3291 3292 if (mddev->curr_resync < max_sector) { /* aborted */ 3293 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3294 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3295 &sync_blocks, 1); 3296 else for (i = 0; i < conf->geo.raid_disks; i++) { 3297 sector_t sect = 3298 raid10_find_virt(conf, mddev->curr_resync, i); 3299 md_bitmap_end_sync(mddev->bitmap, sect, 3300 &sync_blocks, 1); 3301 } 3302 } else { 3303 /* completed sync */ 3304 if ((!mddev->bitmap || conf->fullsync) 3305 && conf->have_replacement 3306 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3307 /* Completed a full sync so the replacements 3308 * are now fully recovered. 3309 */ 3310 rcu_read_lock(); 3311 for (i = 0; i < conf->geo.raid_disks; i++) { 3312 struct md_rdev *rdev = 3313 rcu_dereference(conf->mirrors[i].replacement); 3314 if (rdev) 3315 rdev->recovery_offset = MaxSector; 3316 } 3317 rcu_read_unlock(); 3318 } 3319 conf->fullsync = 0; 3320 } 3321 md_bitmap_close_sync(mddev->bitmap); 3322 close_sync(conf); 3323 *skipped = 1; 3324 return sectors_skipped; 3325 } 3326 3327 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3328 return reshape_request(mddev, sector_nr, skipped); 3329 3330 if (chunks_skipped >= conf->geo.raid_disks) { 3331 /* if there has been nothing to do on any drive, 3332 * then there is nothing to do at all.. 3333 */ 3334 *skipped = 1; 3335 return (max_sector - sector_nr) + sectors_skipped; 3336 } 3337 3338 if (max_sector > mddev->resync_max) 3339 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 3340 3341 /* make sure whole request will fit in a chunk - if chunks 3342 * are meaningful 3343 */ 3344 if (conf->geo.near_copies < conf->geo.raid_disks && 3345 max_sector > (sector_nr | chunk_mask)) 3346 max_sector = (sector_nr | chunk_mask) + 1; 3347 3348 /* 3349 * If there is non-resync activity waiting for a turn, then let it 3350 * though before starting on this new sync request. 3351 */ 3352 if (conf->nr_waiting) 3353 schedule_timeout_uninterruptible(1); 3354 3355 /* Again, very different code for resync and recovery. 3356 * Both must result in an r10bio with a list of bios that 3357 * have bi_end_io, bi_sector, bi_bdev set, 3358 * and bi_private set to the r10bio. 3359 * For recovery, we may actually create several r10bios 3360 * with 2 bios in each, that correspond to the bios in the main one. 3361 * In this case, the subordinate r10bios link back through a 3362 * borrowed master_bio pointer, and the counter in the master 3363 * includes a ref from each subordinate. 3364 */ 3365 /* First, we decide what to do and set ->bi_end_io 3366 * To end_sync_read if we want to read, and 3367 * end_sync_write if we will want to write. 3368 */ 3369 3370 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 3371 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3372 /* recovery... the complicated one */ 3373 int j; 3374 r10_bio = NULL; 3375 3376 for (i = 0 ; i < conf->geo.raid_disks; i++) { 3377 int still_degraded; 3378 struct r10bio *rb2; 3379 sector_t sect; 3380 int must_sync; 3381 int any_working; 3382 int need_recover = 0; 3383 int need_replace = 0; 3384 struct raid10_info *mirror = &conf->mirrors[i]; 3385 struct md_rdev *mrdev, *mreplace; 3386 3387 rcu_read_lock(); 3388 mrdev = rcu_dereference(mirror->rdev); 3389 mreplace = rcu_dereference(mirror->replacement); 3390 3391 if (mrdev != NULL && 3392 !test_bit(Faulty, &mrdev->flags) && 3393 !test_bit(In_sync, &mrdev->flags)) 3394 need_recover = 1; 3395 if (mreplace != NULL && 3396 !test_bit(Faulty, &mreplace->flags)) 3397 need_replace = 1; 3398 3399 if (!need_recover && !need_replace) { 3400 rcu_read_unlock(); 3401 continue; 3402 } 3403 3404 still_degraded = 0; 3405 /* want to reconstruct this device */ 3406 rb2 = r10_bio; 3407 sect = raid10_find_virt(conf, sector_nr, i); 3408 if (sect >= mddev->resync_max_sectors) { 3409 /* last stripe is not complete - don't 3410 * try to recover this sector. 3411 */ 3412 rcu_read_unlock(); 3413 continue; 3414 } 3415 if (mreplace && test_bit(Faulty, &mreplace->flags)) 3416 mreplace = NULL; 3417 /* Unless we are doing a full sync, or a replacement 3418 * we only need to recover the block if it is set in 3419 * the bitmap 3420 */ 3421 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3422 &sync_blocks, 1); 3423 if (sync_blocks < max_sync) 3424 max_sync = sync_blocks; 3425 if (!must_sync && 3426 mreplace == NULL && 3427 !conf->fullsync) { 3428 /* yep, skip the sync_blocks here, but don't assume 3429 * that there will never be anything to do here 3430 */ 3431 chunks_skipped = -1; 3432 rcu_read_unlock(); 3433 continue; 3434 } 3435 atomic_inc(&mrdev->nr_pending); 3436 if (mreplace) 3437 atomic_inc(&mreplace->nr_pending); 3438 rcu_read_unlock(); 3439 3440 r10_bio = raid10_alloc_init_r10buf(conf); 3441 r10_bio->state = 0; 3442 raise_barrier(conf, rb2 != NULL); 3443 atomic_set(&r10_bio->remaining, 0); 3444 3445 r10_bio->master_bio = (struct bio*)rb2; 3446 if (rb2) 3447 atomic_inc(&rb2->remaining); 3448 r10_bio->mddev = mddev; 3449 set_bit(R10BIO_IsRecover, &r10_bio->state); 3450 r10_bio->sector = sect; 3451 3452 raid10_find_phys(conf, r10_bio); 3453 3454 /* Need to check if the array will still be 3455 * degraded 3456 */ 3457 rcu_read_lock(); 3458 for (j = 0; j < conf->geo.raid_disks; j++) { 3459 struct md_rdev *rdev = rcu_dereference( 3460 conf->mirrors[j].rdev); 3461 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3462 still_degraded = 1; 3463 break; 3464 } 3465 } 3466 3467 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3468 &sync_blocks, still_degraded); 3469 3470 any_working = 0; 3471 for (j=0; j<conf->copies;j++) { 3472 int k; 3473 int d = r10_bio->devs[j].devnum; 3474 sector_t from_addr, to_addr; 3475 struct md_rdev *rdev = 3476 rcu_dereference(conf->mirrors[d].rdev); 3477 sector_t sector, first_bad; 3478 int bad_sectors; 3479 if (!rdev || 3480 !test_bit(In_sync, &rdev->flags)) 3481 continue; 3482 /* This is where we read from */ 3483 any_working = 1; 3484 sector = r10_bio->devs[j].addr; 3485 3486 if (is_badblock(rdev, sector, max_sync, 3487 &first_bad, &bad_sectors)) { 3488 if (first_bad > sector) 3489 max_sync = first_bad - sector; 3490 else { 3491 bad_sectors -= (sector 3492 - first_bad); 3493 if (max_sync > bad_sectors) 3494 max_sync = bad_sectors; 3495 continue; 3496 } 3497 } 3498 bio = r10_bio->devs[0].bio; 3499 bio->bi_next = biolist; 3500 biolist = bio; 3501 bio->bi_end_io = end_sync_read; 3502 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3503 if (test_bit(FailFast, &rdev->flags)) 3504 bio->bi_opf |= MD_FAILFAST; 3505 from_addr = r10_bio->devs[j].addr; 3506 bio->bi_iter.bi_sector = from_addr + 3507 rdev->data_offset; 3508 bio_set_dev(bio, rdev->bdev); 3509 atomic_inc(&rdev->nr_pending); 3510 /* and we write to 'i' (if not in_sync) */ 3511 3512 for (k=0; k<conf->copies; k++) 3513 if (r10_bio->devs[k].devnum == i) 3514 break; 3515 BUG_ON(k == conf->copies); 3516 to_addr = r10_bio->devs[k].addr; 3517 r10_bio->devs[0].devnum = d; 3518 r10_bio->devs[0].addr = from_addr; 3519 r10_bio->devs[1].devnum = i; 3520 r10_bio->devs[1].addr = to_addr; 3521 3522 if (need_recover) { 3523 bio = r10_bio->devs[1].bio; 3524 bio->bi_next = biolist; 3525 biolist = bio; 3526 bio->bi_end_io = end_sync_write; 3527 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3528 bio->bi_iter.bi_sector = to_addr 3529 + mrdev->data_offset; 3530 bio_set_dev(bio, mrdev->bdev); 3531 atomic_inc(&r10_bio->remaining); 3532 } else 3533 r10_bio->devs[1].bio->bi_end_io = NULL; 3534 3535 /* and maybe write to replacement */ 3536 bio = r10_bio->devs[1].repl_bio; 3537 if (bio) 3538 bio->bi_end_io = NULL; 3539 /* Note: if need_replace, then bio 3540 * cannot be NULL as r10buf_pool_alloc will 3541 * have allocated it. 3542 */ 3543 if (!need_replace) 3544 break; 3545 bio->bi_next = biolist; 3546 biolist = bio; 3547 bio->bi_end_io = end_sync_write; 3548 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3549 bio->bi_iter.bi_sector = to_addr + 3550 mreplace->data_offset; 3551 bio_set_dev(bio, mreplace->bdev); 3552 atomic_inc(&r10_bio->remaining); 3553 break; 3554 } 3555 rcu_read_unlock(); 3556 if (j == conf->copies) { 3557 /* Cannot recover, so abort the recovery or 3558 * record a bad block */ 3559 if (any_working) { 3560 /* problem is that there are bad blocks 3561 * on other device(s) 3562 */ 3563 int k; 3564 for (k = 0; k < conf->copies; k++) 3565 if (r10_bio->devs[k].devnum == i) 3566 break; 3567 if (!test_bit(In_sync, 3568 &mrdev->flags) 3569 && !rdev_set_badblocks( 3570 mrdev, 3571 r10_bio->devs[k].addr, 3572 max_sync, 0)) 3573 any_working = 0; 3574 if (mreplace && 3575 !rdev_set_badblocks( 3576 mreplace, 3577 r10_bio->devs[k].addr, 3578 max_sync, 0)) 3579 any_working = 0; 3580 } 3581 if (!any_working) { 3582 if (!test_and_set_bit(MD_RECOVERY_INTR, 3583 &mddev->recovery)) 3584 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", 3585 mdname(mddev)); 3586 mirror->recovery_disabled 3587 = mddev->recovery_disabled; 3588 } 3589 put_buf(r10_bio); 3590 if (rb2) 3591 atomic_dec(&rb2->remaining); 3592 r10_bio = rb2; 3593 rdev_dec_pending(mrdev, mddev); 3594 if (mreplace) 3595 rdev_dec_pending(mreplace, mddev); 3596 break; 3597 } 3598 rdev_dec_pending(mrdev, mddev); 3599 if (mreplace) 3600 rdev_dec_pending(mreplace, mddev); 3601 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { 3602 /* Only want this if there is elsewhere to 3603 * read from. 'j' is currently the first 3604 * readable copy. 3605 */ 3606 int targets = 1; 3607 for (; j < conf->copies; j++) { 3608 int d = r10_bio->devs[j].devnum; 3609 if (conf->mirrors[d].rdev && 3610 test_bit(In_sync, 3611 &conf->mirrors[d].rdev->flags)) 3612 targets++; 3613 } 3614 if (targets == 1) 3615 r10_bio->devs[0].bio->bi_opf 3616 &= ~MD_FAILFAST; 3617 } 3618 } 3619 if (biolist == NULL) { 3620 while (r10_bio) { 3621 struct r10bio *rb2 = r10_bio; 3622 r10_bio = (struct r10bio*) rb2->master_bio; 3623 rb2->master_bio = NULL; 3624 put_buf(rb2); 3625 } 3626 goto giveup; 3627 } 3628 } else { 3629 /* resync. Schedule a read for every block at this virt offset */ 3630 int count = 0; 3631 3632 /* 3633 * Since curr_resync_completed could probably not update in 3634 * time, and we will set cluster_sync_low based on it. 3635 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for 3636 * safety reason, which ensures curr_resync_completed is 3637 * updated in bitmap_cond_end_sync. 3638 */ 3639 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 3640 mddev_is_clustered(mddev) && 3641 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 3642 3643 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 3644 &sync_blocks, mddev->degraded) && 3645 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 3646 &mddev->recovery)) { 3647 /* We can skip this block */ 3648 *skipped = 1; 3649 return sync_blocks + sectors_skipped; 3650 } 3651 if (sync_blocks < max_sync) 3652 max_sync = sync_blocks; 3653 r10_bio = raid10_alloc_init_r10buf(conf); 3654 r10_bio->state = 0; 3655 3656 r10_bio->mddev = mddev; 3657 atomic_set(&r10_bio->remaining, 0); 3658 raise_barrier(conf, 0); 3659 conf->next_resync = sector_nr; 3660 3661 r10_bio->master_bio = NULL; 3662 r10_bio->sector = sector_nr; 3663 set_bit(R10BIO_IsSync, &r10_bio->state); 3664 raid10_find_phys(conf, r10_bio); 3665 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 3666 3667 for (i = 0; i < conf->copies; i++) { 3668 int d = r10_bio->devs[i].devnum; 3669 sector_t first_bad, sector; 3670 int bad_sectors; 3671 struct md_rdev *rdev; 3672 3673 if (r10_bio->devs[i].repl_bio) 3674 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3675 3676 bio = r10_bio->devs[i].bio; 3677 bio->bi_status = BLK_STS_IOERR; 3678 rcu_read_lock(); 3679 rdev = rcu_dereference(conf->mirrors[d].rdev); 3680 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3681 rcu_read_unlock(); 3682 continue; 3683 } 3684 sector = r10_bio->devs[i].addr; 3685 if (is_badblock(rdev, sector, max_sync, 3686 &first_bad, &bad_sectors)) { 3687 if (first_bad > sector) 3688 max_sync = first_bad - sector; 3689 else { 3690 bad_sectors -= (sector - first_bad); 3691 if (max_sync > bad_sectors) 3692 max_sync = bad_sectors; 3693 rcu_read_unlock(); 3694 continue; 3695 } 3696 } 3697 atomic_inc(&rdev->nr_pending); 3698 atomic_inc(&r10_bio->remaining); 3699 bio->bi_next = biolist; 3700 biolist = bio; 3701 bio->bi_end_io = end_sync_read; 3702 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3703 if (test_bit(FailFast, &rdev->flags)) 3704 bio->bi_opf |= MD_FAILFAST; 3705 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3706 bio_set_dev(bio, rdev->bdev); 3707 count++; 3708 3709 rdev = rcu_dereference(conf->mirrors[d].replacement); 3710 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3711 rcu_read_unlock(); 3712 continue; 3713 } 3714 atomic_inc(&rdev->nr_pending); 3715 3716 /* Need to set up for writing to the replacement */ 3717 bio = r10_bio->devs[i].repl_bio; 3718 bio->bi_status = BLK_STS_IOERR; 3719 3720 sector = r10_bio->devs[i].addr; 3721 bio->bi_next = biolist; 3722 biolist = bio; 3723 bio->bi_end_io = end_sync_write; 3724 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3725 if (test_bit(FailFast, &rdev->flags)) 3726 bio->bi_opf |= MD_FAILFAST; 3727 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3728 bio_set_dev(bio, rdev->bdev); 3729 count++; 3730 rcu_read_unlock(); 3731 } 3732 3733 if (count < 2) { 3734 for (i=0; i<conf->copies; i++) { 3735 int d = r10_bio->devs[i].devnum; 3736 if (r10_bio->devs[i].bio->bi_end_io) 3737 rdev_dec_pending(conf->mirrors[d].rdev, 3738 mddev); 3739 if (r10_bio->devs[i].repl_bio && 3740 r10_bio->devs[i].repl_bio->bi_end_io) 3741 rdev_dec_pending( 3742 conf->mirrors[d].replacement, 3743 mddev); 3744 } 3745 put_buf(r10_bio); 3746 biolist = NULL; 3747 goto giveup; 3748 } 3749 } 3750 3751 nr_sectors = 0; 3752 if (sector_nr + max_sync < max_sector) 3753 max_sector = sector_nr + max_sync; 3754 do { 3755 struct page *page; 3756 int len = PAGE_SIZE; 3757 if (sector_nr + (len>>9) > max_sector) 3758 len = (max_sector - sector_nr) << 9; 3759 if (len == 0) 3760 break; 3761 for (bio= biolist ; bio ; bio=bio->bi_next) { 3762 struct resync_pages *rp = get_resync_pages(bio); 3763 page = resync_fetch_page(rp, page_idx); 3764 /* 3765 * won't fail because the vec table is big enough 3766 * to hold all these pages 3767 */ 3768 bio_add_page(bio, page, len, 0); 3769 } 3770 nr_sectors += len>>9; 3771 sector_nr += len>>9; 3772 } while (++page_idx < RESYNC_PAGES); 3773 r10_bio->sectors = nr_sectors; 3774 3775 if (mddev_is_clustered(mddev) && 3776 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3777 /* It is resync not recovery */ 3778 if (conf->cluster_sync_high < sector_nr + nr_sectors) { 3779 conf->cluster_sync_low = mddev->curr_resync_completed; 3780 raid10_set_cluster_sync_high(conf); 3781 /* Send resync message */ 3782 md_cluster_ops->resync_info_update(mddev, 3783 conf->cluster_sync_low, 3784 conf->cluster_sync_high); 3785 } 3786 } else if (mddev_is_clustered(mddev)) { 3787 /* This is recovery not resync */ 3788 sector_t sect_va1, sect_va2; 3789 bool broadcast_msg = false; 3790 3791 for (i = 0; i < conf->geo.raid_disks; i++) { 3792 /* 3793 * sector_nr is a device address for recovery, so we 3794 * need translate it to array address before compare 3795 * with cluster_sync_high. 3796 */ 3797 sect_va1 = raid10_find_virt(conf, sector_nr, i); 3798 3799 if (conf->cluster_sync_high < sect_va1 + nr_sectors) { 3800 broadcast_msg = true; 3801 /* 3802 * curr_resync_completed is similar as 3803 * sector_nr, so make the translation too. 3804 */ 3805 sect_va2 = raid10_find_virt(conf, 3806 mddev->curr_resync_completed, i); 3807 3808 if (conf->cluster_sync_low == 0 || 3809 conf->cluster_sync_low > sect_va2) 3810 conf->cluster_sync_low = sect_va2; 3811 } 3812 } 3813 if (broadcast_msg) { 3814 raid10_set_cluster_sync_high(conf); 3815 md_cluster_ops->resync_info_update(mddev, 3816 conf->cluster_sync_low, 3817 conf->cluster_sync_high); 3818 } 3819 } 3820 3821 while (biolist) { 3822 bio = biolist; 3823 biolist = biolist->bi_next; 3824 3825 bio->bi_next = NULL; 3826 r10_bio = get_resync_r10bio(bio); 3827 r10_bio->sectors = nr_sectors; 3828 3829 if (bio->bi_end_io == end_sync_read) { 3830 md_sync_acct_bio(bio, nr_sectors); 3831 bio->bi_status = 0; 3832 submit_bio_noacct(bio); 3833 } 3834 } 3835 3836 if (sectors_skipped) 3837 /* pretend they weren't skipped, it makes 3838 * no important difference in this case 3839 */ 3840 md_done_sync(mddev, sectors_skipped, 1); 3841 3842 return sectors_skipped + nr_sectors; 3843 giveup: 3844 /* There is nowhere to write, so all non-sync 3845 * drives must be failed or in resync, all drives 3846 * have a bad block, so try the next chunk... 3847 */ 3848 if (sector_nr + max_sync < max_sector) 3849 max_sector = sector_nr + max_sync; 3850 3851 sectors_skipped += (max_sector - sector_nr); 3852 chunks_skipped ++; 3853 sector_nr = max_sector; 3854 goto skipped; 3855 } 3856 3857 static sector_t 3858 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 3859 { 3860 sector_t size; 3861 struct r10conf *conf = mddev->private; 3862 3863 if (!raid_disks) 3864 raid_disks = min(conf->geo.raid_disks, 3865 conf->prev.raid_disks); 3866 if (!sectors) 3867 sectors = conf->dev_sectors; 3868 3869 size = sectors >> conf->geo.chunk_shift; 3870 sector_div(size, conf->geo.far_copies); 3871 size = size * raid_disks; 3872 sector_div(size, conf->geo.near_copies); 3873 3874 return size << conf->geo.chunk_shift; 3875 } 3876 3877 static void calc_sectors(struct r10conf *conf, sector_t size) 3878 { 3879 /* Calculate the number of sectors-per-device that will 3880 * actually be used, and set conf->dev_sectors and 3881 * conf->stride 3882 */ 3883 3884 size = size >> conf->geo.chunk_shift; 3885 sector_div(size, conf->geo.far_copies); 3886 size = size * conf->geo.raid_disks; 3887 sector_div(size, conf->geo.near_copies); 3888 /* 'size' is now the number of chunks in the array */ 3889 /* calculate "used chunks per device" */ 3890 size = size * conf->copies; 3891 3892 /* We need to round up when dividing by raid_disks to 3893 * get the stride size. 3894 */ 3895 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 3896 3897 conf->dev_sectors = size << conf->geo.chunk_shift; 3898 3899 if (conf->geo.far_offset) 3900 conf->geo.stride = 1 << conf->geo.chunk_shift; 3901 else { 3902 sector_div(size, conf->geo.far_copies); 3903 conf->geo.stride = size << conf->geo.chunk_shift; 3904 } 3905 } 3906 3907 enum geo_type {geo_new, geo_old, geo_start}; 3908 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 3909 { 3910 int nc, fc, fo; 3911 int layout, chunk, disks; 3912 switch (new) { 3913 case geo_old: 3914 layout = mddev->layout; 3915 chunk = mddev->chunk_sectors; 3916 disks = mddev->raid_disks - mddev->delta_disks; 3917 break; 3918 case geo_new: 3919 layout = mddev->new_layout; 3920 chunk = mddev->new_chunk_sectors; 3921 disks = mddev->raid_disks; 3922 break; 3923 default: /* avoid 'may be unused' warnings */ 3924 case geo_start: /* new when starting reshape - raid_disks not 3925 * updated yet. */ 3926 layout = mddev->new_layout; 3927 chunk = mddev->new_chunk_sectors; 3928 disks = mddev->raid_disks + mddev->delta_disks; 3929 break; 3930 } 3931 if (layout >> 19) 3932 return -1; 3933 if (chunk < (PAGE_SIZE >> 9) || 3934 !is_power_of_2(chunk)) 3935 return -2; 3936 nc = layout & 255; 3937 fc = (layout >> 8) & 255; 3938 fo = layout & (1<<16); 3939 geo->raid_disks = disks; 3940 geo->near_copies = nc; 3941 geo->far_copies = fc; 3942 geo->far_offset = fo; 3943 switch (layout >> 17) { 3944 case 0: /* original layout. simple but not always optimal */ 3945 geo->far_set_size = disks; 3946 break; 3947 case 1: /* "improved" layout which was buggy. Hopefully no-one is 3948 * actually using this, but leave code here just in case.*/ 3949 geo->far_set_size = disks/fc; 3950 WARN(geo->far_set_size < fc, 3951 "This RAID10 layout does not provide data safety - please backup and create new array\n"); 3952 break; 3953 case 2: /* "improved" layout fixed to match documentation */ 3954 geo->far_set_size = fc * nc; 3955 break; 3956 default: /* Not a valid layout */ 3957 return -1; 3958 } 3959 geo->chunk_mask = chunk - 1; 3960 geo->chunk_shift = ffz(~chunk); 3961 return nc*fc; 3962 } 3963 3964 static struct r10conf *setup_conf(struct mddev *mddev) 3965 { 3966 struct r10conf *conf = NULL; 3967 int err = -EINVAL; 3968 struct geom geo; 3969 int copies; 3970 3971 copies = setup_geo(&geo, mddev, geo_new); 3972 3973 if (copies == -2) { 3974 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", 3975 mdname(mddev), PAGE_SIZE); 3976 goto out; 3977 } 3978 3979 if (copies < 2 || copies > mddev->raid_disks) { 3980 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3981 mdname(mddev), mddev->new_layout); 3982 goto out; 3983 } 3984 3985 err = -ENOMEM; 3986 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 3987 if (!conf) 3988 goto out; 3989 3990 /* FIXME calc properly */ 3991 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks), 3992 sizeof(struct raid10_info), 3993 GFP_KERNEL); 3994 if (!conf->mirrors) 3995 goto out; 3996 3997 conf->tmppage = alloc_page(GFP_KERNEL); 3998 if (!conf->tmppage) 3999 goto out; 4000 4001 conf->geo = geo; 4002 conf->copies = copies; 4003 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, 4004 rbio_pool_free, conf); 4005 if (err) 4006 goto out; 4007 4008 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 4009 if (err) 4010 goto out; 4011 4012 calc_sectors(conf, mddev->dev_sectors); 4013 if (mddev->reshape_position == MaxSector) { 4014 conf->prev = conf->geo; 4015 conf->reshape_progress = MaxSector; 4016 } else { 4017 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 4018 err = -EINVAL; 4019 goto out; 4020 } 4021 conf->reshape_progress = mddev->reshape_position; 4022 if (conf->prev.far_offset) 4023 conf->prev.stride = 1 << conf->prev.chunk_shift; 4024 else 4025 /* far_copies must be 1 */ 4026 conf->prev.stride = conf->dev_sectors; 4027 } 4028 conf->reshape_safe = conf->reshape_progress; 4029 spin_lock_init(&conf->device_lock); 4030 INIT_LIST_HEAD(&conf->retry_list); 4031 INIT_LIST_HEAD(&conf->bio_end_io_list); 4032 4033 spin_lock_init(&conf->resync_lock); 4034 init_waitqueue_head(&conf->wait_barrier); 4035 atomic_set(&conf->nr_pending, 0); 4036 4037 err = -ENOMEM; 4038 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 4039 if (!conf->thread) 4040 goto out; 4041 4042 conf->mddev = mddev; 4043 return conf; 4044 4045 out: 4046 if (conf) { 4047 mempool_exit(&conf->r10bio_pool); 4048 kfree(conf->mirrors); 4049 safe_put_page(conf->tmppage); 4050 bioset_exit(&conf->bio_split); 4051 kfree(conf); 4052 } 4053 return ERR_PTR(err); 4054 } 4055 4056 static void raid10_set_io_opt(struct r10conf *conf) 4057 { 4058 int raid_disks = conf->geo.raid_disks; 4059 4060 if (!(conf->geo.raid_disks % conf->geo.near_copies)) 4061 raid_disks /= conf->geo.near_copies; 4062 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 4063 raid_disks); 4064 } 4065 4066 static int raid10_run(struct mddev *mddev) 4067 { 4068 struct r10conf *conf; 4069 int i, disk_idx; 4070 struct raid10_info *disk; 4071 struct md_rdev *rdev; 4072 sector_t size; 4073 sector_t min_offset_diff = 0; 4074 int first = 1; 4075 4076 if (mddev_init_writes_pending(mddev) < 0) 4077 return -ENOMEM; 4078 4079 if (mddev->private == NULL) { 4080 conf = setup_conf(mddev); 4081 if (IS_ERR(conf)) 4082 return PTR_ERR(conf); 4083 mddev->private = conf; 4084 } 4085 conf = mddev->private; 4086 if (!conf) 4087 goto out; 4088 4089 if (mddev_is_clustered(conf->mddev)) { 4090 int fc, fo; 4091 4092 fc = (mddev->layout >> 8) & 255; 4093 fo = mddev->layout & (1<<16); 4094 if (fc > 1 || fo > 0) { 4095 pr_err("only near layout is supported by clustered" 4096 " raid10\n"); 4097 goto out_free_conf; 4098 } 4099 } 4100 4101 mddev->thread = conf->thread; 4102 conf->thread = NULL; 4103 4104 if (mddev->queue) { 4105 blk_queue_max_discard_sectors(mddev->queue, 4106 UINT_MAX); 4107 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 4108 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 4109 raid10_set_io_opt(conf); 4110 } 4111 4112 rdev_for_each(rdev, mddev) { 4113 long long diff; 4114 4115 disk_idx = rdev->raid_disk; 4116 if (disk_idx < 0) 4117 continue; 4118 if (disk_idx >= conf->geo.raid_disks && 4119 disk_idx >= conf->prev.raid_disks) 4120 continue; 4121 disk = conf->mirrors + disk_idx; 4122 4123 if (test_bit(Replacement, &rdev->flags)) { 4124 if (disk->replacement) 4125 goto out_free_conf; 4126 disk->replacement = rdev; 4127 } else { 4128 if (disk->rdev) 4129 goto out_free_conf; 4130 disk->rdev = rdev; 4131 } 4132 diff = (rdev->new_data_offset - rdev->data_offset); 4133 if (!mddev->reshape_backwards) 4134 diff = -diff; 4135 if (diff < 0) 4136 diff = 0; 4137 if (first || diff < min_offset_diff) 4138 min_offset_diff = diff; 4139 4140 if (mddev->gendisk) 4141 disk_stack_limits(mddev->gendisk, rdev->bdev, 4142 rdev->data_offset << 9); 4143 4144 disk->head_position = 0; 4145 first = 0; 4146 } 4147 4148 /* need to check that every block has at least one working mirror */ 4149 if (!enough(conf, -1)) { 4150 pr_err("md/raid10:%s: not enough operational mirrors.\n", 4151 mdname(mddev)); 4152 goto out_free_conf; 4153 } 4154 4155 if (conf->reshape_progress != MaxSector) { 4156 /* must ensure that shape change is supported */ 4157 if (conf->geo.far_copies != 1 && 4158 conf->geo.far_offset == 0) 4159 goto out_free_conf; 4160 if (conf->prev.far_copies != 1 && 4161 conf->prev.far_offset == 0) 4162 goto out_free_conf; 4163 } 4164 4165 mddev->degraded = 0; 4166 for (i = 0; 4167 i < conf->geo.raid_disks 4168 || i < conf->prev.raid_disks; 4169 i++) { 4170 4171 disk = conf->mirrors + i; 4172 4173 if (!disk->rdev && disk->replacement) { 4174 /* The replacement is all we have - use it */ 4175 disk->rdev = disk->replacement; 4176 disk->replacement = NULL; 4177 clear_bit(Replacement, &disk->rdev->flags); 4178 } 4179 4180 if (!disk->rdev || 4181 !test_bit(In_sync, &disk->rdev->flags)) { 4182 disk->head_position = 0; 4183 mddev->degraded++; 4184 if (disk->rdev && 4185 disk->rdev->saved_raid_disk < 0) 4186 conf->fullsync = 1; 4187 } 4188 4189 if (disk->replacement && 4190 !test_bit(In_sync, &disk->replacement->flags) && 4191 disk->replacement->saved_raid_disk < 0) { 4192 conf->fullsync = 1; 4193 } 4194 4195 disk->recovery_disabled = mddev->recovery_disabled - 1; 4196 } 4197 4198 if (mddev->recovery_cp != MaxSector) 4199 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", 4200 mdname(mddev)); 4201 pr_info("md/raid10:%s: active with %d out of %d devices\n", 4202 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 4203 conf->geo.raid_disks); 4204 /* 4205 * Ok, everything is just fine now 4206 */ 4207 mddev->dev_sectors = conf->dev_sectors; 4208 size = raid10_size(mddev, 0, 0); 4209 md_set_array_sectors(mddev, size); 4210 mddev->resync_max_sectors = size; 4211 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 4212 4213 if (md_integrity_register(mddev)) 4214 goto out_free_conf; 4215 4216 if (conf->reshape_progress != MaxSector) { 4217 unsigned long before_length, after_length; 4218 4219 before_length = ((1 << conf->prev.chunk_shift) * 4220 conf->prev.far_copies); 4221 after_length = ((1 << conf->geo.chunk_shift) * 4222 conf->geo.far_copies); 4223 4224 if (max(before_length, after_length) > min_offset_diff) { 4225 /* This cannot work */ 4226 pr_warn("md/raid10: offset difference not enough to continue reshape\n"); 4227 goto out_free_conf; 4228 } 4229 conf->offset_diff = min_offset_diff; 4230 4231 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4232 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4233 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4234 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4235 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4236 "reshape"); 4237 if (!mddev->sync_thread) 4238 goto out_free_conf; 4239 } 4240 4241 return 0; 4242 4243 out_free_conf: 4244 md_unregister_thread(&mddev->thread); 4245 mempool_exit(&conf->r10bio_pool); 4246 safe_put_page(conf->tmppage); 4247 kfree(conf->mirrors); 4248 kfree(conf); 4249 mddev->private = NULL; 4250 out: 4251 return -EIO; 4252 } 4253 4254 static void raid10_free(struct mddev *mddev, void *priv) 4255 { 4256 struct r10conf *conf = priv; 4257 4258 mempool_exit(&conf->r10bio_pool); 4259 safe_put_page(conf->tmppage); 4260 kfree(conf->mirrors); 4261 kfree(conf->mirrors_old); 4262 kfree(conf->mirrors_new); 4263 bioset_exit(&conf->bio_split); 4264 kfree(conf); 4265 } 4266 4267 static void raid10_quiesce(struct mddev *mddev, int quiesce) 4268 { 4269 struct r10conf *conf = mddev->private; 4270 4271 if (quiesce) 4272 raise_barrier(conf, 0); 4273 else 4274 lower_barrier(conf); 4275 } 4276 4277 static int raid10_resize(struct mddev *mddev, sector_t sectors) 4278 { 4279 /* Resize of 'far' arrays is not supported. 4280 * For 'near' and 'offset' arrays we can set the 4281 * number of sectors used to be an appropriate multiple 4282 * of the chunk size. 4283 * For 'offset', this is far_copies*chunksize. 4284 * For 'near' the multiplier is the LCM of 4285 * near_copies and raid_disks. 4286 * So if far_copies > 1 && !far_offset, fail. 4287 * Else find LCM(raid_disks, near_copy)*far_copies and 4288 * multiply by chunk_size. Then round to this number. 4289 * This is mostly done by raid10_size() 4290 */ 4291 struct r10conf *conf = mddev->private; 4292 sector_t oldsize, size; 4293 4294 if (mddev->reshape_position != MaxSector) 4295 return -EBUSY; 4296 4297 if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 4298 return -EINVAL; 4299 4300 oldsize = raid10_size(mddev, 0, 0); 4301 size = raid10_size(mddev, sectors, 0); 4302 if (mddev->external_size && 4303 mddev->array_sectors > size) 4304 return -EINVAL; 4305 if (mddev->bitmap) { 4306 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); 4307 if (ret) 4308 return ret; 4309 } 4310 md_set_array_sectors(mddev, size); 4311 if (sectors > mddev->dev_sectors && 4312 mddev->recovery_cp > oldsize) { 4313 mddev->recovery_cp = oldsize; 4314 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4315 } 4316 calc_sectors(conf, sectors); 4317 mddev->dev_sectors = conf->dev_sectors; 4318 mddev->resync_max_sectors = size; 4319 return 0; 4320 } 4321 4322 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) 4323 { 4324 struct md_rdev *rdev; 4325 struct r10conf *conf; 4326 4327 if (mddev->degraded > 0) { 4328 pr_warn("md/raid10:%s: Error: degraded raid0!\n", 4329 mdname(mddev)); 4330 return ERR_PTR(-EINVAL); 4331 } 4332 sector_div(size, devs); 4333 4334 /* Set new parameters */ 4335 mddev->new_level = 10; 4336 /* new layout: far_copies = 1, near_copies = 2 */ 4337 mddev->new_layout = (1<<8) + 2; 4338 mddev->new_chunk_sectors = mddev->chunk_sectors; 4339 mddev->delta_disks = mddev->raid_disks; 4340 mddev->raid_disks *= 2; 4341 /* make sure it will be not marked as dirty */ 4342 mddev->recovery_cp = MaxSector; 4343 mddev->dev_sectors = size; 4344 4345 conf = setup_conf(mddev); 4346 if (!IS_ERR(conf)) { 4347 rdev_for_each(rdev, mddev) 4348 if (rdev->raid_disk >= 0) { 4349 rdev->new_raid_disk = rdev->raid_disk * 2; 4350 rdev->sectors = size; 4351 } 4352 conf->barrier = 1; 4353 } 4354 4355 return conf; 4356 } 4357 4358 static void *raid10_takeover(struct mddev *mddev) 4359 { 4360 struct r0conf *raid0_conf; 4361 4362 /* raid10 can take over: 4363 * raid0 - providing it has only two drives 4364 */ 4365 if (mddev->level == 0) { 4366 /* for raid0 takeover only one zone is supported */ 4367 raid0_conf = mddev->private; 4368 if (raid0_conf->nr_strip_zones > 1) { 4369 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", 4370 mdname(mddev)); 4371 return ERR_PTR(-EINVAL); 4372 } 4373 return raid10_takeover_raid0(mddev, 4374 raid0_conf->strip_zone->zone_end, 4375 raid0_conf->strip_zone->nb_dev); 4376 } 4377 return ERR_PTR(-EINVAL); 4378 } 4379 4380 static int raid10_check_reshape(struct mddev *mddev) 4381 { 4382 /* Called when there is a request to change 4383 * - layout (to ->new_layout) 4384 * - chunk size (to ->new_chunk_sectors) 4385 * - raid_disks (by delta_disks) 4386 * or when trying to restart a reshape that was ongoing. 4387 * 4388 * We need to validate the request and possibly allocate 4389 * space if that might be an issue later. 4390 * 4391 * Currently we reject any reshape of a 'far' mode array, 4392 * allow chunk size to change if new is generally acceptable, 4393 * allow raid_disks to increase, and allow 4394 * a switch between 'near' mode and 'offset' mode. 4395 */ 4396 struct r10conf *conf = mddev->private; 4397 struct geom geo; 4398 4399 if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 4400 return -EINVAL; 4401 4402 if (setup_geo(&geo, mddev, geo_start) != conf->copies) 4403 /* mustn't change number of copies */ 4404 return -EINVAL; 4405 if (geo.far_copies > 1 && !geo.far_offset) 4406 /* Cannot switch to 'far' mode */ 4407 return -EINVAL; 4408 4409 if (mddev->array_sectors & geo.chunk_mask) 4410 /* not factor of array size */ 4411 return -EINVAL; 4412 4413 if (!enough(conf, -1)) 4414 return -EINVAL; 4415 4416 kfree(conf->mirrors_new); 4417 conf->mirrors_new = NULL; 4418 if (mddev->delta_disks > 0) { 4419 /* allocate new 'mirrors' list */ 4420 conf->mirrors_new = 4421 kcalloc(mddev->raid_disks + mddev->delta_disks, 4422 sizeof(struct raid10_info), 4423 GFP_KERNEL); 4424 if (!conf->mirrors_new) 4425 return -ENOMEM; 4426 } 4427 return 0; 4428 } 4429 4430 /* 4431 * Need to check if array has failed when deciding whether to: 4432 * - start an array 4433 * - remove non-faulty devices 4434 * - add a spare 4435 * - allow a reshape 4436 * This determination is simple when no reshape is happening. 4437 * However if there is a reshape, we need to carefully check 4438 * both the before and after sections. 4439 * This is because some failed devices may only affect one 4440 * of the two sections, and some non-in_sync devices may 4441 * be insync in the section most affected by failed devices. 4442 */ 4443 static int calc_degraded(struct r10conf *conf) 4444 { 4445 int degraded, degraded2; 4446 int i; 4447 4448 rcu_read_lock(); 4449 degraded = 0; 4450 /* 'prev' section first */ 4451 for (i = 0; i < conf->prev.raid_disks; i++) { 4452 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4453 if (!rdev || test_bit(Faulty, &rdev->flags)) 4454 degraded++; 4455 else if (!test_bit(In_sync, &rdev->flags)) 4456 /* When we can reduce the number of devices in 4457 * an array, this might not contribute to 4458 * 'degraded'. It does now. 4459 */ 4460 degraded++; 4461 } 4462 rcu_read_unlock(); 4463 if (conf->geo.raid_disks == conf->prev.raid_disks) 4464 return degraded; 4465 rcu_read_lock(); 4466 degraded2 = 0; 4467 for (i = 0; i < conf->geo.raid_disks; i++) { 4468 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4469 if (!rdev || test_bit(Faulty, &rdev->flags)) 4470 degraded2++; 4471 else if (!test_bit(In_sync, &rdev->flags)) { 4472 /* If reshape is increasing the number of devices, 4473 * this section has already been recovered, so 4474 * it doesn't contribute to degraded. 4475 * else it does. 4476 */ 4477 if (conf->geo.raid_disks <= conf->prev.raid_disks) 4478 degraded2++; 4479 } 4480 } 4481 rcu_read_unlock(); 4482 if (degraded2 > degraded) 4483 return degraded2; 4484 return degraded; 4485 } 4486 4487 static int raid10_start_reshape(struct mddev *mddev) 4488 { 4489 /* A 'reshape' has been requested. This commits 4490 * the various 'new' fields and sets MD_RECOVER_RESHAPE 4491 * This also checks if there are enough spares and adds them 4492 * to the array. 4493 * We currently require enough spares to make the final 4494 * array non-degraded. We also require that the difference 4495 * between old and new data_offset - on each device - is 4496 * enough that we never risk over-writing. 4497 */ 4498 4499 unsigned long before_length, after_length; 4500 sector_t min_offset_diff = 0; 4501 int first = 1; 4502 struct geom new; 4503 struct r10conf *conf = mddev->private; 4504 struct md_rdev *rdev; 4505 int spares = 0; 4506 int ret; 4507 4508 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4509 return -EBUSY; 4510 4511 if (setup_geo(&new, mddev, geo_start) != conf->copies) 4512 return -EINVAL; 4513 4514 before_length = ((1 << conf->prev.chunk_shift) * 4515 conf->prev.far_copies); 4516 after_length = ((1 << conf->geo.chunk_shift) * 4517 conf->geo.far_copies); 4518 4519 rdev_for_each(rdev, mddev) { 4520 if (!test_bit(In_sync, &rdev->flags) 4521 && !test_bit(Faulty, &rdev->flags)) 4522 spares++; 4523 if (rdev->raid_disk >= 0) { 4524 long long diff = (rdev->new_data_offset 4525 - rdev->data_offset); 4526 if (!mddev->reshape_backwards) 4527 diff = -diff; 4528 if (diff < 0) 4529 diff = 0; 4530 if (first || diff < min_offset_diff) 4531 min_offset_diff = diff; 4532 first = 0; 4533 } 4534 } 4535 4536 if (max(before_length, after_length) > min_offset_diff) 4537 return -EINVAL; 4538 4539 if (spares < mddev->delta_disks) 4540 return -EINVAL; 4541 4542 conf->offset_diff = min_offset_diff; 4543 spin_lock_irq(&conf->device_lock); 4544 if (conf->mirrors_new) { 4545 memcpy(conf->mirrors_new, conf->mirrors, 4546 sizeof(struct raid10_info)*conf->prev.raid_disks); 4547 smp_mb(); 4548 kfree(conf->mirrors_old); 4549 conf->mirrors_old = conf->mirrors; 4550 conf->mirrors = conf->mirrors_new; 4551 conf->mirrors_new = NULL; 4552 } 4553 setup_geo(&conf->geo, mddev, geo_start); 4554 smp_mb(); 4555 if (mddev->reshape_backwards) { 4556 sector_t size = raid10_size(mddev, 0, 0); 4557 if (size < mddev->array_sectors) { 4558 spin_unlock_irq(&conf->device_lock); 4559 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", 4560 mdname(mddev)); 4561 return -EINVAL; 4562 } 4563 mddev->resync_max_sectors = size; 4564 conf->reshape_progress = size; 4565 } else 4566 conf->reshape_progress = 0; 4567 conf->reshape_safe = conf->reshape_progress; 4568 spin_unlock_irq(&conf->device_lock); 4569 4570 if (mddev->delta_disks && mddev->bitmap) { 4571 struct mdp_superblock_1 *sb = NULL; 4572 sector_t oldsize, newsize; 4573 4574 oldsize = raid10_size(mddev, 0, 0); 4575 newsize = raid10_size(mddev, 0, conf->geo.raid_disks); 4576 4577 if (!mddev_is_clustered(mddev)) { 4578 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4579 if (ret) 4580 goto abort; 4581 else 4582 goto out; 4583 } 4584 4585 rdev_for_each(rdev, mddev) { 4586 if (rdev->raid_disk > -1 && 4587 !test_bit(Faulty, &rdev->flags)) 4588 sb = page_address(rdev->sb_page); 4589 } 4590 4591 /* 4592 * some node is already performing reshape, and no need to 4593 * call md_bitmap_resize again since it should be called when 4594 * receiving BITMAP_RESIZE msg 4595 */ 4596 if ((sb && (le32_to_cpu(sb->feature_map) & 4597 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) 4598 goto out; 4599 4600 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4601 if (ret) 4602 goto abort; 4603 4604 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); 4605 if (ret) { 4606 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); 4607 goto abort; 4608 } 4609 } 4610 out: 4611 if (mddev->delta_disks > 0) { 4612 rdev_for_each(rdev, mddev) 4613 if (rdev->raid_disk < 0 && 4614 !test_bit(Faulty, &rdev->flags)) { 4615 if (raid10_add_disk(mddev, rdev) == 0) { 4616 if (rdev->raid_disk >= 4617 conf->prev.raid_disks) 4618 set_bit(In_sync, &rdev->flags); 4619 else 4620 rdev->recovery_offset = 0; 4621 4622 /* Failure here is OK */ 4623 sysfs_link_rdev(mddev, rdev); 4624 } 4625 } else if (rdev->raid_disk >= conf->prev.raid_disks 4626 && !test_bit(Faulty, &rdev->flags)) { 4627 /* This is a spare that was manually added */ 4628 set_bit(In_sync, &rdev->flags); 4629 } 4630 } 4631 /* When a reshape changes the number of devices, 4632 * ->degraded is measured against the larger of the 4633 * pre and post numbers. 4634 */ 4635 spin_lock_irq(&conf->device_lock); 4636 mddev->degraded = calc_degraded(conf); 4637 spin_unlock_irq(&conf->device_lock); 4638 mddev->raid_disks = conf->geo.raid_disks; 4639 mddev->reshape_position = conf->reshape_progress; 4640 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4641 4642 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4643 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4644 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4645 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4646 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4647 4648 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4649 "reshape"); 4650 if (!mddev->sync_thread) { 4651 ret = -EAGAIN; 4652 goto abort; 4653 } 4654 conf->reshape_checkpoint = jiffies; 4655 md_wakeup_thread(mddev->sync_thread); 4656 md_new_event(); 4657 return 0; 4658 4659 abort: 4660 mddev->recovery = 0; 4661 spin_lock_irq(&conf->device_lock); 4662 conf->geo = conf->prev; 4663 mddev->raid_disks = conf->geo.raid_disks; 4664 rdev_for_each(rdev, mddev) 4665 rdev->new_data_offset = rdev->data_offset; 4666 smp_wmb(); 4667 conf->reshape_progress = MaxSector; 4668 conf->reshape_safe = MaxSector; 4669 mddev->reshape_position = MaxSector; 4670 spin_unlock_irq(&conf->device_lock); 4671 return ret; 4672 } 4673 4674 /* Calculate the last device-address that could contain 4675 * any block from the chunk that includes the array-address 's' 4676 * and report the next address. 4677 * i.e. the address returned will be chunk-aligned and after 4678 * any data that is in the chunk containing 's'. 4679 */ 4680 static sector_t last_dev_address(sector_t s, struct geom *geo) 4681 { 4682 s = (s | geo->chunk_mask) + 1; 4683 s >>= geo->chunk_shift; 4684 s *= geo->near_copies; 4685 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 4686 s *= geo->far_copies; 4687 s <<= geo->chunk_shift; 4688 return s; 4689 } 4690 4691 /* Calculate the first device-address that could contain 4692 * any block from the chunk that includes the array-address 's'. 4693 * This too will be the start of a chunk 4694 */ 4695 static sector_t first_dev_address(sector_t s, struct geom *geo) 4696 { 4697 s >>= geo->chunk_shift; 4698 s *= geo->near_copies; 4699 sector_div(s, geo->raid_disks); 4700 s *= geo->far_copies; 4701 s <<= geo->chunk_shift; 4702 return s; 4703 } 4704 4705 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 4706 int *skipped) 4707 { 4708 /* We simply copy at most one chunk (smallest of old and new) 4709 * at a time, possibly less if that exceeds RESYNC_PAGES, 4710 * or we hit a bad block or something. 4711 * This might mean we pause for normal IO in the middle of 4712 * a chunk, but that is not a problem as mddev->reshape_position 4713 * can record any location. 4714 * 4715 * If we will want to write to a location that isn't 4716 * yet recorded as 'safe' (i.e. in metadata on disk) then 4717 * we need to flush all reshape requests and update the metadata. 4718 * 4719 * When reshaping forwards (e.g. to more devices), we interpret 4720 * 'safe' as the earliest block which might not have been copied 4721 * down yet. We divide this by previous stripe size and multiply 4722 * by previous stripe length to get lowest device offset that we 4723 * cannot write to yet. 4724 * We interpret 'sector_nr' as an address that we want to write to. 4725 * From this we use last_device_address() to find where we might 4726 * write to, and first_device_address on the 'safe' position. 4727 * If this 'next' write position is after the 'safe' position, 4728 * we must update the metadata to increase the 'safe' position. 4729 * 4730 * When reshaping backwards, we round in the opposite direction 4731 * and perform the reverse test: next write position must not be 4732 * less than current safe position. 4733 * 4734 * In all this the minimum difference in data offsets 4735 * (conf->offset_diff - always positive) allows a bit of slack, 4736 * so next can be after 'safe', but not by more than offset_diff 4737 * 4738 * We need to prepare all the bios here before we start any IO 4739 * to ensure the size we choose is acceptable to all devices. 4740 * The means one for each copy for write-out and an extra one for 4741 * read-in. 4742 * We store the read-in bio in ->master_bio and the others in 4743 * ->devs[x].bio and ->devs[x].repl_bio. 4744 */ 4745 struct r10conf *conf = mddev->private; 4746 struct r10bio *r10_bio; 4747 sector_t next, safe, last; 4748 int max_sectors; 4749 int nr_sectors; 4750 int s; 4751 struct md_rdev *rdev; 4752 int need_flush = 0; 4753 struct bio *blist; 4754 struct bio *bio, *read_bio; 4755 int sectors_done = 0; 4756 struct page **pages; 4757 4758 if (sector_nr == 0) { 4759 /* If restarting in the middle, skip the initial sectors */ 4760 if (mddev->reshape_backwards && 4761 conf->reshape_progress < raid10_size(mddev, 0, 0)) { 4762 sector_nr = (raid10_size(mddev, 0, 0) 4763 - conf->reshape_progress); 4764 } else if (!mddev->reshape_backwards && 4765 conf->reshape_progress > 0) 4766 sector_nr = conf->reshape_progress; 4767 if (sector_nr) { 4768 mddev->curr_resync_completed = sector_nr; 4769 sysfs_notify_dirent_safe(mddev->sysfs_completed); 4770 *skipped = 1; 4771 return sector_nr; 4772 } 4773 } 4774 4775 /* We don't use sector_nr to track where we are up to 4776 * as that doesn't work well for ->reshape_backwards. 4777 * So just use ->reshape_progress. 4778 */ 4779 if (mddev->reshape_backwards) { 4780 /* 'next' is the earliest device address that we might 4781 * write to for this chunk in the new layout 4782 */ 4783 next = first_dev_address(conf->reshape_progress - 1, 4784 &conf->geo); 4785 4786 /* 'safe' is the last device address that we might read from 4787 * in the old layout after a restart 4788 */ 4789 safe = last_dev_address(conf->reshape_safe - 1, 4790 &conf->prev); 4791 4792 if (next + conf->offset_diff < safe) 4793 need_flush = 1; 4794 4795 last = conf->reshape_progress - 1; 4796 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 4797 & conf->prev.chunk_mask); 4798 if (sector_nr + RESYNC_SECTORS < last) 4799 sector_nr = last + 1 - RESYNC_SECTORS; 4800 } else { 4801 /* 'next' is after the last device address that we 4802 * might write to for this chunk in the new layout 4803 */ 4804 next = last_dev_address(conf->reshape_progress, &conf->geo); 4805 4806 /* 'safe' is the earliest device address that we might 4807 * read from in the old layout after a restart 4808 */ 4809 safe = first_dev_address(conf->reshape_safe, &conf->prev); 4810 4811 /* Need to update metadata if 'next' might be beyond 'safe' 4812 * as that would possibly corrupt data 4813 */ 4814 if (next > safe + conf->offset_diff) 4815 need_flush = 1; 4816 4817 sector_nr = conf->reshape_progress; 4818 last = sector_nr | (conf->geo.chunk_mask 4819 & conf->prev.chunk_mask); 4820 4821 if (sector_nr + RESYNC_SECTORS <= last) 4822 last = sector_nr + RESYNC_SECTORS - 1; 4823 } 4824 4825 if (need_flush || 4826 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4827 /* Need to update reshape_position in metadata */ 4828 wait_barrier(conf, false); 4829 mddev->reshape_position = conf->reshape_progress; 4830 if (mddev->reshape_backwards) 4831 mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 4832 - conf->reshape_progress; 4833 else 4834 mddev->curr_resync_completed = conf->reshape_progress; 4835 conf->reshape_checkpoint = jiffies; 4836 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4837 md_wakeup_thread(mddev->thread); 4838 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 4839 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4840 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4841 allow_barrier(conf); 4842 return sectors_done; 4843 } 4844 conf->reshape_safe = mddev->reshape_position; 4845 allow_barrier(conf); 4846 } 4847 4848 raise_barrier(conf, 0); 4849 read_more: 4850 /* Now schedule reads for blocks from sector_nr to last */ 4851 r10_bio = raid10_alloc_init_r10buf(conf); 4852 r10_bio->state = 0; 4853 raise_barrier(conf, 1); 4854 atomic_set(&r10_bio->remaining, 0); 4855 r10_bio->mddev = mddev; 4856 r10_bio->sector = sector_nr; 4857 set_bit(R10BIO_IsReshape, &r10_bio->state); 4858 r10_bio->sectors = last - sector_nr + 1; 4859 rdev = read_balance(conf, r10_bio, &max_sectors); 4860 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 4861 4862 if (!rdev) { 4863 /* Cannot read from here, so need to record bad blocks 4864 * on all the target devices. 4865 */ 4866 // FIXME 4867 mempool_free(r10_bio, &conf->r10buf_pool); 4868 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4869 return sectors_done; 4870 } 4871 4872 read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ, 4873 GFP_KERNEL, &mddev->bio_set); 4874 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 4875 + rdev->data_offset); 4876 read_bio->bi_private = r10_bio; 4877 read_bio->bi_end_io = end_reshape_read; 4878 r10_bio->master_bio = read_bio; 4879 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4880 4881 /* 4882 * Broadcast RESYNC message to other nodes, so all nodes would not 4883 * write to the region to avoid conflict. 4884 */ 4885 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { 4886 struct mdp_superblock_1 *sb = NULL; 4887 int sb_reshape_pos = 0; 4888 4889 conf->cluster_sync_low = sector_nr; 4890 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; 4891 sb = page_address(rdev->sb_page); 4892 if (sb) { 4893 sb_reshape_pos = le64_to_cpu(sb->reshape_position); 4894 /* 4895 * Set cluster_sync_low again if next address for array 4896 * reshape is less than cluster_sync_low. Since we can't 4897 * update cluster_sync_low until it has finished reshape. 4898 */ 4899 if (sb_reshape_pos < conf->cluster_sync_low) 4900 conf->cluster_sync_low = sb_reshape_pos; 4901 } 4902 4903 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, 4904 conf->cluster_sync_high); 4905 } 4906 4907 /* Now find the locations in the new layout */ 4908 __raid10_find_phys(&conf->geo, r10_bio); 4909 4910 blist = read_bio; 4911 read_bio->bi_next = NULL; 4912 4913 rcu_read_lock(); 4914 for (s = 0; s < conf->copies*2; s++) { 4915 struct bio *b; 4916 int d = r10_bio->devs[s/2].devnum; 4917 struct md_rdev *rdev2; 4918 if (s&1) { 4919 rdev2 = rcu_dereference(conf->mirrors[d].replacement); 4920 b = r10_bio->devs[s/2].repl_bio; 4921 } else { 4922 rdev2 = rcu_dereference(conf->mirrors[d].rdev); 4923 b = r10_bio->devs[s/2].bio; 4924 } 4925 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4926 continue; 4927 4928 bio_set_dev(b, rdev2->bdev); 4929 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + 4930 rdev2->new_data_offset; 4931 b->bi_end_io = end_reshape_write; 4932 bio_set_op_attrs(b, REQ_OP_WRITE, 0); 4933 b->bi_next = blist; 4934 blist = b; 4935 } 4936 4937 /* Now add as many pages as possible to all of these bios. */ 4938 4939 nr_sectors = 0; 4940 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 4941 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 4942 struct page *page = pages[s / (PAGE_SIZE >> 9)]; 4943 int len = (max_sectors - s) << 9; 4944 if (len > PAGE_SIZE) 4945 len = PAGE_SIZE; 4946 for (bio = blist; bio ; bio = bio->bi_next) { 4947 /* 4948 * won't fail because the vec table is big enough 4949 * to hold all these pages 4950 */ 4951 bio_add_page(bio, page, len, 0); 4952 } 4953 sector_nr += len >> 9; 4954 nr_sectors += len >> 9; 4955 } 4956 rcu_read_unlock(); 4957 r10_bio->sectors = nr_sectors; 4958 4959 /* Now submit the read */ 4960 md_sync_acct_bio(read_bio, r10_bio->sectors); 4961 atomic_inc(&r10_bio->remaining); 4962 read_bio->bi_next = NULL; 4963 submit_bio_noacct(read_bio); 4964 sectors_done += nr_sectors; 4965 if (sector_nr <= last) 4966 goto read_more; 4967 4968 lower_barrier(conf); 4969 4970 /* Now that we have done the whole section we can 4971 * update reshape_progress 4972 */ 4973 if (mddev->reshape_backwards) 4974 conf->reshape_progress -= sectors_done; 4975 else 4976 conf->reshape_progress += sectors_done; 4977 4978 return sectors_done; 4979 } 4980 4981 static void end_reshape_request(struct r10bio *r10_bio); 4982 static int handle_reshape_read_error(struct mddev *mddev, 4983 struct r10bio *r10_bio); 4984 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 4985 { 4986 /* Reshape read completed. Hopefully we have a block 4987 * to write out. 4988 * If we got a read error then we do sync 1-page reads from 4989 * elsewhere until we find the data - or give up. 4990 */ 4991 struct r10conf *conf = mddev->private; 4992 int s; 4993 4994 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 4995 if (handle_reshape_read_error(mddev, r10_bio) < 0) { 4996 /* Reshape has been aborted */ 4997 md_done_sync(mddev, r10_bio->sectors, 0); 4998 return; 4999 } 5000 5001 /* We definitely have the data in the pages, schedule the 5002 * writes. 5003 */ 5004 atomic_set(&r10_bio->remaining, 1); 5005 for (s = 0; s < conf->copies*2; s++) { 5006 struct bio *b; 5007 int d = r10_bio->devs[s/2].devnum; 5008 struct md_rdev *rdev; 5009 rcu_read_lock(); 5010 if (s&1) { 5011 rdev = rcu_dereference(conf->mirrors[d].replacement); 5012 b = r10_bio->devs[s/2].repl_bio; 5013 } else { 5014 rdev = rcu_dereference(conf->mirrors[d].rdev); 5015 b = r10_bio->devs[s/2].bio; 5016 } 5017 if (!rdev || test_bit(Faulty, &rdev->flags)) { 5018 rcu_read_unlock(); 5019 continue; 5020 } 5021 atomic_inc(&rdev->nr_pending); 5022 rcu_read_unlock(); 5023 md_sync_acct_bio(b, r10_bio->sectors); 5024 atomic_inc(&r10_bio->remaining); 5025 b->bi_next = NULL; 5026 submit_bio_noacct(b); 5027 } 5028 end_reshape_request(r10_bio); 5029 } 5030 5031 static void end_reshape(struct r10conf *conf) 5032 { 5033 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 5034 return; 5035 5036 spin_lock_irq(&conf->device_lock); 5037 conf->prev = conf->geo; 5038 md_finish_reshape(conf->mddev); 5039 smp_wmb(); 5040 conf->reshape_progress = MaxSector; 5041 conf->reshape_safe = MaxSector; 5042 spin_unlock_irq(&conf->device_lock); 5043 5044 if (conf->mddev->queue) 5045 raid10_set_io_opt(conf); 5046 conf->fullsync = 0; 5047 } 5048 5049 static void raid10_update_reshape_pos(struct mddev *mddev) 5050 { 5051 struct r10conf *conf = mddev->private; 5052 sector_t lo, hi; 5053 5054 md_cluster_ops->resync_info_get(mddev, &lo, &hi); 5055 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) 5056 || mddev->reshape_position == MaxSector) 5057 conf->reshape_progress = mddev->reshape_position; 5058 else 5059 WARN_ON_ONCE(1); 5060 } 5061 5062 static int handle_reshape_read_error(struct mddev *mddev, 5063 struct r10bio *r10_bio) 5064 { 5065 /* Use sync reads to get the blocks from somewhere else */ 5066 int sectors = r10_bio->sectors; 5067 struct r10conf *conf = mddev->private; 5068 struct r10bio *r10b; 5069 int slot = 0; 5070 int idx = 0; 5071 struct page **pages; 5072 5073 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); 5074 if (!r10b) { 5075 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5076 return -ENOMEM; 5077 } 5078 5079 /* reshape IOs share pages from .devs[0].bio */ 5080 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 5081 5082 r10b->sector = r10_bio->sector; 5083 __raid10_find_phys(&conf->prev, r10b); 5084 5085 while (sectors) { 5086 int s = sectors; 5087 int success = 0; 5088 int first_slot = slot; 5089 5090 if (s > (PAGE_SIZE >> 9)) 5091 s = PAGE_SIZE >> 9; 5092 5093 rcu_read_lock(); 5094 while (!success) { 5095 int d = r10b->devs[slot].devnum; 5096 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5097 sector_t addr; 5098 if (rdev == NULL || 5099 test_bit(Faulty, &rdev->flags) || 5100 !test_bit(In_sync, &rdev->flags)) 5101 goto failed; 5102 5103 addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 5104 atomic_inc(&rdev->nr_pending); 5105 rcu_read_unlock(); 5106 success = sync_page_io(rdev, 5107 addr, 5108 s << 9, 5109 pages[idx], 5110 REQ_OP_READ, 0, false); 5111 rdev_dec_pending(rdev, mddev); 5112 rcu_read_lock(); 5113 if (success) 5114 break; 5115 failed: 5116 slot++; 5117 if (slot >= conf->copies) 5118 slot = 0; 5119 if (slot == first_slot) 5120 break; 5121 } 5122 rcu_read_unlock(); 5123 if (!success) { 5124 /* couldn't read this block, must give up */ 5125 set_bit(MD_RECOVERY_INTR, 5126 &mddev->recovery); 5127 kfree(r10b); 5128 return -EIO; 5129 } 5130 sectors -= s; 5131 idx++; 5132 } 5133 kfree(r10b); 5134 return 0; 5135 } 5136 5137 static void end_reshape_write(struct bio *bio) 5138 { 5139 struct r10bio *r10_bio = get_resync_r10bio(bio); 5140 struct mddev *mddev = r10_bio->mddev; 5141 struct r10conf *conf = mddev->private; 5142 int d; 5143 int slot; 5144 int repl; 5145 struct md_rdev *rdev = NULL; 5146 5147 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 5148 if (repl) 5149 rdev = conf->mirrors[d].replacement; 5150 if (!rdev) { 5151 smp_mb(); 5152 rdev = conf->mirrors[d].rdev; 5153 } 5154 5155 if (bio->bi_status) { 5156 /* FIXME should record badblock */ 5157 md_error(mddev, rdev); 5158 } 5159 5160 rdev_dec_pending(rdev, mddev); 5161 end_reshape_request(r10_bio); 5162 } 5163 5164 static void end_reshape_request(struct r10bio *r10_bio) 5165 { 5166 if (!atomic_dec_and_test(&r10_bio->remaining)) 5167 return; 5168 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 5169 bio_put(r10_bio->master_bio); 5170 put_buf(r10_bio); 5171 } 5172 5173 static void raid10_finish_reshape(struct mddev *mddev) 5174 { 5175 struct r10conf *conf = mddev->private; 5176 5177 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5178 return; 5179 5180 if (mddev->delta_disks > 0) { 5181 if (mddev->recovery_cp > mddev->resync_max_sectors) { 5182 mddev->recovery_cp = mddev->resync_max_sectors; 5183 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5184 } 5185 mddev->resync_max_sectors = mddev->array_sectors; 5186 } else { 5187 int d; 5188 rcu_read_lock(); 5189 for (d = conf->geo.raid_disks ; 5190 d < conf->geo.raid_disks - mddev->delta_disks; 5191 d++) { 5192 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5193 if (rdev) 5194 clear_bit(In_sync, &rdev->flags); 5195 rdev = rcu_dereference(conf->mirrors[d].replacement); 5196 if (rdev) 5197 clear_bit(In_sync, &rdev->flags); 5198 } 5199 rcu_read_unlock(); 5200 } 5201 mddev->layout = mddev->new_layout; 5202 mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 5203 mddev->reshape_position = MaxSector; 5204 mddev->delta_disks = 0; 5205 mddev->reshape_backwards = 0; 5206 } 5207 5208 static struct md_personality raid10_personality = 5209 { 5210 .name = "raid10", 5211 .level = 10, 5212 .owner = THIS_MODULE, 5213 .make_request = raid10_make_request, 5214 .run = raid10_run, 5215 .free = raid10_free, 5216 .status = raid10_status, 5217 .error_handler = raid10_error, 5218 .hot_add_disk = raid10_add_disk, 5219 .hot_remove_disk= raid10_remove_disk, 5220 .spare_active = raid10_spare_active, 5221 .sync_request = raid10_sync_request, 5222 .quiesce = raid10_quiesce, 5223 .size = raid10_size, 5224 .resize = raid10_resize, 5225 .takeover = raid10_takeover, 5226 .check_reshape = raid10_check_reshape, 5227 .start_reshape = raid10_start_reshape, 5228 .finish_reshape = raid10_finish_reshape, 5229 .update_reshape_pos = raid10_update_reshape_pos, 5230 }; 5231 5232 static int __init raid_init(void) 5233 { 5234 return register_md_personality(&raid10_personality); 5235 } 5236 5237 static void raid_exit(void) 5238 { 5239 unregister_md_personality(&raid10_personality); 5240 } 5241 5242 module_init(raid_init); 5243 module_exit(raid_exit); 5244 MODULE_LICENSE("GPL"); 5245 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); 5246 MODULE_ALIAS("md-personality-9"); /* RAID10 */ 5247 MODULE_ALIAS("md-raid10"); 5248 MODULE_ALIAS("md-level-10"); 5249