1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * raid10.c : Multiple Devices driver for Linux 4 * 5 * Copyright (C) 2000-2004 Neil Brown 6 * 7 * RAID-10 support for md. 8 * 9 * Base on code in raid1.c. See raid1.c for further copyright information. 10 */ 11 12 #include <linux/slab.h> 13 #include <linux/delay.h> 14 #include <linux/blkdev.h> 15 #include <linux/module.h> 16 #include <linux/seq_file.h> 17 #include <linux/ratelimit.h> 18 #include <linux/kthread.h> 19 #include <linux/raid/md_p.h> 20 #include <trace/events/block.h> 21 #include "md.h" 22 #include "raid10.h" 23 #include "raid0.h" 24 #include "md-bitmap.h" 25 26 /* 27 * RAID10 provides a combination of RAID0 and RAID1 functionality. 28 * The layout of data is defined by 29 * chunk_size 30 * raid_disks 31 * near_copies (stored in low byte of layout) 32 * far_copies (stored in second byte of layout) 33 * far_offset (stored in bit 16 of layout ) 34 * use_far_sets (stored in bit 17 of layout ) 35 * use_far_sets_bugfixed (stored in bit 18 of layout ) 36 * 37 * The data to be stored is divided into chunks using chunksize. Each device 38 * is divided into far_copies sections. In each section, chunks are laid out 39 * in a style similar to raid0, but near_copies copies of each chunk is stored 40 * (each on a different drive). The starting device for each section is offset 41 * near_copies from the starting device of the previous section. Thus there 42 * are (near_copies * far_copies) of each chunk, and each is on a different 43 * drive. near_copies and far_copies must be at least one, and their product 44 * is at most raid_disks. 45 * 46 * If far_offset is true, then the far_copies are handled a bit differently. 47 * The copies are still in different stripes, but instead of being very far 48 * apart on disk, there are adjacent stripes. 49 * 50 * The far and offset algorithms are handled slightly differently if 51 * 'use_far_sets' is true. In this case, the array's devices are grouped into 52 * sets that are (near_copies * far_copies) in size. The far copied stripes 53 * are still shifted by 'near_copies' devices, but this shifting stays confined 54 * to the set rather than the entire array. This is done to improve the number 55 * of device combinations that can fail without causing the array to fail. 56 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk 57 * on a device): 58 * A B C D A B C D E 59 * ... ... 60 * D A B C E A B C D 61 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): 62 * [A B] [C D] [A B] [C D E] 63 * |...| |...| |...| | ... | 64 * [B A] [D C] [B A] [E C D] 65 */ 66 67 static void allow_barrier(struct r10conf *conf); 68 static void lower_barrier(struct r10conf *conf); 69 static int _enough(struct r10conf *conf, int previous, int ignore); 70 static int enough(struct r10conf *conf, int ignore); 71 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 72 int *skipped); 73 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 74 static void end_reshape_write(struct bio *bio); 75 static void end_reshape(struct r10conf *conf); 76 77 #define raid10_log(md, fmt, args...) \ 78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) 79 80 #include "raid1-10.c" 81 82 /* 83 * for resync bio, r10bio pointer can be retrieved from the per-bio 84 * 'struct resync_pages'. 85 */ 86 static inline struct r10bio *get_resync_r10bio(struct bio *bio) 87 { 88 return get_resync_pages(bio)->raid_bio; 89 } 90 91 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 92 { 93 struct r10conf *conf = data; 94 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); 95 96 /* allocate a r10bio with room for raid_disks entries in the 97 * bios array */ 98 return kzalloc(size, gfp_flags); 99 } 100 101 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 102 /* amount of memory to reserve for resync requests */ 103 #define RESYNC_WINDOW (1024*1024) 104 /* maximum number of concurrent requests, memory permitting */ 105 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 106 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) 107 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 108 109 /* 110 * When performing a resync, we need to read and compare, so 111 * we need as many pages are there are copies. 112 * When performing a recovery, we need 2 bios, one for read, 113 * one for write (we recover only one drive per r10buf) 114 * 115 */ 116 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 117 { 118 struct r10conf *conf = data; 119 struct r10bio *r10_bio; 120 struct bio *bio; 121 int j; 122 int nalloc, nalloc_rp; 123 struct resync_pages *rps; 124 125 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 126 if (!r10_bio) 127 return NULL; 128 129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 131 nalloc = conf->copies; /* resync */ 132 else 133 nalloc = 2; /* recovery */ 134 135 /* allocate once for all bios */ 136 if (!conf->have_replacement) 137 nalloc_rp = nalloc; 138 else 139 nalloc_rp = nalloc * 2; 140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags); 141 if (!rps) 142 goto out_free_r10bio; 143 144 /* 145 * Allocate bios. 146 */ 147 for (j = nalloc ; j-- ; ) { 148 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 149 if (!bio) 150 goto out_free_bio; 151 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 152 r10_bio->devs[j].bio = bio; 153 if (!conf->have_replacement) 154 continue; 155 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 156 if (!bio) 157 goto out_free_bio; 158 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 159 r10_bio->devs[j].repl_bio = bio; 160 } 161 /* 162 * Allocate RESYNC_PAGES data pages and attach them 163 * where needed. 164 */ 165 for (j = 0; j < nalloc; j++) { 166 struct bio *rbio = r10_bio->devs[j].repl_bio; 167 struct resync_pages *rp, *rp_repl; 168 169 rp = &rps[j]; 170 if (rbio) 171 rp_repl = &rps[nalloc + j]; 172 173 bio = r10_bio->devs[j].bio; 174 175 if (!j || test_bit(MD_RECOVERY_SYNC, 176 &conf->mddev->recovery)) { 177 if (resync_alloc_pages(rp, gfp_flags)) 178 goto out_free_pages; 179 } else { 180 memcpy(rp, &rps[0], sizeof(*rp)); 181 resync_get_all_pages(rp); 182 } 183 184 rp->raid_bio = r10_bio; 185 bio->bi_private = rp; 186 if (rbio) { 187 memcpy(rp_repl, rp, sizeof(*rp)); 188 rbio->bi_private = rp_repl; 189 } 190 } 191 192 return r10_bio; 193 194 out_free_pages: 195 while (--j >= 0) 196 resync_free_pages(&rps[j]); 197 198 j = 0; 199 out_free_bio: 200 for ( ; j < nalloc; j++) { 201 if (r10_bio->devs[j].bio) 202 bio_uninit(r10_bio->devs[j].bio); 203 kfree(r10_bio->devs[j].bio); 204 if (r10_bio->devs[j].repl_bio) 205 bio_uninit(r10_bio->devs[j].repl_bio); 206 kfree(r10_bio->devs[j].repl_bio); 207 } 208 kfree(rps); 209 out_free_r10bio: 210 rbio_pool_free(r10_bio, conf); 211 return NULL; 212 } 213 214 static void r10buf_pool_free(void *__r10_bio, void *data) 215 { 216 struct r10conf *conf = data; 217 struct r10bio *r10bio = __r10_bio; 218 int j; 219 struct resync_pages *rp = NULL; 220 221 for (j = conf->copies; j--; ) { 222 struct bio *bio = r10bio->devs[j].bio; 223 224 if (bio) { 225 rp = get_resync_pages(bio); 226 resync_free_pages(rp); 227 bio_uninit(bio); 228 kfree(bio); 229 } 230 231 bio = r10bio->devs[j].repl_bio; 232 if (bio) { 233 bio_uninit(bio); 234 kfree(bio); 235 } 236 } 237 238 /* resync pages array stored in the 1st bio's .bi_private */ 239 kfree(rp); 240 241 rbio_pool_free(r10bio, conf); 242 } 243 244 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 245 { 246 int i; 247 248 for (i = 0; i < conf->geo.raid_disks; i++) { 249 struct bio **bio = & r10_bio->devs[i].bio; 250 if (!BIO_SPECIAL(*bio)) 251 bio_put(*bio); 252 *bio = NULL; 253 bio = &r10_bio->devs[i].repl_bio; 254 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) 255 bio_put(*bio); 256 *bio = NULL; 257 } 258 } 259 260 static void free_r10bio(struct r10bio *r10_bio) 261 { 262 struct r10conf *conf = r10_bio->mddev->private; 263 264 put_all_bios(conf, r10_bio); 265 mempool_free(r10_bio, &conf->r10bio_pool); 266 } 267 268 static void put_buf(struct r10bio *r10_bio) 269 { 270 struct r10conf *conf = r10_bio->mddev->private; 271 272 mempool_free(r10_bio, &conf->r10buf_pool); 273 274 lower_barrier(conf); 275 } 276 277 static void reschedule_retry(struct r10bio *r10_bio) 278 { 279 unsigned long flags; 280 struct mddev *mddev = r10_bio->mddev; 281 struct r10conf *conf = mddev->private; 282 283 spin_lock_irqsave(&conf->device_lock, flags); 284 list_add(&r10_bio->retry_list, &conf->retry_list); 285 conf->nr_queued ++; 286 spin_unlock_irqrestore(&conf->device_lock, flags); 287 288 /* wake up frozen array... */ 289 wake_up(&conf->wait_barrier); 290 291 md_wakeup_thread(mddev->thread); 292 } 293 294 /* 295 * raid_end_bio_io() is called when we have finished servicing a mirrored 296 * operation and are ready to return a success/failure code to the buffer 297 * cache layer. 298 */ 299 static void raid_end_bio_io(struct r10bio *r10_bio) 300 { 301 struct bio *bio = r10_bio->master_bio; 302 struct r10conf *conf = r10_bio->mddev->private; 303 304 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 305 bio->bi_status = BLK_STS_IOERR; 306 307 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 308 bio_end_io_acct(bio, r10_bio->start_time); 309 bio_endio(bio); 310 /* 311 * Wake up any possible resync thread that waits for the device 312 * to go idle. 313 */ 314 allow_barrier(conf); 315 316 free_r10bio(r10_bio); 317 } 318 319 /* 320 * Update disk head position estimator based on IRQ completion info. 321 */ 322 static inline void update_head_pos(int slot, struct r10bio *r10_bio) 323 { 324 struct r10conf *conf = r10_bio->mddev->private; 325 326 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 327 r10_bio->devs[slot].addr + (r10_bio->sectors); 328 } 329 330 /* 331 * Find the disk number which triggered given bio 332 */ 333 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 334 struct bio *bio, int *slotp, int *replp) 335 { 336 int slot; 337 int repl = 0; 338 339 for (slot = 0; slot < conf->geo.raid_disks; slot++) { 340 if (r10_bio->devs[slot].bio == bio) 341 break; 342 if (r10_bio->devs[slot].repl_bio == bio) { 343 repl = 1; 344 break; 345 } 346 } 347 348 update_head_pos(slot, r10_bio); 349 350 if (slotp) 351 *slotp = slot; 352 if (replp) 353 *replp = repl; 354 return r10_bio->devs[slot].devnum; 355 } 356 357 static void raid10_end_read_request(struct bio *bio) 358 { 359 int uptodate = !bio->bi_status; 360 struct r10bio *r10_bio = bio->bi_private; 361 int slot; 362 struct md_rdev *rdev; 363 struct r10conf *conf = r10_bio->mddev->private; 364 365 slot = r10_bio->read_slot; 366 rdev = r10_bio->devs[slot].rdev; 367 /* 368 * this branch is our 'one mirror IO has finished' event handler: 369 */ 370 update_head_pos(slot, r10_bio); 371 372 if (uptodate) { 373 /* 374 * Set R10BIO_Uptodate in our master bio, so that 375 * we will return a good error code to the higher 376 * levels even if IO on some other mirrored buffer fails. 377 * 378 * The 'master' represents the composite IO operation to 379 * user-side. So if something waits for IO, then it will 380 * wait for the 'master' bio. 381 */ 382 set_bit(R10BIO_Uptodate, &r10_bio->state); 383 } else { 384 /* If all other devices that store this block have 385 * failed, we want to return the error upwards rather 386 * than fail the last device. Here we redefine 387 * "uptodate" to mean "Don't want to retry" 388 */ 389 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), 390 rdev->raid_disk)) 391 uptodate = 1; 392 } 393 if (uptodate) { 394 raid_end_bio_io(r10_bio); 395 rdev_dec_pending(rdev, conf->mddev); 396 } else { 397 /* 398 * oops, read error - keep the refcount on the rdev 399 */ 400 pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", 401 mdname(conf->mddev), 402 rdev->bdev, 403 (unsigned long long)r10_bio->sector); 404 set_bit(R10BIO_ReadError, &r10_bio->state); 405 reschedule_retry(r10_bio); 406 } 407 } 408 409 static void close_write(struct r10bio *r10_bio) 410 { 411 /* clear the bitmap if all writes complete successfully */ 412 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 413 r10_bio->sectors, 414 !test_bit(R10BIO_Degraded, &r10_bio->state), 415 0); 416 md_write_end(r10_bio->mddev); 417 } 418 419 static void one_write_done(struct r10bio *r10_bio) 420 { 421 if (atomic_dec_and_test(&r10_bio->remaining)) { 422 if (test_bit(R10BIO_WriteError, &r10_bio->state)) 423 reschedule_retry(r10_bio); 424 else { 425 close_write(r10_bio); 426 if (test_bit(R10BIO_MadeGood, &r10_bio->state)) 427 reschedule_retry(r10_bio); 428 else 429 raid_end_bio_io(r10_bio); 430 } 431 } 432 } 433 434 static void raid10_end_write_request(struct bio *bio) 435 { 436 struct r10bio *r10_bio = bio->bi_private; 437 int dev; 438 int dec_rdev = 1; 439 struct r10conf *conf = r10_bio->mddev->private; 440 int slot, repl; 441 struct md_rdev *rdev = NULL; 442 struct bio *to_put = NULL; 443 bool discard_error; 444 445 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 446 447 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 448 449 if (repl) 450 rdev = conf->mirrors[dev].replacement; 451 if (!rdev) { 452 smp_rmb(); 453 repl = 0; 454 rdev = conf->mirrors[dev].rdev; 455 } 456 /* 457 * this branch is our 'one mirror IO has finished' event handler: 458 */ 459 if (bio->bi_status && !discard_error) { 460 if (repl) 461 /* Never record new bad blocks to replacement, 462 * just fail it. 463 */ 464 md_error(rdev->mddev, rdev); 465 else { 466 set_bit(WriteErrorSeen, &rdev->flags); 467 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 468 set_bit(MD_RECOVERY_NEEDED, 469 &rdev->mddev->recovery); 470 471 dec_rdev = 0; 472 if (test_bit(FailFast, &rdev->flags) && 473 (bio->bi_opf & MD_FAILFAST)) { 474 md_error(rdev->mddev, rdev); 475 } 476 477 /* 478 * When the device is faulty, it is not necessary to 479 * handle write error. 480 */ 481 if (!test_bit(Faulty, &rdev->flags)) 482 set_bit(R10BIO_WriteError, &r10_bio->state); 483 else { 484 /* Fail the request */ 485 set_bit(R10BIO_Degraded, &r10_bio->state); 486 r10_bio->devs[slot].bio = NULL; 487 to_put = bio; 488 dec_rdev = 1; 489 } 490 } 491 } else { 492 /* 493 * Set R10BIO_Uptodate in our master bio, so that 494 * we will return a good error code for to the higher 495 * levels even if IO on some other mirrored buffer fails. 496 * 497 * The 'master' represents the composite IO operation to 498 * user-side. So if something waits for IO, then it will 499 * wait for the 'master' bio. 500 */ 501 sector_t first_bad; 502 int bad_sectors; 503 504 /* 505 * Do not set R10BIO_Uptodate if the current device is 506 * rebuilding or Faulty. This is because we cannot use 507 * such device for properly reading the data back (we could 508 * potentially use it, if the current write would have felt 509 * before rdev->recovery_offset, but for simplicity we don't 510 * check this here. 511 */ 512 if (test_bit(In_sync, &rdev->flags) && 513 !test_bit(Faulty, &rdev->flags)) 514 set_bit(R10BIO_Uptodate, &r10_bio->state); 515 516 /* Maybe we can clear some bad blocks. */ 517 if (is_badblock(rdev, 518 r10_bio->devs[slot].addr, 519 r10_bio->sectors, 520 &first_bad, &bad_sectors) && !discard_error) { 521 bio_put(bio); 522 if (repl) 523 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; 524 else 525 r10_bio->devs[slot].bio = IO_MADE_GOOD; 526 dec_rdev = 0; 527 set_bit(R10BIO_MadeGood, &r10_bio->state); 528 } 529 } 530 531 /* 532 * 533 * Let's see if all mirrored write operations have finished 534 * already. 535 */ 536 one_write_done(r10_bio); 537 if (dec_rdev) 538 rdev_dec_pending(rdev, conf->mddev); 539 if (to_put) 540 bio_put(to_put); 541 } 542 543 /* 544 * RAID10 layout manager 545 * As well as the chunksize and raid_disks count, there are two 546 * parameters: near_copies and far_copies. 547 * near_copies * far_copies must be <= raid_disks. 548 * Normally one of these will be 1. 549 * If both are 1, we get raid0. 550 * If near_copies == raid_disks, we get raid1. 551 * 552 * Chunks are laid out in raid0 style with near_copies copies of the 553 * first chunk, followed by near_copies copies of the next chunk and 554 * so on. 555 * If far_copies > 1, then after 1/far_copies of the array has been assigned 556 * as described above, we start again with a device offset of near_copies. 557 * So we effectively have another copy of the whole array further down all 558 * the drives, but with blocks on different drives. 559 * With this layout, and block is never stored twice on the one device. 560 * 561 * raid10_find_phys finds the sector offset of a given virtual sector 562 * on each device that it is on. 563 * 564 * raid10_find_virt does the reverse mapping, from a device and a 565 * sector offset to a virtual address 566 */ 567 568 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 569 { 570 int n,f; 571 sector_t sector; 572 sector_t chunk; 573 sector_t stripe; 574 int dev; 575 int slot = 0; 576 int last_far_set_start, last_far_set_size; 577 578 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 579 last_far_set_start *= geo->far_set_size; 580 581 last_far_set_size = geo->far_set_size; 582 last_far_set_size += (geo->raid_disks % geo->far_set_size); 583 584 /* now calculate first sector/dev */ 585 chunk = r10bio->sector >> geo->chunk_shift; 586 sector = r10bio->sector & geo->chunk_mask; 587 588 chunk *= geo->near_copies; 589 stripe = chunk; 590 dev = sector_div(stripe, geo->raid_disks); 591 if (geo->far_offset) 592 stripe *= geo->far_copies; 593 594 sector += stripe << geo->chunk_shift; 595 596 /* and calculate all the others */ 597 for (n = 0; n < geo->near_copies; n++) { 598 int d = dev; 599 int set; 600 sector_t s = sector; 601 r10bio->devs[slot].devnum = d; 602 r10bio->devs[slot].addr = s; 603 slot++; 604 605 for (f = 1; f < geo->far_copies; f++) { 606 set = d / geo->far_set_size; 607 d += geo->near_copies; 608 609 if ((geo->raid_disks % geo->far_set_size) && 610 (d > last_far_set_start)) { 611 d -= last_far_set_start; 612 d %= last_far_set_size; 613 d += last_far_set_start; 614 } else { 615 d %= geo->far_set_size; 616 d += geo->far_set_size * set; 617 } 618 s += geo->stride; 619 r10bio->devs[slot].devnum = d; 620 r10bio->devs[slot].addr = s; 621 slot++; 622 } 623 dev++; 624 if (dev >= geo->raid_disks) { 625 dev = 0; 626 sector += (geo->chunk_mask + 1); 627 } 628 } 629 } 630 631 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 632 { 633 struct geom *geo = &conf->geo; 634 635 if (conf->reshape_progress != MaxSector && 636 ((r10bio->sector >= conf->reshape_progress) != 637 conf->mddev->reshape_backwards)) { 638 set_bit(R10BIO_Previous, &r10bio->state); 639 geo = &conf->prev; 640 } else 641 clear_bit(R10BIO_Previous, &r10bio->state); 642 643 __raid10_find_phys(geo, r10bio); 644 } 645 646 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 647 { 648 sector_t offset, chunk, vchunk; 649 /* Never use conf->prev as this is only called during resync 650 * or recovery, so reshape isn't happening 651 */ 652 struct geom *geo = &conf->geo; 653 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; 654 int far_set_size = geo->far_set_size; 655 int last_far_set_start; 656 657 if (geo->raid_disks % geo->far_set_size) { 658 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 659 last_far_set_start *= geo->far_set_size; 660 661 if (dev >= last_far_set_start) { 662 far_set_size = geo->far_set_size; 663 far_set_size += (geo->raid_disks % geo->far_set_size); 664 far_set_start = last_far_set_start; 665 } 666 } 667 668 offset = sector & geo->chunk_mask; 669 if (geo->far_offset) { 670 int fc; 671 chunk = sector >> geo->chunk_shift; 672 fc = sector_div(chunk, geo->far_copies); 673 dev -= fc * geo->near_copies; 674 if (dev < far_set_start) 675 dev += far_set_size; 676 } else { 677 while (sector >= geo->stride) { 678 sector -= geo->stride; 679 if (dev < (geo->near_copies + far_set_start)) 680 dev += far_set_size - geo->near_copies; 681 else 682 dev -= geo->near_copies; 683 } 684 chunk = sector >> geo->chunk_shift; 685 } 686 vchunk = chunk * geo->raid_disks + dev; 687 sector_div(vchunk, geo->near_copies); 688 return (vchunk << geo->chunk_shift) + offset; 689 } 690 691 /* 692 * This routine returns the disk from which the requested read should 693 * be done. There is a per-array 'next expected sequential IO' sector 694 * number - if this matches on the next IO then we use the last disk. 695 * There is also a per-disk 'last know head position' sector that is 696 * maintained from IRQ contexts, both the normal and the resync IO 697 * completion handlers update this position correctly. If there is no 698 * perfect sequential match then we pick the disk whose head is closest. 699 * 700 * If there are 2 mirrors in the same 2 devices, performance degrades 701 * because position is mirror, not device based. 702 * 703 * The rdev for the device selected will have nr_pending incremented. 704 */ 705 706 /* 707 * FIXME: possibly should rethink readbalancing and do it differently 708 * depending on near_copies / far_copies geometry. 709 */ 710 static struct md_rdev *read_balance(struct r10conf *conf, 711 struct r10bio *r10_bio, 712 int *max_sectors) 713 { 714 const sector_t this_sector = r10_bio->sector; 715 int disk, slot; 716 int sectors = r10_bio->sectors; 717 int best_good_sectors; 718 sector_t new_distance, best_dist; 719 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; 720 int do_balance; 721 int best_dist_slot, best_pending_slot; 722 bool has_nonrot_disk = false; 723 unsigned int min_pending; 724 struct geom *geo = &conf->geo; 725 726 raid10_find_phys(conf, r10_bio); 727 rcu_read_lock(); 728 best_dist_slot = -1; 729 min_pending = UINT_MAX; 730 best_dist_rdev = NULL; 731 best_pending_rdev = NULL; 732 best_dist = MaxSector; 733 best_good_sectors = 0; 734 do_balance = 1; 735 clear_bit(R10BIO_FailFast, &r10_bio->state); 736 /* 737 * Check if we can balance. We can balance on the whole 738 * device if no resync is going on (recovery is ok), or below 739 * the resync window. We take the first readable disk when 740 * above the resync window. 741 */ 742 if ((conf->mddev->recovery_cp < MaxSector 743 && (this_sector + sectors >= conf->next_resync)) || 744 (mddev_is_clustered(conf->mddev) && 745 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 746 this_sector + sectors))) 747 do_balance = 0; 748 749 for (slot = 0; slot < conf->copies ; slot++) { 750 sector_t first_bad; 751 int bad_sectors; 752 sector_t dev_sector; 753 unsigned int pending; 754 bool nonrot; 755 756 if (r10_bio->devs[slot].bio == IO_BLOCKED) 757 continue; 758 disk = r10_bio->devs[slot].devnum; 759 rdev = rcu_dereference(conf->mirrors[disk].replacement); 760 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 761 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 762 rdev = rcu_dereference(conf->mirrors[disk].rdev); 763 if (rdev == NULL || 764 test_bit(Faulty, &rdev->flags)) 765 continue; 766 if (!test_bit(In_sync, &rdev->flags) && 767 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 768 continue; 769 770 dev_sector = r10_bio->devs[slot].addr; 771 if (is_badblock(rdev, dev_sector, sectors, 772 &first_bad, &bad_sectors)) { 773 if (best_dist < MaxSector) 774 /* Already have a better slot */ 775 continue; 776 if (first_bad <= dev_sector) { 777 /* Cannot read here. If this is the 778 * 'primary' device, then we must not read 779 * beyond 'bad_sectors' from another device. 780 */ 781 bad_sectors -= (dev_sector - first_bad); 782 if (!do_balance && sectors > bad_sectors) 783 sectors = bad_sectors; 784 if (best_good_sectors > sectors) 785 best_good_sectors = sectors; 786 } else { 787 sector_t good_sectors = 788 first_bad - dev_sector; 789 if (good_sectors > best_good_sectors) { 790 best_good_sectors = good_sectors; 791 best_dist_slot = slot; 792 best_dist_rdev = rdev; 793 } 794 if (!do_balance) 795 /* Must read from here */ 796 break; 797 } 798 continue; 799 } else 800 best_good_sectors = sectors; 801 802 if (!do_balance) 803 break; 804 805 nonrot = bdev_nonrot(rdev->bdev); 806 has_nonrot_disk |= nonrot; 807 pending = atomic_read(&rdev->nr_pending); 808 if (min_pending > pending && nonrot) { 809 min_pending = pending; 810 best_pending_slot = slot; 811 best_pending_rdev = rdev; 812 } 813 814 if (best_dist_slot >= 0) 815 /* At least 2 disks to choose from so failfast is OK */ 816 set_bit(R10BIO_FailFast, &r10_bio->state); 817 /* This optimisation is debatable, and completely destroys 818 * sequential read speed for 'far copies' arrays. So only 819 * keep it for 'near' arrays, and review those later. 820 */ 821 if (geo->near_copies > 1 && !pending) 822 new_distance = 0; 823 824 /* for far > 1 always use the lowest address */ 825 else if (geo->far_copies > 1) 826 new_distance = r10_bio->devs[slot].addr; 827 else 828 new_distance = abs(r10_bio->devs[slot].addr - 829 conf->mirrors[disk].head_position); 830 831 if (new_distance < best_dist) { 832 best_dist = new_distance; 833 best_dist_slot = slot; 834 best_dist_rdev = rdev; 835 } 836 } 837 if (slot >= conf->copies) { 838 if (has_nonrot_disk) { 839 slot = best_pending_slot; 840 rdev = best_pending_rdev; 841 } else { 842 slot = best_dist_slot; 843 rdev = best_dist_rdev; 844 } 845 } 846 847 if (slot >= 0) { 848 atomic_inc(&rdev->nr_pending); 849 r10_bio->read_slot = slot; 850 } else 851 rdev = NULL; 852 rcu_read_unlock(); 853 *max_sectors = best_good_sectors; 854 855 return rdev; 856 } 857 858 static void flush_pending_writes(struct r10conf *conf) 859 { 860 /* Any writes that have been queued but are awaiting 861 * bitmap updates get flushed here. 862 */ 863 spin_lock_irq(&conf->device_lock); 864 865 if (conf->pending_bio_list.head) { 866 struct blk_plug plug; 867 struct bio *bio; 868 869 bio = bio_list_get(&conf->pending_bio_list); 870 spin_unlock_irq(&conf->device_lock); 871 872 /* 873 * As this is called in a wait_event() loop (see freeze_array), 874 * current->state might be TASK_UNINTERRUPTIBLE which will 875 * cause a warning when we prepare to wait again. As it is 876 * rare that this path is taken, it is perfectly safe to force 877 * us to go around the wait_event() loop again, so the warning 878 * is a false-positive. Silence the warning by resetting 879 * thread state 880 */ 881 __set_current_state(TASK_RUNNING); 882 883 blk_start_plug(&plug); 884 /* flush any pending bitmap writes to disk 885 * before proceeding w/ I/O */ 886 md_bitmap_unplug(conf->mddev->bitmap); 887 wake_up(&conf->wait_barrier); 888 889 while (bio) { /* submit pending writes */ 890 struct bio *next = bio->bi_next; 891 struct md_rdev *rdev = (void*)bio->bi_bdev; 892 bio->bi_next = NULL; 893 bio_set_dev(bio, rdev->bdev); 894 if (test_bit(Faulty, &rdev->flags)) { 895 bio_io_error(bio); 896 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 897 !bdev_max_discard_sectors(bio->bi_bdev))) 898 /* Just ignore it */ 899 bio_endio(bio); 900 else 901 submit_bio_noacct(bio); 902 bio = next; 903 } 904 blk_finish_plug(&plug); 905 } else 906 spin_unlock_irq(&conf->device_lock); 907 } 908 909 /* Barriers.... 910 * Sometimes we need to suspend IO while we do something else, 911 * either some resync/recovery, or reconfigure the array. 912 * To do this we raise a 'barrier'. 913 * The 'barrier' is a counter that can be raised multiple times 914 * to count how many activities are happening which preclude 915 * normal IO. 916 * We can only raise the barrier if there is no pending IO. 917 * i.e. if nr_pending == 0. 918 * We choose only to raise the barrier if no-one is waiting for the 919 * barrier to go down. This means that as soon as an IO request 920 * is ready, no other operations which require a barrier will start 921 * until the IO request has had a chance. 922 * 923 * So: regular IO calls 'wait_barrier'. When that returns there 924 * is no backgroup IO happening, It must arrange to call 925 * allow_barrier when it has finished its IO. 926 * backgroup IO calls must call raise_barrier. Once that returns 927 * there is no normal IO happeing. It must arrange to call 928 * lower_barrier when the particular background IO completes. 929 */ 930 931 static void raise_barrier(struct r10conf *conf, int force) 932 { 933 BUG_ON(force && !conf->barrier); 934 spin_lock_irq(&conf->resync_lock); 935 936 /* Wait until no block IO is waiting (unless 'force') */ 937 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 938 conf->resync_lock); 939 940 /* block any new IO from starting */ 941 conf->barrier++; 942 943 /* Now wait for all pending IO to complete */ 944 wait_event_lock_irq(conf->wait_barrier, 945 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, 946 conf->resync_lock); 947 948 spin_unlock_irq(&conf->resync_lock); 949 } 950 951 static void lower_barrier(struct r10conf *conf) 952 { 953 unsigned long flags; 954 spin_lock_irqsave(&conf->resync_lock, flags); 955 conf->barrier--; 956 spin_unlock_irqrestore(&conf->resync_lock, flags); 957 wake_up(&conf->wait_barrier); 958 } 959 960 static bool wait_barrier(struct r10conf *conf, bool nowait) 961 { 962 bool ret = true; 963 964 spin_lock_irq(&conf->resync_lock); 965 if (conf->barrier) { 966 struct bio_list *bio_list = current->bio_list; 967 conf->nr_waiting++; 968 /* Wait for the barrier to drop. 969 * However if there are already pending 970 * requests (preventing the barrier from 971 * rising completely), and the 972 * pre-process bio queue isn't empty, 973 * then don't wait, as we need to empty 974 * that queue to get the nr_pending 975 * count down. 976 */ 977 /* Return false when nowait flag is set */ 978 if (nowait) { 979 ret = false; 980 } else { 981 raid10_log(conf->mddev, "wait barrier"); 982 wait_event_lock_irq(conf->wait_barrier, 983 !conf->barrier || 984 (atomic_read(&conf->nr_pending) && 985 bio_list && 986 (!bio_list_empty(&bio_list[0]) || 987 !bio_list_empty(&bio_list[1]))) || 988 /* move on if recovery thread is 989 * blocked by us 990 */ 991 (conf->mddev->thread->tsk == current && 992 test_bit(MD_RECOVERY_RUNNING, 993 &conf->mddev->recovery) && 994 conf->nr_queued > 0), 995 conf->resync_lock); 996 } 997 conf->nr_waiting--; 998 if (!conf->nr_waiting) 999 wake_up(&conf->wait_barrier); 1000 } 1001 /* Only increment nr_pending when we wait */ 1002 if (ret) 1003 atomic_inc(&conf->nr_pending); 1004 spin_unlock_irq(&conf->resync_lock); 1005 return ret; 1006 } 1007 1008 static void allow_barrier(struct r10conf *conf) 1009 { 1010 if ((atomic_dec_and_test(&conf->nr_pending)) || 1011 (conf->array_freeze_pending)) 1012 wake_up(&conf->wait_barrier); 1013 } 1014 1015 static void freeze_array(struct r10conf *conf, int extra) 1016 { 1017 /* stop syncio and normal IO and wait for everything to 1018 * go quiet. 1019 * We increment barrier and nr_waiting, and then 1020 * wait until nr_pending match nr_queued+extra 1021 * This is called in the context of one normal IO request 1022 * that has failed. Thus any sync request that might be pending 1023 * will be blocked by nr_pending, and we need to wait for 1024 * pending IO requests to complete or be queued for re-try. 1025 * Thus the number queued (nr_queued) plus this request (extra) 1026 * must match the number of pending IOs (nr_pending) before 1027 * we continue. 1028 */ 1029 spin_lock_irq(&conf->resync_lock); 1030 conf->array_freeze_pending++; 1031 conf->barrier++; 1032 conf->nr_waiting++; 1033 wait_event_lock_irq_cmd(conf->wait_barrier, 1034 atomic_read(&conf->nr_pending) == conf->nr_queued+extra, 1035 conf->resync_lock, 1036 flush_pending_writes(conf)); 1037 1038 conf->array_freeze_pending--; 1039 spin_unlock_irq(&conf->resync_lock); 1040 } 1041 1042 static void unfreeze_array(struct r10conf *conf) 1043 { 1044 /* reverse the effect of the freeze */ 1045 spin_lock_irq(&conf->resync_lock); 1046 conf->barrier--; 1047 conf->nr_waiting--; 1048 wake_up(&conf->wait_barrier); 1049 spin_unlock_irq(&conf->resync_lock); 1050 } 1051 1052 static sector_t choose_data_offset(struct r10bio *r10_bio, 1053 struct md_rdev *rdev) 1054 { 1055 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 1056 test_bit(R10BIO_Previous, &r10_bio->state)) 1057 return rdev->data_offset; 1058 else 1059 return rdev->new_data_offset; 1060 } 1061 1062 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 1063 { 1064 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); 1065 struct mddev *mddev = plug->cb.data; 1066 struct r10conf *conf = mddev->private; 1067 struct bio *bio; 1068 1069 if (from_schedule || current->bio_list) { 1070 spin_lock_irq(&conf->device_lock); 1071 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1072 spin_unlock_irq(&conf->device_lock); 1073 wake_up(&conf->wait_barrier); 1074 md_wakeup_thread(mddev->thread); 1075 kfree(plug); 1076 return; 1077 } 1078 1079 /* we aren't scheduling, so we can do the write-out directly. */ 1080 bio = bio_list_get(&plug->pending); 1081 md_bitmap_unplug(mddev->bitmap); 1082 wake_up(&conf->wait_barrier); 1083 1084 while (bio) { /* submit pending writes */ 1085 struct bio *next = bio->bi_next; 1086 struct md_rdev *rdev = (void*)bio->bi_bdev; 1087 bio->bi_next = NULL; 1088 bio_set_dev(bio, rdev->bdev); 1089 if (test_bit(Faulty, &rdev->flags)) { 1090 bio_io_error(bio); 1091 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 1092 !bdev_max_discard_sectors(bio->bi_bdev))) 1093 /* Just ignore it */ 1094 bio_endio(bio); 1095 else 1096 submit_bio_noacct(bio); 1097 bio = next; 1098 } 1099 kfree(plug); 1100 } 1101 1102 /* 1103 * 1. Register the new request and wait if the reconstruction thread has put 1104 * up a bar for new requests. Continue immediately if no resync is active 1105 * currently. 1106 * 2. If IO spans the reshape position. Need to wait for reshape to pass. 1107 */ 1108 static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, 1109 struct bio *bio, sector_t sectors) 1110 { 1111 /* Bail out if REQ_NOWAIT is set for the bio */ 1112 if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { 1113 bio_wouldblock_error(bio); 1114 return false; 1115 } 1116 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1117 bio->bi_iter.bi_sector < conf->reshape_progress && 1118 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 1119 allow_barrier(conf); 1120 if (bio->bi_opf & REQ_NOWAIT) { 1121 bio_wouldblock_error(bio); 1122 return false; 1123 } 1124 raid10_log(conf->mddev, "wait reshape"); 1125 wait_event(conf->wait_barrier, 1126 conf->reshape_progress <= bio->bi_iter.bi_sector || 1127 conf->reshape_progress >= bio->bi_iter.bi_sector + 1128 sectors); 1129 wait_barrier(conf, false); 1130 } 1131 return true; 1132 } 1133 1134 static void raid10_read_request(struct mddev *mddev, struct bio *bio, 1135 struct r10bio *r10_bio) 1136 { 1137 struct r10conf *conf = mddev->private; 1138 struct bio *read_bio; 1139 const enum req_op op = bio_op(bio); 1140 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1141 int max_sectors; 1142 struct md_rdev *rdev; 1143 char b[BDEVNAME_SIZE]; 1144 int slot = r10_bio->read_slot; 1145 struct md_rdev *err_rdev = NULL; 1146 gfp_t gfp = GFP_NOIO; 1147 1148 if (slot >= 0 && r10_bio->devs[slot].rdev) { 1149 /* 1150 * This is an error retry, but we cannot 1151 * safely dereference the rdev in the r10_bio, 1152 * we must use the one in conf. 1153 * If it has already been disconnected (unlikely) 1154 * we lose the device name in error messages. 1155 */ 1156 int disk; 1157 /* 1158 * As we are blocking raid10, it is a little safer to 1159 * use __GFP_HIGH. 1160 */ 1161 gfp = GFP_NOIO | __GFP_HIGH; 1162 1163 rcu_read_lock(); 1164 disk = r10_bio->devs[slot].devnum; 1165 err_rdev = rcu_dereference(conf->mirrors[disk].rdev); 1166 if (err_rdev) 1167 snprintf(b, sizeof(b), "%pg", err_rdev->bdev); 1168 else { 1169 strcpy(b, "???"); 1170 /* This never gets dereferenced */ 1171 err_rdev = r10_bio->devs[slot].rdev; 1172 } 1173 rcu_read_unlock(); 1174 } 1175 1176 if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) 1177 return; 1178 rdev = read_balance(conf, r10_bio, &max_sectors); 1179 if (!rdev) { 1180 if (err_rdev) { 1181 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", 1182 mdname(mddev), b, 1183 (unsigned long long)r10_bio->sector); 1184 } 1185 raid_end_bio_io(r10_bio); 1186 return; 1187 } 1188 if (err_rdev) 1189 pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", 1190 mdname(mddev), 1191 rdev->bdev, 1192 (unsigned long long)r10_bio->sector); 1193 if (max_sectors < bio_sectors(bio)) { 1194 struct bio *split = bio_split(bio, max_sectors, 1195 gfp, &conf->bio_split); 1196 bio_chain(split, bio); 1197 allow_barrier(conf); 1198 submit_bio_noacct(bio); 1199 wait_barrier(conf, false); 1200 bio = split; 1201 r10_bio->master_bio = bio; 1202 r10_bio->sectors = max_sectors; 1203 } 1204 slot = r10_bio->read_slot; 1205 1206 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1207 r10_bio->start_time = bio_start_io_acct(bio); 1208 read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); 1209 1210 r10_bio->devs[slot].bio = read_bio; 1211 r10_bio->devs[slot].rdev = rdev; 1212 1213 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + 1214 choose_data_offset(r10_bio, rdev); 1215 read_bio->bi_end_io = raid10_end_read_request; 1216 bio_set_op_attrs(read_bio, op, do_sync); 1217 if (test_bit(FailFast, &rdev->flags) && 1218 test_bit(R10BIO_FailFast, &r10_bio->state)) 1219 read_bio->bi_opf |= MD_FAILFAST; 1220 read_bio->bi_private = r10_bio; 1221 1222 if (mddev->gendisk) 1223 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 1224 r10_bio->sector); 1225 submit_bio_noacct(read_bio); 1226 return; 1227 } 1228 1229 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, 1230 struct bio *bio, bool replacement, 1231 int n_copy) 1232 { 1233 const enum req_op op = bio_op(bio); 1234 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1235 const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 1236 unsigned long flags; 1237 struct blk_plug_cb *cb; 1238 struct raid1_plug_cb *plug = NULL; 1239 struct r10conf *conf = mddev->private; 1240 struct md_rdev *rdev; 1241 int devnum = r10_bio->devs[n_copy].devnum; 1242 struct bio *mbio; 1243 1244 if (replacement) { 1245 rdev = conf->mirrors[devnum].replacement; 1246 if (rdev == NULL) { 1247 /* Replacement just got moved to main 'rdev' */ 1248 smp_mb(); 1249 rdev = conf->mirrors[devnum].rdev; 1250 } 1251 } else 1252 rdev = conf->mirrors[devnum].rdev; 1253 1254 mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); 1255 if (replacement) 1256 r10_bio->devs[n_copy].repl_bio = mbio; 1257 else 1258 r10_bio->devs[n_copy].bio = mbio; 1259 1260 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1261 choose_data_offset(r10_bio, rdev)); 1262 mbio->bi_end_io = raid10_end_write_request; 1263 bio_set_op_attrs(mbio, op, do_sync | do_fua); 1264 if (!replacement && test_bit(FailFast, 1265 &conf->mirrors[devnum].rdev->flags) 1266 && enough(conf, devnum)) 1267 mbio->bi_opf |= MD_FAILFAST; 1268 mbio->bi_private = r10_bio; 1269 1270 if (conf->mddev->gendisk) 1271 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), 1272 r10_bio->sector); 1273 /* flush_pending_writes() needs access to the rdev so...*/ 1274 mbio->bi_bdev = (void *)rdev; 1275 1276 atomic_inc(&r10_bio->remaining); 1277 1278 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1279 if (cb) 1280 plug = container_of(cb, struct raid1_plug_cb, cb); 1281 else 1282 plug = NULL; 1283 if (plug) { 1284 bio_list_add(&plug->pending, mbio); 1285 } else { 1286 spin_lock_irqsave(&conf->device_lock, flags); 1287 bio_list_add(&conf->pending_bio_list, mbio); 1288 spin_unlock_irqrestore(&conf->device_lock, flags); 1289 md_wakeup_thread(mddev->thread); 1290 } 1291 } 1292 1293 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 1294 { 1295 int i; 1296 struct r10conf *conf = mddev->private; 1297 struct md_rdev *blocked_rdev; 1298 1299 retry_wait: 1300 blocked_rdev = NULL; 1301 rcu_read_lock(); 1302 for (i = 0; i < conf->copies; i++) { 1303 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1304 struct md_rdev *rrdev = rcu_dereference( 1305 conf->mirrors[i].replacement); 1306 if (rdev == rrdev) 1307 rrdev = NULL; 1308 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1309 atomic_inc(&rdev->nr_pending); 1310 blocked_rdev = rdev; 1311 break; 1312 } 1313 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 1314 atomic_inc(&rrdev->nr_pending); 1315 blocked_rdev = rrdev; 1316 break; 1317 } 1318 1319 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1320 sector_t first_bad; 1321 sector_t dev_sector = r10_bio->devs[i].addr; 1322 int bad_sectors; 1323 int is_bad; 1324 1325 /* 1326 * Discard request doesn't care the write result 1327 * so it doesn't need to wait blocked disk here. 1328 */ 1329 if (!r10_bio->sectors) 1330 continue; 1331 1332 is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, 1333 &first_bad, &bad_sectors); 1334 if (is_bad < 0) { 1335 /* 1336 * Mustn't write here until the bad block 1337 * is acknowledged 1338 */ 1339 atomic_inc(&rdev->nr_pending); 1340 set_bit(BlockedBadBlocks, &rdev->flags); 1341 blocked_rdev = rdev; 1342 break; 1343 } 1344 } 1345 } 1346 rcu_read_unlock(); 1347 1348 if (unlikely(blocked_rdev)) { 1349 /* Have to wait for this device to get unblocked, then retry */ 1350 allow_barrier(conf); 1351 raid10_log(conf->mddev, "%s wait rdev %d blocked", 1352 __func__, blocked_rdev->raid_disk); 1353 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1354 wait_barrier(conf, false); 1355 goto retry_wait; 1356 } 1357 } 1358 1359 static void raid10_write_request(struct mddev *mddev, struct bio *bio, 1360 struct r10bio *r10_bio) 1361 { 1362 struct r10conf *conf = mddev->private; 1363 int i; 1364 sector_t sectors; 1365 int max_sectors; 1366 1367 if ((mddev_is_clustered(mddev) && 1368 md_cluster_ops->area_resyncing(mddev, WRITE, 1369 bio->bi_iter.bi_sector, 1370 bio_end_sector(bio)))) { 1371 DEFINE_WAIT(w); 1372 /* Bail out if REQ_NOWAIT is set for the bio */ 1373 if (bio->bi_opf & REQ_NOWAIT) { 1374 bio_wouldblock_error(bio); 1375 return; 1376 } 1377 for (;;) { 1378 prepare_to_wait(&conf->wait_barrier, 1379 &w, TASK_IDLE); 1380 if (!md_cluster_ops->area_resyncing(mddev, WRITE, 1381 bio->bi_iter.bi_sector, bio_end_sector(bio))) 1382 break; 1383 schedule(); 1384 } 1385 finish_wait(&conf->wait_barrier, &w); 1386 } 1387 1388 sectors = r10_bio->sectors; 1389 if (!regular_request_wait(mddev, conf, bio, sectors)) 1390 return; 1391 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1392 (mddev->reshape_backwards 1393 ? (bio->bi_iter.bi_sector < conf->reshape_safe && 1394 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) 1395 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && 1396 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1397 /* Need to update reshape_position in metadata */ 1398 mddev->reshape_position = conf->reshape_progress; 1399 set_mask_bits(&mddev->sb_flags, 0, 1400 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1401 md_wakeup_thread(mddev->thread); 1402 if (bio->bi_opf & REQ_NOWAIT) { 1403 allow_barrier(conf); 1404 bio_wouldblock_error(bio); 1405 return; 1406 } 1407 raid10_log(conf->mddev, "wait reshape metadata"); 1408 wait_event(mddev->sb_wait, 1409 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 1410 1411 conf->reshape_safe = mddev->reshape_position; 1412 } 1413 1414 /* first select target devices under rcu_lock and 1415 * inc refcount on their rdev. Record them by setting 1416 * bios[x] to bio 1417 * If there are known/acknowledged bad blocks on any device 1418 * on which we have seen a write error, we want to avoid 1419 * writing to those blocks. This potentially requires several 1420 * writes to write around the bad blocks. Each set of writes 1421 * gets its own r10_bio with a set of bios attached. 1422 */ 1423 1424 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 1425 raid10_find_phys(conf, r10_bio); 1426 1427 wait_blocked_dev(mddev, r10_bio); 1428 1429 rcu_read_lock(); 1430 max_sectors = r10_bio->sectors; 1431 1432 for (i = 0; i < conf->copies; i++) { 1433 int d = r10_bio->devs[i].devnum; 1434 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1435 struct md_rdev *rrdev = rcu_dereference( 1436 conf->mirrors[d].replacement); 1437 if (rdev == rrdev) 1438 rrdev = NULL; 1439 if (rdev && (test_bit(Faulty, &rdev->flags))) 1440 rdev = NULL; 1441 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1442 rrdev = NULL; 1443 1444 r10_bio->devs[i].bio = NULL; 1445 r10_bio->devs[i].repl_bio = NULL; 1446 1447 if (!rdev && !rrdev) { 1448 set_bit(R10BIO_Degraded, &r10_bio->state); 1449 continue; 1450 } 1451 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1452 sector_t first_bad; 1453 sector_t dev_sector = r10_bio->devs[i].addr; 1454 int bad_sectors; 1455 int is_bad; 1456 1457 is_bad = is_badblock(rdev, dev_sector, max_sectors, 1458 &first_bad, &bad_sectors); 1459 if (is_bad && first_bad <= dev_sector) { 1460 /* Cannot write here at all */ 1461 bad_sectors -= (dev_sector - first_bad); 1462 if (bad_sectors < max_sectors) 1463 /* Mustn't write more than bad_sectors 1464 * to other devices yet 1465 */ 1466 max_sectors = bad_sectors; 1467 /* We don't set R10BIO_Degraded as that 1468 * only applies if the disk is missing, 1469 * so it might be re-added, and we want to 1470 * know to recover this chunk. 1471 * In this case the device is here, and the 1472 * fact that this chunk is not in-sync is 1473 * recorded in the bad block log. 1474 */ 1475 continue; 1476 } 1477 if (is_bad) { 1478 int good_sectors = first_bad - dev_sector; 1479 if (good_sectors < max_sectors) 1480 max_sectors = good_sectors; 1481 } 1482 } 1483 if (rdev) { 1484 r10_bio->devs[i].bio = bio; 1485 atomic_inc(&rdev->nr_pending); 1486 } 1487 if (rrdev) { 1488 r10_bio->devs[i].repl_bio = bio; 1489 atomic_inc(&rrdev->nr_pending); 1490 } 1491 } 1492 rcu_read_unlock(); 1493 1494 if (max_sectors < r10_bio->sectors) 1495 r10_bio->sectors = max_sectors; 1496 1497 if (r10_bio->sectors < bio_sectors(bio)) { 1498 struct bio *split = bio_split(bio, r10_bio->sectors, 1499 GFP_NOIO, &conf->bio_split); 1500 bio_chain(split, bio); 1501 allow_barrier(conf); 1502 submit_bio_noacct(bio); 1503 wait_barrier(conf, false); 1504 bio = split; 1505 r10_bio->master_bio = bio; 1506 } 1507 1508 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1509 r10_bio->start_time = bio_start_io_acct(bio); 1510 atomic_set(&r10_bio->remaining, 1); 1511 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1512 1513 for (i = 0; i < conf->copies; i++) { 1514 if (r10_bio->devs[i].bio) 1515 raid10_write_one_disk(mddev, r10_bio, bio, false, i); 1516 if (r10_bio->devs[i].repl_bio) 1517 raid10_write_one_disk(mddev, r10_bio, bio, true, i); 1518 } 1519 one_write_done(r10_bio); 1520 } 1521 1522 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) 1523 { 1524 struct r10conf *conf = mddev->private; 1525 struct r10bio *r10_bio; 1526 1527 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1528 1529 r10_bio->master_bio = bio; 1530 r10_bio->sectors = sectors; 1531 1532 r10_bio->mddev = mddev; 1533 r10_bio->sector = bio->bi_iter.bi_sector; 1534 r10_bio->state = 0; 1535 r10_bio->read_slot = -1; 1536 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * 1537 conf->geo.raid_disks); 1538 1539 if (bio_data_dir(bio) == READ) 1540 raid10_read_request(mddev, bio, r10_bio); 1541 else 1542 raid10_write_request(mddev, bio, r10_bio); 1543 } 1544 1545 static void raid_end_discard_bio(struct r10bio *r10bio) 1546 { 1547 struct r10conf *conf = r10bio->mddev->private; 1548 struct r10bio *first_r10bio; 1549 1550 while (atomic_dec_and_test(&r10bio->remaining)) { 1551 1552 allow_barrier(conf); 1553 1554 if (!test_bit(R10BIO_Discard, &r10bio->state)) { 1555 first_r10bio = (struct r10bio *)r10bio->master_bio; 1556 free_r10bio(r10bio); 1557 r10bio = first_r10bio; 1558 } else { 1559 md_write_end(r10bio->mddev); 1560 bio_endio(r10bio->master_bio); 1561 free_r10bio(r10bio); 1562 break; 1563 } 1564 } 1565 } 1566 1567 static void raid10_end_discard_request(struct bio *bio) 1568 { 1569 struct r10bio *r10_bio = bio->bi_private; 1570 struct r10conf *conf = r10_bio->mddev->private; 1571 struct md_rdev *rdev = NULL; 1572 int dev; 1573 int slot, repl; 1574 1575 /* 1576 * We don't care the return value of discard bio 1577 */ 1578 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 1579 set_bit(R10BIO_Uptodate, &r10_bio->state); 1580 1581 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1582 if (repl) 1583 rdev = conf->mirrors[dev].replacement; 1584 if (!rdev) { 1585 /* 1586 * raid10_remove_disk uses smp_mb to make sure rdev is set to 1587 * replacement before setting replacement to NULL. It can read 1588 * rdev first without barrier protect even replacment is NULL 1589 */ 1590 smp_rmb(); 1591 rdev = conf->mirrors[dev].rdev; 1592 } 1593 1594 raid_end_discard_bio(r10_bio); 1595 rdev_dec_pending(rdev, conf->mddev); 1596 } 1597 1598 /* 1599 * There are some limitations to handle discard bio 1600 * 1st, the discard size is bigger than stripe_size*2. 1601 * 2st, if the discard bio spans reshape progress, we use the old way to 1602 * handle discard bio 1603 */ 1604 static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) 1605 { 1606 struct r10conf *conf = mddev->private; 1607 struct geom *geo = &conf->geo; 1608 int far_copies = geo->far_copies; 1609 bool first_copy = true; 1610 struct r10bio *r10_bio, *first_r10bio; 1611 struct bio *split; 1612 int disk; 1613 sector_t chunk; 1614 unsigned int stripe_size; 1615 unsigned int stripe_data_disks; 1616 sector_t split_size; 1617 sector_t bio_start, bio_end; 1618 sector_t first_stripe_index, last_stripe_index; 1619 sector_t start_disk_offset; 1620 unsigned int start_disk_index; 1621 sector_t end_disk_offset; 1622 unsigned int end_disk_index; 1623 unsigned int remainder; 1624 1625 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1626 return -EAGAIN; 1627 1628 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { 1629 bio_wouldblock_error(bio); 1630 return 0; 1631 } 1632 wait_barrier(conf, false); 1633 1634 /* 1635 * Check reshape again to avoid reshape happens after checking 1636 * MD_RECOVERY_RESHAPE and before wait_barrier 1637 */ 1638 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1639 goto out; 1640 1641 if (geo->near_copies) 1642 stripe_data_disks = geo->raid_disks / geo->near_copies + 1643 geo->raid_disks % geo->near_copies; 1644 else 1645 stripe_data_disks = geo->raid_disks; 1646 1647 stripe_size = stripe_data_disks << geo->chunk_shift; 1648 1649 bio_start = bio->bi_iter.bi_sector; 1650 bio_end = bio_end_sector(bio); 1651 1652 /* 1653 * Maybe one discard bio is smaller than strip size or across one 1654 * stripe and discard region is larger than one stripe size. For far 1655 * offset layout, if the discard region is not aligned with stripe 1656 * size, there is hole when we submit discard bio to member disk. 1657 * For simplicity, we only handle discard bio which discard region 1658 * is bigger than stripe_size * 2 1659 */ 1660 if (bio_sectors(bio) < stripe_size*2) 1661 goto out; 1662 1663 /* 1664 * Keep bio aligned with strip size. 1665 */ 1666 div_u64_rem(bio_start, stripe_size, &remainder); 1667 if (remainder) { 1668 split_size = stripe_size - remainder; 1669 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1670 bio_chain(split, bio); 1671 allow_barrier(conf); 1672 /* Resend the fist split part */ 1673 submit_bio_noacct(split); 1674 wait_barrier(conf, false); 1675 } 1676 div_u64_rem(bio_end, stripe_size, &remainder); 1677 if (remainder) { 1678 split_size = bio_sectors(bio) - remainder; 1679 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1680 bio_chain(split, bio); 1681 allow_barrier(conf); 1682 /* Resend the second split part */ 1683 submit_bio_noacct(bio); 1684 bio = split; 1685 wait_barrier(conf, false); 1686 } 1687 1688 bio_start = bio->bi_iter.bi_sector; 1689 bio_end = bio_end_sector(bio); 1690 1691 /* 1692 * Raid10 uses chunk as the unit to store data. It's similar like raid0. 1693 * One stripe contains the chunks from all member disk (one chunk from 1694 * one disk at the same HBA address). For layout detail, see 'man md 4' 1695 */ 1696 chunk = bio_start >> geo->chunk_shift; 1697 chunk *= geo->near_copies; 1698 first_stripe_index = chunk; 1699 start_disk_index = sector_div(first_stripe_index, geo->raid_disks); 1700 if (geo->far_offset) 1701 first_stripe_index *= geo->far_copies; 1702 start_disk_offset = (bio_start & geo->chunk_mask) + 1703 (first_stripe_index << geo->chunk_shift); 1704 1705 chunk = bio_end >> geo->chunk_shift; 1706 chunk *= geo->near_copies; 1707 last_stripe_index = chunk; 1708 end_disk_index = sector_div(last_stripe_index, geo->raid_disks); 1709 if (geo->far_offset) 1710 last_stripe_index *= geo->far_copies; 1711 end_disk_offset = (bio_end & geo->chunk_mask) + 1712 (last_stripe_index << geo->chunk_shift); 1713 1714 retry_discard: 1715 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1716 r10_bio->mddev = mddev; 1717 r10_bio->state = 0; 1718 r10_bio->sectors = 0; 1719 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); 1720 wait_blocked_dev(mddev, r10_bio); 1721 1722 /* 1723 * For far layout it needs more than one r10bio to cover all regions. 1724 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio 1725 * to record the discard bio. Other r10bio->master_bio record the first 1726 * r10bio. The first r10bio only release after all other r10bios finish. 1727 * The discard bio returns only first r10bio finishes 1728 */ 1729 if (first_copy) { 1730 r10_bio->master_bio = bio; 1731 set_bit(R10BIO_Discard, &r10_bio->state); 1732 first_copy = false; 1733 first_r10bio = r10_bio; 1734 } else 1735 r10_bio->master_bio = (struct bio *)first_r10bio; 1736 1737 /* 1738 * first select target devices under rcu_lock and 1739 * inc refcount on their rdev. Record them by setting 1740 * bios[x] to bio 1741 */ 1742 rcu_read_lock(); 1743 for (disk = 0; disk < geo->raid_disks; disk++) { 1744 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1745 struct md_rdev *rrdev = rcu_dereference( 1746 conf->mirrors[disk].replacement); 1747 1748 r10_bio->devs[disk].bio = NULL; 1749 r10_bio->devs[disk].repl_bio = NULL; 1750 1751 if (rdev && (test_bit(Faulty, &rdev->flags))) 1752 rdev = NULL; 1753 if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1754 rrdev = NULL; 1755 if (!rdev && !rrdev) 1756 continue; 1757 1758 if (rdev) { 1759 r10_bio->devs[disk].bio = bio; 1760 atomic_inc(&rdev->nr_pending); 1761 } 1762 if (rrdev) { 1763 r10_bio->devs[disk].repl_bio = bio; 1764 atomic_inc(&rrdev->nr_pending); 1765 } 1766 } 1767 rcu_read_unlock(); 1768 1769 atomic_set(&r10_bio->remaining, 1); 1770 for (disk = 0; disk < geo->raid_disks; disk++) { 1771 sector_t dev_start, dev_end; 1772 struct bio *mbio, *rbio = NULL; 1773 1774 /* 1775 * Now start to calculate the start and end address for each disk. 1776 * The space between dev_start and dev_end is the discard region. 1777 * 1778 * For dev_start, it needs to consider three conditions: 1779 * 1st, the disk is before start_disk, you can imagine the disk in 1780 * the next stripe. So the dev_start is the start address of next 1781 * stripe. 1782 * 2st, the disk is after start_disk, it means the disk is at the 1783 * same stripe of first disk 1784 * 3st, the first disk itself, we can use start_disk_offset directly 1785 */ 1786 if (disk < start_disk_index) 1787 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; 1788 else if (disk > start_disk_index) 1789 dev_start = first_stripe_index * mddev->chunk_sectors; 1790 else 1791 dev_start = start_disk_offset; 1792 1793 if (disk < end_disk_index) 1794 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 1795 else if (disk > end_disk_index) 1796 dev_end = last_stripe_index * mddev->chunk_sectors; 1797 else 1798 dev_end = end_disk_offset; 1799 1800 /* 1801 * It only handles discard bio which size is >= stripe size, so 1802 * dev_end > dev_start all the time. 1803 * It doesn't need to use rcu lock to get rdev here. We already 1804 * add rdev->nr_pending in the first loop. 1805 */ 1806 if (r10_bio->devs[disk].bio) { 1807 struct md_rdev *rdev = conf->mirrors[disk].rdev; 1808 mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1809 &mddev->bio_set); 1810 mbio->bi_end_io = raid10_end_discard_request; 1811 mbio->bi_private = r10_bio; 1812 r10_bio->devs[disk].bio = mbio; 1813 r10_bio->devs[disk].devnum = disk; 1814 atomic_inc(&r10_bio->remaining); 1815 md_submit_discard_bio(mddev, rdev, mbio, 1816 dev_start + choose_data_offset(r10_bio, rdev), 1817 dev_end - dev_start); 1818 bio_endio(mbio); 1819 } 1820 if (r10_bio->devs[disk].repl_bio) { 1821 struct md_rdev *rrdev = conf->mirrors[disk].replacement; 1822 rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 1823 &mddev->bio_set); 1824 rbio->bi_end_io = raid10_end_discard_request; 1825 rbio->bi_private = r10_bio; 1826 r10_bio->devs[disk].repl_bio = rbio; 1827 r10_bio->devs[disk].devnum = disk; 1828 atomic_inc(&r10_bio->remaining); 1829 md_submit_discard_bio(mddev, rrdev, rbio, 1830 dev_start + choose_data_offset(r10_bio, rrdev), 1831 dev_end - dev_start); 1832 bio_endio(rbio); 1833 } 1834 } 1835 1836 if (!geo->far_offset && --far_copies) { 1837 first_stripe_index += geo->stride >> geo->chunk_shift; 1838 start_disk_offset += geo->stride; 1839 last_stripe_index += geo->stride >> geo->chunk_shift; 1840 end_disk_offset += geo->stride; 1841 atomic_inc(&first_r10bio->remaining); 1842 raid_end_discard_bio(r10_bio); 1843 wait_barrier(conf, false); 1844 goto retry_discard; 1845 } 1846 1847 raid_end_discard_bio(r10_bio); 1848 1849 return 0; 1850 out: 1851 allow_barrier(conf); 1852 return -EAGAIN; 1853 } 1854 1855 static bool raid10_make_request(struct mddev *mddev, struct bio *bio) 1856 { 1857 struct r10conf *conf = mddev->private; 1858 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 1859 int chunk_sects = chunk_mask + 1; 1860 int sectors = bio_sectors(bio); 1861 1862 if (unlikely(bio->bi_opf & REQ_PREFLUSH) 1863 && md_flush_request(mddev, bio)) 1864 return true; 1865 1866 if (!md_write_start(mddev, bio)) 1867 return false; 1868 1869 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1870 if (!raid10_handle_discard(mddev, bio)) 1871 return true; 1872 1873 /* 1874 * If this request crosses a chunk boundary, we need to split 1875 * it. 1876 */ 1877 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + 1878 sectors > chunk_sects 1879 && (conf->geo.near_copies < conf->geo.raid_disks 1880 || conf->prev.near_copies < 1881 conf->prev.raid_disks))) 1882 sectors = chunk_sects - 1883 (bio->bi_iter.bi_sector & 1884 (chunk_sects - 1)); 1885 __make_request(mddev, bio, sectors); 1886 1887 /* In case raid10d snuck in to freeze_array */ 1888 wake_up(&conf->wait_barrier); 1889 return true; 1890 } 1891 1892 static void raid10_status(struct seq_file *seq, struct mddev *mddev) 1893 { 1894 struct r10conf *conf = mddev->private; 1895 int i; 1896 1897 if (conf->geo.near_copies < conf->geo.raid_disks) 1898 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1899 if (conf->geo.near_copies > 1) 1900 seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1901 if (conf->geo.far_copies > 1) { 1902 if (conf->geo.far_offset) 1903 seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1904 else 1905 seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1906 if (conf->geo.far_set_size != conf->geo.raid_disks) 1907 seq_printf(seq, " %d devices per set", conf->geo.far_set_size); 1908 } 1909 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1910 conf->geo.raid_disks - mddev->degraded); 1911 rcu_read_lock(); 1912 for (i = 0; i < conf->geo.raid_disks; i++) { 1913 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1914 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 1915 } 1916 rcu_read_unlock(); 1917 seq_printf(seq, "]"); 1918 } 1919 1920 /* check if there are enough drives for 1921 * every block to appear on atleast one. 1922 * Don't consider the device numbered 'ignore' 1923 * as we might be about to remove it. 1924 */ 1925 static int _enough(struct r10conf *conf, int previous, int ignore) 1926 { 1927 int first = 0; 1928 int has_enough = 0; 1929 int disks, ncopies; 1930 if (previous) { 1931 disks = conf->prev.raid_disks; 1932 ncopies = conf->prev.near_copies; 1933 } else { 1934 disks = conf->geo.raid_disks; 1935 ncopies = conf->geo.near_copies; 1936 } 1937 1938 rcu_read_lock(); 1939 do { 1940 int n = conf->copies; 1941 int cnt = 0; 1942 int this = first; 1943 while (n--) { 1944 struct md_rdev *rdev; 1945 if (this != ignore && 1946 (rdev = rcu_dereference(conf->mirrors[this].rdev)) && 1947 test_bit(In_sync, &rdev->flags)) 1948 cnt++; 1949 this = (this+1) % disks; 1950 } 1951 if (cnt == 0) 1952 goto out; 1953 first = (first + ncopies) % disks; 1954 } while (first != 0); 1955 has_enough = 1; 1956 out: 1957 rcu_read_unlock(); 1958 return has_enough; 1959 } 1960 1961 static int enough(struct r10conf *conf, int ignore) 1962 { 1963 /* when calling 'enough', both 'prev' and 'geo' must 1964 * be stable. 1965 * This is ensured if ->reconfig_mutex or ->device_lock 1966 * is held. 1967 */ 1968 return _enough(conf, 0, ignore) && 1969 _enough(conf, 1, ignore); 1970 } 1971 1972 /** 1973 * raid10_error() - RAID10 error handler. 1974 * @mddev: affected md device. 1975 * @rdev: member device to fail. 1976 * 1977 * The routine acknowledges &rdev failure and determines new @mddev state. 1978 * If it failed, then: 1979 * - &MD_BROKEN flag is set in &mddev->flags. 1980 * Otherwise, it must be degraded: 1981 * - recovery is interrupted. 1982 * - &mddev->degraded is bumped. 1983 1984 * @rdev is marked as &Faulty excluding case when array is failed and 1985 * &mddev->fail_last_dev is off. 1986 */ 1987 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) 1988 { 1989 struct r10conf *conf = mddev->private; 1990 unsigned long flags; 1991 1992 spin_lock_irqsave(&conf->device_lock, flags); 1993 1994 if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { 1995 set_bit(MD_BROKEN, &mddev->flags); 1996 1997 if (!mddev->fail_last_dev) { 1998 spin_unlock_irqrestore(&conf->device_lock, flags); 1999 return; 2000 } 2001 } 2002 if (test_and_clear_bit(In_sync, &rdev->flags)) 2003 mddev->degraded++; 2004 2005 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2006 set_bit(Blocked, &rdev->flags); 2007 set_bit(Faulty, &rdev->flags); 2008 set_mask_bits(&mddev->sb_flags, 0, 2009 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2010 spin_unlock_irqrestore(&conf->device_lock, flags); 2011 pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" 2012 "md/raid10:%s: Operation continuing on %d devices.\n", 2013 mdname(mddev), rdev->bdev, 2014 mdname(mddev), conf->geo.raid_disks - mddev->degraded); 2015 } 2016 2017 static void print_conf(struct r10conf *conf) 2018 { 2019 int i; 2020 struct md_rdev *rdev; 2021 2022 pr_debug("RAID10 conf printout:\n"); 2023 if (!conf) { 2024 pr_debug("(!conf)\n"); 2025 return; 2026 } 2027 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 2028 conf->geo.raid_disks); 2029 2030 /* This is only called with ->reconfix_mutex held, so 2031 * rcu protection of rdev is not needed */ 2032 for (i = 0; i < conf->geo.raid_disks; i++) { 2033 rdev = conf->mirrors[i].rdev; 2034 if (rdev) 2035 pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", 2036 i, !test_bit(In_sync, &rdev->flags), 2037 !test_bit(Faulty, &rdev->flags), 2038 rdev->bdev); 2039 } 2040 } 2041 2042 static void close_sync(struct r10conf *conf) 2043 { 2044 wait_barrier(conf, false); 2045 allow_barrier(conf); 2046 2047 mempool_exit(&conf->r10buf_pool); 2048 } 2049 2050 static int raid10_spare_active(struct mddev *mddev) 2051 { 2052 int i; 2053 struct r10conf *conf = mddev->private; 2054 struct raid10_info *tmp; 2055 int count = 0; 2056 unsigned long flags; 2057 2058 /* 2059 * Find all non-in_sync disks within the RAID10 configuration 2060 * and mark them in_sync 2061 */ 2062 for (i = 0; i < conf->geo.raid_disks; i++) { 2063 tmp = conf->mirrors + i; 2064 if (tmp->replacement 2065 && tmp->replacement->recovery_offset == MaxSector 2066 && !test_bit(Faulty, &tmp->replacement->flags) 2067 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 2068 /* Replacement has just become active */ 2069 if (!tmp->rdev 2070 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 2071 count++; 2072 if (tmp->rdev) { 2073 /* Replaced device not technically faulty, 2074 * but we need to be sure it gets removed 2075 * and never re-added. 2076 */ 2077 set_bit(Faulty, &tmp->rdev->flags); 2078 sysfs_notify_dirent_safe( 2079 tmp->rdev->sysfs_state); 2080 } 2081 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 2082 } else if (tmp->rdev 2083 && tmp->rdev->recovery_offset == MaxSector 2084 && !test_bit(Faulty, &tmp->rdev->flags) 2085 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 2086 count++; 2087 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 2088 } 2089 } 2090 spin_lock_irqsave(&conf->device_lock, flags); 2091 mddev->degraded -= count; 2092 spin_unlock_irqrestore(&conf->device_lock, flags); 2093 2094 print_conf(conf); 2095 return count; 2096 } 2097 2098 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 2099 { 2100 struct r10conf *conf = mddev->private; 2101 int err = -EEXIST; 2102 int mirror; 2103 int first = 0; 2104 int last = conf->geo.raid_disks - 1; 2105 2106 if (mddev->recovery_cp < MaxSector) 2107 /* only hot-add to in-sync arrays, as recovery is 2108 * very different from resync 2109 */ 2110 return -EBUSY; 2111 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) 2112 return -EINVAL; 2113 2114 if (md_integrity_add_rdev(rdev, mddev)) 2115 return -ENXIO; 2116 2117 if (rdev->raid_disk >= 0) 2118 first = last = rdev->raid_disk; 2119 2120 if (rdev->saved_raid_disk >= first && 2121 rdev->saved_raid_disk < conf->geo.raid_disks && 2122 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 2123 mirror = rdev->saved_raid_disk; 2124 else 2125 mirror = first; 2126 for ( ; mirror <= last ; mirror++) { 2127 struct raid10_info *p = &conf->mirrors[mirror]; 2128 if (p->recovery_disabled == mddev->recovery_disabled) 2129 continue; 2130 if (p->rdev) { 2131 if (!test_bit(WantReplacement, &p->rdev->flags) || 2132 p->replacement != NULL) 2133 continue; 2134 clear_bit(In_sync, &rdev->flags); 2135 set_bit(Replacement, &rdev->flags); 2136 rdev->raid_disk = mirror; 2137 err = 0; 2138 if (mddev->gendisk) 2139 disk_stack_limits(mddev->gendisk, rdev->bdev, 2140 rdev->data_offset << 9); 2141 conf->fullsync = 1; 2142 rcu_assign_pointer(p->replacement, rdev); 2143 break; 2144 } 2145 2146 if (mddev->gendisk) 2147 disk_stack_limits(mddev->gendisk, rdev->bdev, 2148 rdev->data_offset << 9); 2149 2150 p->head_position = 0; 2151 p->recovery_disabled = mddev->recovery_disabled - 1; 2152 rdev->raid_disk = mirror; 2153 err = 0; 2154 if (rdev->saved_raid_disk != mirror) 2155 conf->fullsync = 1; 2156 rcu_assign_pointer(p->rdev, rdev); 2157 break; 2158 } 2159 2160 print_conf(conf); 2161 return err; 2162 } 2163 2164 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 2165 { 2166 struct r10conf *conf = mddev->private; 2167 int err = 0; 2168 int number = rdev->raid_disk; 2169 struct md_rdev **rdevp; 2170 struct raid10_info *p; 2171 2172 print_conf(conf); 2173 if (unlikely(number >= mddev->raid_disks)) 2174 return 0; 2175 p = conf->mirrors + number; 2176 if (rdev == p->rdev) 2177 rdevp = &p->rdev; 2178 else if (rdev == p->replacement) 2179 rdevp = &p->replacement; 2180 else 2181 return 0; 2182 2183 if (test_bit(In_sync, &rdev->flags) || 2184 atomic_read(&rdev->nr_pending)) { 2185 err = -EBUSY; 2186 goto abort; 2187 } 2188 /* Only remove non-faulty devices if recovery 2189 * is not possible. 2190 */ 2191 if (!test_bit(Faulty, &rdev->flags) && 2192 mddev->recovery_disabled != p->recovery_disabled && 2193 (!p->replacement || p->replacement == rdev) && 2194 number < conf->geo.raid_disks && 2195 enough(conf, -1)) { 2196 err = -EBUSY; 2197 goto abort; 2198 } 2199 *rdevp = NULL; 2200 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 2201 synchronize_rcu(); 2202 if (atomic_read(&rdev->nr_pending)) { 2203 /* lost the race, try later */ 2204 err = -EBUSY; 2205 *rdevp = rdev; 2206 goto abort; 2207 } 2208 } 2209 if (p->replacement) { 2210 /* We must have just cleared 'rdev' */ 2211 p->rdev = p->replacement; 2212 clear_bit(Replacement, &p->replacement->flags); 2213 smp_mb(); /* Make sure other CPUs may see both as identical 2214 * but will never see neither -- if they are careful. 2215 */ 2216 p->replacement = NULL; 2217 } 2218 2219 clear_bit(WantReplacement, &rdev->flags); 2220 err = md_integrity_register(mddev); 2221 2222 abort: 2223 2224 print_conf(conf); 2225 return err; 2226 } 2227 2228 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) 2229 { 2230 struct r10conf *conf = r10_bio->mddev->private; 2231 2232 if (!bio->bi_status) 2233 set_bit(R10BIO_Uptodate, &r10_bio->state); 2234 else 2235 /* The write handler will notice the lack of 2236 * R10BIO_Uptodate and record any errors etc 2237 */ 2238 atomic_add(r10_bio->sectors, 2239 &conf->mirrors[d].rdev->corrected_errors); 2240 2241 /* for reconstruct, we always reschedule after a read. 2242 * for resync, only after all reads 2243 */ 2244 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); 2245 if (test_bit(R10BIO_IsRecover, &r10_bio->state) || 2246 atomic_dec_and_test(&r10_bio->remaining)) { 2247 /* we have read all the blocks, 2248 * do the comparison in process context in raid10d 2249 */ 2250 reschedule_retry(r10_bio); 2251 } 2252 } 2253 2254 static void end_sync_read(struct bio *bio) 2255 { 2256 struct r10bio *r10_bio = get_resync_r10bio(bio); 2257 struct r10conf *conf = r10_bio->mddev->private; 2258 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 2259 2260 __end_sync_read(r10_bio, bio, d); 2261 } 2262 2263 static void end_reshape_read(struct bio *bio) 2264 { 2265 /* reshape read bio isn't allocated from r10buf_pool */ 2266 struct r10bio *r10_bio = bio->bi_private; 2267 2268 __end_sync_read(r10_bio, bio, r10_bio->read_slot); 2269 } 2270 2271 static void end_sync_request(struct r10bio *r10_bio) 2272 { 2273 struct mddev *mddev = r10_bio->mddev; 2274 2275 while (atomic_dec_and_test(&r10_bio->remaining)) { 2276 if (r10_bio->master_bio == NULL) { 2277 /* the primary of several recovery bios */ 2278 sector_t s = r10_bio->sectors; 2279 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2280 test_bit(R10BIO_WriteError, &r10_bio->state)) 2281 reschedule_retry(r10_bio); 2282 else 2283 put_buf(r10_bio); 2284 md_done_sync(mddev, s, 1); 2285 break; 2286 } else { 2287 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 2288 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2289 test_bit(R10BIO_WriteError, &r10_bio->state)) 2290 reschedule_retry(r10_bio); 2291 else 2292 put_buf(r10_bio); 2293 r10_bio = r10_bio2; 2294 } 2295 } 2296 } 2297 2298 static void end_sync_write(struct bio *bio) 2299 { 2300 struct r10bio *r10_bio = get_resync_r10bio(bio); 2301 struct mddev *mddev = r10_bio->mddev; 2302 struct r10conf *conf = mddev->private; 2303 int d; 2304 sector_t first_bad; 2305 int bad_sectors; 2306 int slot; 2307 int repl; 2308 struct md_rdev *rdev = NULL; 2309 2310 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 2311 if (repl) 2312 rdev = conf->mirrors[d].replacement; 2313 else 2314 rdev = conf->mirrors[d].rdev; 2315 2316 if (bio->bi_status) { 2317 if (repl) 2318 md_error(mddev, rdev); 2319 else { 2320 set_bit(WriteErrorSeen, &rdev->flags); 2321 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2322 set_bit(MD_RECOVERY_NEEDED, 2323 &rdev->mddev->recovery); 2324 set_bit(R10BIO_WriteError, &r10_bio->state); 2325 } 2326 } else if (is_badblock(rdev, 2327 r10_bio->devs[slot].addr, 2328 r10_bio->sectors, 2329 &first_bad, &bad_sectors)) 2330 set_bit(R10BIO_MadeGood, &r10_bio->state); 2331 2332 rdev_dec_pending(rdev, mddev); 2333 2334 end_sync_request(r10_bio); 2335 } 2336 2337 /* 2338 * Note: sync and recover and handled very differently for raid10 2339 * This code is for resync. 2340 * For resync, we read through virtual addresses and read all blocks. 2341 * If there is any error, we schedule a write. The lowest numbered 2342 * drive is authoritative. 2343 * However requests come for physical address, so we need to map. 2344 * For every physical address there are raid_disks/copies virtual addresses, 2345 * which is always are least one, but is not necessarly an integer. 2346 * This means that a physical address can span multiple chunks, so we may 2347 * have to submit multiple io requests for a single sync request. 2348 */ 2349 /* 2350 * We check if all blocks are in-sync and only write to blocks that 2351 * aren't in sync 2352 */ 2353 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2354 { 2355 struct r10conf *conf = mddev->private; 2356 int i, first; 2357 struct bio *tbio, *fbio; 2358 int vcnt; 2359 struct page **tpages, **fpages; 2360 2361 atomic_set(&r10_bio->remaining, 1); 2362 2363 /* find the first device with a block */ 2364 for (i=0; i<conf->copies; i++) 2365 if (!r10_bio->devs[i].bio->bi_status) 2366 break; 2367 2368 if (i == conf->copies) 2369 goto done; 2370 2371 first = i; 2372 fbio = r10_bio->devs[i].bio; 2373 fbio->bi_iter.bi_size = r10_bio->sectors << 9; 2374 fbio->bi_iter.bi_idx = 0; 2375 fpages = get_resync_pages(fbio)->pages; 2376 2377 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); 2378 /* now find blocks with errors */ 2379 for (i=0 ; i < conf->copies ; i++) { 2380 int j, d; 2381 struct md_rdev *rdev; 2382 struct resync_pages *rp; 2383 2384 tbio = r10_bio->devs[i].bio; 2385 2386 if (tbio->bi_end_io != end_sync_read) 2387 continue; 2388 if (i == first) 2389 continue; 2390 2391 tpages = get_resync_pages(tbio)->pages; 2392 d = r10_bio->devs[i].devnum; 2393 rdev = conf->mirrors[d].rdev; 2394 if (!r10_bio->devs[i].bio->bi_status) { 2395 /* We know that the bi_io_vec layout is the same for 2396 * both 'first' and 'i', so we just compare them. 2397 * All vec entries are PAGE_SIZE; 2398 */ 2399 int sectors = r10_bio->sectors; 2400 for (j = 0; j < vcnt; j++) { 2401 int len = PAGE_SIZE; 2402 if (sectors < (len / 512)) 2403 len = sectors * 512; 2404 if (memcmp(page_address(fpages[j]), 2405 page_address(tpages[j]), 2406 len)) 2407 break; 2408 sectors -= len/512; 2409 } 2410 if (j == vcnt) 2411 continue; 2412 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 2413 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2414 /* Don't fix anything. */ 2415 continue; 2416 } else if (test_bit(FailFast, &rdev->flags)) { 2417 /* Just give up on this device */ 2418 md_error(rdev->mddev, rdev); 2419 continue; 2420 } 2421 /* Ok, we need to write this bio, either to correct an 2422 * inconsistency or to correct an unreadable block. 2423 * First we need to fixup bv_offset, bv_len and 2424 * bi_vecs, as the read request might have corrupted these 2425 */ 2426 rp = get_resync_pages(tbio); 2427 bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE); 2428 2429 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); 2430 2431 rp->raid_bio = r10_bio; 2432 tbio->bi_private = rp; 2433 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; 2434 tbio->bi_end_io = end_sync_write; 2435 2436 bio_copy_data(tbio, fbio); 2437 2438 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2439 atomic_inc(&r10_bio->remaining); 2440 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 2441 2442 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) 2443 tbio->bi_opf |= MD_FAILFAST; 2444 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; 2445 submit_bio_noacct(tbio); 2446 } 2447 2448 /* Now write out to any replacement devices 2449 * that are active 2450 */ 2451 for (i = 0; i < conf->copies; i++) { 2452 int d; 2453 2454 tbio = r10_bio->devs[i].repl_bio; 2455 if (!tbio || !tbio->bi_end_io) 2456 continue; 2457 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write 2458 && r10_bio->devs[i].bio != fbio) 2459 bio_copy_data(tbio, fbio); 2460 d = r10_bio->devs[i].devnum; 2461 atomic_inc(&r10_bio->remaining); 2462 md_sync_acct(conf->mirrors[d].replacement->bdev, 2463 bio_sectors(tbio)); 2464 submit_bio_noacct(tbio); 2465 } 2466 2467 done: 2468 if (atomic_dec_and_test(&r10_bio->remaining)) { 2469 md_done_sync(mddev, r10_bio->sectors, 1); 2470 put_buf(r10_bio); 2471 } 2472 } 2473 2474 /* 2475 * Now for the recovery code. 2476 * Recovery happens across physical sectors. 2477 * We recover all non-is_sync drives by finding the virtual address of 2478 * each, and then choose a working drive that also has that virt address. 2479 * There is a separate r10_bio for each non-in_sync drive. 2480 * Only the first two slots are in use. The first for reading, 2481 * The second for writing. 2482 * 2483 */ 2484 static void fix_recovery_read_error(struct r10bio *r10_bio) 2485 { 2486 /* We got a read error during recovery. 2487 * We repeat the read in smaller page-sized sections. 2488 * If a read succeeds, write it to the new device or record 2489 * a bad block if we cannot. 2490 * If a read fails, record a bad block on both old and 2491 * new devices. 2492 */ 2493 struct mddev *mddev = r10_bio->mddev; 2494 struct r10conf *conf = mddev->private; 2495 struct bio *bio = r10_bio->devs[0].bio; 2496 sector_t sect = 0; 2497 int sectors = r10_bio->sectors; 2498 int idx = 0; 2499 int dr = r10_bio->devs[0].devnum; 2500 int dw = r10_bio->devs[1].devnum; 2501 struct page **pages = get_resync_pages(bio)->pages; 2502 2503 while (sectors) { 2504 int s = sectors; 2505 struct md_rdev *rdev; 2506 sector_t addr; 2507 int ok; 2508 2509 if (s > (PAGE_SIZE>>9)) 2510 s = PAGE_SIZE >> 9; 2511 2512 rdev = conf->mirrors[dr].rdev; 2513 addr = r10_bio->devs[0].addr + sect, 2514 ok = sync_page_io(rdev, 2515 addr, 2516 s << 9, 2517 pages[idx], 2518 REQ_OP_READ, false); 2519 if (ok) { 2520 rdev = conf->mirrors[dw].rdev; 2521 addr = r10_bio->devs[1].addr + sect; 2522 ok = sync_page_io(rdev, 2523 addr, 2524 s << 9, 2525 pages[idx], 2526 REQ_OP_WRITE, false); 2527 if (!ok) { 2528 set_bit(WriteErrorSeen, &rdev->flags); 2529 if (!test_and_set_bit(WantReplacement, 2530 &rdev->flags)) 2531 set_bit(MD_RECOVERY_NEEDED, 2532 &rdev->mddev->recovery); 2533 } 2534 } 2535 if (!ok) { 2536 /* We don't worry if we cannot set a bad block - 2537 * it really is bad so there is no loss in not 2538 * recording it yet 2539 */ 2540 rdev_set_badblocks(rdev, addr, s, 0); 2541 2542 if (rdev != conf->mirrors[dw].rdev) { 2543 /* need bad block on destination too */ 2544 struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 2545 addr = r10_bio->devs[1].addr + sect; 2546 ok = rdev_set_badblocks(rdev2, addr, s, 0); 2547 if (!ok) { 2548 /* just abort the recovery */ 2549 pr_notice("md/raid10:%s: recovery aborted due to read error\n", 2550 mdname(mddev)); 2551 2552 conf->mirrors[dw].recovery_disabled 2553 = mddev->recovery_disabled; 2554 set_bit(MD_RECOVERY_INTR, 2555 &mddev->recovery); 2556 break; 2557 } 2558 } 2559 } 2560 2561 sectors -= s; 2562 sect += s; 2563 idx++; 2564 } 2565 } 2566 2567 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2568 { 2569 struct r10conf *conf = mddev->private; 2570 int d; 2571 struct bio *wbio, *wbio2; 2572 2573 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 2574 fix_recovery_read_error(r10_bio); 2575 end_sync_request(r10_bio); 2576 return; 2577 } 2578 2579 /* 2580 * share the pages with the first bio 2581 * and submit the write request 2582 */ 2583 d = r10_bio->devs[1].devnum; 2584 wbio = r10_bio->devs[1].bio; 2585 wbio2 = r10_bio->devs[1].repl_bio; 2586 /* Need to test wbio2->bi_end_io before we call 2587 * submit_bio_noacct as if the former is NULL, 2588 * the latter is free to free wbio2. 2589 */ 2590 if (wbio2 && !wbio2->bi_end_io) 2591 wbio2 = NULL; 2592 if (wbio->bi_end_io) { 2593 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2594 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 2595 submit_bio_noacct(wbio); 2596 } 2597 if (wbio2) { 2598 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2599 md_sync_acct(conf->mirrors[d].replacement->bdev, 2600 bio_sectors(wbio2)); 2601 submit_bio_noacct(wbio2); 2602 } 2603 } 2604 2605 /* 2606 * Used by fix_read_error() to decay the per rdev read_errors. 2607 * We halve the read error count for every hour that has elapsed 2608 * since the last recorded read error. 2609 * 2610 */ 2611 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 2612 { 2613 long cur_time_mon; 2614 unsigned long hours_since_last; 2615 unsigned int read_errors = atomic_read(&rdev->read_errors); 2616 2617 cur_time_mon = ktime_get_seconds(); 2618 2619 if (rdev->last_read_error == 0) { 2620 /* first time we've seen a read error */ 2621 rdev->last_read_error = cur_time_mon; 2622 return; 2623 } 2624 2625 hours_since_last = (long)(cur_time_mon - 2626 rdev->last_read_error) / 3600; 2627 2628 rdev->last_read_error = cur_time_mon; 2629 2630 /* 2631 * if hours_since_last is > the number of bits in read_errors 2632 * just set read errors to 0. We do this to avoid 2633 * overflowing the shift of read_errors by hours_since_last. 2634 */ 2635 if (hours_since_last >= 8 * sizeof(read_errors)) 2636 atomic_set(&rdev->read_errors, 0); 2637 else 2638 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 2639 } 2640 2641 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 2642 int sectors, struct page *page, int rw) 2643 { 2644 sector_t first_bad; 2645 int bad_sectors; 2646 2647 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 2648 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) 2649 return -1; 2650 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 2651 /* success */ 2652 return 1; 2653 if (rw == WRITE) { 2654 set_bit(WriteErrorSeen, &rdev->flags); 2655 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2656 set_bit(MD_RECOVERY_NEEDED, 2657 &rdev->mddev->recovery); 2658 } 2659 /* need to record an error - either for the block or the device */ 2660 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2661 md_error(rdev->mddev, rdev); 2662 return 0; 2663 } 2664 2665 /* 2666 * This is a kernel thread which: 2667 * 2668 * 1. Retries failed read operations on working mirrors. 2669 * 2. Updates the raid superblock when problems encounter. 2670 * 3. Performs writes following reads for array synchronising. 2671 */ 2672 2673 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 2674 { 2675 int sect = 0; /* Offset from r10_bio->sector */ 2676 int sectors = r10_bio->sectors; 2677 struct md_rdev *rdev; 2678 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2679 int d = r10_bio->devs[r10_bio->read_slot].devnum; 2680 2681 /* still own a reference to this rdev, so it cannot 2682 * have been cleared recently. 2683 */ 2684 rdev = conf->mirrors[d].rdev; 2685 2686 if (test_bit(Faulty, &rdev->flags)) 2687 /* drive has already been failed, just ignore any 2688 more fix_read_error() attempts */ 2689 return; 2690 2691 check_decay_read_errors(mddev, rdev); 2692 atomic_inc(&rdev->read_errors); 2693 if (atomic_read(&rdev->read_errors) > max_read_errors) { 2694 pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", 2695 mdname(mddev), rdev->bdev, 2696 atomic_read(&rdev->read_errors), max_read_errors); 2697 pr_notice("md/raid10:%s: %pg: Failing raid device\n", 2698 mdname(mddev), rdev->bdev); 2699 md_error(mddev, rdev); 2700 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2701 return; 2702 } 2703 2704 while(sectors) { 2705 int s = sectors; 2706 int sl = r10_bio->read_slot; 2707 int success = 0; 2708 int start; 2709 2710 if (s > (PAGE_SIZE>>9)) 2711 s = PAGE_SIZE >> 9; 2712 2713 rcu_read_lock(); 2714 do { 2715 sector_t first_bad; 2716 int bad_sectors; 2717 2718 d = r10_bio->devs[sl].devnum; 2719 rdev = rcu_dereference(conf->mirrors[d].rdev); 2720 if (rdev && 2721 test_bit(In_sync, &rdev->flags) && 2722 !test_bit(Faulty, &rdev->flags) && 2723 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2724 &first_bad, &bad_sectors) == 0) { 2725 atomic_inc(&rdev->nr_pending); 2726 rcu_read_unlock(); 2727 success = sync_page_io(rdev, 2728 r10_bio->devs[sl].addr + 2729 sect, 2730 s<<9, 2731 conf->tmppage, 2732 REQ_OP_READ, false); 2733 rdev_dec_pending(rdev, mddev); 2734 rcu_read_lock(); 2735 if (success) 2736 break; 2737 } 2738 sl++; 2739 if (sl == conf->copies) 2740 sl = 0; 2741 } while (!success && sl != r10_bio->read_slot); 2742 rcu_read_unlock(); 2743 2744 if (!success) { 2745 /* Cannot read from anywhere, just mark the block 2746 * as bad on the first device to discourage future 2747 * reads. 2748 */ 2749 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 2750 rdev = conf->mirrors[dn].rdev; 2751 2752 if (!rdev_set_badblocks( 2753 rdev, 2754 r10_bio->devs[r10_bio->read_slot].addr 2755 + sect, 2756 s, 0)) { 2757 md_error(mddev, rdev); 2758 r10_bio->devs[r10_bio->read_slot].bio 2759 = IO_BLOCKED; 2760 } 2761 break; 2762 } 2763 2764 start = sl; 2765 /* write it back and re-read */ 2766 rcu_read_lock(); 2767 while (sl != r10_bio->read_slot) { 2768 if (sl==0) 2769 sl = conf->copies; 2770 sl--; 2771 d = r10_bio->devs[sl].devnum; 2772 rdev = rcu_dereference(conf->mirrors[d].rdev); 2773 if (!rdev || 2774 test_bit(Faulty, &rdev->flags) || 2775 !test_bit(In_sync, &rdev->flags)) 2776 continue; 2777 2778 atomic_inc(&rdev->nr_pending); 2779 rcu_read_unlock(); 2780 if (r10_sync_page_io(rdev, 2781 r10_bio->devs[sl].addr + 2782 sect, 2783 s, conf->tmppage, WRITE) 2784 == 0) { 2785 /* Well, this device is dead */ 2786 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", 2787 mdname(mddev), s, 2788 (unsigned long long)( 2789 sect + 2790 choose_data_offset(r10_bio, 2791 rdev)), 2792 rdev->bdev); 2793 pr_notice("md/raid10:%s: %pg: failing drive\n", 2794 mdname(mddev), 2795 rdev->bdev); 2796 } 2797 rdev_dec_pending(rdev, mddev); 2798 rcu_read_lock(); 2799 } 2800 sl = start; 2801 while (sl != r10_bio->read_slot) { 2802 if (sl==0) 2803 sl = conf->copies; 2804 sl--; 2805 d = r10_bio->devs[sl].devnum; 2806 rdev = rcu_dereference(conf->mirrors[d].rdev); 2807 if (!rdev || 2808 test_bit(Faulty, &rdev->flags) || 2809 !test_bit(In_sync, &rdev->flags)) 2810 continue; 2811 2812 atomic_inc(&rdev->nr_pending); 2813 rcu_read_unlock(); 2814 switch (r10_sync_page_io(rdev, 2815 r10_bio->devs[sl].addr + 2816 sect, 2817 s, conf->tmppage, 2818 READ)) { 2819 case 0: 2820 /* Well, this device is dead */ 2821 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", 2822 mdname(mddev), s, 2823 (unsigned long long)( 2824 sect + 2825 choose_data_offset(r10_bio, rdev)), 2826 rdev->bdev); 2827 pr_notice("md/raid10:%s: %pg: failing drive\n", 2828 mdname(mddev), 2829 rdev->bdev); 2830 break; 2831 case 1: 2832 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", 2833 mdname(mddev), s, 2834 (unsigned long long)( 2835 sect + 2836 choose_data_offset(r10_bio, rdev)), 2837 rdev->bdev); 2838 atomic_add(s, &rdev->corrected_errors); 2839 } 2840 2841 rdev_dec_pending(rdev, mddev); 2842 rcu_read_lock(); 2843 } 2844 rcu_read_unlock(); 2845 2846 sectors -= s; 2847 sect += s; 2848 } 2849 } 2850 2851 static int narrow_write_error(struct r10bio *r10_bio, int i) 2852 { 2853 struct bio *bio = r10_bio->master_bio; 2854 struct mddev *mddev = r10_bio->mddev; 2855 struct r10conf *conf = mddev->private; 2856 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 2857 /* bio has the data to be written to slot 'i' where 2858 * we just recently had a write error. 2859 * We repeatedly clone the bio and trim down to one block, 2860 * then try the write. Where the write fails we record 2861 * a bad block. 2862 * It is conceivable that the bio doesn't exactly align with 2863 * blocks. We must handle this. 2864 * 2865 * We currently own a reference to the rdev. 2866 */ 2867 2868 int block_sectors; 2869 sector_t sector; 2870 int sectors; 2871 int sect_to_write = r10_bio->sectors; 2872 int ok = 1; 2873 2874 if (rdev->badblocks.shift < 0) 2875 return 0; 2876 2877 block_sectors = roundup(1 << rdev->badblocks.shift, 2878 bdev_logical_block_size(rdev->bdev) >> 9); 2879 sector = r10_bio->sector; 2880 sectors = ((r10_bio->sector + block_sectors) 2881 & ~(sector_t)(block_sectors - 1)) 2882 - sector; 2883 2884 while (sect_to_write) { 2885 struct bio *wbio; 2886 sector_t wsector; 2887 if (sectors > sect_to_write) 2888 sectors = sect_to_write; 2889 /* Write at 'sector' for 'sectors' */ 2890 wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, 2891 &mddev->bio_set); 2892 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); 2893 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); 2894 wbio->bi_iter.bi_sector = wsector + 2895 choose_data_offset(r10_bio, rdev); 2896 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 2897 2898 if (submit_bio_wait(wbio) < 0) 2899 /* Failure! */ 2900 ok = rdev_set_badblocks(rdev, wsector, 2901 sectors, 0) 2902 && ok; 2903 2904 bio_put(wbio); 2905 sect_to_write -= sectors; 2906 sector += sectors; 2907 sectors = block_sectors; 2908 } 2909 return ok; 2910 } 2911 2912 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2913 { 2914 int slot = r10_bio->read_slot; 2915 struct bio *bio; 2916 struct r10conf *conf = mddev->private; 2917 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2918 2919 /* we got a read error. Maybe the drive is bad. Maybe just 2920 * the block and we can fix it. 2921 * We freeze all other IO, and try reading the block from 2922 * other devices. When we find one, we re-write 2923 * and check it that fixes the read error. 2924 * This is all done synchronously while the array is 2925 * frozen. 2926 */ 2927 bio = r10_bio->devs[slot].bio; 2928 bio_put(bio); 2929 r10_bio->devs[slot].bio = NULL; 2930 2931 if (mddev->ro) 2932 r10_bio->devs[slot].bio = IO_BLOCKED; 2933 else if (!test_bit(FailFast, &rdev->flags)) { 2934 freeze_array(conf, 1); 2935 fix_read_error(conf, mddev, r10_bio); 2936 unfreeze_array(conf); 2937 } else 2938 md_error(mddev, rdev); 2939 2940 rdev_dec_pending(rdev, mddev); 2941 allow_barrier(conf); 2942 r10_bio->state = 0; 2943 raid10_read_request(mddev, r10_bio->master_bio, r10_bio); 2944 } 2945 2946 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 2947 { 2948 /* Some sort of write request has finished and it 2949 * succeeded in writing where we thought there was a 2950 * bad block. So forget the bad block. 2951 * Or possibly if failed and we need to record 2952 * a bad block. 2953 */ 2954 int m; 2955 struct md_rdev *rdev; 2956 2957 if (test_bit(R10BIO_IsSync, &r10_bio->state) || 2958 test_bit(R10BIO_IsRecover, &r10_bio->state)) { 2959 for (m = 0; m < conf->copies; m++) { 2960 int dev = r10_bio->devs[m].devnum; 2961 rdev = conf->mirrors[dev].rdev; 2962 if (r10_bio->devs[m].bio == NULL || 2963 r10_bio->devs[m].bio->bi_end_io == NULL) 2964 continue; 2965 if (!r10_bio->devs[m].bio->bi_status) { 2966 rdev_clear_badblocks( 2967 rdev, 2968 r10_bio->devs[m].addr, 2969 r10_bio->sectors, 0); 2970 } else { 2971 if (!rdev_set_badblocks( 2972 rdev, 2973 r10_bio->devs[m].addr, 2974 r10_bio->sectors, 0)) 2975 md_error(conf->mddev, rdev); 2976 } 2977 rdev = conf->mirrors[dev].replacement; 2978 if (r10_bio->devs[m].repl_bio == NULL || 2979 r10_bio->devs[m].repl_bio->bi_end_io == NULL) 2980 continue; 2981 2982 if (!r10_bio->devs[m].repl_bio->bi_status) { 2983 rdev_clear_badblocks( 2984 rdev, 2985 r10_bio->devs[m].addr, 2986 r10_bio->sectors, 0); 2987 } else { 2988 if (!rdev_set_badblocks( 2989 rdev, 2990 r10_bio->devs[m].addr, 2991 r10_bio->sectors, 0)) 2992 md_error(conf->mddev, rdev); 2993 } 2994 } 2995 put_buf(r10_bio); 2996 } else { 2997 bool fail = false; 2998 for (m = 0; m < conf->copies; m++) { 2999 int dev = r10_bio->devs[m].devnum; 3000 struct bio *bio = r10_bio->devs[m].bio; 3001 rdev = conf->mirrors[dev].rdev; 3002 if (bio == IO_MADE_GOOD) { 3003 rdev_clear_badblocks( 3004 rdev, 3005 r10_bio->devs[m].addr, 3006 r10_bio->sectors, 0); 3007 rdev_dec_pending(rdev, conf->mddev); 3008 } else if (bio != NULL && bio->bi_status) { 3009 fail = true; 3010 if (!narrow_write_error(r10_bio, m)) { 3011 md_error(conf->mddev, rdev); 3012 set_bit(R10BIO_Degraded, 3013 &r10_bio->state); 3014 } 3015 rdev_dec_pending(rdev, conf->mddev); 3016 } 3017 bio = r10_bio->devs[m].repl_bio; 3018 rdev = conf->mirrors[dev].replacement; 3019 if (rdev && bio == IO_MADE_GOOD) { 3020 rdev_clear_badblocks( 3021 rdev, 3022 r10_bio->devs[m].addr, 3023 r10_bio->sectors, 0); 3024 rdev_dec_pending(rdev, conf->mddev); 3025 } 3026 } 3027 if (fail) { 3028 spin_lock_irq(&conf->device_lock); 3029 list_add(&r10_bio->retry_list, &conf->bio_end_io_list); 3030 conf->nr_queued++; 3031 spin_unlock_irq(&conf->device_lock); 3032 /* 3033 * In case freeze_array() is waiting for condition 3034 * nr_pending == nr_queued + extra to be true. 3035 */ 3036 wake_up(&conf->wait_barrier); 3037 md_wakeup_thread(conf->mddev->thread); 3038 } else { 3039 if (test_bit(R10BIO_WriteError, 3040 &r10_bio->state)) 3041 close_write(r10_bio); 3042 raid_end_bio_io(r10_bio); 3043 } 3044 } 3045 } 3046 3047 static void raid10d(struct md_thread *thread) 3048 { 3049 struct mddev *mddev = thread->mddev; 3050 struct r10bio *r10_bio; 3051 unsigned long flags; 3052 struct r10conf *conf = mddev->private; 3053 struct list_head *head = &conf->retry_list; 3054 struct blk_plug plug; 3055 3056 md_check_recovery(mddev); 3057 3058 if (!list_empty_careful(&conf->bio_end_io_list) && 3059 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3060 LIST_HEAD(tmp); 3061 spin_lock_irqsave(&conf->device_lock, flags); 3062 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 3063 while (!list_empty(&conf->bio_end_io_list)) { 3064 list_move(conf->bio_end_io_list.prev, &tmp); 3065 conf->nr_queued--; 3066 } 3067 } 3068 spin_unlock_irqrestore(&conf->device_lock, flags); 3069 while (!list_empty(&tmp)) { 3070 r10_bio = list_first_entry(&tmp, struct r10bio, 3071 retry_list); 3072 list_del(&r10_bio->retry_list); 3073 if (mddev->degraded) 3074 set_bit(R10BIO_Degraded, &r10_bio->state); 3075 3076 if (test_bit(R10BIO_WriteError, 3077 &r10_bio->state)) 3078 close_write(r10_bio); 3079 raid_end_bio_io(r10_bio); 3080 } 3081 } 3082 3083 blk_start_plug(&plug); 3084 for (;;) { 3085 3086 flush_pending_writes(conf); 3087 3088 spin_lock_irqsave(&conf->device_lock, flags); 3089 if (list_empty(head)) { 3090 spin_unlock_irqrestore(&conf->device_lock, flags); 3091 break; 3092 } 3093 r10_bio = list_entry(head->prev, struct r10bio, retry_list); 3094 list_del(head->prev); 3095 conf->nr_queued--; 3096 spin_unlock_irqrestore(&conf->device_lock, flags); 3097 3098 mddev = r10_bio->mddev; 3099 conf = mddev->private; 3100 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 3101 test_bit(R10BIO_WriteError, &r10_bio->state)) 3102 handle_write_completed(conf, r10_bio); 3103 else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 3104 reshape_request_write(mddev, r10_bio); 3105 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 3106 sync_request_write(mddev, r10_bio); 3107 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 3108 recovery_request_write(mddev, r10_bio); 3109 else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 3110 handle_read_error(mddev, r10_bio); 3111 else 3112 WARN_ON_ONCE(1); 3113 3114 cond_resched(); 3115 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 3116 md_check_recovery(mddev); 3117 } 3118 blk_finish_plug(&plug); 3119 } 3120 3121 static int init_resync(struct r10conf *conf) 3122 { 3123 int ret, buffs, i; 3124 3125 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 3126 BUG_ON(mempool_initialized(&conf->r10buf_pool)); 3127 conf->have_replacement = 0; 3128 for (i = 0; i < conf->geo.raid_disks; i++) 3129 if (conf->mirrors[i].replacement) 3130 conf->have_replacement = 1; 3131 ret = mempool_init(&conf->r10buf_pool, buffs, 3132 r10buf_pool_alloc, r10buf_pool_free, conf); 3133 if (ret) 3134 return ret; 3135 conf->next_resync = 0; 3136 return 0; 3137 } 3138 3139 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) 3140 { 3141 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); 3142 struct rsync_pages *rp; 3143 struct bio *bio; 3144 int nalloc; 3145 int i; 3146 3147 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 3148 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 3149 nalloc = conf->copies; /* resync */ 3150 else 3151 nalloc = 2; /* recovery */ 3152 3153 for (i = 0; i < nalloc; i++) { 3154 bio = r10bio->devs[i].bio; 3155 rp = bio->bi_private; 3156 bio_reset(bio, NULL, 0); 3157 bio->bi_private = rp; 3158 bio = r10bio->devs[i].repl_bio; 3159 if (bio) { 3160 rp = bio->bi_private; 3161 bio_reset(bio, NULL, 0); 3162 bio->bi_private = rp; 3163 } 3164 } 3165 return r10bio; 3166 } 3167 3168 /* 3169 * Set cluster_sync_high since we need other nodes to add the 3170 * range [cluster_sync_low, cluster_sync_high] to suspend list. 3171 */ 3172 static void raid10_set_cluster_sync_high(struct r10conf *conf) 3173 { 3174 sector_t window_size; 3175 int extra_chunk, chunks; 3176 3177 /* 3178 * First, here we define "stripe" as a unit which across 3179 * all member devices one time, so we get chunks by use 3180 * raid_disks / near_copies. Otherwise, if near_copies is 3181 * close to raid_disks, then resync window could increases 3182 * linearly with the increase of raid_disks, which means 3183 * we will suspend a really large IO window while it is not 3184 * necessary. If raid_disks is not divisible by near_copies, 3185 * an extra chunk is needed to ensure the whole "stripe" is 3186 * covered. 3187 */ 3188 3189 chunks = conf->geo.raid_disks / conf->geo.near_copies; 3190 if (conf->geo.raid_disks % conf->geo.near_copies == 0) 3191 extra_chunk = 0; 3192 else 3193 extra_chunk = 1; 3194 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; 3195 3196 /* 3197 * At least use a 32M window to align with raid1's resync window 3198 */ 3199 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? 3200 CLUSTER_RESYNC_WINDOW_SECTORS : window_size; 3201 3202 conf->cluster_sync_high = conf->cluster_sync_low + window_size; 3203 } 3204 3205 /* 3206 * perform a "sync" on one "block" 3207 * 3208 * We need to make sure that no normal I/O request - particularly write 3209 * requests - conflict with active sync requests. 3210 * 3211 * This is achieved by tracking pending requests and a 'barrier' concept 3212 * that can be installed to exclude normal IO requests. 3213 * 3214 * Resync and recovery are handled very differently. 3215 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. 3216 * 3217 * For resync, we iterate over virtual addresses, read all copies, 3218 * and update if there are differences. If only one copy is live, 3219 * skip it. 3220 * For recovery, we iterate over physical addresses, read a good 3221 * value for each non-in_sync drive, and over-write. 3222 * 3223 * So, for recovery we may have several outstanding complex requests for a 3224 * given address, one for each out-of-sync device. We model this by allocating 3225 * a number of r10_bio structures, one for each out-of-sync device. 3226 * As we setup these structures, we collect all bio's together into a list 3227 * which we then process collectively to add pages, and then process again 3228 * to pass to submit_bio_noacct. 3229 * 3230 * The r10_bio structures are linked using a borrowed master_bio pointer. 3231 * This link is counted in ->remaining. When the r10_bio that points to NULL 3232 * has its remaining count decremented to 0, the whole complex operation 3233 * is complete. 3234 * 3235 */ 3236 3237 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, 3238 int *skipped) 3239 { 3240 struct r10conf *conf = mddev->private; 3241 struct r10bio *r10_bio; 3242 struct bio *biolist = NULL, *bio; 3243 sector_t max_sector, nr_sectors; 3244 int i; 3245 int max_sync; 3246 sector_t sync_blocks; 3247 sector_t sectors_skipped = 0; 3248 int chunks_skipped = 0; 3249 sector_t chunk_mask = conf->geo.chunk_mask; 3250 int page_idx = 0; 3251 3252 if (!mempool_initialized(&conf->r10buf_pool)) 3253 if (init_resync(conf)) 3254 return 0; 3255 3256 /* 3257 * Allow skipping a full rebuild for incremental assembly 3258 * of a clean array, like RAID1 does. 3259 */ 3260 if (mddev->bitmap == NULL && 3261 mddev->recovery_cp == MaxSector && 3262 mddev->reshape_position == MaxSector && 3263 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 3264 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 3265 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 3266 conf->fullsync == 0) { 3267 *skipped = 1; 3268 return mddev->dev_sectors - sector_nr; 3269 } 3270 3271 skipped: 3272 max_sector = mddev->dev_sectors; 3273 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 3274 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3275 max_sector = mddev->resync_max_sectors; 3276 if (sector_nr >= max_sector) { 3277 conf->cluster_sync_low = 0; 3278 conf->cluster_sync_high = 0; 3279 3280 /* If we aborted, we need to abort the 3281 * sync on the 'current' bitmap chucks (there can 3282 * be several when recovering multiple devices). 3283 * as we may have started syncing it but not finished. 3284 * We can find the current address in 3285 * mddev->curr_resync, but for recovery, 3286 * we need to convert that to several 3287 * virtual addresses. 3288 */ 3289 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3290 end_reshape(conf); 3291 close_sync(conf); 3292 return 0; 3293 } 3294 3295 if (mddev->curr_resync < max_sector) { /* aborted */ 3296 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3297 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3298 &sync_blocks, 1); 3299 else for (i = 0; i < conf->geo.raid_disks; i++) { 3300 sector_t sect = 3301 raid10_find_virt(conf, mddev->curr_resync, i); 3302 md_bitmap_end_sync(mddev->bitmap, sect, 3303 &sync_blocks, 1); 3304 } 3305 } else { 3306 /* completed sync */ 3307 if ((!mddev->bitmap || conf->fullsync) 3308 && conf->have_replacement 3309 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3310 /* Completed a full sync so the replacements 3311 * are now fully recovered. 3312 */ 3313 rcu_read_lock(); 3314 for (i = 0; i < conf->geo.raid_disks; i++) { 3315 struct md_rdev *rdev = 3316 rcu_dereference(conf->mirrors[i].replacement); 3317 if (rdev) 3318 rdev->recovery_offset = MaxSector; 3319 } 3320 rcu_read_unlock(); 3321 } 3322 conf->fullsync = 0; 3323 } 3324 md_bitmap_close_sync(mddev->bitmap); 3325 close_sync(conf); 3326 *skipped = 1; 3327 return sectors_skipped; 3328 } 3329 3330 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3331 return reshape_request(mddev, sector_nr, skipped); 3332 3333 if (chunks_skipped >= conf->geo.raid_disks) { 3334 /* if there has been nothing to do on any drive, 3335 * then there is nothing to do at all.. 3336 */ 3337 *skipped = 1; 3338 return (max_sector - sector_nr) + sectors_skipped; 3339 } 3340 3341 if (max_sector > mddev->resync_max) 3342 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 3343 3344 /* make sure whole request will fit in a chunk - if chunks 3345 * are meaningful 3346 */ 3347 if (conf->geo.near_copies < conf->geo.raid_disks && 3348 max_sector > (sector_nr | chunk_mask)) 3349 max_sector = (sector_nr | chunk_mask) + 1; 3350 3351 /* 3352 * If there is non-resync activity waiting for a turn, then let it 3353 * though before starting on this new sync request. 3354 */ 3355 if (conf->nr_waiting) 3356 schedule_timeout_uninterruptible(1); 3357 3358 /* Again, very different code for resync and recovery. 3359 * Both must result in an r10bio with a list of bios that 3360 * have bi_end_io, bi_sector, bi_bdev set, 3361 * and bi_private set to the r10bio. 3362 * For recovery, we may actually create several r10bios 3363 * with 2 bios in each, that correspond to the bios in the main one. 3364 * In this case, the subordinate r10bios link back through a 3365 * borrowed master_bio pointer, and the counter in the master 3366 * includes a ref from each subordinate. 3367 */ 3368 /* First, we decide what to do and set ->bi_end_io 3369 * To end_sync_read if we want to read, and 3370 * end_sync_write if we will want to write. 3371 */ 3372 3373 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 3374 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3375 /* recovery... the complicated one */ 3376 int j; 3377 r10_bio = NULL; 3378 3379 for (i = 0 ; i < conf->geo.raid_disks; i++) { 3380 int still_degraded; 3381 struct r10bio *rb2; 3382 sector_t sect; 3383 int must_sync; 3384 int any_working; 3385 int need_recover = 0; 3386 int need_replace = 0; 3387 struct raid10_info *mirror = &conf->mirrors[i]; 3388 struct md_rdev *mrdev, *mreplace; 3389 3390 rcu_read_lock(); 3391 mrdev = rcu_dereference(mirror->rdev); 3392 mreplace = rcu_dereference(mirror->replacement); 3393 3394 if (mrdev != NULL && 3395 !test_bit(Faulty, &mrdev->flags) && 3396 !test_bit(In_sync, &mrdev->flags)) 3397 need_recover = 1; 3398 if (mreplace != NULL && 3399 !test_bit(Faulty, &mreplace->flags)) 3400 need_replace = 1; 3401 3402 if (!need_recover && !need_replace) { 3403 rcu_read_unlock(); 3404 continue; 3405 } 3406 3407 still_degraded = 0; 3408 /* want to reconstruct this device */ 3409 rb2 = r10_bio; 3410 sect = raid10_find_virt(conf, sector_nr, i); 3411 if (sect >= mddev->resync_max_sectors) { 3412 /* last stripe is not complete - don't 3413 * try to recover this sector. 3414 */ 3415 rcu_read_unlock(); 3416 continue; 3417 } 3418 if (mreplace && test_bit(Faulty, &mreplace->flags)) 3419 mreplace = NULL; 3420 /* Unless we are doing a full sync, or a replacement 3421 * we only need to recover the block if it is set in 3422 * the bitmap 3423 */ 3424 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3425 &sync_blocks, 1); 3426 if (sync_blocks < max_sync) 3427 max_sync = sync_blocks; 3428 if (!must_sync && 3429 mreplace == NULL && 3430 !conf->fullsync) { 3431 /* yep, skip the sync_blocks here, but don't assume 3432 * that there will never be anything to do here 3433 */ 3434 chunks_skipped = -1; 3435 rcu_read_unlock(); 3436 continue; 3437 } 3438 atomic_inc(&mrdev->nr_pending); 3439 if (mreplace) 3440 atomic_inc(&mreplace->nr_pending); 3441 rcu_read_unlock(); 3442 3443 r10_bio = raid10_alloc_init_r10buf(conf); 3444 r10_bio->state = 0; 3445 raise_barrier(conf, rb2 != NULL); 3446 atomic_set(&r10_bio->remaining, 0); 3447 3448 r10_bio->master_bio = (struct bio*)rb2; 3449 if (rb2) 3450 atomic_inc(&rb2->remaining); 3451 r10_bio->mddev = mddev; 3452 set_bit(R10BIO_IsRecover, &r10_bio->state); 3453 r10_bio->sector = sect; 3454 3455 raid10_find_phys(conf, r10_bio); 3456 3457 /* Need to check if the array will still be 3458 * degraded 3459 */ 3460 rcu_read_lock(); 3461 for (j = 0; j < conf->geo.raid_disks; j++) { 3462 struct md_rdev *rdev = rcu_dereference( 3463 conf->mirrors[j].rdev); 3464 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3465 still_degraded = 1; 3466 break; 3467 } 3468 } 3469 3470 must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3471 &sync_blocks, still_degraded); 3472 3473 any_working = 0; 3474 for (j=0; j<conf->copies;j++) { 3475 int k; 3476 int d = r10_bio->devs[j].devnum; 3477 sector_t from_addr, to_addr; 3478 struct md_rdev *rdev = 3479 rcu_dereference(conf->mirrors[d].rdev); 3480 sector_t sector, first_bad; 3481 int bad_sectors; 3482 if (!rdev || 3483 !test_bit(In_sync, &rdev->flags)) 3484 continue; 3485 /* This is where we read from */ 3486 any_working = 1; 3487 sector = r10_bio->devs[j].addr; 3488 3489 if (is_badblock(rdev, sector, max_sync, 3490 &first_bad, &bad_sectors)) { 3491 if (first_bad > sector) 3492 max_sync = first_bad - sector; 3493 else { 3494 bad_sectors -= (sector 3495 - first_bad); 3496 if (max_sync > bad_sectors) 3497 max_sync = bad_sectors; 3498 continue; 3499 } 3500 } 3501 bio = r10_bio->devs[0].bio; 3502 bio->bi_next = biolist; 3503 biolist = bio; 3504 bio->bi_end_io = end_sync_read; 3505 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3506 if (test_bit(FailFast, &rdev->flags)) 3507 bio->bi_opf |= MD_FAILFAST; 3508 from_addr = r10_bio->devs[j].addr; 3509 bio->bi_iter.bi_sector = from_addr + 3510 rdev->data_offset; 3511 bio_set_dev(bio, rdev->bdev); 3512 atomic_inc(&rdev->nr_pending); 3513 /* and we write to 'i' (if not in_sync) */ 3514 3515 for (k=0; k<conf->copies; k++) 3516 if (r10_bio->devs[k].devnum == i) 3517 break; 3518 BUG_ON(k == conf->copies); 3519 to_addr = r10_bio->devs[k].addr; 3520 r10_bio->devs[0].devnum = d; 3521 r10_bio->devs[0].addr = from_addr; 3522 r10_bio->devs[1].devnum = i; 3523 r10_bio->devs[1].addr = to_addr; 3524 3525 if (need_recover) { 3526 bio = r10_bio->devs[1].bio; 3527 bio->bi_next = biolist; 3528 biolist = bio; 3529 bio->bi_end_io = end_sync_write; 3530 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3531 bio->bi_iter.bi_sector = to_addr 3532 + mrdev->data_offset; 3533 bio_set_dev(bio, mrdev->bdev); 3534 atomic_inc(&r10_bio->remaining); 3535 } else 3536 r10_bio->devs[1].bio->bi_end_io = NULL; 3537 3538 /* and maybe write to replacement */ 3539 bio = r10_bio->devs[1].repl_bio; 3540 if (bio) 3541 bio->bi_end_io = NULL; 3542 /* Note: if need_replace, then bio 3543 * cannot be NULL as r10buf_pool_alloc will 3544 * have allocated it. 3545 */ 3546 if (!need_replace) 3547 break; 3548 bio->bi_next = biolist; 3549 biolist = bio; 3550 bio->bi_end_io = end_sync_write; 3551 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3552 bio->bi_iter.bi_sector = to_addr + 3553 mreplace->data_offset; 3554 bio_set_dev(bio, mreplace->bdev); 3555 atomic_inc(&r10_bio->remaining); 3556 break; 3557 } 3558 rcu_read_unlock(); 3559 if (j == conf->copies) { 3560 /* Cannot recover, so abort the recovery or 3561 * record a bad block */ 3562 if (any_working) { 3563 /* problem is that there are bad blocks 3564 * on other device(s) 3565 */ 3566 int k; 3567 for (k = 0; k < conf->copies; k++) 3568 if (r10_bio->devs[k].devnum == i) 3569 break; 3570 if (!test_bit(In_sync, 3571 &mrdev->flags) 3572 && !rdev_set_badblocks( 3573 mrdev, 3574 r10_bio->devs[k].addr, 3575 max_sync, 0)) 3576 any_working = 0; 3577 if (mreplace && 3578 !rdev_set_badblocks( 3579 mreplace, 3580 r10_bio->devs[k].addr, 3581 max_sync, 0)) 3582 any_working = 0; 3583 } 3584 if (!any_working) { 3585 if (!test_and_set_bit(MD_RECOVERY_INTR, 3586 &mddev->recovery)) 3587 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", 3588 mdname(mddev)); 3589 mirror->recovery_disabled 3590 = mddev->recovery_disabled; 3591 } 3592 put_buf(r10_bio); 3593 if (rb2) 3594 atomic_dec(&rb2->remaining); 3595 r10_bio = rb2; 3596 rdev_dec_pending(mrdev, mddev); 3597 if (mreplace) 3598 rdev_dec_pending(mreplace, mddev); 3599 break; 3600 } 3601 rdev_dec_pending(mrdev, mddev); 3602 if (mreplace) 3603 rdev_dec_pending(mreplace, mddev); 3604 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { 3605 /* Only want this if there is elsewhere to 3606 * read from. 'j' is currently the first 3607 * readable copy. 3608 */ 3609 int targets = 1; 3610 for (; j < conf->copies; j++) { 3611 int d = r10_bio->devs[j].devnum; 3612 if (conf->mirrors[d].rdev && 3613 test_bit(In_sync, 3614 &conf->mirrors[d].rdev->flags)) 3615 targets++; 3616 } 3617 if (targets == 1) 3618 r10_bio->devs[0].bio->bi_opf 3619 &= ~MD_FAILFAST; 3620 } 3621 } 3622 if (biolist == NULL) { 3623 while (r10_bio) { 3624 struct r10bio *rb2 = r10_bio; 3625 r10_bio = (struct r10bio*) rb2->master_bio; 3626 rb2->master_bio = NULL; 3627 put_buf(rb2); 3628 } 3629 goto giveup; 3630 } 3631 } else { 3632 /* resync. Schedule a read for every block at this virt offset */ 3633 int count = 0; 3634 3635 /* 3636 * Since curr_resync_completed could probably not update in 3637 * time, and we will set cluster_sync_low based on it. 3638 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for 3639 * safety reason, which ensures curr_resync_completed is 3640 * updated in bitmap_cond_end_sync. 3641 */ 3642 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 3643 mddev_is_clustered(mddev) && 3644 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 3645 3646 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 3647 &sync_blocks, mddev->degraded) && 3648 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 3649 &mddev->recovery)) { 3650 /* We can skip this block */ 3651 *skipped = 1; 3652 return sync_blocks + sectors_skipped; 3653 } 3654 if (sync_blocks < max_sync) 3655 max_sync = sync_blocks; 3656 r10_bio = raid10_alloc_init_r10buf(conf); 3657 r10_bio->state = 0; 3658 3659 r10_bio->mddev = mddev; 3660 atomic_set(&r10_bio->remaining, 0); 3661 raise_barrier(conf, 0); 3662 conf->next_resync = sector_nr; 3663 3664 r10_bio->master_bio = NULL; 3665 r10_bio->sector = sector_nr; 3666 set_bit(R10BIO_IsSync, &r10_bio->state); 3667 raid10_find_phys(conf, r10_bio); 3668 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 3669 3670 for (i = 0; i < conf->copies; i++) { 3671 int d = r10_bio->devs[i].devnum; 3672 sector_t first_bad, sector; 3673 int bad_sectors; 3674 struct md_rdev *rdev; 3675 3676 if (r10_bio->devs[i].repl_bio) 3677 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3678 3679 bio = r10_bio->devs[i].bio; 3680 bio->bi_status = BLK_STS_IOERR; 3681 rcu_read_lock(); 3682 rdev = rcu_dereference(conf->mirrors[d].rdev); 3683 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3684 rcu_read_unlock(); 3685 continue; 3686 } 3687 sector = r10_bio->devs[i].addr; 3688 if (is_badblock(rdev, sector, max_sync, 3689 &first_bad, &bad_sectors)) { 3690 if (first_bad > sector) 3691 max_sync = first_bad - sector; 3692 else { 3693 bad_sectors -= (sector - first_bad); 3694 if (max_sync > bad_sectors) 3695 max_sync = bad_sectors; 3696 rcu_read_unlock(); 3697 continue; 3698 } 3699 } 3700 atomic_inc(&rdev->nr_pending); 3701 atomic_inc(&r10_bio->remaining); 3702 bio->bi_next = biolist; 3703 biolist = bio; 3704 bio->bi_end_io = end_sync_read; 3705 bio_set_op_attrs(bio, REQ_OP_READ, 0); 3706 if (test_bit(FailFast, &rdev->flags)) 3707 bio->bi_opf |= MD_FAILFAST; 3708 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3709 bio_set_dev(bio, rdev->bdev); 3710 count++; 3711 3712 rdev = rcu_dereference(conf->mirrors[d].replacement); 3713 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3714 rcu_read_unlock(); 3715 continue; 3716 } 3717 atomic_inc(&rdev->nr_pending); 3718 3719 /* Need to set up for writing to the replacement */ 3720 bio = r10_bio->devs[i].repl_bio; 3721 bio->bi_status = BLK_STS_IOERR; 3722 3723 sector = r10_bio->devs[i].addr; 3724 bio->bi_next = biolist; 3725 biolist = bio; 3726 bio->bi_end_io = end_sync_write; 3727 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 3728 if (test_bit(FailFast, &rdev->flags)) 3729 bio->bi_opf |= MD_FAILFAST; 3730 bio->bi_iter.bi_sector = sector + rdev->data_offset; 3731 bio_set_dev(bio, rdev->bdev); 3732 count++; 3733 rcu_read_unlock(); 3734 } 3735 3736 if (count < 2) { 3737 for (i=0; i<conf->copies; i++) { 3738 int d = r10_bio->devs[i].devnum; 3739 if (r10_bio->devs[i].bio->bi_end_io) 3740 rdev_dec_pending(conf->mirrors[d].rdev, 3741 mddev); 3742 if (r10_bio->devs[i].repl_bio && 3743 r10_bio->devs[i].repl_bio->bi_end_io) 3744 rdev_dec_pending( 3745 conf->mirrors[d].replacement, 3746 mddev); 3747 } 3748 put_buf(r10_bio); 3749 biolist = NULL; 3750 goto giveup; 3751 } 3752 } 3753 3754 nr_sectors = 0; 3755 if (sector_nr + max_sync < max_sector) 3756 max_sector = sector_nr + max_sync; 3757 do { 3758 struct page *page; 3759 int len = PAGE_SIZE; 3760 if (sector_nr + (len>>9) > max_sector) 3761 len = (max_sector - sector_nr) << 9; 3762 if (len == 0) 3763 break; 3764 for (bio= biolist ; bio ; bio=bio->bi_next) { 3765 struct resync_pages *rp = get_resync_pages(bio); 3766 page = resync_fetch_page(rp, page_idx); 3767 /* 3768 * won't fail because the vec table is big enough 3769 * to hold all these pages 3770 */ 3771 bio_add_page(bio, page, len, 0); 3772 } 3773 nr_sectors += len>>9; 3774 sector_nr += len>>9; 3775 } while (++page_idx < RESYNC_PAGES); 3776 r10_bio->sectors = nr_sectors; 3777 3778 if (mddev_is_clustered(mddev) && 3779 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3780 /* It is resync not recovery */ 3781 if (conf->cluster_sync_high < sector_nr + nr_sectors) { 3782 conf->cluster_sync_low = mddev->curr_resync_completed; 3783 raid10_set_cluster_sync_high(conf); 3784 /* Send resync message */ 3785 md_cluster_ops->resync_info_update(mddev, 3786 conf->cluster_sync_low, 3787 conf->cluster_sync_high); 3788 } 3789 } else if (mddev_is_clustered(mddev)) { 3790 /* This is recovery not resync */ 3791 sector_t sect_va1, sect_va2; 3792 bool broadcast_msg = false; 3793 3794 for (i = 0; i < conf->geo.raid_disks; i++) { 3795 /* 3796 * sector_nr is a device address for recovery, so we 3797 * need translate it to array address before compare 3798 * with cluster_sync_high. 3799 */ 3800 sect_va1 = raid10_find_virt(conf, sector_nr, i); 3801 3802 if (conf->cluster_sync_high < sect_va1 + nr_sectors) { 3803 broadcast_msg = true; 3804 /* 3805 * curr_resync_completed is similar as 3806 * sector_nr, so make the translation too. 3807 */ 3808 sect_va2 = raid10_find_virt(conf, 3809 mddev->curr_resync_completed, i); 3810 3811 if (conf->cluster_sync_low == 0 || 3812 conf->cluster_sync_low > sect_va2) 3813 conf->cluster_sync_low = sect_va2; 3814 } 3815 } 3816 if (broadcast_msg) { 3817 raid10_set_cluster_sync_high(conf); 3818 md_cluster_ops->resync_info_update(mddev, 3819 conf->cluster_sync_low, 3820 conf->cluster_sync_high); 3821 } 3822 } 3823 3824 while (biolist) { 3825 bio = biolist; 3826 biolist = biolist->bi_next; 3827 3828 bio->bi_next = NULL; 3829 r10_bio = get_resync_r10bio(bio); 3830 r10_bio->sectors = nr_sectors; 3831 3832 if (bio->bi_end_io == end_sync_read) { 3833 md_sync_acct_bio(bio, nr_sectors); 3834 bio->bi_status = 0; 3835 submit_bio_noacct(bio); 3836 } 3837 } 3838 3839 if (sectors_skipped) 3840 /* pretend they weren't skipped, it makes 3841 * no important difference in this case 3842 */ 3843 md_done_sync(mddev, sectors_skipped, 1); 3844 3845 return sectors_skipped + nr_sectors; 3846 giveup: 3847 /* There is nowhere to write, so all non-sync 3848 * drives must be failed or in resync, all drives 3849 * have a bad block, so try the next chunk... 3850 */ 3851 if (sector_nr + max_sync < max_sector) 3852 max_sector = sector_nr + max_sync; 3853 3854 sectors_skipped += (max_sector - sector_nr); 3855 chunks_skipped ++; 3856 sector_nr = max_sector; 3857 goto skipped; 3858 } 3859 3860 static sector_t 3861 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 3862 { 3863 sector_t size; 3864 struct r10conf *conf = mddev->private; 3865 3866 if (!raid_disks) 3867 raid_disks = min(conf->geo.raid_disks, 3868 conf->prev.raid_disks); 3869 if (!sectors) 3870 sectors = conf->dev_sectors; 3871 3872 size = sectors >> conf->geo.chunk_shift; 3873 sector_div(size, conf->geo.far_copies); 3874 size = size * raid_disks; 3875 sector_div(size, conf->geo.near_copies); 3876 3877 return size << conf->geo.chunk_shift; 3878 } 3879 3880 static void calc_sectors(struct r10conf *conf, sector_t size) 3881 { 3882 /* Calculate the number of sectors-per-device that will 3883 * actually be used, and set conf->dev_sectors and 3884 * conf->stride 3885 */ 3886 3887 size = size >> conf->geo.chunk_shift; 3888 sector_div(size, conf->geo.far_copies); 3889 size = size * conf->geo.raid_disks; 3890 sector_div(size, conf->geo.near_copies); 3891 /* 'size' is now the number of chunks in the array */ 3892 /* calculate "used chunks per device" */ 3893 size = size * conf->copies; 3894 3895 /* We need to round up when dividing by raid_disks to 3896 * get the stride size. 3897 */ 3898 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 3899 3900 conf->dev_sectors = size << conf->geo.chunk_shift; 3901 3902 if (conf->geo.far_offset) 3903 conf->geo.stride = 1 << conf->geo.chunk_shift; 3904 else { 3905 sector_div(size, conf->geo.far_copies); 3906 conf->geo.stride = size << conf->geo.chunk_shift; 3907 } 3908 } 3909 3910 enum geo_type {geo_new, geo_old, geo_start}; 3911 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 3912 { 3913 int nc, fc, fo; 3914 int layout, chunk, disks; 3915 switch (new) { 3916 case geo_old: 3917 layout = mddev->layout; 3918 chunk = mddev->chunk_sectors; 3919 disks = mddev->raid_disks - mddev->delta_disks; 3920 break; 3921 case geo_new: 3922 layout = mddev->new_layout; 3923 chunk = mddev->new_chunk_sectors; 3924 disks = mddev->raid_disks; 3925 break; 3926 default: /* avoid 'may be unused' warnings */ 3927 case geo_start: /* new when starting reshape - raid_disks not 3928 * updated yet. */ 3929 layout = mddev->new_layout; 3930 chunk = mddev->new_chunk_sectors; 3931 disks = mddev->raid_disks + mddev->delta_disks; 3932 break; 3933 } 3934 if (layout >> 19) 3935 return -1; 3936 if (chunk < (PAGE_SIZE >> 9) || 3937 !is_power_of_2(chunk)) 3938 return -2; 3939 nc = layout & 255; 3940 fc = (layout >> 8) & 255; 3941 fo = layout & (1<<16); 3942 geo->raid_disks = disks; 3943 geo->near_copies = nc; 3944 geo->far_copies = fc; 3945 geo->far_offset = fo; 3946 switch (layout >> 17) { 3947 case 0: /* original layout. simple but not always optimal */ 3948 geo->far_set_size = disks; 3949 break; 3950 case 1: /* "improved" layout which was buggy. Hopefully no-one is 3951 * actually using this, but leave code here just in case.*/ 3952 geo->far_set_size = disks/fc; 3953 WARN(geo->far_set_size < fc, 3954 "This RAID10 layout does not provide data safety - please backup and create new array\n"); 3955 break; 3956 case 2: /* "improved" layout fixed to match documentation */ 3957 geo->far_set_size = fc * nc; 3958 break; 3959 default: /* Not a valid layout */ 3960 return -1; 3961 } 3962 geo->chunk_mask = chunk - 1; 3963 geo->chunk_shift = ffz(~chunk); 3964 return nc*fc; 3965 } 3966 3967 static struct r10conf *setup_conf(struct mddev *mddev) 3968 { 3969 struct r10conf *conf = NULL; 3970 int err = -EINVAL; 3971 struct geom geo; 3972 int copies; 3973 3974 copies = setup_geo(&geo, mddev, geo_new); 3975 3976 if (copies == -2) { 3977 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", 3978 mdname(mddev), PAGE_SIZE); 3979 goto out; 3980 } 3981 3982 if (copies < 2 || copies > mddev->raid_disks) { 3983 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3984 mdname(mddev), mddev->new_layout); 3985 goto out; 3986 } 3987 3988 err = -ENOMEM; 3989 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 3990 if (!conf) 3991 goto out; 3992 3993 /* FIXME calc properly */ 3994 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks), 3995 sizeof(struct raid10_info), 3996 GFP_KERNEL); 3997 if (!conf->mirrors) 3998 goto out; 3999 4000 conf->tmppage = alloc_page(GFP_KERNEL); 4001 if (!conf->tmppage) 4002 goto out; 4003 4004 conf->geo = geo; 4005 conf->copies = copies; 4006 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, 4007 rbio_pool_free, conf); 4008 if (err) 4009 goto out; 4010 4011 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 4012 if (err) 4013 goto out; 4014 4015 calc_sectors(conf, mddev->dev_sectors); 4016 if (mddev->reshape_position == MaxSector) { 4017 conf->prev = conf->geo; 4018 conf->reshape_progress = MaxSector; 4019 } else { 4020 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 4021 err = -EINVAL; 4022 goto out; 4023 } 4024 conf->reshape_progress = mddev->reshape_position; 4025 if (conf->prev.far_offset) 4026 conf->prev.stride = 1 << conf->prev.chunk_shift; 4027 else 4028 /* far_copies must be 1 */ 4029 conf->prev.stride = conf->dev_sectors; 4030 } 4031 conf->reshape_safe = conf->reshape_progress; 4032 spin_lock_init(&conf->device_lock); 4033 INIT_LIST_HEAD(&conf->retry_list); 4034 INIT_LIST_HEAD(&conf->bio_end_io_list); 4035 4036 spin_lock_init(&conf->resync_lock); 4037 init_waitqueue_head(&conf->wait_barrier); 4038 atomic_set(&conf->nr_pending, 0); 4039 4040 err = -ENOMEM; 4041 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 4042 if (!conf->thread) 4043 goto out; 4044 4045 conf->mddev = mddev; 4046 return conf; 4047 4048 out: 4049 if (conf) { 4050 mempool_exit(&conf->r10bio_pool); 4051 kfree(conf->mirrors); 4052 safe_put_page(conf->tmppage); 4053 bioset_exit(&conf->bio_split); 4054 kfree(conf); 4055 } 4056 return ERR_PTR(err); 4057 } 4058 4059 static void raid10_set_io_opt(struct r10conf *conf) 4060 { 4061 int raid_disks = conf->geo.raid_disks; 4062 4063 if (!(conf->geo.raid_disks % conf->geo.near_copies)) 4064 raid_disks /= conf->geo.near_copies; 4065 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 4066 raid_disks); 4067 } 4068 4069 static int raid10_run(struct mddev *mddev) 4070 { 4071 struct r10conf *conf; 4072 int i, disk_idx; 4073 struct raid10_info *disk; 4074 struct md_rdev *rdev; 4075 sector_t size; 4076 sector_t min_offset_diff = 0; 4077 int first = 1; 4078 4079 if (mddev_init_writes_pending(mddev) < 0) 4080 return -ENOMEM; 4081 4082 if (mddev->private == NULL) { 4083 conf = setup_conf(mddev); 4084 if (IS_ERR(conf)) 4085 return PTR_ERR(conf); 4086 mddev->private = conf; 4087 } 4088 conf = mddev->private; 4089 if (!conf) 4090 goto out; 4091 4092 if (mddev_is_clustered(conf->mddev)) { 4093 int fc, fo; 4094 4095 fc = (mddev->layout >> 8) & 255; 4096 fo = mddev->layout & (1<<16); 4097 if (fc > 1 || fo > 0) { 4098 pr_err("only near layout is supported by clustered" 4099 " raid10\n"); 4100 goto out_free_conf; 4101 } 4102 } 4103 4104 mddev->thread = conf->thread; 4105 conf->thread = NULL; 4106 4107 if (mddev->queue) { 4108 blk_queue_max_discard_sectors(mddev->queue, 4109 UINT_MAX); 4110 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 4111 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 4112 raid10_set_io_opt(conf); 4113 } 4114 4115 rdev_for_each(rdev, mddev) { 4116 long long diff; 4117 4118 disk_idx = rdev->raid_disk; 4119 if (disk_idx < 0) 4120 continue; 4121 if (disk_idx >= conf->geo.raid_disks && 4122 disk_idx >= conf->prev.raid_disks) 4123 continue; 4124 disk = conf->mirrors + disk_idx; 4125 4126 if (test_bit(Replacement, &rdev->flags)) { 4127 if (disk->replacement) 4128 goto out_free_conf; 4129 disk->replacement = rdev; 4130 } else { 4131 if (disk->rdev) 4132 goto out_free_conf; 4133 disk->rdev = rdev; 4134 } 4135 diff = (rdev->new_data_offset - rdev->data_offset); 4136 if (!mddev->reshape_backwards) 4137 diff = -diff; 4138 if (diff < 0) 4139 diff = 0; 4140 if (first || diff < min_offset_diff) 4141 min_offset_diff = diff; 4142 4143 if (mddev->gendisk) 4144 disk_stack_limits(mddev->gendisk, rdev->bdev, 4145 rdev->data_offset << 9); 4146 4147 disk->head_position = 0; 4148 first = 0; 4149 } 4150 4151 /* need to check that every block has at least one working mirror */ 4152 if (!enough(conf, -1)) { 4153 pr_err("md/raid10:%s: not enough operational mirrors.\n", 4154 mdname(mddev)); 4155 goto out_free_conf; 4156 } 4157 4158 if (conf->reshape_progress != MaxSector) { 4159 /* must ensure that shape change is supported */ 4160 if (conf->geo.far_copies != 1 && 4161 conf->geo.far_offset == 0) 4162 goto out_free_conf; 4163 if (conf->prev.far_copies != 1 && 4164 conf->prev.far_offset == 0) 4165 goto out_free_conf; 4166 } 4167 4168 mddev->degraded = 0; 4169 for (i = 0; 4170 i < conf->geo.raid_disks 4171 || i < conf->prev.raid_disks; 4172 i++) { 4173 4174 disk = conf->mirrors + i; 4175 4176 if (!disk->rdev && disk->replacement) { 4177 /* The replacement is all we have - use it */ 4178 disk->rdev = disk->replacement; 4179 disk->replacement = NULL; 4180 clear_bit(Replacement, &disk->rdev->flags); 4181 } 4182 4183 if (!disk->rdev || 4184 !test_bit(In_sync, &disk->rdev->flags)) { 4185 disk->head_position = 0; 4186 mddev->degraded++; 4187 if (disk->rdev && 4188 disk->rdev->saved_raid_disk < 0) 4189 conf->fullsync = 1; 4190 } 4191 4192 if (disk->replacement && 4193 !test_bit(In_sync, &disk->replacement->flags) && 4194 disk->replacement->saved_raid_disk < 0) { 4195 conf->fullsync = 1; 4196 } 4197 4198 disk->recovery_disabled = mddev->recovery_disabled - 1; 4199 } 4200 4201 if (mddev->recovery_cp != MaxSector) 4202 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", 4203 mdname(mddev)); 4204 pr_info("md/raid10:%s: active with %d out of %d devices\n", 4205 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 4206 conf->geo.raid_disks); 4207 /* 4208 * Ok, everything is just fine now 4209 */ 4210 mddev->dev_sectors = conf->dev_sectors; 4211 size = raid10_size(mddev, 0, 0); 4212 md_set_array_sectors(mddev, size); 4213 mddev->resync_max_sectors = size; 4214 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 4215 4216 if (md_integrity_register(mddev)) 4217 goto out_free_conf; 4218 4219 if (conf->reshape_progress != MaxSector) { 4220 unsigned long before_length, after_length; 4221 4222 before_length = ((1 << conf->prev.chunk_shift) * 4223 conf->prev.far_copies); 4224 after_length = ((1 << conf->geo.chunk_shift) * 4225 conf->geo.far_copies); 4226 4227 if (max(before_length, after_length) > min_offset_diff) { 4228 /* This cannot work */ 4229 pr_warn("md/raid10: offset difference not enough to continue reshape\n"); 4230 goto out_free_conf; 4231 } 4232 conf->offset_diff = min_offset_diff; 4233 4234 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4235 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4236 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4237 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4238 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4239 "reshape"); 4240 if (!mddev->sync_thread) 4241 goto out_free_conf; 4242 } 4243 4244 return 0; 4245 4246 out_free_conf: 4247 md_unregister_thread(&mddev->thread); 4248 mempool_exit(&conf->r10bio_pool); 4249 safe_put_page(conf->tmppage); 4250 kfree(conf->mirrors); 4251 kfree(conf); 4252 mddev->private = NULL; 4253 out: 4254 return -EIO; 4255 } 4256 4257 static void raid10_free(struct mddev *mddev, void *priv) 4258 { 4259 struct r10conf *conf = priv; 4260 4261 mempool_exit(&conf->r10bio_pool); 4262 safe_put_page(conf->tmppage); 4263 kfree(conf->mirrors); 4264 kfree(conf->mirrors_old); 4265 kfree(conf->mirrors_new); 4266 bioset_exit(&conf->bio_split); 4267 kfree(conf); 4268 } 4269 4270 static void raid10_quiesce(struct mddev *mddev, int quiesce) 4271 { 4272 struct r10conf *conf = mddev->private; 4273 4274 if (quiesce) 4275 raise_barrier(conf, 0); 4276 else 4277 lower_barrier(conf); 4278 } 4279 4280 static int raid10_resize(struct mddev *mddev, sector_t sectors) 4281 { 4282 /* Resize of 'far' arrays is not supported. 4283 * For 'near' and 'offset' arrays we can set the 4284 * number of sectors used to be an appropriate multiple 4285 * of the chunk size. 4286 * For 'offset', this is far_copies*chunksize. 4287 * For 'near' the multiplier is the LCM of 4288 * near_copies and raid_disks. 4289 * So if far_copies > 1 && !far_offset, fail. 4290 * Else find LCM(raid_disks, near_copy)*far_copies and 4291 * multiply by chunk_size. Then round to this number. 4292 * This is mostly done by raid10_size() 4293 */ 4294 struct r10conf *conf = mddev->private; 4295 sector_t oldsize, size; 4296 4297 if (mddev->reshape_position != MaxSector) 4298 return -EBUSY; 4299 4300 if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 4301 return -EINVAL; 4302 4303 oldsize = raid10_size(mddev, 0, 0); 4304 size = raid10_size(mddev, sectors, 0); 4305 if (mddev->external_size && 4306 mddev->array_sectors > size) 4307 return -EINVAL; 4308 if (mddev->bitmap) { 4309 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); 4310 if (ret) 4311 return ret; 4312 } 4313 md_set_array_sectors(mddev, size); 4314 if (sectors > mddev->dev_sectors && 4315 mddev->recovery_cp > oldsize) { 4316 mddev->recovery_cp = oldsize; 4317 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4318 } 4319 calc_sectors(conf, sectors); 4320 mddev->dev_sectors = conf->dev_sectors; 4321 mddev->resync_max_sectors = size; 4322 return 0; 4323 } 4324 4325 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) 4326 { 4327 struct md_rdev *rdev; 4328 struct r10conf *conf; 4329 4330 if (mddev->degraded > 0) { 4331 pr_warn("md/raid10:%s: Error: degraded raid0!\n", 4332 mdname(mddev)); 4333 return ERR_PTR(-EINVAL); 4334 } 4335 sector_div(size, devs); 4336 4337 /* Set new parameters */ 4338 mddev->new_level = 10; 4339 /* new layout: far_copies = 1, near_copies = 2 */ 4340 mddev->new_layout = (1<<8) + 2; 4341 mddev->new_chunk_sectors = mddev->chunk_sectors; 4342 mddev->delta_disks = mddev->raid_disks; 4343 mddev->raid_disks *= 2; 4344 /* make sure it will be not marked as dirty */ 4345 mddev->recovery_cp = MaxSector; 4346 mddev->dev_sectors = size; 4347 4348 conf = setup_conf(mddev); 4349 if (!IS_ERR(conf)) { 4350 rdev_for_each(rdev, mddev) 4351 if (rdev->raid_disk >= 0) { 4352 rdev->new_raid_disk = rdev->raid_disk * 2; 4353 rdev->sectors = size; 4354 } 4355 conf->barrier = 1; 4356 } 4357 4358 return conf; 4359 } 4360 4361 static void *raid10_takeover(struct mddev *mddev) 4362 { 4363 struct r0conf *raid0_conf; 4364 4365 /* raid10 can take over: 4366 * raid0 - providing it has only two drives 4367 */ 4368 if (mddev->level == 0) { 4369 /* for raid0 takeover only one zone is supported */ 4370 raid0_conf = mddev->private; 4371 if (raid0_conf->nr_strip_zones > 1) { 4372 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", 4373 mdname(mddev)); 4374 return ERR_PTR(-EINVAL); 4375 } 4376 return raid10_takeover_raid0(mddev, 4377 raid0_conf->strip_zone->zone_end, 4378 raid0_conf->strip_zone->nb_dev); 4379 } 4380 return ERR_PTR(-EINVAL); 4381 } 4382 4383 static int raid10_check_reshape(struct mddev *mddev) 4384 { 4385 /* Called when there is a request to change 4386 * - layout (to ->new_layout) 4387 * - chunk size (to ->new_chunk_sectors) 4388 * - raid_disks (by delta_disks) 4389 * or when trying to restart a reshape that was ongoing. 4390 * 4391 * We need to validate the request and possibly allocate 4392 * space if that might be an issue later. 4393 * 4394 * Currently we reject any reshape of a 'far' mode array, 4395 * allow chunk size to change if new is generally acceptable, 4396 * allow raid_disks to increase, and allow 4397 * a switch between 'near' mode and 'offset' mode. 4398 */ 4399 struct r10conf *conf = mddev->private; 4400 struct geom geo; 4401 4402 if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 4403 return -EINVAL; 4404 4405 if (setup_geo(&geo, mddev, geo_start) != conf->copies) 4406 /* mustn't change number of copies */ 4407 return -EINVAL; 4408 if (geo.far_copies > 1 && !geo.far_offset) 4409 /* Cannot switch to 'far' mode */ 4410 return -EINVAL; 4411 4412 if (mddev->array_sectors & geo.chunk_mask) 4413 /* not factor of array size */ 4414 return -EINVAL; 4415 4416 if (!enough(conf, -1)) 4417 return -EINVAL; 4418 4419 kfree(conf->mirrors_new); 4420 conf->mirrors_new = NULL; 4421 if (mddev->delta_disks > 0) { 4422 /* allocate new 'mirrors' list */ 4423 conf->mirrors_new = 4424 kcalloc(mddev->raid_disks + mddev->delta_disks, 4425 sizeof(struct raid10_info), 4426 GFP_KERNEL); 4427 if (!conf->mirrors_new) 4428 return -ENOMEM; 4429 } 4430 return 0; 4431 } 4432 4433 /* 4434 * Need to check if array has failed when deciding whether to: 4435 * - start an array 4436 * - remove non-faulty devices 4437 * - add a spare 4438 * - allow a reshape 4439 * This determination is simple when no reshape is happening. 4440 * However if there is a reshape, we need to carefully check 4441 * both the before and after sections. 4442 * This is because some failed devices may only affect one 4443 * of the two sections, and some non-in_sync devices may 4444 * be insync in the section most affected by failed devices. 4445 */ 4446 static int calc_degraded(struct r10conf *conf) 4447 { 4448 int degraded, degraded2; 4449 int i; 4450 4451 rcu_read_lock(); 4452 degraded = 0; 4453 /* 'prev' section first */ 4454 for (i = 0; i < conf->prev.raid_disks; i++) { 4455 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4456 if (!rdev || test_bit(Faulty, &rdev->flags)) 4457 degraded++; 4458 else if (!test_bit(In_sync, &rdev->flags)) 4459 /* When we can reduce the number of devices in 4460 * an array, this might not contribute to 4461 * 'degraded'. It does now. 4462 */ 4463 degraded++; 4464 } 4465 rcu_read_unlock(); 4466 if (conf->geo.raid_disks == conf->prev.raid_disks) 4467 return degraded; 4468 rcu_read_lock(); 4469 degraded2 = 0; 4470 for (i = 0; i < conf->geo.raid_disks; i++) { 4471 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 4472 if (!rdev || test_bit(Faulty, &rdev->flags)) 4473 degraded2++; 4474 else if (!test_bit(In_sync, &rdev->flags)) { 4475 /* If reshape is increasing the number of devices, 4476 * this section has already been recovered, so 4477 * it doesn't contribute to degraded. 4478 * else it does. 4479 */ 4480 if (conf->geo.raid_disks <= conf->prev.raid_disks) 4481 degraded2++; 4482 } 4483 } 4484 rcu_read_unlock(); 4485 if (degraded2 > degraded) 4486 return degraded2; 4487 return degraded; 4488 } 4489 4490 static int raid10_start_reshape(struct mddev *mddev) 4491 { 4492 /* A 'reshape' has been requested. This commits 4493 * the various 'new' fields and sets MD_RECOVER_RESHAPE 4494 * This also checks if there are enough spares and adds them 4495 * to the array. 4496 * We currently require enough spares to make the final 4497 * array non-degraded. We also require that the difference 4498 * between old and new data_offset - on each device - is 4499 * enough that we never risk over-writing. 4500 */ 4501 4502 unsigned long before_length, after_length; 4503 sector_t min_offset_diff = 0; 4504 int first = 1; 4505 struct geom new; 4506 struct r10conf *conf = mddev->private; 4507 struct md_rdev *rdev; 4508 int spares = 0; 4509 int ret; 4510 4511 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4512 return -EBUSY; 4513 4514 if (setup_geo(&new, mddev, geo_start) != conf->copies) 4515 return -EINVAL; 4516 4517 before_length = ((1 << conf->prev.chunk_shift) * 4518 conf->prev.far_copies); 4519 after_length = ((1 << conf->geo.chunk_shift) * 4520 conf->geo.far_copies); 4521 4522 rdev_for_each(rdev, mddev) { 4523 if (!test_bit(In_sync, &rdev->flags) 4524 && !test_bit(Faulty, &rdev->flags)) 4525 spares++; 4526 if (rdev->raid_disk >= 0) { 4527 long long diff = (rdev->new_data_offset 4528 - rdev->data_offset); 4529 if (!mddev->reshape_backwards) 4530 diff = -diff; 4531 if (diff < 0) 4532 diff = 0; 4533 if (first || diff < min_offset_diff) 4534 min_offset_diff = diff; 4535 first = 0; 4536 } 4537 } 4538 4539 if (max(before_length, after_length) > min_offset_diff) 4540 return -EINVAL; 4541 4542 if (spares < mddev->delta_disks) 4543 return -EINVAL; 4544 4545 conf->offset_diff = min_offset_diff; 4546 spin_lock_irq(&conf->device_lock); 4547 if (conf->mirrors_new) { 4548 memcpy(conf->mirrors_new, conf->mirrors, 4549 sizeof(struct raid10_info)*conf->prev.raid_disks); 4550 smp_mb(); 4551 kfree(conf->mirrors_old); 4552 conf->mirrors_old = conf->mirrors; 4553 conf->mirrors = conf->mirrors_new; 4554 conf->mirrors_new = NULL; 4555 } 4556 setup_geo(&conf->geo, mddev, geo_start); 4557 smp_mb(); 4558 if (mddev->reshape_backwards) { 4559 sector_t size = raid10_size(mddev, 0, 0); 4560 if (size < mddev->array_sectors) { 4561 spin_unlock_irq(&conf->device_lock); 4562 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", 4563 mdname(mddev)); 4564 return -EINVAL; 4565 } 4566 mddev->resync_max_sectors = size; 4567 conf->reshape_progress = size; 4568 } else 4569 conf->reshape_progress = 0; 4570 conf->reshape_safe = conf->reshape_progress; 4571 spin_unlock_irq(&conf->device_lock); 4572 4573 if (mddev->delta_disks && mddev->bitmap) { 4574 struct mdp_superblock_1 *sb = NULL; 4575 sector_t oldsize, newsize; 4576 4577 oldsize = raid10_size(mddev, 0, 0); 4578 newsize = raid10_size(mddev, 0, conf->geo.raid_disks); 4579 4580 if (!mddev_is_clustered(mddev)) { 4581 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4582 if (ret) 4583 goto abort; 4584 else 4585 goto out; 4586 } 4587 4588 rdev_for_each(rdev, mddev) { 4589 if (rdev->raid_disk > -1 && 4590 !test_bit(Faulty, &rdev->flags)) 4591 sb = page_address(rdev->sb_page); 4592 } 4593 4594 /* 4595 * some node is already performing reshape, and no need to 4596 * call md_bitmap_resize again since it should be called when 4597 * receiving BITMAP_RESIZE msg 4598 */ 4599 if ((sb && (le32_to_cpu(sb->feature_map) & 4600 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) 4601 goto out; 4602 4603 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4604 if (ret) 4605 goto abort; 4606 4607 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); 4608 if (ret) { 4609 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); 4610 goto abort; 4611 } 4612 } 4613 out: 4614 if (mddev->delta_disks > 0) { 4615 rdev_for_each(rdev, mddev) 4616 if (rdev->raid_disk < 0 && 4617 !test_bit(Faulty, &rdev->flags)) { 4618 if (raid10_add_disk(mddev, rdev) == 0) { 4619 if (rdev->raid_disk >= 4620 conf->prev.raid_disks) 4621 set_bit(In_sync, &rdev->flags); 4622 else 4623 rdev->recovery_offset = 0; 4624 4625 /* Failure here is OK */ 4626 sysfs_link_rdev(mddev, rdev); 4627 } 4628 } else if (rdev->raid_disk >= conf->prev.raid_disks 4629 && !test_bit(Faulty, &rdev->flags)) { 4630 /* This is a spare that was manually added */ 4631 set_bit(In_sync, &rdev->flags); 4632 } 4633 } 4634 /* When a reshape changes the number of devices, 4635 * ->degraded is measured against the larger of the 4636 * pre and post numbers. 4637 */ 4638 spin_lock_irq(&conf->device_lock); 4639 mddev->degraded = calc_degraded(conf); 4640 spin_unlock_irq(&conf->device_lock); 4641 mddev->raid_disks = conf->geo.raid_disks; 4642 mddev->reshape_position = conf->reshape_progress; 4643 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4644 4645 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4646 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4647 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 4648 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4649 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4650 4651 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4652 "reshape"); 4653 if (!mddev->sync_thread) { 4654 ret = -EAGAIN; 4655 goto abort; 4656 } 4657 conf->reshape_checkpoint = jiffies; 4658 md_wakeup_thread(mddev->sync_thread); 4659 md_new_event(); 4660 return 0; 4661 4662 abort: 4663 mddev->recovery = 0; 4664 spin_lock_irq(&conf->device_lock); 4665 conf->geo = conf->prev; 4666 mddev->raid_disks = conf->geo.raid_disks; 4667 rdev_for_each(rdev, mddev) 4668 rdev->new_data_offset = rdev->data_offset; 4669 smp_wmb(); 4670 conf->reshape_progress = MaxSector; 4671 conf->reshape_safe = MaxSector; 4672 mddev->reshape_position = MaxSector; 4673 spin_unlock_irq(&conf->device_lock); 4674 return ret; 4675 } 4676 4677 /* Calculate the last device-address that could contain 4678 * any block from the chunk that includes the array-address 's' 4679 * and report the next address. 4680 * i.e. the address returned will be chunk-aligned and after 4681 * any data that is in the chunk containing 's'. 4682 */ 4683 static sector_t last_dev_address(sector_t s, struct geom *geo) 4684 { 4685 s = (s | geo->chunk_mask) + 1; 4686 s >>= geo->chunk_shift; 4687 s *= geo->near_copies; 4688 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 4689 s *= geo->far_copies; 4690 s <<= geo->chunk_shift; 4691 return s; 4692 } 4693 4694 /* Calculate the first device-address that could contain 4695 * any block from the chunk that includes the array-address 's'. 4696 * This too will be the start of a chunk 4697 */ 4698 static sector_t first_dev_address(sector_t s, struct geom *geo) 4699 { 4700 s >>= geo->chunk_shift; 4701 s *= geo->near_copies; 4702 sector_div(s, geo->raid_disks); 4703 s *= geo->far_copies; 4704 s <<= geo->chunk_shift; 4705 return s; 4706 } 4707 4708 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 4709 int *skipped) 4710 { 4711 /* We simply copy at most one chunk (smallest of old and new) 4712 * at a time, possibly less if that exceeds RESYNC_PAGES, 4713 * or we hit a bad block or something. 4714 * This might mean we pause for normal IO in the middle of 4715 * a chunk, but that is not a problem as mddev->reshape_position 4716 * can record any location. 4717 * 4718 * If we will want to write to a location that isn't 4719 * yet recorded as 'safe' (i.e. in metadata on disk) then 4720 * we need to flush all reshape requests and update the metadata. 4721 * 4722 * When reshaping forwards (e.g. to more devices), we interpret 4723 * 'safe' as the earliest block which might not have been copied 4724 * down yet. We divide this by previous stripe size and multiply 4725 * by previous stripe length to get lowest device offset that we 4726 * cannot write to yet. 4727 * We interpret 'sector_nr' as an address that we want to write to. 4728 * From this we use last_device_address() to find where we might 4729 * write to, and first_device_address on the 'safe' position. 4730 * If this 'next' write position is after the 'safe' position, 4731 * we must update the metadata to increase the 'safe' position. 4732 * 4733 * When reshaping backwards, we round in the opposite direction 4734 * and perform the reverse test: next write position must not be 4735 * less than current safe position. 4736 * 4737 * In all this the minimum difference in data offsets 4738 * (conf->offset_diff - always positive) allows a bit of slack, 4739 * so next can be after 'safe', but not by more than offset_diff 4740 * 4741 * We need to prepare all the bios here before we start any IO 4742 * to ensure the size we choose is acceptable to all devices. 4743 * The means one for each copy for write-out and an extra one for 4744 * read-in. 4745 * We store the read-in bio in ->master_bio and the others in 4746 * ->devs[x].bio and ->devs[x].repl_bio. 4747 */ 4748 struct r10conf *conf = mddev->private; 4749 struct r10bio *r10_bio; 4750 sector_t next, safe, last; 4751 int max_sectors; 4752 int nr_sectors; 4753 int s; 4754 struct md_rdev *rdev; 4755 int need_flush = 0; 4756 struct bio *blist; 4757 struct bio *bio, *read_bio; 4758 int sectors_done = 0; 4759 struct page **pages; 4760 4761 if (sector_nr == 0) { 4762 /* If restarting in the middle, skip the initial sectors */ 4763 if (mddev->reshape_backwards && 4764 conf->reshape_progress < raid10_size(mddev, 0, 0)) { 4765 sector_nr = (raid10_size(mddev, 0, 0) 4766 - conf->reshape_progress); 4767 } else if (!mddev->reshape_backwards && 4768 conf->reshape_progress > 0) 4769 sector_nr = conf->reshape_progress; 4770 if (sector_nr) { 4771 mddev->curr_resync_completed = sector_nr; 4772 sysfs_notify_dirent_safe(mddev->sysfs_completed); 4773 *skipped = 1; 4774 return sector_nr; 4775 } 4776 } 4777 4778 /* We don't use sector_nr to track where we are up to 4779 * as that doesn't work well for ->reshape_backwards. 4780 * So just use ->reshape_progress. 4781 */ 4782 if (mddev->reshape_backwards) { 4783 /* 'next' is the earliest device address that we might 4784 * write to for this chunk in the new layout 4785 */ 4786 next = first_dev_address(conf->reshape_progress - 1, 4787 &conf->geo); 4788 4789 /* 'safe' is the last device address that we might read from 4790 * in the old layout after a restart 4791 */ 4792 safe = last_dev_address(conf->reshape_safe - 1, 4793 &conf->prev); 4794 4795 if (next + conf->offset_diff < safe) 4796 need_flush = 1; 4797 4798 last = conf->reshape_progress - 1; 4799 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 4800 & conf->prev.chunk_mask); 4801 if (sector_nr + RESYNC_SECTORS < last) 4802 sector_nr = last + 1 - RESYNC_SECTORS; 4803 } else { 4804 /* 'next' is after the last device address that we 4805 * might write to for this chunk in the new layout 4806 */ 4807 next = last_dev_address(conf->reshape_progress, &conf->geo); 4808 4809 /* 'safe' is the earliest device address that we might 4810 * read from in the old layout after a restart 4811 */ 4812 safe = first_dev_address(conf->reshape_safe, &conf->prev); 4813 4814 /* Need to update metadata if 'next' might be beyond 'safe' 4815 * as that would possibly corrupt data 4816 */ 4817 if (next > safe + conf->offset_diff) 4818 need_flush = 1; 4819 4820 sector_nr = conf->reshape_progress; 4821 last = sector_nr | (conf->geo.chunk_mask 4822 & conf->prev.chunk_mask); 4823 4824 if (sector_nr + RESYNC_SECTORS <= last) 4825 last = sector_nr + RESYNC_SECTORS - 1; 4826 } 4827 4828 if (need_flush || 4829 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4830 /* Need to update reshape_position in metadata */ 4831 wait_barrier(conf, false); 4832 mddev->reshape_position = conf->reshape_progress; 4833 if (mddev->reshape_backwards) 4834 mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 4835 - conf->reshape_progress; 4836 else 4837 mddev->curr_resync_completed = conf->reshape_progress; 4838 conf->reshape_checkpoint = jiffies; 4839 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4840 md_wakeup_thread(mddev->thread); 4841 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 4842 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4843 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4844 allow_barrier(conf); 4845 return sectors_done; 4846 } 4847 conf->reshape_safe = mddev->reshape_position; 4848 allow_barrier(conf); 4849 } 4850 4851 raise_barrier(conf, 0); 4852 read_more: 4853 /* Now schedule reads for blocks from sector_nr to last */ 4854 r10_bio = raid10_alloc_init_r10buf(conf); 4855 r10_bio->state = 0; 4856 raise_barrier(conf, 1); 4857 atomic_set(&r10_bio->remaining, 0); 4858 r10_bio->mddev = mddev; 4859 r10_bio->sector = sector_nr; 4860 set_bit(R10BIO_IsReshape, &r10_bio->state); 4861 r10_bio->sectors = last - sector_nr + 1; 4862 rdev = read_balance(conf, r10_bio, &max_sectors); 4863 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 4864 4865 if (!rdev) { 4866 /* Cannot read from here, so need to record bad blocks 4867 * on all the target devices. 4868 */ 4869 // FIXME 4870 mempool_free(r10_bio, &conf->r10buf_pool); 4871 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4872 return sectors_done; 4873 } 4874 4875 read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ, 4876 GFP_KERNEL, &mddev->bio_set); 4877 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 4878 + rdev->data_offset); 4879 read_bio->bi_private = r10_bio; 4880 read_bio->bi_end_io = end_reshape_read; 4881 r10_bio->master_bio = read_bio; 4882 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4883 4884 /* 4885 * Broadcast RESYNC message to other nodes, so all nodes would not 4886 * write to the region to avoid conflict. 4887 */ 4888 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { 4889 struct mdp_superblock_1 *sb = NULL; 4890 int sb_reshape_pos = 0; 4891 4892 conf->cluster_sync_low = sector_nr; 4893 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; 4894 sb = page_address(rdev->sb_page); 4895 if (sb) { 4896 sb_reshape_pos = le64_to_cpu(sb->reshape_position); 4897 /* 4898 * Set cluster_sync_low again if next address for array 4899 * reshape is less than cluster_sync_low. Since we can't 4900 * update cluster_sync_low until it has finished reshape. 4901 */ 4902 if (sb_reshape_pos < conf->cluster_sync_low) 4903 conf->cluster_sync_low = sb_reshape_pos; 4904 } 4905 4906 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, 4907 conf->cluster_sync_high); 4908 } 4909 4910 /* Now find the locations in the new layout */ 4911 __raid10_find_phys(&conf->geo, r10_bio); 4912 4913 blist = read_bio; 4914 read_bio->bi_next = NULL; 4915 4916 rcu_read_lock(); 4917 for (s = 0; s < conf->copies*2; s++) { 4918 struct bio *b; 4919 int d = r10_bio->devs[s/2].devnum; 4920 struct md_rdev *rdev2; 4921 if (s&1) { 4922 rdev2 = rcu_dereference(conf->mirrors[d].replacement); 4923 b = r10_bio->devs[s/2].repl_bio; 4924 } else { 4925 rdev2 = rcu_dereference(conf->mirrors[d].rdev); 4926 b = r10_bio->devs[s/2].bio; 4927 } 4928 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4929 continue; 4930 4931 bio_set_dev(b, rdev2->bdev); 4932 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + 4933 rdev2->new_data_offset; 4934 b->bi_end_io = end_reshape_write; 4935 bio_set_op_attrs(b, REQ_OP_WRITE, 0); 4936 b->bi_next = blist; 4937 blist = b; 4938 } 4939 4940 /* Now add as many pages as possible to all of these bios. */ 4941 4942 nr_sectors = 0; 4943 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 4944 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 4945 struct page *page = pages[s / (PAGE_SIZE >> 9)]; 4946 int len = (max_sectors - s) << 9; 4947 if (len > PAGE_SIZE) 4948 len = PAGE_SIZE; 4949 for (bio = blist; bio ; bio = bio->bi_next) { 4950 /* 4951 * won't fail because the vec table is big enough 4952 * to hold all these pages 4953 */ 4954 bio_add_page(bio, page, len, 0); 4955 } 4956 sector_nr += len >> 9; 4957 nr_sectors += len >> 9; 4958 } 4959 rcu_read_unlock(); 4960 r10_bio->sectors = nr_sectors; 4961 4962 /* Now submit the read */ 4963 md_sync_acct_bio(read_bio, r10_bio->sectors); 4964 atomic_inc(&r10_bio->remaining); 4965 read_bio->bi_next = NULL; 4966 submit_bio_noacct(read_bio); 4967 sectors_done += nr_sectors; 4968 if (sector_nr <= last) 4969 goto read_more; 4970 4971 lower_barrier(conf); 4972 4973 /* Now that we have done the whole section we can 4974 * update reshape_progress 4975 */ 4976 if (mddev->reshape_backwards) 4977 conf->reshape_progress -= sectors_done; 4978 else 4979 conf->reshape_progress += sectors_done; 4980 4981 return sectors_done; 4982 } 4983 4984 static void end_reshape_request(struct r10bio *r10_bio); 4985 static int handle_reshape_read_error(struct mddev *mddev, 4986 struct r10bio *r10_bio); 4987 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 4988 { 4989 /* Reshape read completed. Hopefully we have a block 4990 * to write out. 4991 * If we got a read error then we do sync 1-page reads from 4992 * elsewhere until we find the data - or give up. 4993 */ 4994 struct r10conf *conf = mddev->private; 4995 int s; 4996 4997 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 4998 if (handle_reshape_read_error(mddev, r10_bio) < 0) { 4999 /* Reshape has been aborted */ 5000 md_done_sync(mddev, r10_bio->sectors, 0); 5001 return; 5002 } 5003 5004 /* We definitely have the data in the pages, schedule the 5005 * writes. 5006 */ 5007 atomic_set(&r10_bio->remaining, 1); 5008 for (s = 0; s < conf->copies*2; s++) { 5009 struct bio *b; 5010 int d = r10_bio->devs[s/2].devnum; 5011 struct md_rdev *rdev; 5012 rcu_read_lock(); 5013 if (s&1) { 5014 rdev = rcu_dereference(conf->mirrors[d].replacement); 5015 b = r10_bio->devs[s/2].repl_bio; 5016 } else { 5017 rdev = rcu_dereference(conf->mirrors[d].rdev); 5018 b = r10_bio->devs[s/2].bio; 5019 } 5020 if (!rdev || test_bit(Faulty, &rdev->flags)) { 5021 rcu_read_unlock(); 5022 continue; 5023 } 5024 atomic_inc(&rdev->nr_pending); 5025 rcu_read_unlock(); 5026 md_sync_acct_bio(b, r10_bio->sectors); 5027 atomic_inc(&r10_bio->remaining); 5028 b->bi_next = NULL; 5029 submit_bio_noacct(b); 5030 } 5031 end_reshape_request(r10_bio); 5032 } 5033 5034 static void end_reshape(struct r10conf *conf) 5035 { 5036 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 5037 return; 5038 5039 spin_lock_irq(&conf->device_lock); 5040 conf->prev = conf->geo; 5041 md_finish_reshape(conf->mddev); 5042 smp_wmb(); 5043 conf->reshape_progress = MaxSector; 5044 conf->reshape_safe = MaxSector; 5045 spin_unlock_irq(&conf->device_lock); 5046 5047 if (conf->mddev->queue) 5048 raid10_set_io_opt(conf); 5049 conf->fullsync = 0; 5050 } 5051 5052 static void raid10_update_reshape_pos(struct mddev *mddev) 5053 { 5054 struct r10conf *conf = mddev->private; 5055 sector_t lo, hi; 5056 5057 md_cluster_ops->resync_info_get(mddev, &lo, &hi); 5058 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) 5059 || mddev->reshape_position == MaxSector) 5060 conf->reshape_progress = mddev->reshape_position; 5061 else 5062 WARN_ON_ONCE(1); 5063 } 5064 5065 static int handle_reshape_read_error(struct mddev *mddev, 5066 struct r10bio *r10_bio) 5067 { 5068 /* Use sync reads to get the blocks from somewhere else */ 5069 int sectors = r10_bio->sectors; 5070 struct r10conf *conf = mddev->private; 5071 struct r10bio *r10b; 5072 int slot = 0; 5073 int idx = 0; 5074 struct page **pages; 5075 5076 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); 5077 if (!r10b) { 5078 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5079 return -ENOMEM; 5080 } 5081 5082 /* reshape IOs share pages from .devs[0].bio */ 5083 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 5084 5085 r10b->sector = r10_bio->sector; 5086 __raid10_find_phys(&conf->prev, r10b); 5087 5088 while (sectors) { 5089 int s = sectors; 5090 int success = 0; 5091 int first_slot = slot; 5092 5093 if (s > (PAGE_SIZE >> 9)) 5094 s = PAGE_SIZE >> 9; 5095 5096 rcu_read_lock(); 5097 while (!success) { 5098 int d = r10b->devs[slot].devnum; 5099 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5100 sector_t addr; 5101 if (rdev == NULL || 5102 test_bit(Faulty, &rdev->flags) || 5103 !test_bit(In_sync, &rdev->flags)) 5104 goto failed; 5105 5106 addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 5107 atomic_inc(&rdev->nr_pending); 5108 rcu_read_unlock(); 5109 success = sync_page_io(rdev, 5110 addr, 5111 s << 9, 5112 pages[idx], 5113 REQ_OP_READ, false); 5114 rdev_dec_pending(rdev, mddev); 5115 rcu_read_lock(); 5116 if (success) 5117 break; 5118 failed: 5119 slot++; 5120 if (slot >= conf->copies) 5121 slot = 0; 5122 if (slot == first_slot) 5123 break; 5124 } 5125 rcu_read_unlock(); 5126 if (!success) { 5127 /* couldn't read this block, must give up */ 5128 set_bit(MD_RECOVERY_INTR, 5129 &mddev->recovery); 5130 kfree(r10b); 5131 return -EIO; 5132 } 5133 sectors -= s; 5134 idx++; 5135 } 5136 kfree(r10b); 5137 return 0; 5138 } 5139 5140 static void end_reshape_write(struct bio *bio) 5141 { 5142 struct r10bio *r10_bio = get_resync_r10bio(bio); 5143 struct mddev *mddev = r10_bio->mddev; 5144 struct r10conf *conf = mddev->private; 5145 int d; 5146 int slot; 5147 int repl; 5148 struct md_rdev *rdev = NULL; 5149 5150 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 5151 if (repl) 5152 rdev = conf->mirrors[d].replacement; 5153 if (!rdev) { 5154 smp_mb(); 5155 rdev = conf->mirrors[d].rdev; 5156 } 5157 5158 if (bio->bi_status) { 5159 /* FIXME should record badblock */ 5160 md_error(mddev, rdev); 5161 } 5162 5163 rdev_dec_pending(rdev, mddev); 5164 end_reshape_request(r10_bio); 5165 } 5166 5167 static void end_reshape_request(struct r10bio *r10_bio) 5168 { 5169 if (!atomic_dec_and_test(&r10_bio->remaining)) 5170 return; 5171 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 5172 bio_put(r10_bio->master_bio); 5173 put_buf(r10_bio); 5174 } 5175 5176 static void raid10_finish_reshape(struct mddev *mddev) 5177 { 5178 struct r10conf *conf = mddev->private; 5179 5180 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5181 return; 5182 5183 if (mddev->delta_disks > 0) { 5184 if (mddev->recovery_cp > mddev->resync_max_sectors) { 5185 mddev->recovery_cp = mddev->resync_max_sectors; 5186 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5187 } 5188 mddev->resync_max_sectors = mddev->array_sectors; 5189 } else { 5190 int d; 5191 rcu_read_lock(); 5192 for (d = conf->geo.raid_disks ; 5193 d < conf->geo.raid_disks - mddev->delta_disks; 5194 d++) { 5195 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 5196 if (rdev) 5197 clear_bit(In_sync, &rdev->flags); 5198 rdev = rcu_dereference(conf->mirrors[d].replacement); 5199 if (rdev) 5200 clear_bit(In_sync, &rdev->flags); 5201 } 5202 rcu_read_unlock(); 5203 } 5204 mddev->layout = mddev->new_layout; 5205 mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 5206 mddev->reshape_position = MaxSector; 5207 mddev->delta_disks = 0; 5208 mddev->reshape_backwards = 0; 5209 } 5210 5211 static struct md_personality raid10_personality = 5212 { 5213 .name = "raid10", 5214 .level = 10, 5215 .owner = THIS_MODULE, 5216 .make_request = raid10_make_request, 5217 .run = raid10_run, 5218 .free = raid10_free, 5219 .status = raid10_status, 5220 .error_handler = raid10_error, 5221 .hot_add_disk = raid10_add_disk, 5222 .hot_remove_disk= raid10_remove_disk, 5223 .spare_active = raid10_spare_active, 5224 .sync_request = raid10_sync_request, 5225 .quiesce = raid10_quiesce, 5226 .size = raid10_size, 5227 .resize = raid10_resize, 5228 .takeover = raid10_takeover, 5229 .check_reshape = raid10_check_reshape, 5230 .start_reshape = raid10_start_reshape, 5231 .finish_reshape = raid10_finish_reshape, 5232 .update_reshape_pos = raid10_update_reshape_pos, 5233 }; 5234 5235 static int __init raid_init(void) 5236 { 5237 return register_md_personality(&raid10_personality); 5238 } 5239 5240 static void raid_exit(void) 5241 { 5242 unregister_md_personality(&raid10_personality); 5243 } 5244 5245 module_init(raid_init); 5246 module_exit(raid_exit); 5247 MODULE_LICENSE("GPL"); 5248 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); 5249 MODULE_ALIAS("md-personality-9"); /* RAID10 */ 5250 MODULE_ALIAS("md-raid10"); 5251 MODULE_ALIAS("md-level-10"); 5252