1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <trace/events/block.h> 57 58 #include "md.h" 59 #include "raid5.h" 60 #include "raid0.h" 61 #include "bitmap.h" 62 63 /* 64 * Stripe cache 65 */ 66 67 #define NR_STRIPES 256 68 #define STRIPE_SIZE PAGE_SIZE 69 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 70 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 71 #define IO_THRESHOLD 1 72 #define BYPASS_THRESHOLD 1 73 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 74 #define HASH_MASK (NR_HASH - 1) 75 76 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 77 { 78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 79 return &conf->stripe_hashtbl[hash]; 80 } 81 82 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 83 * order without overlap. There may be several bio's per stripe+device, and 84 * a bio could span several devices. 85 * When walking this list for a particular stripe+device, we must never proceed 86 * beyond a bio that extends past this device, as the next bio might no longer 87 * be valid. 88 * This function is used to determine the 'next' bio in the list, given the sector 89 * of the current stripe+device 90 */ 91 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 92 { 93 int sectors = bio->bi_size >> 9; 94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 95 return bio->bi_next; 96 else 97 return NULL; 98 } 99 100 /* 101 * We maintain a biased count of active stripes in the bottom 16 bits of 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 103 */ 104 static inline int raid5_bi_processed_stripes(struct bio *bio) 105 { 106 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 107 return (atomic_read(segments) >> 16) & 0xffff; 108 } 109 110 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 111 { 112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 113 return atomic_sub_return(1, segments) & 0xffff; 114 } 115 116 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 117 { 118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 119 atomic_inc(segments); 120 } 121 122 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 123 unsigned int cnt) 124 { 125 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 126 int old, new; 127 128 do { 129 old = atomic_read(segments); 130 new = (old & 0xffff) | (cnt << 16); 131 } while (atomic_cmpxchg(segments, old, new) != old); 132 } 133 134 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 135 { 136 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 137 atomic_set(segments, cnt); 138 } 139 140 /* Find first data disk in a raid6 stripe */ 141 static inline int raid6_d0(struct stripe_head *sh) 142 { 143 if (sh->ddf_layout) 144 /* ddf always start from first device */ 145 return 0; 146 /* md starts just after Q block */ 147 if (sh->qd_idx == sh->disks - 1) 148 return 0; 149 else 150 return sh->qd_idx + 1; 151 } 152 static inline int raid6_next_disk(int disk, int raid_disks) 153 { 154 disk++; 155 return (disk < raid_disks) ? disk : 0; 156 } 157 158 /* When walking through the disks in a raid5, starting at raid6_d0, 159 * We need to map each disk to a 'slot', where the data disks are slot 160 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 161 * is raid_disks-1. This help does that mapping. 162 */ 163 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 164 int *count, int syndrome_disks) 165 { 166 int slot = *count; 167 168 if (sh->ddf_layout) 169 (*count)++; 170 if (idx == sh->pd_idx) 171 return syndrome_disks; 172 if (idx == sh->qd_idx) 173 return syndrome_disks + 1; 174 if (!sh->ddf_layout) 175 (*count)++; 176 return slot; 177 } 178 179 static void return_io(struct bio *return_bi) 180 { 181 struct bio *bi = return_bi; 182 while (bi) { 183 184 return_bi = bi->bi_next; 185 bi->bi_next = NULL; 186 bi->bi_size = 0; 187 bio_endio(bi, 0); 188 bi = return_bi; 189 } 190 } 191 192 static void print_raid5_conf (struct r5conf *conf); 193 194 static int stripe_operations_active(struct stripe_head *sh) 195 { 196 return sh->check_state || sh->reconstruct_state || 197 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 198 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 199 } 200 201 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 202 { 203 BUG_ON(!list_empty(&sh->lru)); 204 BUG_ON(atomic_read(&conf->active_stripes)==0); 205 if (test_bit(STRIPE_HANDLE, &sh->state)) { 206 if (test_bit(STRIPE_DELAYED, &sh->state) && 207 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 208 list_add_tail(&sh->lru, &conf->delayed_list); 209 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 210 sh->bm_seq - conf->seq_write > 0) 211 list_add_tail(&sh->lru, &conf->bitmap_list); 212 else { 213 clear_bit(STRIPE_DELAYED, &sh->state); 214 clear_bit(STRIPE_BIT_DELAY, &sh->state); 215 list_add_tail(&sh->lru, &conf->handle_list); 216 } 217 md_wakeup_thread(conf->mddev->thread); 218 } else { 219 BUG_ON(stripe_operations_active(sh)); 220 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 221 if (atomic_dec_return(&conf->preread_active_stripes) 222 < IO_THRESHOLD) 223 md_wakeup_thread(conf->mddev->thread); 224 atomic_dec(&conf->active_stripes); 225 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 226 list_add_tail(&sh->lru, &conf->inactive_list); 227 wake_up(&conf->wait_for_stripe); 228 if (conf->retry_read_aligned) 229 md_wakeup_thread(conf->mddev->thread); 230 } 231 } 232 } 233 234 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 235 { 236 if (atomic_dec_and_test(&sh->count)) 237 do_release_stripe(conf, sh); 238 } 239 240 static void release_stripe(struct stripe_head *sh) 241 { 242 struct r5conf *conf = sh->raid_conf; 243 unsigned long flags; 244 245 local_irq_save(flags); 246 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 247 do_release_stripe(conf, sh); 248 spin_unlock(&conf->device_lock); 249 } 250 local_irq_restore(flags); 251 } 252 253 static inline void remove_hash(struct stripe_head *sh) 254 { 255 pr_debug("remove_hash(), stripe %llu\n", 256 (unsigned long long)sh->sector); 257 258 hlist_del_init(&sh->hash); 259 } 260 261 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 262 { 263 struct hlist_head *hp = stripe_hash(conf, sh->sector); 264 265 pr_debug("insert_hash(), stripe %llu\n", 266 (unsigned long long)sh->sector); 267 268 hlist_add_head(&sh->hash, hp); 269 } 270 271 272 /* find an idle stripe, make sure it is unhashed, and return it. */ 273 static struct stripe_head *get_free_stripe(struct r5conf *conf) 274 { 275 struct stripe_head *sh = NULL; 276 struct list_head *first; 277 278 if (list_empty(&conf->inactive_list)) 279 goto out; 280 first = conf->inactive_list.next; 281 sh = list_entry(first, struct stripe_head, lru); 282 list_del_init(first); 283 remove_hash(sh); 284 atomic_inc(&conf->active_stripes); 285 out: 286 return sh; 287 } 288 289 static void shrink_buffers(struct stripe_head *sh) 290 { 291 struct page *p; 292 int i; 293 int num = sh->raid_conf->pool_size; 294 295 for (i = 0; i < num ; i++) { 296 p = sh->dev[i].page; 297 if (!p) 298 continue; 299 sh->dev[i].page = NULL; 300 put_page(p); 301 } 302 } 303 304 static int grow_buffers(struct stripe_head *sh) 305 { 306 int i; 307 int num = sh->raid_conf->pool_size; 308 309 for (i = 0; i < num; i++) { 310 struct page *page; 311 312 if (!(page = alloc_page(GFP_KERNEL))) { 313 return 1; 314 } 315 sh->dev[i].page = page; 316 } 317 return 0; 318 } 319 320 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 321 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 322 struct stripe_head *sh); 323 324 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 325 { 326 struct r5conf *conf = sh->raid_conf; 327 int i; 328 329 BUG_ON(atomic_read(&sh->count) != 0); 330 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 331 BUG_ON(stripe_operations_active(sh)); 332 333 pr_debug("init_stripe called, stripe %llu\n", 334 (unsigned long long)sh->sector); 335 336 remove_hash(sh); 337 338 sh->generation = conf->generation - previous; 339 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 340 sh->sector = sector; 341 stripe_set_idx(sector, conf, previous, sh); 342 sh->state = 0; 343 344 345 for (i = sh->disks; i--; ) { 346 struct r5dev *dev = &sh->dev[i]; 347 348 if (dev->toread || dev->read || dev->towrite || dev->written || 349 test_bit(R5_LOCKED, &dev->flags)) { 350 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 351 (unsigned long long)sh->sector, i, dev->toread, 352 dev->read, dev->towrite, dev->written, 353 test_bit(R5_LOCKED, &dev->flags)); 354 WARN_ON(1); 355 } 356 dev->flags = 0; 357 raid5_build_block(sh, i, previous); 358 } 359 insert_hash(conf, sh); 360 } 361 362 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 363 short generation) 364 { 365 struct stripe_head *sh; 366 367 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 368 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 369 if (sh->sector == sector && sh->generation == generation) 370 return sh; 371 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 372 return NULL; 373 } 374 375 /* 376 * Need to check if array has failed when deciding whether to: 377 * - start an array 378 * - remove non-faulty devices 379 * - add a spare 380 * - allow a reshape 381 * This determination is simple when no reshape is happening. 382 * However if there is a reshape, we need to carefully check 383 * both the before and after sections. 384 * This is because some failed devices may only affect one 385 * of the two sections, and some non-in_sync devices may 386 * be insync in the section most affected by failed devices. 387 */ 388 static int calc_degraded(struct r5conf *conf) 389 { 390 int degraded, degraded2; 391 int i; 392 393 rcu_read_lock(); 394 degraded = 0; 395 for (i = 0; i < conf->previous_raid_disks; i++) { 396 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 397 if (rdev && test_bit(Faulty, &rdev->flags)) 398 rdev = rcu_dereference(conf->disks[i].replacement); 399 if (!rdev || test_bit(Faulty, &rdev->flags)) 400 degraded++; 401 else if (test_bit(In_sync, &rdev->flags)) 402 ; 403 else 404 /* not in-sync or faulty. 405 * If the reshape increases the number of devices, 406 * this is being recovered by the reshape, so 407 * this 'previous' section is not in_sync. 408 * If the number of devices is being reduced however, 409 * the device can only be part of the array if 410 * we are reverting a reshape, so this section will 411 * be in-sync. 412 */ 413 if (conf->raid_disks >= conf->previous_raid_disks) 414 degraded++; 415 } 416 rcu_read_unlock(); 417 if (conf->raid_disks == conf->previous_raid_disks) 418 return degraded; 419 rcu_read_lock(); 420 degraded2 = 0; 421 for (i = 0; i < conf->raid_disks; i++) { 422 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 423 if (rdev && test_bit(Faulty, &rdev->flags)) 424 rdev = rcu_dereference(conf->disks[i].replacement); 425 if (!rdev || test_bit(Faulty, &rdev->flags)) 426 degraded2++; 427 else if (test_bit(In_sync, &rdev->flags)) 428 ; 429 else 430 /* not in-sync or faulty. 431 * If reshape increases the number of devices, this 432 * section has already been recovered, else it 433 * almost certainly hasn't. 434 */ 435 if (conf->raid_disks <= conf->previous_raid_disks) 436 degraded2++; 437 } 438 rcu_read_unlock(); 439 if (degraded2 > degraded) 440 return degraded2; 441 return degraded; 442 } 443 444 static int has_failed(struct r5conf *conf) 445 { 446 int degraded; 447 448 if (conf->mddev->reshape_position == MaxSector) 449 return conf->mddev->degraded > conf->max_degraded; 450 451 degraded = calc_degraded(conf); 452 if (degraded > conf->max_degraded) 453 return 1; 454 return 0; 455 } 456 457 static struct stripe_head * 458 get_active_stripe(struct r5conf *conf, sector_t sector, 459 int previous, int noblock, int noquiesce) 460 { 461 struct stripe_head *sh; 462 463 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 464 465 spin_lock_irq(&conf->device_lock); 466 467 do { 468 wait_event_lock_irq(conf->wait_for_stripe, 469 conf->quiesce == 0 || noquiesce, 470 conf->device_lock); 471 sh = __find_stripe(conf, sector, conf->generation - previous); 472 if (!sh) { 473 if (!conf->inactive_blocked) 474 sh = get_free_stripe(conf); 475 if (noblock && sh == NULL) 476 break; 477 if (!sh) { 478 conf->inactive_blocked = 1; 479 wait_event_lock_irq(conf->wait_for_stripe, 480 !list_empty(&conf->inactive_list) && 481 (atomic_read(&conf->active_stripes) 482 < (conf->max_nr_stripes *3/4) 483 || !conf->inactive_blocked), 484 conf->device_lock); 485 conf->inactive_blocked = 0; 486 } else 487 init_stripe(sh, sector, previous); 488 } else { 489 if (atomic_read(&sh->count)) { 490 BUG_ON(!list_empty(&sh->lru) 491 && !test_bit(STRIPE_EXPANDING, &sh->state) 492 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 493 } else { 494 if (!test_bit(STRIPE_HANDLE, &sh->state)) 495 atomic_inc(&conf->active_stripes); 496 if (list_empty(&sh->lru) && 497 !test_bit(STRIPE_EXPANDING, &sh->state)) 498 BUG(); 499 list_del_init(&sh->lru); 500 } 501 } 502 } while (sh == NULL); 503 504 if (sh) 505 atomic_inc(&sh->count); 506 507 spin_unlock_irq(&conf->device_lock); 508 return sh; 509 } 510 511 /* Determine if 'data_offset' or 'new_data_offset' should be used 512 * in this stripe_head. 513 */ 514 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 515 { 516 sector_t progress = conf->reshape_progress; 517 /* Need a memory barrier to make sure we see the value 518 * of conf->generation, or ->data_offset that was set before 519 * reshape_progress was updated. 520 */ 521 smp_rmb(); 522 if (progress == MaxSector) 523 return 0; 524 if (sh->generation == conf->generation - 1) 525 return 0; 526 /* We are in a reshape, and this is a new-generation stripe, 527 * so use new_data_offset. 528 */ 529 return 1; 530 } 531 532 static void 533 raid5_end_read_request(struct bio *bi, int error); 534 static void 535 raid5_end_write_request(struct bio *bi, int error); 536 537 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 538 { 539 struct r5conf *conf = sh->raid_conf; 540 int i, disks = sh->disks; 541 542 might_sleep(); 543 544 for (i = disks; i--; ) { 545 int rw; 546 int replace_only = 0; 547 struct bio *bi, *rbi; 548 struct md_rdev *rdev, *rrdev = NULL; 549 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 550 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 551 rw = WRITE_FUA; 552 else 553 rw = WRITE; 554 if (test_bit(R5_Discard, &sh->dev[i].flags)) 555 rw |= REQ_DISCARD; 556 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 557 rw = READ; 558 else if (test_and_clear_bit(R5_WantReplace, 559 &sh->dev[i].flags)) { 560 rw = WRITE; 561 replace_only = 1; 562 } else 563 continue; 564 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 565 rw |= REQ_SYNC; 566 567 bi = &sh->dev[i].req; 568 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 569 570 bi->bi_rw = rw; 571 rbi->bi_rw = rw; 572 if (rw & WRITE) { 573 bi->bi_end_io = raid5_end_write_request; 574 rbi->bi_end_io = raid5_end_write_request; 575 } else 576 bi->bi_end_io = raid5_end_read_request; 577 578 rcu_read_lock(); 579 rrdev = rcu_dereference(conf->disks[i].replacement); 580 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 581 rdev = rcu_dereference(conf->disks[i].rdev); 582 if (!rdev) { 583 rdev = rrdev; 584 rrdev = NULL; 585 } 586 if (rw & WRITE) { 587 if (replace_only) 588 rdev = NULL; 589 if (rdev == rrdev) 590 /* We raced and saw duplicates */ 591 rrdev = NULL; 592 } else { 593 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 594 rdev = rrdev; 595 rrdev = NULL; 596 } 597 598 if (rdev && test_bit(Faulty, &rdev->flags)) 599 rdev = NULL; 600 if (rdev) 601 atomic_inc(&rdev->nr_pending); 602 if (rrdev && test_bit(Faulty, &rrdev->flags)) 603 rrdev = NULL; 604 if (rrdev) 605 atomic_inc(&rrdev->nr_pending); 606 rcu_read_unlock(); 607 608 /* We have already checked bad blocks for reads. Now 609 * need to check for writes. We never accept write errors 610 * on the replacement, so we don't to check rrdev. 611 */ 612 while ((rw & WRITE) && rdev && 613 test_bit(WriteErrorSeen, &rdev->flags)) { 614 sector_t first_bad; 615 int bad_sectors; 616 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 617 &first_bad, &bad_sectors); 618 if (!bad) 619 break; 620 621 if (bad < 0) { 622 set_bit(BlockedBadBlocks, &rdev->flags); 623 if (!conf->mddev->external && 624 conf->mddev->flags) { 625 /* It is very unlikely, but we might 626 * still need to write out the 627 * bad block log - better give it 628 * a chance*/ 629 md_check_recovery(conf->mddev); 630 } 631 /* 632 * Because md_wait_for_blocked_rdev 633 * will dec nr_pending, we must 634 * increment it first. 635 */ 636 atomic_inc(&rdev->nr_pending); 637 md_wait_for_blocked_rdev(rdev, conf->mddev); 638 } else { 639 /* Acknowledged bad block - skip the write */ 640 rdev_dec_pending(rdev, conf->mddev); 641 rdev = NULL; 642 } 643 } 644 645 if (rdev) { 646 if (s->syncing || s->expanding || s->expanded 647 || s->replacing) 648 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 649 650 set_bit(STRIPE_IO_STARTED, &sh->state); 651 652 bi->bi_bdev = rdev->bdev; 653 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 654 __func__, (unsigned long long)sh->sector, 655 bi->bi_rw, i); 656 atomic_inc(&sh->count); 657 if (use_new_offset(conf, sh)) 658 bi->bi_sector = (sh->sector 659 + rdev->new_data_offset); 660 else 661 bi->bi_sector = (sh->sector 662 + rdev->data_offset); 663 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 664 bi->bi_rw |= REQ_FLUSH; 665 666 bi->bi_flags = 1 << BIO_UPTODATE; 667 bi->bi_idx = 0; 668 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 669 bi->bi_io_vec[0].bv_offset = 0; 670 bi->bi_size = STRIPE_SIZE; 671 bi->bi_next = NULL; 672 if (rrdev) 673 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 674 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 675 bi, disk_devt(conf->mddev->gendisk), 676 sh->dev[i].sector); 677 generic_make_request(bi); 678 } 679 if (rrdev) { 680 if (s->syncing || s->expanding || s->expanded 681 || s->replacing) 682 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 683 684 set_bit(STRIPE_IO_STARTED, &sh->state); 685 686 rbi->bi_bdev = rrdev->bdev; 687 pr_debug("%s: for %llu schedule op %ld on " 688 "replacement disc %d\n", 689 __func__, (unsigned long long)sh->sector, 690 rbi->bi_rw, i); 691 atomic_inc(&sh->count); 692 if (use_new_offset(conf, sh)) 693 rbi->bi_sector = (sh->sector 694 + rrdev->new_data_offset); 695 else 696 rbi->bi_sector = (sh->sector 697 + rrdev->data_offset); 698 rbi->bi_flags = 1 << BIO_UPTODATE; 699 rbi->bi_idx = 0; 700 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 701 rbi->bi_io_vec[0].bv_offset = 0; 702 rbi->bi_size = STRIPE_SIZE; 703 rbi->bi_next = NULL; 704 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 705 rbi, disk_devt(conf->mddev->gendisk), 706 sh->dev[i].sector); 707 generic_make_request(rbi); 708 } 709 if (!rdev && !rrdev) { 710 if (rw & WRITE) 711 set_bit(STRIPE_DEGRADED, &sh->state); 712 pr_debug("skip op %ld on disc %d for sector %llu\n", 713 bi->bi_rw, i, (unsigned long long)sh->sector); 714 clear_bit(R5_LOCKED, &sh->dev[i].flags); 715 set_bit(STRIPE_HANDLE, &sh->state); 716 } 717 } 718 } 719 720 static struct dma_async_tx_descriptor * 721 async_copy_data(int frombio, struct bio *bio, struct page *page, 722 sector_t sector, struct dma_async_tx_descriptor *tx) 723 { 724 struct bio_vec *bvl; 725 struct page *bio_page; 726 int i; 727 int page_offset; 728 struct async_submit_ctl submit; 729 enum async_tx_flags flags = 0; 730 731 if (bio->bi_sector >= sector) 732 page_offset = (signed)(bio->bi_sector - sector) * 512; 733 else 734 page_offset = (signed)(sector - bio->bi_sector) * -512; 735 736 if (frombio) 737 flags |= ASYNC_TX_FENCE; 738 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 739 740 bio_for_each_segment(bvl, bio, i) { 741 int len = bvl->bv_len; 742 int clen; 743 int b_offset = 0; 744 745 if (page_offset < 0) { 746 b_offset = -page_offset; 747 page_offset += b_offset; 748 len -= b_offset; 749 } 750 751 if (len > 0 && page_offset + len > STRIPE_SIZE) 752 clen = STRIPE_SIZE - page_offset; 753 else 754 clen = len; 755 756 if (clen > 0) { 757 b_offset += bvl->bv_offset; 758 bio_page = bvl->bv_page; 759 if (frombio) 760 tx = async_memcpy(page, bio_page, page_offset, 761 b_offset, clen, &submit); 762 else 763 tx = async_memcpy(bio_page, page, b_offset, 764 page_offset, clen, &submit); 765 } 766 /* chain the operations */ 767 submit.depend_tx = tx; 768 769 if (clen < len) /* hit end of page */ 770 break; 771 page_offset += len; 772 } 773 774 return tx; 775 } 776 777 static void ops_complete_biofill(void *stripe_head_ref) 778 { 779 struct stripe_head *sh = stripe_head_ref; 780 struct bio *return_bi = NULL; 781 int i; 782 783 pr_debug("%s: stripe %llu\n", __func__, 784 (unsigned long long)sh->sector); 785 786 /* clear completed biofills */ 787 for (i = sh->disks; i--; ) { 788 struct r5dev *dev = &sh->dev[i]; 789 790 /* acknowledge completion of a biofill operation */ 791 /* and check if we need to reply to a read request, 792 * new R5_Wantfill requests are held off until 793 * !STRIPE_BIOFILL_RUN 794 */ 795 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 796 struct bio *rbi, *rbi2; 797 798 BUG_ON(!dev->read); 799 rbi = dev->read; 800 dev->read = NULL; 801 while (rbi && rbi->bi_sector < 802 dev->sector + STRIPE_SECTORS) { 803 rbi2 = r5_next_bio(rbi, dev->sector); 804 if (!raid5_dec_bi_active_stripes(rbi)) { 805 rbi->bi_next = return_bi; 806 return_bi = rbi; 807 } 808 rbi = rbi2; 809 } 810 } 811 } 812 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 813 814 return_io(return_bi); 815 816 set_bit(STRIPE_HANDLE, &sh->state); 817 release_stripe(sh); 818 } 819 820 static void ops_run_biofill(struct stripe_head *sh) 821 { 822 struct dma_async_tx_descriptor *tx = NULL; 823 struct async_submit_ctl submit; 824 int i; 825 826 pr_debug("%s: stripe %llu\n", __func__, 827 (unsigned long long)sh->sector); 828 829 for (i = sh->disks; i--; ) { 830 struct r5dev *dev = &sh->dev[i]; 831 if (test_bit(R5_Wantfill, &dev->flags)) { 832 struct bio *rbi; 833 spin_lock_irq(&sh->stripe_lock); 834 dev->read = rbi = dev->toread; 835 dev->toread = NULL; 836 spin_unlock_irq(&sh->stripe_lock); 837 while (rbi && rbi->bi_sector < 838 dev->sector + STRIPE_SECTORS) { 839 tx = async_copy_data(0, rbi, dev->page, 840 dev->sector, tx); 841 rbi = r5_next_bio(rbi, dev->sector); 842 } 843 } 844 } 845 846 atomic_inc(&sh->count); 847 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 848 async_trigger_callback(&submit); 849 } 850 851 static void mark_target_uptodate(struct stripe_head *sh, int target) 852 { 853 struct r5dev *tgt; 854 855 if (target < 0) 856 return; 857 858 tgt = &sh->dev[target]; 859 set_bit(R5_UPTODATE, &tgt->flags); 860 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 861 clear_bit(R5_Wantcompute, &tgt->flags); 862 } 863 864 static void ops_complete_compute(void *stripe_head_ref) 865 { 866 struct stripe_head *sh = stripe_head_ref; 867 868 pr_debug("%s: stripe %llu\n", __func__, 869 (unsigned long long)sh->sector); 870 871 /* mark the computed target(s) as uptodate */ 872 mark_target_uptodate(sh, sh->ops.target); 873 mark_target_uptodate(sh, sh->ops.target2); 874 875 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 876 if (sh->check_state == check_state_compute_run) 877 sh->check_state = check_state_compute_result; 878 set_bit(STRIPE_HANDLE, &sh->state); 879 release_stripe(sh); 880 } 881 882 /* return a pointer to the address conversion region of the scribble buffer */ 883 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 884 struct raid5_percpu *percpu) 885 { 886 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 887 } 888 889 static struct dma_async_tx_descriptor * 890 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 891 { 892 int disks = sh->disks; 893 struct page **xor_srcs = percpu->scribble; 894 int target = sh->ops.target; 895 struct r5dev *tgt = &sh->dev[target]; 896 struct page *xor_dest = tgt->page; 897 int count = 0; 898 struct dma_async_tx_descriptor *tx; 899 struct async_submit_ctl submit; 900 int i; 901 902 pr_debug("%s: stripe %llu block: %d\n", 903 __func__, (unsigned long long)sh->sector, target); 904 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 905 906 for (i = disks; i--; ) 907 if (i != target) 908 xor_srcs[count++] = sh->dev[i].page; 909 910 atomic_inc(&sh->count); 911 912 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 913 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 914 if (unlikely(count == 1)) 915 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 916 else 917 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 918 919 return tx; 920 } 921 922 /* set_syndrome_sources - populate source buffers for gen_syndrome 923 * @srcs - (struct page *) array of size sh->disks 924 * @sh - stripe_head to parse 925 * 926 * Populates srcs in proper layout order for the stripe and returns the 927 * 'count' of sources to be used in a call to async_gen_syndrome. The P 928 * destination buffer is recorded in srcs[count] and the Q destination 929 * is recorded in srcs[count+1]]. 930 */ 931 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 932 { 933 int disks = sh->disks; 934 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 935 int d0_idx = raid6_d0(sh); 936 int count; 937 int i; 938 939 for (i = 0; i < disks; i++) 940 srcs[i] = NULL; 941 942 count = 0; 943 i = d0_idx; 944 do { 945 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 946 947 srcs[slot] = sh->dev[i].page; 948 i = raid6_next_disk(i, disks); 949 } while (i != d0_idx); 950 951 return syndrome_disks; 952 } 953 954 static struct dma_async_tx_descriptor * 955 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 956 { 957 int disks = sh->disks; 958 struct page **blocks = percpu->scribble; 959 int target; 960 int qd_idx = sh->qd_idx; 961 struct dma_async_tx_descriptor *tx; 962 struct async_submit_ctl submit; 963 struct r5dev *tgt; 964 struct page *dest; 965 int i; 966 int count; 967 968 if (sh->ops.target < 0) 969 target = sh->ops.target2; 970 else if (sh->ops.target2 < 0) 971 target = sh->ops.target; 972 else 973 /* we should only have one valid target */ 974 BUG(); 975 BUG_ON(target < 0); 976 pr_debug("%s: stripe %llu block: %d\n", 977 __func__, (unsigned long long)sh->sector, target); 978 979 tgt = &sh->dev[target]; 980 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 981 dest = tgt->page; 982 983 atomic_inc(&sh->count); 984 985 if (target == qd_idx) { 986 count = set_syndrome_sources(blocks, sh); 987 blocks[count] = NULL; /* regenerating p is not necessary */ 988 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 989 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 990 ops_complete_compute, sh, 991 to_addr_conv(sh, percpu)); 992 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 993 } else { 994 /* Compute any data- or p-drive using XOR */ 995 count = 0; 996 for (i = disks; i-- ; ) { 997 if (i == target || i == qd_idx) 998 continue; 999 blocks[count++] = sh->dev[i].page; 1000 } 1001 1002 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1003 NULL, ops_complete_compute, sh, 1004 to_addr_conv(sh, percpu)); 1005 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1006 } 1007 1008 return tx; 1009 } 1010 1011 static struct dma_async_tx_descriptor * 1012 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1013 { 1014 int i, count, disks = sh->disks; 1015 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1016 int d0_idx = raid6_d0(sh); 1017 int faila = -1, failb = -1; 1018 int target = sh->ops.target; 1019 int target2 = sh->ops.target2; 1020 struct r5dev *tgt = &sh->dev[target]; 1021 struct r5dev *tgt2 = &sh->dev[target2]; 1022 struct dma_async_tx_descriptor *tx; 1023 struct page **blocks = percpu->scribble; 1024 struct async_submit_ctl submit; 1025 1026 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1027 __func__, (unsigned long long)sh->sector, target, target2); 1028 BUG_ON(target < 0 || target2 < 0); 1029 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1030 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1031 1032 /* we need to open-code set_syndrome_sources to handle the 1033 * slot number conversion for 'faila' and 'failb' 1034 */ 1035 for (i = 0; i < disks ; i++) 1036 blocks[i] = NULL; 1037 count = 0; 1038 i = d0_idx; 1039 do { 1040 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1041 1042 blocks[slot] = sh->dev[i].page; 1043 1044 if (i == target) 1045 faila = slot; 1046 if (i == target2) 1047 failb = slot; 1048 i = raid6_next_disk(i, disks); 1049 } while (i != d0_idx); 1050 1051 BUG_ON(faila == failb); 1052 if (failb < faila) 1053 swap(faila, failb); 1054 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1055 __func__, (unsigned long long)sh->sector, faila, failb); 1056 1057 atomic_inc(&sh->count); 1058 1059 if (failb == syndrome_disks+1) { 1060 /* Q disk is one of the missing disks */ 1061 if (faila == syndrome_disks) { 1062 /* Missing P+Q, just recompute */ 1063 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1064 ops_complete_compute, sh, 1065 to_addr_conv(sh, percpu)); 1066 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1067 STRIPE_SIZE, &submit); 1068 } else { 1069 struct page *dest; 1070 int data_target; 1071 int qd_idx = sh->qd_idx; 1072 1073 /* Missing D+Q: recompute D from P, then recompute Q */ 1074 if (target == qd_idx) 1075 data_target = target2; 1076 else 1077 data_target = target; 1078 1079 count = 0; 1080 for (i = disks; i-- ; ) { 1081 if (i == data_target || i == qd_idx) 1082 continue; 1083 blocks[count++] = sh->dev[i].page; 1084 } 1085 dest = sh->dev[data_target].page; 1086 init_async_submit(&submit, 1087 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1088 NULL, NULL, NULL, 1089 to_addr_conv(sh, percpu)); 1090 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1091 &submit); 1092 1093 count = set_syndrome_sources(blocks, sh); 1094 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1095 ops_complete_compute, sh, 1096 to_addr_conv(sh, percpu)); 1097 return async_gen_syndrome(blocks, 0, count+2, 1098 STRIPE_SIZE, &submit); 1099 } 1100 } else { 1101 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1102 ops_complete_compute, sh, 1103 to_addr_conv(sh, percpu)); 1104 if (failb == syndrome_disks) { 1105 /* We're missing D+P. */ 1106 return async_raid6_datap_recov(syndrome_disks+2, 1107 STRIPE_SIZE, faila, 1108 blocks, &submit); 1109 } else { 1110 /* We're missing D+D. */ 1111 return async_raid6_2data_recov(syndrome_disks+2, 1112 STRIPE_SIZE, faila, failb, 1113 blocks, &submit); 1114 } 1115 } 1116 } 1117 1118 1119 static void ops_complete_prexor(void *stripe_head_ref) 1120 { 1121 struct stripe_head *sh = stripe_head_ref; 1122 1123 pr_debug("%s: stripe %llu\n", __func__, 1124 (unsigned long long)sh->sector); 1125 } 1126 1127 static struct dma_async_tx_descriptor * 1128 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1129 struct dma_async_tx_descriptor *tx) 1130 { 1131 int disks = sh->disks; 1132 struct page **xor_srcs = percpu->scribble; 1133 int count = 0, pd_idx = sh->pd_idx, i; 1134 struct async_submit_ctl submit; 1135 1136 /* existing parity data subtracted */ 1137 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1138 1139 pr_debug("%s: stripe %llu\n", __func__, 1140 (unsigned long long)sh->sector); 1141 1142 for (i = disks; i--; ) { 1143 struct r5dev *dev = &sh->dev[i]; 1144 /* Only process blocks that are known to be uptodate */ 1145 if (test_bit(R5_Wantdrain, &dev->flags)) 1146 xor_srcs[count++] = dev->page; 1147 } 1148 1149 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1150 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1151 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1152 1153 return tx; 1154 } 1155 1156 static struct dma_async_tx_descriptor * 1157 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1158 { 1159 int disks = sh->disks; 1160 int i; 1161 1162 pr_debug("%s: stripe %llu\n", __func__, 1163 (unsigned long long)sh->sector); 1164 1165 for (i = disks; i--; ) { 1166 struct r5dev *dev = &sh->dev[i]; 1167 struct bio *chosen; 1168 1169 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1170 struct bio *wbi; 1171 1172 spin_lock_irq(&sh->stripe_lock); 1173 chosen = dev->towrite; 1174 dev->towrite = NULL; 1175 BUG_ON(dev->written); 1176 wbi = dev->written = chosen; 1177 spin_unlock_irq(&sh->stripe_lock); 1178 1179 while (wbi && wbi->bi_sector < 1180 dev->sector + STRIPE_SECTORS) { 1181 if (wbi->bi_rw & REQ_FUA) 1182 set_bit(R5_WantFUA, &dev->flags); 1183 if (wbi->bi_rw & REQ_SYNC) 1184 set_bit(R5_SyncIO, &dev->flags); 1185 if (wbi->bi_rw & REQ_DISCARD) 1186 set_bit(R5_Discard, &dev->flags); 1187 else 1188 tx = async_copy_data(1, wbi, dev->page, 1189 dev->sector, tx); 1190 wbi = r5_next_bio(wbi, dev->sector); 1191 } 1192 } 1193 } 1194 1195 return tx; 1196 } 1197 1198 static void ops_complete_reconstruct(void *stripe_head_ref) 1199 { 1200 struct stripe_head *sh = stripe_head_ref; 1201 int disks = sh->disks; 1202 int pd_idx = sh->pd_idx; 1203 int qd_idx = sh->qd_idx; 1204 int i; 1205 bool fua = false, sync = false, discard = false; 1206 1207 pr_debug("%s: stripe %llu\n", __func__, 1208 (unsigned long long)sh->sector); 1209 1210 for (i = disks; i--; ) { 1211 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1212 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1213 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1214 } 1215 1216 for (i = disks; i--; ) { 1217 struct r5dev *dev = &sh->dev[i]; 1218 1219 if (dev->written || i == pd_idx || i == qd_idx) { 1220 if (!discard) 1221 set_bit(R5_UPTODATE, &dev->flags); 1222 if (fua) 1223 set_bit(R5_WantFUA, &dev->flags); 1224 if (sync) 1225 set_bit(R5_SyncIO, &dev->flags); 1226 } 1227 } 1228 1229 if (sh->reconstruct_state == reconstruct_state_drain_run) 1230 sh->reconstruct_state = reconstruct_state_drain_result; 1231 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1232 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1233 else { 1234 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1235 sh->reconstruct_state = reconstruct_state_result; 1236 } 1237 1238 set_bit(STRIPE_HANDLE, &sh->state); 1239 release_stripe(sh); 1240 } 1241 1242 static void 1243 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1244 struct dma_async_tx_descriptor *tx) 1245 { 1246 int disks = sh->disks; 1247 struct page **xor_srcs = percpu->scribble; 1248 struct async_submit_ctl submit; 1249 int count = 0, pd_idx = sh->pd_idx, i; 1250 struct page *xor_dest; 1251 int prexor = 0; 1252 unsigned long flags; 1253 1254 pr_debug("%s: stripe %llu\n", __func__, 1255 (unsigned long long)sh->sector); 1256 1257 for (i = 0; i < sh->disks; i++) { 1258 if (pd_idx == i) 1259 continue; 1260 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1261 break; 1262 } 1263 if (i >= sh->disks) { 1264 atomic_inc(&sh->count); 1265 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1266 ops_complete_reconstruct(sh); 1267 return; 1268 } 1269 /* check if prexor is active which means only process blocks 1270 * that are part of a read-modify-write (written) 1271 */ 1272 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1273 prexor = 1; 1274 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1275 for (i = disks; i--; ) { 1276 struct r5dev *dev = &sh->dev[i]; 1277 if (dev->written) 1278 xor_srcs[count++] = dev->page; 1279 } 1280 } else { 1281 xor_dest = sh->dev[pd_idx].page; 1282 for (i = disks; i--; ) { 1283 struct r5dev *dev = &sh->dev[i]; 1284 if (i != pd_idx) 1285 xor_srcs[count++] = dev->page; 1286 } 1287 } 1288 1289 /* 1/ if we prexor'd then the dest is reused as a source 1290 * 2/ if we did not prexor then we are redoing the parity 1291 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1292 * for the synchronous xor case 1293 */ 1294 flags = ASYNC_TX_ACK | 1295 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1296 1297 atomic_inc(&sh->count); 1298 1299 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1300 to_addr_conv(sh, percpu)); 1301 if (unlikely(count == 1)) 1302 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1303 else 1304 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1305 } 1306 1307 static void 1308 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1309 struct dma_async_tx_descriptor *tx) 1310 { 1311 struct async_submit_ctl submit; 1312 struct page **blocks = percpu->scribble; 1313 int count, i; 1314 1315 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1316 1317 for (i = 0; i < sh->disks; i++) { 1318 if (sh->pd_idx == i || sh->qd_idx == i) 1319 continue; 1320 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1321 break; 1322 } 1323 if (i >= sh->disks) { 1324 atomic_inc(&sh->count); 1325 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1326 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1327 ops_complete_reconstruct(sh); 1328 return; 1329 } 1330 1331 count = set_syndrome_sources(blocks, sh); 1332 1333 atomic_inc(&sh->count); 1334 1335 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1336 sh, to_addr_conv(sh, percpu)); 1337 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1338 } 1339 1340 static void ops_complete_check(void *stripe_head_ref) 1341 { 1342 struct stripe_head *sh = stripe_head_ref; 1343 1344 pr_debug("%s: stripe %llu\n", __func__, 1345 (unsigned long long)sh->sector); 1346 1347 sh->check_state = check_state_check_result; 1348 set_bit(STRIPE_HANDLE, &sh->state); 1349 release_stripe(sh); 1350 } 1351 1352 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1353 { 1354 int disks = sh->disks; 1355 int pd_idx = sh->pd_idx; 1356 int qd_idx = sh->qd_idx; 1357 struct page *xor_dest; 1358 struct page **xor_srcs = percpu->scribble; 1359 struct dma_async_tx_descriptor *tx; 1360 struct async_submit_ctl submit; 1361 int count; 1362 int i; 1363 1364 pr_debug("%s: stripe %llu\n", __func__, 1365 (unsigned long long)sh->sector); 1366 1367 count = 0; 1368 xor_dest = sh->dev[pd_idx].page; 1369 xor_srcs[count++] = xor_dest; 1370 for (i = disks; i--; ) { 1371 if (i == pd_idx || i == qd_idx) 1372 continue; 1373 xor_srcs[count++] = sh->dev[i].page; 1374 } 1375 1376 init_async_submit(&submit, 0, NULL, NULL, NULL, 1377 to_addr_conv(sh, percpu)); 1378 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1379 &sh->ops.zero_sum_result, &submit); 1380 1381 atomic_inc(&sh->count); 1382 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1383 tx = async_trigger_callback(&submit); 1384 } 1385 1386 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1387 { 1388 struct page **srcs = percpu->scribble; 1389 struct async_submit_ctl submit; 1390 int count; 1391 1392 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1393 (unsigned long long)sh->sector, checkp); 1394 1395 count = set_syndrome_sources(srcs, sh); 1396 if (!checkp) 1397 srcs[count] = NULL; 1398 1399 atomic_inc(&sh->count); 1400 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1401 sh, to_addr_conv(sh, percpu)); 1402 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1403 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1404 } 1405 1406 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1407 { 1408 int overlap_clear = 0, i, disks = sh->disks; 1409 struct dma_async_tx_descriptor *tx = NULL; 1410 struct r5conf *conf = sh->raid_conf; 1411 int level = conf->level; 1412 struct raid5_percpu *percpu; 1413 unsigned long cpu; 1414 1415 cpu = get_cpu(); 1416 percpu = per_cpu_ptr(conf->percpu, cpu); 1417 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1418 ops_run_biofill(sh); 1419 overlap_clear++; 1420 } 1421 1422 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1423 if (level < 6) 1424 tx = ops_run_compute5(sh, percpu); 1425 else { 1426 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1427 tx = ops_run_compute6_1(sh, percpu); 1428 else 1429 tx = ops_run_compute6_2(sh, percpu); 1430 } 1431 /* terminate the chain if reconstruct is not set to be run */ 1432 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1433 async_tx_ack(tx); 1434 } 1435 1436 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1437 tx = ops_run_prexor(sh, percpu, tx); 1438 1439 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1440 tx = ops_run_biodrain(sh, tx); 1441 overlap_clear++; 1442 } 1443 1444 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1445 if (level < 6) 1446 ops_run_reconstruct5(sh, percpu, tx); 1447 else 1448 ops_run_reconstruct6(sh, percpu, tx); 1449 } 1450 1451 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1452 if (sh->check_state == check_state_run) 1453 ops_run_check_p(sh, percpu); 1454 else if (sh->check_state == check_state_run_q) 1455 ops_run_check_pq(sh, percpu, 0); 1456 else if (sh->check_state == check_state_run_pq) 1457 ops_run_check_pq(sh, percpu, 1); 1458 else 1459 BUG(); 1460 } 1461 1462 if (overlap_clear) 1463 for (i = disks; i--; ) { 1464 struct r5dev *dev = &sh->dev[i]; 1465 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1466 wake_up(&sh->raid_conf->wait_for_overlap); 1467 } 1468 put_cpu(); 1469 } 1470 1471 static int grow_one_stripe(struct r5conf *conf) 1472 { 1473 struct stripe_head *sh; 1474 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1475 if (!sh) 1476 return 0; 1477 1478 sh->raid_conf = conf; 1479 1480 spin_lock_init(&sh->stripe_lock); 1481 1482 if (grow_buffers(sh)) { 1483 shrink_buffers(sh); 1484 kmem_cache_free(conf->slab_cache, sh); 1485 return 0; 1486 } 1487 /* we just created an active stripe so... */ 1488 atomic_set(&sh->count, 1); 1489 atomic_inc(&conf->active_stripes); 1490 INIT_LIST_HEAD(&sh->lru); 1491 release_stripe(sh); 1492 return 1; 1493 } 1494 1495 static int grow_stripes(struct r5conf *conf, int num) 1496 { 1497 struct kmem_cache *sc; 1498 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1499 1500 if (conf->mddev->gendisk) 1501 sprintf(conf->cache_name[0], 1502 "raid%d-%s", conf->level, mdname(conf->mddev)); 1503 else 1504 sprintf(conf->cache_name[0], 1505 "raid%d-%p", conf->level, conf->mddev); 1506 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1507 1508 conf->active_name = 0; 1509 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1510 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1511 0, 0, NULL); 1512 if (!sc) 1513 return 1; 1514 conf->slab_cache = sc; 1515 conf->pool_size = devs; 1516 while (num--) 1517 if (!grow_one_stripe(conf)) 1518 return 1; 1519 return 0; 1520 } 1521 1522 /** 1523 * scribble_len - return the required size of the scribble region 1524 * @num - total number of disks in the array 1525 * 1526 * The size must be enough to contain: 1527 * 1/ a struct page pointer for each device in the array +2 1528 * 2/ room to convert each entry in (1) to its corresponding dma 1529 * (dma_map_page()) or page (page_address()) address. 1530 * 1531 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1532 * calculate over all devices (not just the data blocks), using zeros in place 1533 * of the P and Q blocks. 1534 */ 1535 static size_t scribble_len(int num) 1536 { 1537 size_t len; 1538 1539 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1540 1541 return len; 1542 } 1543 1544 static int resize_stripes(struct r5conf *conf, int newsize) 1545 { 1546 /* Make all the stripes able to hold 'newsize' devices. 1547 * New slots in each stripe get 'page' set to a new page. 1548 * 1549 * This happens in stages: 1550 * 1/ create a new kmem_cache and allocate the required number of 1551 * stripe_heads. 1552 * 2/ gather all the old stripe_heads and transfer the pages across 1553 * to the new stripe_heads. This will have the side effect of 1554 * freezing the array as once all stripe_heads have been collected, 1555 * no IO will be possible. Old stripe heads are freed once their 1556 * pages have been transferred over, and the old kmem_cache is 1557 * freed when all stripes are done. 1558 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1559 * we simple return a failre status - no need to clean anything up. 1560 * 4/ allocate new pages for the new slots in the new stripe_heads. 1561 * If this fails, we don't bother trying the shrink the 1562 * stripe_heads down again, we just leave them as they are. 1563 * As each stripe_head is processed the new one is released into 1564 * active service. 1565 * 1566 * Once step2 is started, we cannot afford to wait for a write, 1567 * so we use GFP_NOIO allocations. 1568 */ 1569 struct stripe_head *osh, *nsh; 1570 LIST_HEAD(newstripes); 1571 struct disk_info *ndisks; 1572 unsigned long cpu; 1573 int err; 1574 struct kmem_cache *sc; 1575 int i; 1576 1577 if (newsize <= conf->pool_size) 1578 return 0; /* never bother to shrink */ 1579 1580 err = md_allow_write(conf->mddev); 1581 if (err) 1582 return err; 1583 1584 /* Step 1 */ 1585 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1586 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1587 0, 0, NULL); 1588 if (!sc) 1589 return -ENOMEM; 1590 1591 for (i = conf->max_nr_stripes; i; i--) { 1592 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1593 if (!nsh) 1594 break; 1595 1596 nsh->raid_conf = conf; 1597 spin_lock_init(&nsh->stripe_lock); 1598 1599 list_add(&nsh->lru, &newstripes); 1600 } 1601 if (i) { 1602 /* didn't get enough, give up */ 1603 while (!list_empty(&newstripes)) { 1604 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1605 list_del(&nsh->lru); 1606 kmem_cache_free(sc, nsh); 1607 } 1608 kmem_cache_destroy(sc); 1609 return -ENOMEM; 1610 } 1611 /* Step 2 - Must use GFP_NOIO now. 1612 * OK, we have enough stripes, start collecting inactive 1613 * stripes and copying them over 1614 */ 1615 list_for_each_entry(nsh, &newstripes, lru) { 1616 spin_lock_irq(&conf->device_lock); 1617 wait_event_lock_irq(conf->wait_for_stripe, 1618 !list_empty(&conf->inactive_list), 1619 conf->device_lock); 1620 osh = get_free_stripe(conf); 1621 spin_unlock_irq(&conf->device_lock); 1622 atomic_set(&nsh->count, 1); 1623 for(i=0; i<conf->pool_size; i++) 1624 nsh->dev[i].page = osh->dev[i].page; 1625 for( ; i<newsize; i++) 1626 nsh->dev[i].page = NULL; 1627 kmem_cache_free(conf->slab_cache, osh); 1628 } 1629 kmem_cache_destroy(conf->slab_cache); 1630 1631 /* Step 3. 1632 * At this point, we are holding all the stripes so the array 1633 * is completely stalled, so now is a good time to resize 1634 * conf->disks and the scribble region 1635 */ 1636 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1637 if (ndisks) { 1638 for (i=0; i<conf->raid_disks; i++) 1639 ndisks[i] = conf->disks[i]; 1640 kfree(conf->disks); 1641 conf->disks = ndisks; 1642 } else 1643 err = -ENOMEM; 1644 1645 get_online_cpus(); 1646 conf->scribble_len = scribble_len(newsize); 1647 for_each_present_cpu(cpu) { 1648 struct raid5_percpu *percpu; 1649 void *scribble; 1650 1651 percpu = per_cpu_ptr(conf->percpu, cpu); 1652 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1653 1654 if (scribble) { 1655 kfree(percpu->scribble); 1656 percpu->scribble = scribble; 1657 } else { 1658 err = -ENOMEM; 1659 break; 1660 } 1661 } 1662 put_online_cpus(); 1663 1664 /* Step 4, return new stripes to service */ 1665 while(!list_empty(&newstripes)) { 1666 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1667 list_del_init(&nsh->lru); 1668 1669 for (i=conf->raid_disks; i < newsize; i++) 1670 if (nsh->dev[i].page == NULL) { 1671 struct page *p = alloc_page(GFP_NOIO); 1672 nsh->dev[i].page = p; 1673 if (!p) 1674 err = -ENOMEM; 1675 } 1676 release_stripe(nsh); 1677 } 1678 /* critical section pass, GFP_NOIO no longer needed */ 1679 1680 conf->slab_cache = sc; 1681 conf->active_name = 1-conf->active_name; 1682 conf->pool_size = newsize; 1683 return err; 1684 } 1685 1686 static int drop_one_stripe(struct r5conf *conf) 1687 { 1688 struct stripe_head *sh; 1689 1690 spin_lock_irq(&conf->device_lock); 1691 sh = get_free_stripe(conf); 1692 spin_unlock_irq(&conf->device_lock); 1693 if (!sh) 1694 return 0; 1695 BUG_ON(atomic_read(&sh->count)); 1696 shrink_buffers(sh); 1697 kmem_cache_free(conf->slab_cache, sh); 1698 atomic_dec(&conf->active_stripes); 1699 return 1; 1700 } 1701 1702 static void shrink_stripes(struct r5conf *conf) 1703 { 1704 while (drop_one_stripe(conf)) 1705 ; 1706 1707 if (conf->slab_cache) 1708 kmem_cache_destroy(conf->slab_cache); 1709 conf->slab_cache = NULL; 1710 } 1711 1712 static void raid5_end_read_request(struct bio * bi, int error) 1713 { 1714 struct stripe_head *sh = bi->bi_private; 1715 struct r5conf *conf = sh->raid_conf; 1716 int disks = sh->disks, i; 1717 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1718 char b[BDEVNAME_SIZE]; 1719 struct md_rdev *rdev = NULL; 1720 sector_t s; 1721 1722 for (i=0 ; i<disks; i++) 1723 if (bi == &sh->dev[i].req) 1724 break; 1725 1726 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1727 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1728 uptodate); 1729 if (i == disks) { 1730 BUG(); 1731 return; 1732 } 1733 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1734 /* If replacement finished while this request was outstanding, 1735 * 'replacement' might be NULL already. 1736 * In that case it moved down to 'rdev'. 1737 * rdev is not removed until all requests are finished. 1738 */ 1739 rdev = conf->disks[i].replacement; 1740 if (!rdev) 1741 rdev = conf->disks[i].rdev; 1742 1743 if (use_new_offset(conf, sh)) 1744 s = sh->sector + rdev->new_data_offset; 1745 else 1746 s = sh->sector + rdev->data_offset; 1747 if (uptodate) { 1748 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1749 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1750 /* Note that this cannot happen on a 1751 * replacement device. We just fail those on 1752 * any error 1753 */ 1754 printk_ratelimited( 1755 KERN_INFO 1756 "md/raid:%s: read error corrected" 1757 " (%lu sectors at %llu on %s)\n", 1758 mdname(conf->mddev), STRIPE_SECTORS, 1759 (unsigned long long)s, 1760 bdevname(rdev->bdev, b)); 1761 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1762 clear_bit(R5_ReadError, &sh->dev[i].flags); 1763 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1764 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1765 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1766 1767 if (atomic_read(&rdev->read_errors)) 1768 atomic_set(&rdev->read_errors, 0); 1769 } else { 1770 const char *bdn = bdevname(rdev->bdev, b); 1771 int retry = 0; 1772 int set_bad = 0; 1773 1774 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1775 atomic_inc(&rdev->read_errors); 1776 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1777 printk_ratelimited( 1778 KERN_WARNING 1779 "md/raid:%s: read error on replacement device " 1780 "(sector %llu on %s).\n", 1781 mdname(conf->mddev), 1782 (unsigned long long)s, 1783 bdn); 1784 else if (conf->mddev->degraded >= conf->max_degraded) { 1785 set_bad = 1; 1786 printk_ratelimited( 1787 KERN_WARNING 1788 "md/raid:%s: read error not correctable " 1789 "(sector %llu on %s).\n", 1790 mdname(conf->mddev), 1791 (unsigned long long)s, 1792 bdn); 1793 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1794 /* Oh, no!!! */ 1795 set_bad = 1; 1796 printk_ratelimited( 1797 KERN_WARNING 1798 "md/raid:%s: read error NOT corrected!! " 1799 "(sector %llu on %s).\n", 1800 mdname(conf->mddev), 1801 (unsigned long long)s, 1802 bdn); 1803 } else if (atomic_read(&rdev->read_errors) 1804 > conf->max_nr_stripes) 1805 printk(KERN_WARNING 1806 "md/raid:%s: Too many read errors, failing device %s.\n", 1807 mdname(conf->mddev), bdn); 1808 else 1809 retry = 1; 1810 if (retry) 1811 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1812 set_bit(R5_ReadError, &sh->dev[i].flags); 1813 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1814 } else 1815 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1816 else { 1817 clear_bit(R5_ReadError, &sh->dev[i].flags); 1818 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1819 if (!(set_bad 1820 && test_bit(In_sync, &rdev->flags) 1821 && rdev_set_badblocks( 1822 rdev, sh->sector, STRIPE_SECTORS, 0))) 1823 md_error(conf->mddev, rdev); 1824 } 1825 } 1826 rdev_dec_pending(rdev, conf->mddev); 1827 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1828 set_bit(STRIPE_HANDLE, &sh->state); 1829 release_stripe(sh); 1830 } 1831 1832 static void raid5_end_write_request(struct bio *bi, int error) 1833 { 1834 struct stripe_head *sh = bi->bi_private; 1835 struct r5conf *conf = sh->raid_conf; 1836 int disks = sh->disks, i; 1837 struct md_rdev *uninitialized_var(rdev); 1838 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1839 sector_t first_bad; 1840 int bad_sectors; 1841 int replacement = 0; 1842 1843 for (i = 0 ; i < disks; i++) { 1844 if (bi == &sh->dev[i].req) { 1845 rdev = conf->disks[i].rdev; 1846 break; 1847 } 1848 if (bi == &sh->dev[i].rreq) { 1849 rdev = conf->disks[i].replacement; 1850 if (rdev) 1851 replacement = 1; 1852 else 1853 /* rdev was removed and 'replacement' 1854 * replaced it. rdev is not removed 1855 * until all requests are finished. 1856 */ 1857 rdev = conf->disks[i].rdev; 1858 break; 1859 } 1860 } 1861 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1862 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1863 uptodate); 1864 if (i == disks) { 1865 BUG(); 1866 return; 1867 } 1868 1869 if (replacement) { 1870 if (!uptodate) 1871 md_error(conf->mddev, rdev); 1872 else if (is_badblock(rdev, sh->sector, 1873 STRIPE_SECTORS, 1874 &first_bad, &bad_sectors)) 1875 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1876 } else { 1877 if (!uptodate) { 1878 set_bit(WriteErrorSeen, &rdev->flags); 1879 set_bit(R5_WriteError, &sh->dev[i].flags); 1880 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1881 set_bit(MD_RECOVERY_NEEDED, 1882 &rdev->mddev->recovery); 1883 } else if (is_badblock(rdev, sh->sector, 1884 STRIPE_SECTORS, 1885 &first_bad, &bad_sectors)) 1886 set_bit(R5_MadeGood, &sh->dev[i].flags); 1887 } 1888 rdev_dec_pending(rdev, conf->mddev); 1889 1890 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1891 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1892 set_bit(STRIPE_HANDLE, &sh->state); 1893 release_stripe(sh); 1894 } 1895 1896 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1897 1898 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1899 { 1900 struct r5dev *dev = &sh->dev[i]; 1901 1902 bio_init(&dev->req); 1903 dev->req.bi_io_vec = &dev->vec; 1904 dev->req.bi_vcnt++; 1905 dev->req.bi_max_vecs++; 1906 dev->req.bi_private = sh; 1907 dev->vec.bv_page = dev->page; 1908 1909 bio_init(&dev->rreq); 1910 dev->rreq.bi_io_vec = &dev->rvec; 1911 dev->rreq.bi_vcnt++; 1912 dev->rreq.bi_max_vecs++; 1913 dev->rreq.bi_private = sh; 1914 dev->rvec.bv_page = dev->page; 1915 1916 dev->flags = 0; 1917 dev->sector = compute_blocknr(sh, i, previous); 1918 } 1919 1920 static void error(struct mddev *mddev, struct md_rdev *rdev) 1921 { 1922 char b[BDEVNAME_SIZE]; 1923 struct r5conf *conf = mddev->private; 1924 unsigned long flags; 1925 pr_debug("raid456: error called\n"); 1926 1927 spin_lock_irqsave(&conf->device_lock, flags); 1928 clear_bit(In_sync, &rdev->flags); 1929 mddev->degraded = calc_degraded(conf); 1930 spin_unlock_irqrestore(&conf->device_lock, flags); 1931 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1932 1933 set_bit(Blocked, &rdev->flags); 1934 set_bit(Faulty, &rdev->flags); 1935 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1936 printk(KERN_ALERT 1937 "md/raid:%s: Disk failure on %s, disabling device.\n" 1938 "md/raid:%s: Operation continuing on %d devices.\n", 1939 mdname(mddev), 1940 bdevname(rdev->bdev, b), 1941 mdname(mddev), 1942 conf->raid_disks - mddev->degraded); 1943 } 1944 1945 /* 1946 * Input: a 'big' sector number, 1947 * Output: index of the data and parity disk, and the sector # in them. 1948 */ 1949 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1950 int previous, int *dd_idx, 1951 struct stripe_head *sh) 1952 { 1953 sector_t stripe, stripe2; 1954 sector_t chunk_number; 1955 unsigned int chunk_offset; 1956 int pd_idx, qd_idx; 1957 int ddf_layout = 0; 1958 sector_t new_sector; 1959 int algorithm = previous ? conf->prev_algo 1960 : conf->algorithm; 1961 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1962 : conf->chunk_sectors; 1963 int raid_disks = previous ? conf->previous_raid_disks 1964 : conf->raid_disks; 1965 int data_disks = raid_disks - conf->max_degraded; 1966 1967 /* First compute the information on this sector */ 1968 1969 /* 1970 * Compute the chunk number and the sector offset inside the chunk 1971 */ 1972 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1973 chunk_number = r_sector; 1974 1975 /* 1976 * Compute the stripe number 1977 */ 1978 stripe = chunk_number; 1979 *dd_idx = sector_div(stripe, data_disks); 1980 stripe2 = stripe; 1981 /* 1982 * Select the parity disk based on the user selected algorithm. 1983 */ 1984 pd_idx = qd_idx = -1; 1985 switch(conf->level) { 1986 case 4: 1987 pd_idx = data_disks; 1988 break; 1989 case 5: 1990 switch (algorithm) { 1991 case ALGORITHM_LEFT_ASYMMETRIC: 1992 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1993 if (*dd_idx >= pd_idx) 1994 (*dd_idx)++; 1995 break; 1996 case ALGORITHM_RIGHT_ASYMMETRIC: 1997 pd_idx = sector_div(stripe2, raid_disks); 1998 if (*dd_idx >= pd_idx) 1999 (*dd_idx)++; 2000 break; 2001 case ALGORITHM_LEFT_SYMMETRIC: 2002 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2003 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2004 break; 2005 case ALGORITHM_RIGHT_SYMMETRIC: 2006 pd_idx = sector_div(stripe2, raid_disks); 2007 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2008 break; 2009 case ALGORITHM_PARITY_0: 2010 pd_idx = 0; 2011 (*dd_idx)++; 2012 break; 2013 case ALGORITHM_PARITY_N: 2014 pd_idx = data_disks; 2015 break; 2016 default: 2017 BUG(); 2018 } 2019 break; 2020 case 6: 2021 2022 switch (algorithm) { 2023 case ALGORITHM_LEFT_ASYMMETRIC: 2024 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2025 qd_idx = pd_idx + 1; 2026 if (pd_idx == raid_disks-1) { 2027 (*dd_idx)++; /* Q D D D P */ 2028 qd_idx = 0; 2029 } else if (*dd_idx >= pd_idx) 2030 (*dd_idx) += 2; /* D D P Q D */ 2031 break; 2032 case ALGORITHM_RIGHT_ASYMMETRIC: 2033 pd_idx = sector_div(stripe2, raid_disks); 2034 qd_idx = pd_idx + 1; 2035 if (pd_idx == raid_disks-1) { 2036 (*dd_idx)++; /* Q D D D P */ 2037 qd_idx = 0; 2038 } else if (*dd_idx >= pd_idx) 2039 (*dd_idx) += 2; /* D D P Q D */ 2040 break; 2041 case ALGORITHM_LEFT_SYMMETRIC: 2042 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2043 qd_idx = (pd_idx + 1) % raid_disks; 2044 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2045 break; 2046 case ALGORITHM_RIGHT_SYMMETRIC: 2047 pd_idx = sector_div(stripe2, raid_disks); 2048 qd_idx = (pd_idx + 1) % raid_disks; 2049 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2050 break; 2051 2052 case ALGORITHM_PARITY_0: 2053 pd_idx = 0; 2054 qd_idx = 1; 2055 (*dd_idx) += 2; 2056 break; 2057 case ALGORITHM_PARITY_N: 2058 pd_idx = data_disks; 2059 qd_idx = data_disks + 1; 2060 break; 2061 2062 case ALGORITHM_ROTATING_ZERO_RESTART: 2063 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2064 * of blocks for computing Q is different. 2065 */ 2066 pd_idx = sector_div(stripe2, raid_disks); 2067 qd_idx = pd_idx + 1; 2068 if (pd_idx == raid_disks-1) { 2069 (*dd_idx)++; /* Q D D D P */ 2070 qd_idx = 0; 2071 } else if (*dd_idx >= pd_idx) 2072 (*dd_idx) += 2; /* D D P Q D */ 2073 ddf_layout = 1; 2074 break; 2075 2076 case ALGORITHM_ROTATING_N_RESTART: 2077 /* Same a left_asymmetric, by first stripe is 2078 * D D D P Q rather than 2079 * Q D D D P 2080 */ 2081 stripe2 += 1; 2082 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2083 qd_idx = pd_idx + 1; 2084 if (pd_idx == raid_disks-1) { 2085 (*dd_idx)++; /* Q D D D P */ 2086 qd_idx = 0; 2087 } else if (*dd_idx >= pd_idx) 2088 (*dd_idx) += 2; /* D D P Q D */ 2089 ddf_layout = 1; 2090 break; 2091 2092 case ALGORITHM_ROTATING_N_CONTINUE: 2093 /* Same as left_symmetric but Q is before P */ 2094 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2095 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2096 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2097 ddf_layout = 1; 2098 break; 2099 2100 case ALGORITHM_LEFT_ASYMMETRIC_6: 2101 /* RAID5 left_asymmetric, with Q on last device */ 2102 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2103 if (*dd_idx >= pd_idx) 2104 (*dd_idx)++; 2105 qd_idx = raid_disks - 1; 2106 break; 2107 2108 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2109 pd_idx = sector_div(stripe2, raid_disks-1); 2110 if (*dd_idx >= pd_idx) 2111 (*dd_idx)++; 2112 qd_idx = raid_disks - 1; 2113 break; 2114 2115 case ALGORITHM_LEFT_SYMMETRIC_6: 2116 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2117 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2118 qd_idx = raid_disks - 1; 2119 break; 2120 2121 case ALGORITHM_RIGHT_SYMMETRIC_6: 2122 pd_idx = sector_div(stripe2, raid_disks-1); 2123 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2124 qd_idx = raid_disks - 1; 2125 break; 2126 2127 case ALGORITHM_PARITY_0_6: 2128 pd_idx = 0; 2129 (*dd_idx)++; 2130 qd_idx = raid_disks - 1; 2131 break; 2132 2133 default: 2134 BUG(); 2135 } 2136 break; 2137 } 2138 2139 if (sh) { 2140 sh->pd_idx = pd_idx; 2141 sh->qd_idx = qd_idx; 2142 sh->ddf_layout = ddf_layout; 2143 } 2144 /* 2145 * Finally, compute the new sector number 2146 */ 2147 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2148 return new_sector; 2149 } 2150 2151 2152 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2153 { 2154 struct r5conf *conf = sh->raid_conf; 2155 int raid_disks = sh->disks; 2156 int data_disks = raid_disks - conf->max_degraded; 2157 sector_t new_sector = sh->sector, check; 2158 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2159 : conf->chunk_sectors; 2160 int algorithm = previous ? conf->prev_algo 2161 : conf->algorithm; 2162 sector_t stripe; 2163 int chunk_offset; 2164 sector_t chunk_number; 2165 int dummy1, dd_idx = i; 2166 sector_t r_sector; 2167 struct stripe_head sh2; 2168 2169 2170 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2171 stripe = new_sector; 2172 2173 if (i == sh->pd_idx) 2174 return 0; 2175 switch(conf->level) { 2176 case 4: break; 2177 case 5: 2178 switch (algorithm) { 2179 case ALGORITHM_LEFT_ASYMMETRIC: 2180 case ALGORITHM_RIGHT_ASYMMETRIC: 2181 if (i > sh->pd_idx) 2182 i--; 2183 break; 2184 case ALGORITHM_LEFT_SYMMETRIC: 2185 case ALGORITHM_RIGHT_SYMMETRIC: 2186 if (i < sh->pd_idx) 2187 i += raid_disks; 2188 i -= (sh->pd_idx + 1); 2189 break; 2190 case ALGORITHM_PARITY_0: 2191 i -= 1; 2192 break; 2193 case ALGORITHM_PARITY_N: 2194 break; 2195 default: 2196 BUG(); 2197 } 2198 break; 2199 case 6: 2200 if (i == sh->qd_idx) 2201 return 0; /* It is the Q disk */ 2202 switch (algorithm) { 2203 case ALGORITHM_LEFT_ASYMMETRIC: 2204 case ALGORITHM_RIGHT_ASYMMETRIC: 2205 case ALGORITHM_ROTATING_ZERO_RESTART: 2206 case ALGORITHM_ROTATING_N_RESTART: 2207 if (sh->pd_idx == raid_disks-1) 2208 i--; /* Q D D D P */ 2209 else if (i > sh->pd_idx) 2210 i -= 2; /* D D P Q D */ 2211 break; 2212 case ALGORITHM_LEFT_SYMMETRIC: 2213 case ALGORITHM_RIGHT_SYMMETRIC: 2214 if (sh->pd_idx == raid_disks-1) 2215 i--; /* Q D D D P */ 2216 else { 2217 /* D D P Q D */ 2218 if (i < sh->pd_idx) 2219 i += raid_disks; 2220 i -= (sh->pd_idx + 2); 2221 } 2222 break; 2223 case ALGORITHM_PARITY_0: 2224 i -= 2; 2225 break; 2226 case ALGORITHM_PARITY_N: 2227 break; 2228 case ALGORITHM_ROTATING_N_CONTINUE: 2229 /* Like left_symmetric, but P is before Q */ 2230 if (sh->pd_idx == 0) 2231 i--; /* P D D D Q */ 2232 else { 2233 /* D D Q P D */ 2234 if (i < sh->pd_idx) 2235 i += raid_disks; 2236 i -= (sh->pd_idx + 1); 2237 } 2238 break; 2239 case ALGORITHM_LEFT_ASYMMETRIC_6: 2240 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2241 if (i > sh->pd_idx) 2242 i--; 2243 break; 2244 case ALGORITHM_LEFT_SYMMETRIC_6: 2245 case ALGORITHM_RIGHT_SYMMETRIC_6: 2246 if (i < sh->pd_idx) 2247 i += data_disks + 1; 2248 i -= (sh->pd_idx + 1); 2249 break; 2250 case ALGORITHM_PARITY_0_6: 2251 i -= 1; 2252 break; 2253 default: 2254 BUG(); 2255 } 2256 break; 2257 } 2258 2259 chunk_number = stripe * data_disks + i; 2260 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2261 2262 check = raid5_compute_sector(conf, r_sector, 2263 previous, &dummy1, &sh2); 2264 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2265 || sh2.qd_idx != sh->qd_idx) { 2266 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2267 mdname(conf->mddev)); 2268 return 0; 2269 } 2270 return r_sector; 2271 } 2272 2273 2274 static void 2275 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2276 int rcw, int expand) 2277 { 2278 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2279 struct r5conf *conf = sh->raid_conf; 2280 int level = conf->level; 2281 2282 if (rcw) { 2283 /* if we are not expanding this is a proper write request, and 2284 * there will be bios with new data to be drained into the 2285 * stripe cache 2286 */ 2287 if (!expand) { 2288 sh->reconstruct_state = reconstruct_state_drain_run; 2289 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2290 } else 2291 sh->reconstruct_state = reconstruct_state_run; 2292 2293 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2294 2295 for (i = disks; i--; ) { 2296 struct r5dev *dev = &sh->dev[i]; 2297 2298 if (dev->towrite) { 2299 set_bit(R5_LOCKED, &dev->flags); 2300 set_bit(R5_Wantdrain, &dev->flags); 2301 if (!expand) 2302 clear_bit(R5_UPTODATE, &dev->flags); 2303 s->locked++; 2304 } 2305 } 2306 if (s->locked + conf->max_degraded == disks) 2307 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2308 atomic_inc(&conf->pending_full_writes); 2309 } else { 2310 BUG_ON(level == 6); 2311 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2312 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2313 2314 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2315 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2316 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2317 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2318 2319 for (i = disks; i--; ) { 2320 struct r5dev *dev = &sh->dev[i]; 2321 if (i == pd_idx) 2322 continue; 2323 2324 if (dev->towrite && 2325 (test_bit(R5_UPTODATE, &dev->flags) || 2326 test_bit(R5_Wantcompute, &dev->flags))) { 2327 set_bit(R5_Wantdrain, &dev->flags); 2328 set_bit(R5_LOCKED, &dev->flags); 2329 clear_bit(R5_UPTODATE, &dev->flags); 2330 s->locked++; 2331 } 2332 } 2333 } 2334 2335 /* keep the parity disk(s) locked while asynchronous operations 2336 * are in flight 2337 */ 2338 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2339 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2340 s->locked++; 2341 2342 if (level == 6) { 2343 int qd_idx = sh->qd_idx; 2344 struct r5dev *dev = &sh->dev[qd_idx]; 2345 2346 set_bit(R5_LOCKED, &dev->flags); 2347 clear_bit(R5_UPTODATE, &dev->flags); 2348 s->locked++; 2349 } 2350 2351 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2352 __func__, (unsigned long long)sh->sector, 2353 s->locked, s->ops_request); 2354 } 2355 2356 /* 2357 * Each stripe/dev can have one or more bion attached. 2358 * toread/towrite point to the first in a chain. 2359 * The bi_next chain must be in order. 2360 */ 2361 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2362 { 2363 struct bio **bip; 2364 struct r5conf *conf = sh->raid_conf; 2365 int firstwrite=0; 2366 2367 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2368 (unsigned long long)bi->bi_sector, 2369 (unsigned long long)sh->sector); 2370 2371 /* 2372 * If several bio share a stripe. The bio bi_phys_segments acts as a 2373 * reference count to avoid race. The reference count should already be 2374 * increased before this function is called (for example, in 2375 * make_request()), so other bio sharing this stripe will not free the 2376 * stripe. If a stripe is owned by one stripe, the stripe lock will 2377 * protect it. 2378 */ 2379 spin_lock_irq(&sh->stripe_lock); 2380 if (forwrite) { 2381 bip = &sh->dev[dd_idx].towrite; 2382 if (*bip == NULL) 2383 firstwrite = 1; 2384 } else 2385 bip = &sh->dev[dd_idx].toread; 2386 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2387 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2388 goto overlap; 2389 bip = & (*bip)->bi_next; 2390 } 2391 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2392 goto overlap; 2393 2394 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2395 if (*bip) 2396 bi->bi_next = *bip; 2397 *bip = bi; 2398 raid5_inc_bi_active_stripes(bi); 2399 2400 if (forwrite) { 2401 /* check if page is covered */ 2402 sector_t sector = sh->dev[dd_idx].sector; 2403 for (bi=sh->dev[dd_idx].towrite; 2404 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2405 bi && bi->bi_sector <= sector; 2406 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2407 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2408 sector = bi->bi_sector + (bi->bi_size>>9); 2409 } 2410 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2411 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2412 } 2413 2414 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2415 (unsigned long long)(*bip)->bi_sector, 2416 (unsigned long long)sh->sector, dd_idx); 2417 spin_unlock_irq(&sh->stripe_lock); 2418 2419 if (conf->mddev->bitmap && firstwrite) { 2420 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2421 STRIPE_SECTORS, 0); 2422 sh->bm_seq = conf->seq_flush+1; 2423 set_bit(STRIPE_BIT_DELAY, &sh->state); 2424 } 2425 return 1; 2426 2427 overlap: 2428 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2429 spin_unlock_irq(&sh->stripe_lock); 2430 return 0; 2431 } 2432 2433 static void end_reshape(struct r5conf *conf); 2434 2435 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2436 struct stripe_head *sh) 2437 { 2438 int sectors_per_chunk = 2439 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2440 int dd_idx; 2441 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2442 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2443 2444 raid5_compute_sector(conf, 2445 stripe * (disks - conf->max_degraded) 2446 *sectors_per_chunk + chunk_offset, 2447 previous, 2448 &dd_idx, sh); 2449 } 2450 2451 static void 2452 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2453 struct stripe_head_state *s, int disks, 2454 struct bio **return_bi) 2455 { 2456 int i; 2457 for (i = disks; i--; ) { 2458 struct bio *bi; 2459 int bitmap_end = 0; 2460 2461 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2462 struct md_rdev *rdev; 2463 rcu_read_lock(); 2464 rdev = rcu_dereference(conf->disks[i].rdev); 2465 if (rdev && test_bit(In_sync, &rdev->flags)) 2466 atomic_inc(&rdev->nr_pending); 2467 else 2468 rdev = NULL; 2469 rcu_read_unlock(); 2470 if (rdev) { 2471 if (!rdev_set_badblocks( 2472 rdev, 2473 sh->sector, 2474 STRIPE_SECTORS, 0)) 2475 md_error(conf->mddev, rdev); 2476 rdev_dec_pending(rdev, conf->mddev); 2477 } 2478 } 2479 spin_lock_irq(&sh->stripe_lock); 2480 /* fail all writes first */ 2481 bi = sh->dev[i].towrite; 2482 sh->dev[i].towrite = NULL; 2483 spin_unlock_irq(&sh->stripe_lock); 2484 if (bi) 2485 bitmap_end = 1; 2486 2487 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2488 wake_up(&conf->wait_for_overlap); 2489 2490 while (bi && bi->bi_sector < 2491 sh->dev[i].sector + STRIPE_SECTORS) { 2492 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2493 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2494 if (!raid5_dec_bi_active_stripes(bi)) { 2495 md_write_end(conf->mddev); 2496 bi->bi_next = *return_bi; 2497 *return_bi = bi; 2498 } 2499 bi = nextbi; 2500 } 2501 if (bitmap_end) 2502 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2503 STRIPE_SECTORS, 0, 0); 2504 bitmap_end = 0; 2505 /* and fail all 'written' */ 2506 bi = sh->dev[i].written; 2507 sh->dev[i].written = NULL; 2508 if (bi) bitmap_end = 1; 2509 while (bi && bi->bi_sector < 2510 sh->dev[i].sector + STRIPE_SECTORS) { 2511 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2512 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2513 if (!raid5_dec_bi_active_stripes(bi)) { 2514 md_write_end(conf->mddev); 2515 bi->bi_next = *return_bi; 2516 *return_bi = bi; 2517 } 2518 bi = bi2; 2519 } 2520 2521 /* fail any reads if this device is non-operational and 2522 * the data has not reached the cache yet. 2523 */ 2524 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2525 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2526 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2527 spin_lock_irq(&sh->stripe_lock); 2528 bi = sh->dev[i].toread; 2529 sh->dev[i].toread = NULL; 2530 spin_unlock_irq(&sh->stripe_lock); 2531 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2532 wake_up(&conf->wait_for_overlap); 2533 while (bi && bi->bi_sector < 2534 sh->dev[i].sector + STRIPE_SECTORS) { 2535 struct bio *nextbi = 2536 r5_next_bio(bi, sh->dev[i].sector); 2537 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2538 if (!raid5_dec_bi_active_stripes(bi)) { 2539 bi->bi_next = *return_bi; 2540 *return_bi = bi; 2541 } 2542 bi = nextbi; 2543 } 2544 } 2545 if (bitmap_end) 2546 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2547 STRIPE_SECTORS, 0, 0); 2548 /* If we were in the middle of a write the parity block might 2549 * still be locked - so just clear all R5_LOCKED flags 2550 */ 2551 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2552 } 2553 2554 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2555 if (atomic_dec_and_test(&conf->pending_full_writes)) 2556 md_wakeup_thread(conf->mddev->thread); 2557 } 2558 2559 static void 2560 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2561 struct stripe_head_state *s) 2562 { 2563 int abort = 0; 2564 int i; 2565 2566 clear_bit(STRIPE_SYNCING, &sh->state); 2567 s->syncing = 0; 2568 s->replacing = 0; 2569 /* There is nothing more to do for sync/check/repair. 2570 * Don't even need to abort as that is handled elsewhere 2571 * if needed, and not always wanted e.g. if there is a known 2572 * bad block here. 2573 * For recover/replace we need to record a bad block on all 2574 * non-sync devices, or abort the recovery 2575 */ 2576 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2577 /* During recovery devices cannot be removed, so 2578 * locking and refcounting of rdevs is not needed 2579 */ 2580 for (i = 0; i < conf->raid_disks; i++) { 2581 struct md_rdev *rdev = conf->disks[i].rdev; 2582 if (rdev 2583 && !test_bit(Faulty, &rdev->flags) 2584 && !test_bit(In_sync, &rdev->flags) 2585 && !rdev_set_badblocks(rdev, sh->sector, 2586 STRIPE_SECTORS, 0)) 2587 abort = 1; 2588 rdev = conf->disks[i].replacement; 2589 if (rdev 2590 && !test_bit(Faulty, &rdev->flags) 2591 && !test_bit(In_sync, &rdev->flags) 2592 && !rdev_set_badblocks(rdev, sh->sector, 2593 STRIPE_SECTORS, 0)) 2594 abort = 1; 2595 } 2596 if (abort) 2597 conf->recovery_disabled = 2598 conf->mddev->recovery_disabled; 2599 } 2600 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2601 } 2602 2603 static int want_replace(struct stripe_head *sh, int disk_idx) 2604 { 2605 struct md_rdev *rdev; 2606 int rv = 0; 2607 /* Doing recovery so rcu locking not required */ 2608 rdev = sh->raid_conf->disks[disk_idx].replacement; 2609 if (rdev 2610 && !test_bit(Faulty, &rdev->flags) 2611 && !test_bit(In_sync, &rdev->flags) 2612 && (rdev->recovery_offset <= sh->sector 2613 || rdev->mddev->recovery_cp <= sh->sector)) 2614 rv = 1; 2615 2616 return rv; 2617 } 2618 2619 /* fetch_block - checks the given member device to see if its data needs 2620 * to be read or computed to satisfy a request. 2621 * 2622 * Returns 1 when no more member devices need to be checked, otherwise returns 2623 * 0 to tell the loop in handle_stripe_fill to continue 2624 */ 2625 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2626 int disk_idx, int disks) 2627 { 2628 struct r5dev *dev = &sh->dev[disk_idx]; 2629 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2630 &sh->dev[s->failed_num[1]] }; 2631 2632 /* is the data in this block needed, and can we get it? */ 2633 if (!test_bit(R5_LOCKED, &dev->flags) && 2634 !test_bit(R5_UPTODATE, &dev->flags) && 2635 (dev->toread || 2636 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2637 s->syncing || s->expanding || 2638 (s->replacing && want_replace(sh, disk_idx)) || 2639 (s->failed >= 1 && fdev[0]->toread) || 2640 (s->failed >= 2 && fdev[1]->toread) || 2641 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2642 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2643 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2644 /* we would like to get this block, possibly by computing it, 2645 * otherwise read it if the backing disk is insync 2646 */ 2647 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2648 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2649 if ((s->uptodate == disks - 1) && 2650 (s->failed && (disk_idx == s->failed_num[0] || 2651 disk_idx == s->failed_num[1]))) { 2652 /* have disk failed, and we're requested to fetch it; 2653 * do compute it 2654 */ 2655 pr_debug("Computing stripe %llu block %d\n", 2656 (unsigned long long)sh->sector, disk_idx); 2657 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2658 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2659 set_bit(R5_Wantcompute, &dev->flags); 2660 sh->ops.target = disk_idx; 2661 sh->ops.target2 = -1; /* no 2nd target */ 2662 s->req_compute = 1; 2663 /* Careful: from this point on 'uptodate' is in the eye 2664 * of raid_run_ops which services 'compute' operations 2665 * before writes. R5_Wantcompute flags a block that will 2666 * be R5_UPTODATE by the time it is needed for a 2667 * subsequent operation. 2668 */ 2669 s->uptodate++; 2670 return 1; 2671 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2672 /* Computing 2-failure is *very* expensive; only 2673 * do it if failed >= 2 2674 */ 2675 int other; 2676 for (other = disks; other--; ) { 2677 if (other == disk_idx) 2678 continue; 2679 if (!test_bit(R5_UPTODATE, 2680 &sh->dev[other].flags)) 2681 break; 2682 } 2683 BUG_ON(other < 0); 2684 pr_debug("Computing stripe %llu blocks %d,%d\n", 2685 (unsigned long long)sh->sector, 2686 disk_idx, other); 2687 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2688 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2689 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2690 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2691 sh->ops.target = disk_idx; 2692 sh->ops.target2 = other; 2693 s->uptodate += 2; 2694 s->req_compute = 1; 2695 return 1; 2696 } else if (test_bit(R5_Insync, &dev->flags)) { 2697 set_bit(R5_LOCKED, &dev->flags); 2698 set_bit(R5_Wantread, &dev->flags); 2699 s->locked++; 2700 pr_debug("Reading block %d (sync=%d)\n", 2701 disk_idx, s->syncing); 2702 } 2703 } 2704 2705 return 0; 2706 } 2707 2708 /** 2709 * handle_stripe_fill - read or compute data to satisfy pending requests. 2710 */ 2711 static void handle_stripe_fill(struct stripe_head *sh, 2712 struct stripe_head_state *s, 2713 int disks) 2714 { 2715 int i; 2716 2717 /* look for blocks to read/compute, skip this if a compute 2718 * is already in flight, or if the stripe contents are in the 2719 * midst of changing due to a write 2720 */ 2721 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2722 !sh->reconstruct_state) 2723 for (i = disks; i--; ) 2724 if (fetch_block(sh, s, i, disks)) 2725 break; 2726 set_bit(STRIPE_HANDLE, &sh->state); 2727 } 2728 2729 2730 /* handle_stripe_clean_event 2731 * any written block on an uptodate or failed drive can be returned. 2732 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2733 * never LOCKED, so we don't need to test 'failed' directly. 2734 */ 2735 static void handle_stripe_clean_event(struct r5conf *conf, 2736 struct stripe_head *sh, int disks, struct bio **return_bi) 2737 { 2738 int i; 2739 struct r5dev *dev; 2740 2741 for (i = disks; i--; ) 2742 if (sh->dev[i].written) { 2743 dev = &sh->dev[i]; 2744 if (!test_bit(R5_LOCKED, &dev->flags) && 2745 (test_bit(R5_UPTODATE, &dev->flags) || 2746 test_bit(R5_Discard, &dev->flags))) { 2747 /* We can return any write requests */ 2748 struct bio *wbi, *wbi2; 2749 pr_debug("Return write for disc %d\n", i); 2750 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2751 clear_bit(R5_UPTODATE, &dev->flags); 2752 wbi = dev->written; 2753 dev->written = NULL; 2754 while (wbi && wbi->bi_sector < 2755 dev->sector + STRIPE_SECTORS) { 2756 wbi2 = r5_next_bio(wbi, dev->sector); 2757 if (!raid5_dec_bi_active_stripes(wbi)) { 2758 md_write_end(conf->mddev); 2759 wbi->bi_next = *return_bi; 2760 *return_bi = wbi; 2761 } 2762 wbi = wbi2; 2763 } 2764 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2765 STRIPE_SECTORS, 2766 !test_bit(STRIPE_DEGRADED, &sh->state), 2767 0); 2768 } 2769 } else if (test_bit(R5_Discard, &sh->dev[i].flags)) 2770 clear_bit(R5_Discard, &sh->dev[i].flags); 2771 2772 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2773 if (atomic_dec_and_test(&conf->pending_full_writes)) 2774 md_wakeup_thread(conf->mddev->thread); 2775 } 2776 2777 static void handle_stripe_dirtying(struct r5conf *conf, 2778 struct stripe_head *sh, 2779 struct stripe_head_state *s, 2780 int disks) 2781 { 2782 int rmw = 0, rcw = 0, i; 2783 sector_t recovery_cp = conf->mddev->recovery_cp; 2784 2785 /* RAID6 requires 'rcw' in current implementation. 2786 * Otherwise, check whether resync is now happening or should start. 2787 * If yes, then the array is dirty (after unclean shutdown or 2788 * initial creation), so parity in some stripes might be inconsistent. 2789 * In this case, we need to always do reconstruct-write, to ensure 2790 * that in case of drive failure or read-error correction, we 2791 * generate correct data from the parity. 2792 */ 2793 if (conf->max_degraded == 2 || 2794 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2795 /* Calculate the real rcw later - for now make it 2796 * look like rcw is cheaper 2797 */ 2798 rcw = 1; rmw = 2; 2799 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2800 conf->max_degraded, (unsigned long long)recovery_cp, 2801 (unsigned long long)sh->sector); 2802 } else for (i = disks; i--; ) { 2803 /* would I have to read this buffer for read_modify_write */ 2804 struct r5dev *dev = &sh->dev[i]; 2805 if ((dev->towrite || i == sh->pd_idx) && 2806 !test_bit(R5_LOCKED, &dev->flags) && 2807 !(test_bit(R5_UPTODATE, &dev->flags) || 2808 test_bit(R5_Wantcompute, &dev->flags))) { 2809 if (test_bit(R5_Insync, &dev->flags)) 2810 rmw++; 2811 else 2812 rmw += 2*disks; /* cannot read it */ 2813 } 2814 /* Would I have to read this buffer for reconstruct_write */ 2815 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2816 !test_bit(R5_LOCKED, &dev->flags) && 2817 !(test_bit(R5_UPTODATE, &dev->flags) || 2818 test_bit(R5_Wantcompute, &dev->flags))) { 2819 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2820 else 2821 rcw += 2*disks; 2822 } 2823 } 2824 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2825 (unsigned long long)sh->sector, rmw, rcw); 2826 set_bit(STRIPE_HANDLE, &sh->state); 2827 if (rmw < rcw && rmw > 0) { 2828 /* prefer read-modify-write, but need to get some data */ 2829 blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", 2830 (unsigned long long)sh->sector, rmw); 2831 for (i = disks; i--; ) { 2832 struct r5dev *dev = &sh->dev[i]; 2833 if ((dev->towrite || i == sh->pd_idx) && 2834 !test_bit(R5_LOCKED, &dev->flags) && 2835 !(test_bit(R5_UPTODATE, &dev->flags) || 2836 test_bit(R5_Wantcompute, &dev->flags)) && 2837 test_bit(R5_Insync, &dev->flags)) { 2838 if ( 2839 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2840 pr_debug("Read_old block " 2841 "%d for r-m-w\n", i); 2842 set_bit(R5_LOCKED, &dev->flags); 2843 set_bit(R5_Wantread, &dev->flags); 2844 s->locked++; 2845 } else { 2846 set_bit(STRIPE_DELAYED, &sh->state); 2847 set_bit(STRIPE_HANDLE, &sh->state); 2848 } 2849 } 2850 } 2851 } 2852 if (rcw <= rmw && rcw > 0) { 2853 /* want reconstruct write, but need to get some data */ 2854 int qread =0; 2855 rcw = 0; 2856 for (i = disks; i--; ) { 2857 struct r5dev *dev = &sh->dev[i]; 2858 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2859 i != sh->pd_idx && i != sh->qd_idx && 2860 !test_bit(R5_LOCKED, &dev->flags) && 2861 !(test_bit(R5_UPTODATE, &dev->flags) || 2862 test_bit(R5_Wantcompute, &dev->flags))) { 2863 rcw++; 2864 if (!test_bit(R5_Insync, &dev->flags)) 2865 continue; /* it's a failed drive */ 2866 if ( 2867 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2868 pr_debug("Read_old block " 2869 "%d for Reconstruct\n", i); 2870 set_bit(R5_LOCKED, &dev->flags); 2871 set_bit(R5_Wantread, &dev->flags); 2872 s->locked++; 2873 qread++; 2874 } else { 2875 set_bit(STRIPE_DELAYED, &sh->state); 2876 set_bit(STRIPE_HANDLE, &sh->state); 2877 } 2878 } 2879 } 2880 if (rcw) 2881 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 2882 (unsigned long long)sh->sector, 2883 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 2884 } 2885 /* now if nothing is locked, and if we have enough data, 2886 * we can start a write request 2887 */ 2888 /* since handle_stripe can be called at any time we need to handle the 2889 * case where a compute block operation has been submitted and then a 2890 * subsequent call wants to start a write request. raid_run_ops only 2891 * handles the case where compute block and reconstruct are requested 2892 * simultaneously. If this is not the case then new writes need to be 2893 * held off until the compute completes. 2894 */ 2895 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2896 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2897 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2898 schedule_reconstruction(sh, s, rcw == 0, 0); 2899 } 2900 2901 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2902 struct stripe_head_state *s, int disks) 2903 { 2904 struct r5dev *dev = NULL; 2905 2906 set_bit(STRIPE_HANDLE, &sh->state); 2907 2908 switch (sh->check_state) { 2909 case check_state_idle: 2910 /* start a new check operation if there are no failures */ 2911 if (s->failed == 0) { 2912 BUG_ON(s->uptodate != disks); 2913 sh->check_state = check_state_run; 2914 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2915 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2916 s->uptodate--; 2917 break; 2918 } 2919 dev = &sh->dev[s->failed_num[0]]; 2920 /* fall through */ 2921 case check_state_compute_result: 2922 sh->check_state = check_state_idle; 2923 if (!dev) 2924 dev = &sh->dev[sh->pd_idx]; 2925 2926 /* check that a write has not made the stripe insync */ 2927 if (test_bit(STRIPE_INSYNC, &sh->state)) 2928 break; 2929 2930 /* either failed parity check, or recovery is happening */ 2931 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2932 BUG_ON(s->uptodate != disks); 2933 2934 set_bit(R5_LOCKED, &dev->flags); 2935 s->locked++; 2936 set_bit(R5_Wantwrite, &dev->flags); 2937 2938 clear_bit(STRIPE_DEGRADED, &sh->state); 2939 set_bit(STRIPE_INSYNC, &sh->state); 2940 break; 2941 case check_state_run: 2942 break; /* we will be called again upon completion */ 2943 case check_state_check_result: 2944 sh->check_state = check_state_idle; 2945 2946 /* if a failure occurred during the check operation, leave 2947 * STRIPE_INSYNC not set and let the stripe be handled again 2948 */ 2949 if (s->failed) 2950 break; 2951 2952 /* handle a successful check operation, if parity is correct 2953 * we are done. Otherwise update the mismatch count and repair 2954 * parity if !MD_RECOVERY_CHECK 2955 */ 2956 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2957 /* parity is correct (on disc, 2958 * not in buffer any more) 2959 */ 2960 set_bit(STRIPE_INSYNC, &sh->state); 2961 else { 2962 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 2963 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2964 /* don't try to repair!! */ 2965 set_bit(STRIPE_INSYNC, &sh->state); 2966 else { 2967 sh->check_state = check_state_compute_run; 2968 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2969 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2970 set_bit(R5_Wantcompute, 2971 &sh->dev[sh->pd_idx].flags); 2972 sh->ops.target = sh->pd_idx; 2973 sh->ops.target2 = -1; 2974 s->uptodate++; 2975 } 2976 } 2977 break; 2978 case check_state_compute_run: 2979 break; 2980 default: 2981 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2982 __func__, sh->check_state, 2983 (unsigned long long) sh->sector); 2984 BUG(); 2985 } 2986 } 2987 2988 2989 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2990 struct stripe_head_state *s, 2991 int disks) 2992 { 2993 int pd_idx = sh->pd_idx; 2994 int qd_idx = sh->qd_idx; 2995 struct r5dev *dev; 2996 2997 set_bit(STRIPE_HANDLE, &sh->state); 2998 2999 BUG_ON(s->failed > 2); 3000 3001 /* Want to check and possibly repair P and Q. 3002 * However there could be one 'failed' device, in which 3003 * case we can only check one of them, possibly using the 3004 * other to generate missing data 3005 */ 3006 3007 switch (sh->check_state) { 3008 case check_state_idle: 3009 /* start a new check operation if there are < 2 failures */ 3010 if (s->failed == s->q_failed) { 3011 /* The only possible failed device holds Q, so it 3012 * makes sense to check P (If anything else were failed, 3013 * we would have used P to recreate it). 3014 */ 3015 sh->check_state = check_state_run; 3016 } 3017 if (!s->q_failed && s->failed < 2) { 3018 /* Q is not failed, and we didn't use it to generate 3019 * anything, so it makes sense to check it 3020 */ 3021 if (sh->check_state == check_state_run) 3022 sh->check_state = check_state_run_pq; 3023 else 3024 sh->check_state = check_state_run_q; 3025 } 3026 3027 /* discard potentially stale zero_sum_result */ 3028 sh->ops.zero_sum_result = 0; 3029 3030 if (sh->check_state == check_state_run) { 3031 /* async_xor_zero_sum destroys the contents of P */ 3032 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3033 s->uptodate--; 3034 } 3035 if (sh->check_state >= check_state_run && 3036 sh->check_state <= check_state_run_pq) { 3037 /* async_syndrome_zero_sum preserves P and Q, so 3038 * no need to mark them !uptodate here 3039 */ 3040 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3041 break; 3042 } 3043 3044 /* we have 2-disk failure */ 3045 BUG_ON(s->failed != 2); 3046 /* fall through */ 3047 case check_state_compute_result: 3048 sh->check_state = check_state_idle; 3049 3050 /* check that a write has not made the stripe insync */ 3051 if (test_bit(STRIPE_INSYNC, &sh->state)) 3052 break; 3053 3054 /* now write out any block on a failed drive, 3055 * or P or Q if they were recomputed 3056 */ 3057 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3058 if (s->failed == 2) { 3059 dev = &sh->dev[s->failed_num[1]]; 3060 s->locked++; 3061 set_bit(R5_LOCKED, &dev->flags); 3062 set_bit(R5_Wantwrite, &dev->flags); 3063 } 3064 if (s->failed >= 1) { 3065 dev = &sh->dev[s->failed_num[0]]; 3066 s->locked++; 3067 set_bit(R5_LOCKED, &dev->flags); 3068 set_bit(R5_Wantwrite, &dev->flags); 3069 } 3070 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3071 dev = &sh->dev[pd_idx]; 3072 s->locked++; 3073 set_bit(R5_LOCKED, &dev->flags); 3074 set_bit(R5_Wantwrite, &dev->flags); 3075 } 3076 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3077 dev = &sh->dev[qd_idx]; 3078 s->locked++; 3079 set_bit(R5_LOCKED, &dev->flags); 3080 set_bit(R5_Wantwrite, &dev->flags); 3081 } 3082 clear_bit(STRIPE_DEGRADED, &sh->state); 3083 3084 set_bit(STRIPE_INSYNC, &sh->state); 3085 break; 3086 case check_state_run: 3087 case check_state_run_q: 3088 case check_state_run_pq: 3089 break; /* we will be called again upon completion */ 3090 case check_state_check_result: 3091 sh->check_state = check_state_idle; 3092 3093 /* handle a successful check operation, if parity is correct 3094 * we are done. Otherwise update the mismatch count and repair 3095 * parity if !MD_RECOVERY_CHECK 3096 */ 3097 if (sh->ops.zero_sum_result == 0) { 3098 /* both parities are correct */ 3099 if (!s->failed) 3100 set_bit(STRIPE_INSYNC, &sh->state); 3101 else { 3102 /* in contrast to the raid5 case we can validate 3103 * parity, but still have a failure to write 3104 * back 3105 */ 3106 sh->check_state = check_state_compute_result; 3107 /* Returning at this point means that we may go 3108 * off and bring p and/or q uptodate again so 3109 * we make sure to check zero_sum_result again 3110 * to verify if p or q need writeback 3111 */ 3112 } 3113 } else { 3114 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3115 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3116 /* don't try to repair!! */ 3117 set_bit(STRIPE_INSYNC, &sh->state); 3118 else { 3119 int *target = &sh->ops.target; 3120 3121 sh->ops.target = -1; 3122 sh->ops.target2 = -1; 3123 sh->check_state = check_state_compute_run; 3124 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3125 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3126 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3127 set_bit(R5_Wantcompute, 3128 &sh->dev[pd_idx].flags); 3129 *target = pd_idx; 3130 target = &sh->ops.target2; 3131 s->uptodate++; 3132 } 3133 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3134 set_bit(R5_Wantcompute, 3135 &sh->dev[qd_idx].flags); 3136 *target = qd_idx; 3137 s->uptodate++; 3138 } 3139 } 3140 } 3141 break; 3142 case check_state_compute_run: 3143 break; 3144 default: 3145 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3146 __func__, sh->check_state, 3147 (unsigned long long) sh->sector); 3148 BUG(); 3149 } 3150 } 3151 3152 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3153 { 3154 int i; 3155 3156 /* We have read all the blocks in this stripe and now we need to 3157 * copy some of them into a target stripe for expand. 3158 */ 3159 struct dma_async_tx_descriptor *tx = NULL; 3160 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3161 for (i = 0; i < sh->disks; i++) 3162 if (i != sh->pd_idx && i != sh->qd_idx) { 3163 int dd_idx, j; 3164 struct stripe_head *sh2; 3165 struct async_submit_ctl submit; 3166 3167 sector_t bn = compute_blocknr(sh, i, 1); 3168 sector_t s = raid5_compute_sector(conf, bn, 0, 3169 &dd_idx, NULL); 3170 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3171 if (sh2 == NULL) 3172 /* so far only the early blocks of this stripe 3173 * have been requested. When later blocks 3174 * get requested, we will try again 3175 */ 3176 continue; 3177 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3178 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3179 /* must have already done this block */ 3180 release_stripe(sh2); 3181 continue; 3182 } 3183 3184 /* place all the copies on one channel */ 3185 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3186 tx = async_memcpy(sh2->dev[dd_idx].page, 3187 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3188 &submit); 3189 3190 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3191 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3192 for (j = 0; j < conf->raid_disks; j++) 3193 if (j != sh2->pd_idx && 3194 j != sh2->qd_idx && 3195 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3196 break; 3197 if (j == conf->raid_disks) { 3198 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3199 set_bit(STRIPE_HANDLE, &sh2->state); 3200 } 3201 release_stripe(sh2); 3202 3203 } 3204 /* done submitting copies, wait for them to complete */ 3205 async_tx_quiesce(&tx); 3206 } 3207 3208 /* 3209 * handle_stripe - do things to a stripe. 3210 * 3211 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3212 * state of various bits to see what needs to be done. 3213 * Possible results: 3214 * return some read requests which now have data 3215 * return some write requests which are safely on storage 3216 * schedule a read on some buffers 3217 * schedule a write of some buffers 3218 * return confirmation of parity correctness 3219 * 3220 */ 3221 3222 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3223 { 3224 struct r5conf *conf = sh->raid_conf; 3225 int disks = sh->disks; 3226 struct r5dev *dev; 3227 int i; 3228 int do_recovery = 0; 3229 3230 memset(s, 0, sizeof(*s)); 3231 3232 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3233 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3234 s->failed_num[0] = -1; 3235 s->failed_num[1] = -1; 3236 3237 /* Now to look around and see what can be done */ 3238 rcu_read_lock(); 3239 for (i=disks; i--; ) { 3240 struct md_rdev *rdev; 3241 sector_t first_bad; 3242 int bad_sectors; 3243 int is_bad = 0; 3244 3245 dev = &sh->dev[i]; 3246 3247 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3248 i, dev->flags, 3249 dev->toread, dev->towrite, dev->written); 3250 /* maybe we can reply to a read 3251 * 3252 * new wantfill requests are only permitted while 3253 * ops_complete_biofill is guaranteed to be inactive 3254 */ 3255 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3256 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3257 set_bit(R5_Wantfill, &dev->flags); 3258 3259 /* now count some things */ 3260 if (test_bit(R5_LOCKED, &dev->flags)) 3261 s->locked++; 3262 if (test_bit(R5_UPTODATE, &dev->flags)) 3263 s->uptodate++; 3264 if (test_bit(R5_Wantcompute, &dev->flags)) { 3265 s->compute++; 3266 BUG_ON(s->compute > 2); 3267 } 3268 3269 if (test_bit(R5_Wantfill, &dev->flags)) 3270 s->to_fill++; 3271 else if (dev->toread) 3272 s->to_read++; 3273 if (dev->towrite) { 3274 s->to_write++; 3275 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3276 s->non_overwrite++; 3277 } 3278 if (dev->written) 3279 s->written++; 3280 /* Prefer to use the replacement for reads, but only 3281 * if it is recovered enough and has no bad blocks. 3282 */ 3283 rdev = rcu_dereference(conf->disks[i].replacement); 3284 if (rdev && !test_bit(Faulty, &rdev->flags) && 3285 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3286 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3287 &first_bad, &bad_sectors)) 3288 set_bit(R5_ReadRepl, &dev->flags); 3289 else { 3290 if (rdev) 3291 set_bit(R5_NeedReplace, &dev->flags); 3292 rdev = rcu_dereference(conf->disks[i].rdev); 3293 clear_bit(R5_ReadRepl, &dev->flags); 3294 } 3295 if (rdev && test_bit(Faulty, &rdev->flags)) 3296 rdev = NULL; 3297 if (rdev) { 3298 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3299 &first_bad, &bad_sectors); 3300 if (s->blocked_rdev == NULL 3301 && (test_bit(Blocked, &rdev->flags) 3302 || is_bad < 0)) { 3303 if (is_bad < 0) 3304 set_bit(BlockedBadBlocks, 3305 &rdev->flags); 3306 s->blocked_rdev = rdev; 3307 atomic_inc(&rdev->nr_pending); 3308 } 3309 } 3310 clear_bit(R5_Insync, &dev->flags); 3311 if (!rdev) 3312 /* Not in-sync */; 3313 else if (is_bad) { 3314 /* also not in-sync */ 3315 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3316 test_bit(R5_UPTODATE, &dev->flags)) { 3317 /* treat as in-sync, but with a read error 3318 * which we can now try to correct 3319 */ 3320 set_bit(R5_Insync, &dev->flags); 3321 set_bit(R5_ReadError, &dev->flags); 3322 } 3323 } else if (test_bit(In_sync, &rdev->flags)) 3324 set_bit(R5_Insync, &dev->flags); 3325 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3326 /* in sync if before recovery_offset */ 3327 set_bit(R5_Insync, &dev->flags); 3328 else if (test_bit(R5_UPTODATE, &dev->flags) && 3329 test_bit(R5_Expanded, &dev->flags)) 3330 /* If we've reshaped into here, we assume it is Insync. 3331 * We will shortly update recovery_offset to make 3332 * it official. 3333 */ 3334 set_bit(R5_Insync, &dev->flags); 3335 3336 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3337 /* This flag does not apply to '.replacement' 3338 * only to .rdev, so make sure to check that*/ 3339 struct md_rdev *rdev2 = rcu_dereference( 3340 conf->disks[i].rdev); 3341 if (rdev2 == rdev) 3342 clear_bit(R5_Insync, &dev->flags); 3343 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3344 s->handle_bad_blocks = 1; 3345 atomic_inc(&rdev2->nr_pending); 3346 } else 3347 clear_bit(R5_WriteError, &dev->flags); 3348 } 3349 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3350 /* This flag does not apply to '.replacement' 3351 * only to .rdev, so make sure to check that*/ 3352 struct md_rdev *rdev2 = rcu_dereference( 3353 conf->disks[i].rdev); 3354 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3355 s->handle_bad_blocks = 1; 3356 atomic_inc(&rdev2->nr_pending); 3357 } else 3358 clear_bit(R5_MadeGood, &dev->flags); 3359 } 3360 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3361 struct md_rdev *rdev2 = rcu_dereference( 3362 conf->disks[i].replacement); 3363 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3364 s->handle_bad_blocks = 1; 3365 atomic_inc(&rdev2->nr_pending); 3366 } else 3367 clear_bit(R5_MadeGoodRepl, &dev->flags); 3368 } 3369 if (!test_bit(R5_Insync, &dev->flags)) { 3370 /* The ReadError flag will just be confusing now */ 3371 clear_bit(R5_ReadError, &dev->flags); 3372 clear_bit(R5_ReWrite, &dev->flags); 3373 } 3374 if (test_bit(R5_ReadError, &dev->flags)) 3375 clear_bit(R5_Insync, &dev->flags); 3376 if (!test_bit(R5_Insync, &dev->flags)) { 3377 if (s->failed < 2) 3378 s->failed_num[s->failed] = i; 3379 s->failed++; 3380 if (rdev && !test_bit(Faulty, &rdev->flags)) 3381 do_recovery = 1; 3382 } 3383 } 3384 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3385 /* If there is a failed device being replaced, 3386 * we must be recovering. 3387 * else if we are after recovery_cp, we must be syncing 3388 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3389 * else we can only be replacing 3390 * sync and recovery both need to read all devices, and so 3391 * use the same flag. 3392 */ 3393 if (do_recovery || 3394 sh->sector >= conf->mddev->recovery_cp || 3395 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3396 s->syncing = 1; 3397 else 3398 s->replacing = 1; 3399 } 3400 rcu_read_unlock(); 3401 } 3402 3403 static void handle_stripe(struct stripe_head *sh) 3404 { 3405 struct stripe_head_state s; 3406 struct r5conf *conf = sh->raid_conf; 3407 int i; 3408 int prexor; 3409 int disks = sh->disks; 3410 struct r5dev *pdev, *qdev; 3411 3412 clear_bit(STRIPE_HANDLE, &sh->state); 3413 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3414 /* already being handled, ensure it gets handled 3415 * again when current action finishes */ 3416 set_bit(STRIPE_HANDLE, &sh->state); 3417 return; 3418 } 3419 3420 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3421 set_bit(STRIPE_SYNCING, &sh->state); 3422 clear_bit(STRIPE_INSYNC, &sh->state); 3423 } 3424 clear_bit(STRIPE_DELAYED, &sh->state); 3425 3426 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3427 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3428 (unsigned long long)sh->sector, sh->state, 3429 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3430 sh->check_state, sh->reconstruct_state); 3431 3432 analyse_stripe(sh, &s); 3433 3434 if (s.handle_bad_blocks) { 3435 set_bit(STRIPE_HANDLE, &sh->state); 3436 goto finish; 3437 } 3438 3439 if (unlikely(s.blocked_rdev)) { 3440 if (s.syncing || s.expanding || s.expanded || 3441 s.replacing || s.to_write || s.written) { 3442 set_bit(STRIPE_HANDLE, &sh->state); 3443 goto finish; 3444 } 3445 /* There is nothing for the blocked_rdev to block */ 3446 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3447 s.blocked_rdev = NULL; 3448 } 3449 3450 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3451 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3452 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3453 } 3454 3455 pr_debug("locked=%d uptodate=%d to_read=%d" 3456 " to_write=%d failed=%d failed_num=%d,%d\n", 3457 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3458 s.failed_num[0], s.failed_num[1]); 3459 /* check if the array has lost more than max_degraded devices and, 3460 * if so, some requests might need to be failed. 3461 */ 3462 if (s.failed > conf->max_degraded) { 3463 sh->check_state = 0; 3464 sh->reconstruct_state = 0; 3465 if (s.to_read+s.to_write+s.written) 3466 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3467 if (s.syncing + s.replacing) 3468 handle_failed_sync(conf, sh, &s); 3469 } 3470 3471 /* Now we check to see if any write operations have recently 3472 * completed 3473 */ 3474 prexor = 0; 3475 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3476 prexor = 1; 3477 if (sh->reconstruct_state == reconstruct_state_drain_result || 3478 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3479 sh->reconstruct_state = reconstruct_state_idle; 3480 3481 /* All the 'written' buffers and the parity block are ready to 3482 * be written back to disk 3483 */ 3484 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3485 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3486 BUG_ON(sh->qd_idx >= 0 && 3487 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3488 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3489 for (i = disks; i--; ) { 3490 struct r5dev *dev = &sh->dev[i]; 3491 if (test_bit(R5_LOCKED, &dev->flags) && 3492 (i == sh->pd_idx || i == sh->qd_idx || 3493 dev->written)) { 3494 pr_debug("Writing block %d\n", i); 3495 set_bit(R5_Wantwrite, &dev->flags); 3496 if (prexor) 3497 continue; 3498 if (!test_bit(R5_Insync, &dev->flags) || 3499 ((i == sh->pd_idx || i == sh->qd_idx) && 3500 s.failed == 0)) 3501 set_bit(STRIPE_INSYNC, &sh->state); 3502 } 3503 } 3504 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3505 s.dec_preread_active = 1; 3506 } 3507 3508 /* 3509 * might be able to return some write requests if the parity blocks 3510 * are safe, or on a failed drive 3511 */ 3512 pdev = &sh->dev[sh->pd_idx]; 3513 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3514 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3515 qdev = &sh->dev[sh->qd_idx]; 3516 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3517 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3518 || conf->level < 6; 3519 3520 if (s.written && 3521 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3522 && !test_bit(R5_LOCKED, &pdev->flags) 3523 && (test_bit(R5_UPTODATE, &pdev->flags) || 3524 test_bit(R5_Discard, &pdev->flags))))) && 3525 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3526 && !test_bit(R5_LOCKED, &qdev->flags) 3527 && (test_bit(R5_UPTODATE, &qdev->flags) || 3528 test_bit(R5_Discard, &qdev->flags)))))) 3529 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3530 3531 /* Now we might consider reading some blocks, either to check/generate 3532 * parity, or to satisfy requests 3533 * or to load a block that is being partially written. 3534 */ 3535 if (s.to_read || s.non_overwrite 3536 || (conf->level == 6 && s.to_write && s.failed) 3537 || (s.syncing && (s.uptodate + s.compute < disks)) 3538 || s.replacing 3539 || s.expanding) 3540 handle_stripe_fill(sh, &s, disks); 3541 3542 /* Now to consider new write requests and what else, if anything 3543 * should be read. We do not handle new writes when: 3544 * 1/ A 'write' operation (copy+xor) is already in flight. 3545 * 2/ A 'check' operation is in flight, as it may clobber the parity 3546 * block. 3547 */ 3548 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3549 handle_stripe_dirtying(conf, sh, &s, disks); 3550 3551 /* maybe we need to check and possibly fix the parity for this stripe 3552 * Any reads will already have been scheduled, so we just see if enough 3553 * data is available. The parity check is held off while parity 3554 * dependent operations are in flight. 3555 */ 3556 if (sh->check_state || 3557 (s.syncing && s.locked == 0 && 3558 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3559 !test_bit(STRIPE_INSYNC, &sh->state))) { 3560 if (conf->level == 6) 3561 handle_parity_checks6(conf, sh, &s, disks); 3562 else 3563 handle_parity_checks5(conf, sh, &s, disks); 3564 } 3565 3566 if (s.replacing && s.locked == 0 3567 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3568 /* Write out to replacement devices where possible */ 3569 for (i = 0; i < conf->raid_disks; i++) 3570 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3571 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3572 set_bit(R5_WantReplace, &sh->dev[i].flags); 3573 set_bit(R5_LOCKED, &sh->dev[i].flags); 3574 s.locked++; 3575 } 3576 set_bit(STRIPE_INSYNC, &sh->state); 3577 } 3578 if ((s.syncing || s.replacing) && s.locked == 0 && 3579 test_bit(STRIPE_INSYNC, &sh->state)) { 3580 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3581 clear_bit(STRIPE_SYNCING, &sh->state); 3582 } 3583 3584 /* If the failed drives are just a ReadError, then we might need 3585 * to progress the repair/check process 3586 */ 3587 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3588 for (i = 0; i < s.failed; i++) { 3589 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3590 if (test_bit(R5_ReadError, &dev->flags) 3591 && !test_bit(R5_LOCKED, &dev->flags) 3592 && test_bit(R5_UPTODATE, &dev->flags) 3593 ) { 3594 if (!test_bit(R5_ReWrite, &dev->flags)) { 3595 set_bit(R5_Wantwrite, &dev->flags); 3596 set_bit(R5_ReWrite, &dev->flags); 3597 set_bit(R5_LOCKED, &dev->flags); 3598 s.locked++; 3599 } else { 3600 /* let's read it back */ 3601 set_bit(R5_Wantread, &dev->flags); 3602 set_bit(R5_LOCKED, &dev->flags); 3603 s.locked++; 3604 } 3605 } 3606 } 3607 3608 3609 /* Finish reconstruct operations initiated by the expansion process */ 3610 if (sh->reconstruct_state == reconstruct_state_result) { 3611 struct stripe_head *sh_src 3612 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3613 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3614 /* sh cannot be written until sh_src has been read. 3615 * so arrange for sh to be delayed a little 3616 */ 3617 set_bit(STRIPE_DELAYED, &sh->state); 3618 set_bit(STRIPE_HANDLE, &sh->state); 3619 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3620 &sh_src->state)) 3621 atomic_inc(&conf->preread_active_stripes); 3622 release_stripe(sh_src); 3623 goto finish; 3624 } 3625 if (sh_src) 3626 release_stripe(sh_src); 3627 3628 sh->reconstruct_state = reconstruct_state_idle; 3629 clear_bit(STRIPE_EXPANDING, &sh->state); 3630 for (i = conf->raid_disks; i--; ) { 3631 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3632 set_bit(R5_LOCKED, &sh->dev[i].flags); 3633 s.locked++; 3634 } 3635 } 3636 3637 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3638 !sh->reconstruct_state) { 3639 /* Need to write out all blocks after computing parity */ 3640 sh->disks = conf->raid_disks; 3641 stripe_set_idx(sh->sector, conf, 0, sh); 3642 schedule_reconstruction(sh, &s, 1, 1); 3643 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3644 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3645 atomic_dec(&conf->reshape_stripes); 3646 wake_up(&conf->wait_for_overlap); 3647 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3648 } 3649 3650 if (s.expanding && s.locked == 0 && 3651 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3652 handle_stripe_expansion(conf, sh); 3653 3654 finish: 3655 /* wait for this device to become unblocked */ 3656 if (unlikely(s.blocked_rdev)) { 3657 if (conf->mddev->external) 3658 md_wait_for_blocked_rdev(s.blocked_rdev, 3659 conf->mddev); 3660 else 3661 /* Internal metadata will immediately 3662 * be written by raid5d, so we don't 3663 * need to wait here. 3664 */ 3665 rdev_dec_pending(s.blocked_rdev, 3666 conf->mddev); 3667 } 3668 3669 if (s.handle_bad_blocks) 3670 for (i = disks; i--; ) { 3671 struct md_rdev *rdev; 3672 struct r5dev *dev = &sh->dev[i]; 3673 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3674 /* We own a safe reference to the rdev */ 3675 rdev = conf->disks[i].rdev; 3676 if (!rdev_set_badblocks(rdev, sh->sector, 3677 STRIPE_SECTORS, 0)) 3678 md_error(conf->mddev, rdev); 3679 rdev_dec_pending(rdev, conf->mddev); 3680 } 3681 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3682 rdev = conf->disks[i].rdev; 3683 rdev_clear_badblocks(rdev, sh->sector, 3684 STRIPE_SECTORS, 0); 3685 rdev_dec_pending(rdev, conf->mddev); 3686 } 3687 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3688 rdev = conf->disks[i].replacement; 3689 if (!rdev) 3690 /* rdev have been moved down */ 3691 rdev = conf->disks[i].rdev; 3692 rdev_clear_badblocks(rdev, sh->sector, 3693 STRIPE_SECTORS, 0); 3694 rdev_dec_pending(rdev, conf->mddev); 3695 } 3696 } 3697 3698 if (s.ops_request) 3699 raid_run_ops(sh, s.ops_request); 3700 3701 ops_run_io(sh, &s); 3702 3703 if (s.dec_preread_active) { 3704 /* We delay this until after ops_run_io so that if make_request 3705 * is waiting on a flush, it won't continue until the writes 3706 * have actually been submitted. 3707 */ 3708 atomic_dec(&conf->preread_active_stripes); 3709 if (atomic_read(&conf->preread_active_stripes) < 3710 IO_THRESHOLD) 3711 md_wakeup_thread(conf->mddev->thread); 3712 } 3713 3714 return_io(s.return_bi); 3715 3716 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3717 } 3718 3719 static void raid5_activate_delayed(struct r5conf *conf) 3720 { 3721 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3722 while (!list_empty(&conf->delayed_list)) { 3723 struct list_head *l = conf->delayed_list.next; 3724 struct stripe_head *sh; 3725 sh = list_entry(l, struct stripe_head, lru); 3726 list_del_init(l); 3727 clear_bit(STRIPE_DELAYED, &sh->state); 3728 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3729 atomic_inc(&conf->preread_active_stripes); 3730 list_add_tail(&sh->lru, &conf->hold_list); 3731 } 3732 } 3733 } 3734 3735 static void activate_bit_delay(struct r5conf *conf) 3736 { 3737 /* device_lock is held */ 3738 struct list_head head; 3739 list_add(&head, &conf->bitmap_list); 3740 list_del_init(&conf->bitmap_list); 3741 while (!list_empty(&head)) { 3742 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3743 list_del_init(&sh->lru); 3744 atomic_inc(&sh->count); 3745 __release_stripe(conf, sh); 3746 } 3747 } 3748 3749 int md_raid5_congested(struct mddev *mddev, int bits) 3750 { 3751 struct r5conf *conf = mddev->private; 3752 3753 /* No difference between reads and writes. Just check 3754 * how busy the stripe_cache is 3755 */ 3756 3757 if (conf->inactive_blocked) 3758 return 1; 3759 if (conf->quiesce) 3760 return 1; 3761 if (list_empty_careful(&conf->inactive_list)) 3762 return 1; 3763 3764 return 0; 3765 } 3766 EXPORT_SYMBOL_GPL(md_raid5_congested); 3767 3768 static int raid5_congested(void *data, int bits) 3769 { 3770 struct mddev *mddev = data; 3771 3772 return mddev_congested(mddev, bits) || 3773 md_raid5_congested(mddev, bits); 3774 } 3775 3776 /* We want read requests to align with chunks where possible, 3777 * but write requests don't need to. 3778 */ 3779 static int raid5_mergeable_bvec(struct request_queue *q, 3780 struct bvec_merge_data *bvm, 3781 struct bio_vec *biovec) 3782 { 3783 struct mddev *mddev = q->queuedata; 3784 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3785 int max; 3786 unsigned int chunk_sectors = mddev->chunk_sectors; 3787 unsigned int bio_sectors = bvm->bi_size >> 9; 3788 3789 if ((bvm->bi_rw & 1) == WRITE) 3790 return biovec->bv_len; /* always allow writes to be mergeable */ 3791 3792 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3793 chunk_sectors = mddev->new_chunk_sectors; 3794 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3795 if (max < 0) max = 0; 3796 if (max <= biovec->bv_len && bio_sectors == 0) 3797 return biovec->bv_len; 3798 else 3799 return max; 3800 } 3801 3802 3803 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3804 { 3805 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3806 unsigned int chunk_sectors = mddev->chunk_sectors; 3807 unsigned int bio_sectors = bio->bi_size >> 9; 3808 3809 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3810 chunk_sectors = mddev->new_chunk_sectors; 3811 return chunk_sectors >= 3812 ((sector & (chunk_sectors - 1)) + bio_sectors); 3813 } 3814 3815 /* 3816 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3817 * later sampled by raid5d. 3818 */ 3819 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3820 { 3821 unsigned long flags; 3822 3823 spin_lock_irqsave(&conf->device_lock, flags); 3824 3825 bi->bi_next = conf->retry_read_aligned_list; 3826 conf->retry_read_aligned_list = bi; 3827 3828 spin_unlock_irqrestore(&conf->device_lock, flags); 3829 md_wakeup_thread(conf->mddev->thread); 3830 } 3831 3832 3833 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3834 { 3835 struct bio *bi; 3836 3837 bi = conf->retry_read_aligned; 3838 if (bi) { 3839 conf->retry_read_aligned = NULL; 3840 return bi; 3841 } 3842 bi = conf->retry_read_aligned_list; 3843 if(bi) { 3844 conf->retry_read_aligned_list = bi->bi_next; 3845 bi->bi_next = NULL; 3846 /* 3847 * this sets the active strip count to 1 and the processed 3848 * strip count to zero (upper 8 bits) 3849 */ 3850 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3851 } 3852 3853 return bi; 3854 } 3855 3856 3857 /* 3858 * The "raid5_align_endio" should check if the read succeeded and if it 3859 * did, call bio_endio on the original bio (having bio_put the new bio 3860 * first). 3861 * If the read failed.. 3862 */ 3863 static void raid5_align_endio(struct bio *bi, int error) 3864 { 3865 struct bio* raid_bi = bi->bi_private; 3866 struct mddev *mddev; 3867 struct r5conf *conf; 3868 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3869 struct md_rdev *rdev; 3870 3871 bio_put(bi); 3872 3873 rdev = (void*)raid_bi->bi_next; 3874 raid_bi->bi_next = NULL; 3875 mddev = rdev->mddev; 3876 conf = mddev->private; 3877 3878 rdev_dec_pending(rdev, conf->mddev); 3879 3880 if (!error && uptodate) { 3881 bio_endio(raid_bi, 0); 3882 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3883 wake_up(&conf->wait_for_stripe); 3884 return; 3885 } 3886 3887 3888 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3889 3890 add_bio_to_retry(raid_bi, conf); 3891 } 3892 3893 static int bio_fits_rdev(struct bio *bi) 3894 { 3895 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3896 3897 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3898 return 0; 3899 blk_recount_segments(q, bi); 3900 if (bi->bi_phys_segments > queue_max_segments(q)) 3901 return 0; 3902 3903 if (q->merge_bvec_fn) 3904 /* it's too hard to apply the merge_bvec_fn at this stage, 3905 * just just give up 3906 */ 3907 return 0; 3908 3909 return 1; 3910 } 3911 3912 3913 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3914 { 3915 struct r5conf *conf = mddev->private; 3916 int dd_idx; 3917 struct bio* align_bi; 3918 struct md_rdev *rdev; 3919 sector_t end_sector; 3920 3921 if (!in_chunk_boundary(mddev, raid_bio)) { 3922 pr_debug("chunk_aligned_read : non aligned\n"); 3923 return 0; 3924 } 3925 /* 3926 * use bio_clone_mddev to make a copy of the bio 3927 */ 3928 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3929 if (!align_bi) 3930 return 0; 3931 /* 3932 * set bi_end_io to a new function, and set bi_private to the 3933 * original bio. 3934 */ 3935 align_bi->bi_end_io = raid5_align_endio; 3936 align_bi->bi_private = raid_bio; 3937 /* 3938 * compute position 3939 */ 3940 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3941 0, 3942 &dd_idx, NULL); 3943 3944 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3945 rcu_read_lock(); 3946 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3947 if (!rdev || test_bit(Faulty, &rdev->flags) || 3948 rdev->recovery_offset < end_sector) { 3949 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3950 if (rdev && 3951 (test_bit(Faulty, &rdev->flags) || 3952 !(test_bit(In_sync, &rdev->flags) || 3953 rdev->recovery_offset >= end_sector))) 3954 rdev = NULL; 3955 } 3956 if (rdev) { 3957 sector_t first_bad; 3958 int bad_sectors; 3959 3960 atomic_inc(&rdev->nr_pending); 3961 rcu_read_unlock(); 3962 raid_bio->bi_next = (void*)rdev; 3963 align_bi->bi_bdev = rdev->bdev; 3964 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3965 3966 if (!bio_fits_rdev(align_bi) || 3967 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3968 &first_bad, &bad_sectors)) { 3969 /* too big in some way, or has a known bad block */ 3970 bio_put(align_bi); 3971 rdev_dec_pending(rdev, mddev); 3972 return 0; 3973 } 3974 3975 /* No reshape active, so we can trust rdev->data_offset */ 3976 align_bi->bi_sector += rdev->data_offset; 3977 3978 spin_lock_irq(&conf->device_lock); 3979 wait_event_lock_irq(conf->wait_for_stripe, 3980 conf->quiesce == 0, 3981 conf->device_lock); 3982 atomic_inc(&conf->active_aligned_reads); 3983 spin_unlock_irq(&conf->device_lock); 3984 3985 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 3986 align_bi, disk_devt(mddev->gendisk), 3987 raid_bio->bi_sector); 3988 generic_make_request(align_bi); 3989 return 1; 3990 } else { 3991 rcu_read_unlock(); 3992 bio_put(align_bi); 3993 return 0; 3994 } 3995 } 3996 3997 /* __get_priority_stripe - get the next stripe to process 3998 * 3999 * Full stripe writes are allowed to pass preread active stripes up until 4000 * the bypass_threshold is exceeded. In general the bypass_count 4001 * increments when the handle_list is handled before the hold_list; however, it 4002 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4003 * stripe with in flight i/o. The bypass_count will be reset when the 4004 * head of the hold_list has changed, i.e. the head was promoted to the 4005 * handle_list. 4006 */ 4007 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 4008 { 4009 struct stripe_head *sh; 4010 4011 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4012 __func__, 4013 list_empty(&conf->handle_list) ? "empty" : "busy", 4014 list_empty(&conf->hold_list) ? "empty" : "busy", 4015 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4016 4017 if (!list_empty(&conf->handle_list)) { 4018 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 4019 4020 if (list_empty(&conf->hold_list)) 4021 conf->bypass_count = 0; 4022 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4023 if (conf->hold_list.next == conf->last_hold) 4024 conf->bypass_count++; 4025 else { 4026 conf->last_hold = conf->hold_list.next; 4027 conf->bypass_count -= conf->bypass_threshold; 4028 if (conf->bypass_count < 0) 4029 conf->bypass_count = 0; 4030 } 4031 } 4032 } else if (!list_empty(&conf->hold_list) && 4033 ((conf->bypass_threshold && 4034 conf->bypass_count > conf->bypass_threshold) || 4035 atomic_read(&conf->pending_full_writes) == 0)) { 4036 sh = list_entry(conf->hold_list.next, 4037 typeof(*sh), lru); 4038 conf->bypass_count -= conf->bypass_threshold; 4039 if (conf->bypass_count < 0) 4040 conf->bypass_count = 0; 4041 } else 4042 return NULL; 4043 4044 list_del_init(&sh->lru); 4045 atomic_inc(&sh->count); 4046 BUG_ON(atomic_read(&sh->count) != 1); 4047 return sh; 4048 } 4049 4050 struct raid5_plug_cb { 4051 struct blk_plug_cb cb; 4052 struct list_head list; 4053 }; 4054 4055 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4056 { 4057 struct raid5_plug_cb *cb = container_of( 4058 blk_cb, struct raid5_plug_cb, cb); 4059 struct stripe_head *sh; 4060 struct mddev *mddev = cb->cb.data; 4061 struct r5conf *conf = mddev->private; 4062 int cnt = 0; 4063 4064 if (cb->list.next && !list_empty(&cb->list)) { 4065 spin_lock_irq(&conf->device_lock); 4066 while (!list_empty(&cb->list)) { 4067 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4068 list_del_init(&sh->lru); 4069 /* 4070 * avoid race release_stripe_plug() sees 4071 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4072 * is still in our list 4073 */ 4074 smp_mb__before_clear_bit(); 4075 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4076 __release_stripe(conf, sh); 4077 cnt++; 4078 } 4079 spin_unlock_irq(&conf->device_lock); 4080 } 4081 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4082 kfree(cb); 4083 } 4084 4085 static void release_stripe_plug(struct mddev *mddev, 4086 struct stripe_head *sh) 4087 { 4088 struct blk_plug_cb *blk_cb = blk_check_plugged( 4089 raid5_unplug, mddev, 4090 sizeof(struct raid5_plug_cb)); 4091 struct raid5_plug_cb *cb; 4092 4093 if (!blk_cb) { 4094 release_stripe(sh); 4095 return; 4096 } 4097 4098 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4099 4100 if (cb->list.next == NULL) 4101 INIT_LIST_HEAD(&cb->list); 4102 4103 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4104 list_add_tail(&sh->lru, &cb->list); 4105 else 4106 release_stripe(sh); 4107 } 4108 4109 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4110 { 4111 struct r5conf *conf = mddev->private; 4112 sector_t logical_sector, last_sector; 4113 struct stripe_head *sh; 4114 int remaining; 4115 int stripe_sectors; 4116 4117 if (mddev->reshape_position != MaxSector) 4118 /* Skip discard while reshape is happening */ 4119 return; 4120 4121 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4122 last_sector = bi->bi_sector + (bi->bi_size>>9); 4123 4124 bi->bi_next = NULL; 4125 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4126 4127 stripe_sectors = conf->chunk_sectors * 4128 (conf->raid_disks - conf->max_degraded); 4129 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4130 stripe_sectors); 4131 sector_div(last_sector, stripe_sectors); 4132 4133 logical_sector *= conf->chunk_sectors; 4134 last_sector *= conf->chunk_sectors; 4135 4136 for (; logical_sector < last_sector; 4137 logical_sector += STRIPE_SECTORS) { 4138 DEFINE_WAIT(w); 4139 int d; 4140 again: 4141 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4142 prepare_to_wait(&conf->wait_for_overlap, &w, 4143 TASK_UNINTERRUPTIBLE); 4144 spin_lock_irq(&sh->stripe_lock); 4145 for (d = 0; d < conf->raid_disks; d++) { 4146 if (d == sh->pd_idx || d == sh->qd_idx) 4147 continue; 4148 if (sh->dev[d].towrite || sh->dev[d].toread) { 4149 set_bit(R5_Overlap, &sh->dev[d].flags); 4150 spin_unlock_irq(&sh->stripe_lock); 4151 release_stripe(sh); 4152 schedule(); 4153 goto again; 4154 } 4155 } 4156 finish_wait(&conf->wait_for_overlap, &w); 4157 for (d = 0; d < conf->raid_disks; d++) { 4158 if (d == sh->pd_idx || d == sh->qd_idx) 4159 continue; 4160 sh->dev[d].towrite = bi; 4161 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4162 raid5_inc_bi_active_stripes(bi); 4163 } 4164 spin_unlock_irq(&sh->stripe_lock); 4165 if (conf->mddev->bitmap) { 4166 for (d = 0; 4167 d < conf->raid_disks - conf->max_degraded; 4168 d++) 4169 bitmap_startwrite(mddev->bitmap, 4170 sh->sector, 4171 STRIPE_SECTORS, 4172 0); 4173 sh->bm_seq = conf->seq_flush + 1; 4174 set_bit(STRIPE_BIT_DELAY, &sh->state); 4175 } 4176 4177 set_bit(STRIPE_HANDLE, &sh->state); 4178 clear_bit(STRIPE_DELAYED, &sh->state); 4179 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4180 atomic_inc(&conf->preread_active_stripes); 4181 release_stripe_plug(mddev, sh); 4182 } 4183 4184 remaining = raid5_dec_bi_active_stripes(bi); 4185 if (remaining == 0) { 4186 md_write_end(mddev); 4187 bio_endio(bi, 0); 4188 } 4189 } 4190 4191 static void make_request(struct mddev *mddev, struct bio * bi) 4192 { 4193 struct r5conf *conf = mddev->private; 4194 int dd_idx; 4195 sector_t new_sector; 4196 sector_t logical_sector, last_sector; 4197 struct stripe_head *sh; 4198 const int rw = bio_data_dir(bi); 4199 int remaining; 4200 4201 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4202 md_flush_request(mddev, bi); 4203 return; 4204 } 4205 4206 md_write_start(mddev, bi); 4207 4208 if (rw == READ && 4209 mddev->reshape_position == MaxSector && 4210 chunk_aligned_read(mddev,bi)) 4211 return; 4212 4213 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4214 make_discard_request(mddev, bi); 4215 return; 4216 } 4217 4218 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4219 last_sector = bi->bi_sector + (bi->bi_size>>9); 4220 bi->bi_next = NULL; 4221 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4222 4223 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4224 DEFINE_WAIT(w); 4225 int previous; 4226 4227 retry: 4228 previous = 0; 4229 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4230 if (unlikely(conf->reshape_progress != MaxSector)) { 4231 /* spinlock is needed as reshape_progress may be 4232 * 64bit on a 32bit platform, and so it might be 4233 * possible to see a half-updated value 4234 * Of course reshape_progress could change after 4235 * the lock is dropped, so once we get a reference 4236 * to the stripe that we think it is, we will have 4237 * to check again. 4238 */ 4239 spin_lock_irq(&conf->device_lock); 4240 if (mddev->reshape_backwards 4241 ? logical_sector < conf->reshape_progress 4242 : logical_sector >= conf->reshape_progress) { 4243 previous = 1; 4244 } else { 4245 if (mddev->reshape_backwards 4246 ? logical_sector < conf->reshape_safe 4247 : logical_sector >= conf->reshape_safe) { 4248 spin_unlock_irq(&conf->device_lock); 4249 schedule(); 4250 goto retry; 4251 } 4252 } 4253 spin_unlock_irq(&conf->device_lock); 4254 } 4255 4256 new_sector = raid5_compute_sector(conf, logical_sector, 4257 previous, 4258 &dd_idx, NULL); 4259 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4260 (unsigned long long)new_sector, 4261 (unsigned long long)logical_sector); 4262 4263 sh = get_active_stripe(conf, new_sector, previous, 4264 (bi->bi_rw&RWA_MASK), 0); 4265 if (sh) { 4266 if (unlikely(previous)) { 4267 /* expansion might have moved on while waiting for a 4268 * stripe, so we must do the range check again. 4269 * Expansion could still move past after this 4270 * test, but as we are holding a reference to 4271 * 'sh', we know that if that happens, 4272 * STRIPE_EXPANDING will get set and the expansion 4273 * won't proceed until we finish with the stripe. 4274 */ 4275 int must_retry = 0; 4276 spin_lock_irq(&conf->device_lock); 4277 if (mddev->reshape_backwards 4278 ? logical_sector >= conf->reshape_progress 4279 : logical_sector < conf->reshape_progress) 4280 /* mismatch, need to try again */ 4281 must_retry = 1; 4282 spin_unlock_irq(&conf->device_lock); 4283 if (must_retry) { 4284 release_stripe(sh); 4285 schedule(); 4286 goto retry; 4287 } 4288 } 4289 4290 if (rw == WRITE && 4291 logical_sector >= mddev->suspend_lo && 4292 logical_sector < mddev->suspend_hi) { 4293 release_stripe(sh); 4294 /* As the suspend_* range is controlled by 4295 * userspace, we want an interruptible 4296 * wait. 4297 */ 4298 flush_signals(current); 4299 prepare_to_wait(&conf->wait_for_overlap, 4300 &w, TASK_INTERRUPTIBLE); 4301 if (logical_sector >= mddev->suspend_lo && 4302 logical_sector < mddev->suspend_hi) 4303 schedule(); 4304 goto retry; 4305 } 4306 4307 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4308 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4309 /* Stripe is busy expanding or 4310 * add failed due to overlap. Flush everything 4311 * and wait a while 4312 */ 4313 md_wakeup_thread(mddev->thread); 4314 release_stripe(sh); 4315 schedule(); 4316 goto retry; 4317 } 4318 finish_wait(&conf->wait_for_overlap, &w); 4319 set_bit(STRIPE_HANDLE, &sh->state); 4320 clear_bit(STRIPE_DELAYED, &sh->state); 4321 if ((bi->bi_rw & REQ_SYNC) && 4322 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4323 atomic_inc(&conf->preread_active_stripes); 4324 release_stripe_plug(mddev, sh); 4325 } else { 4326 /* cannot get stripe for read-ahead, just give-up */ 4327 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4328 finish_wait(&conf->wait_for_overlap, &w); 4329 break; 4330 } 4331 } 4332 4333 remaining = raid5_dec_bi_active_stripes(bi); 4334 if (remaining == 0) { 4335 4336 if ( rw == WRITE ) 4337 md_write_end(mddev); 4338 4339 bio_endio(bi, 0); 4340 } 4341 } 4342 4343 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4344 4345 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4346 { 4347 /* reshaping is quite different to recovery/resync so it is 4348 * handled quite separately ... here. 4349 * 4350 * On each call to sync_request, we gather one chunk worth of 4351 * destination stripes and flag them as expanding. 4352 * Then we find all the source stripes and request reads. 4353 * As the reads complete, handle_stripe will copy the data 4354 * into the destination stripe and release that stripe. 4355 */ 4356 struct r5conf *conf = mddev->private; 4357 struct stripe_head *sh; 4358 sector_t first_sector, last_sector; 4359 int raid_disks = conf->previous_raid_disks; 4360 int data_disks = raid_disks - conf->max_degraded; 4361 int new_data_disks = conf->raid_disks - conf->max_degraded; 4362 int i; 4363 int dd_idx; 4364 sector_t writepos, readpos, safepos; 4365 sector_t stripe_addr; 4366 int reshape_sectors; 4367 struct list_head stripes; 4368 4369 if (sector_nr == 0) { 4370 /* If restarting in the middle, skip the initial sectors */ 4371 if (mddev->reshape_backwards && 4372 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4373 sector_nr = raid5_size(mddev, 0, 0) 4374 - conf->reshape_progress; 4375 } else if (!mddev->reshape_backwards && 4376 conf->reshape_progress > 0) 4377 sector_nr = conf->reshape_progress; 4378 sector_div(sector_nr, new_data_disks); 4379 if (sector_nr) { 4380 mddev->curr_resync_completed = sector_nr; 4381 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4382 *skipped = 1; 4383 return sector_nr; 4384 } 4385 } 4386 4387 /* We need to process a full chunk at a time. 4388 * If old and new chunk sizes differ, we need to process the 4389 * largest of these 4390 */ 4391 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4392 reshape_sectors = mddev->new_chunk_sectors; 4393 else 4394 reshape_sectors = mddev->chunk_sectors; 4395 4396 /* We update the metadata at least every 10 seconds, or when 4397 * the data about to be copied would over-write the source of 4398 * the data at the front of the range. i.e. one new_stripe 4399 * along from reshape_progress new_maps to after where 4400 * reshape_safe old_maps to 4401 */ 4402 writepos = conf->reshape_progress; 4403 sector_div(writepos, new_data_disks); 4404 readpos = conf->reshape_progress; 4405 sector_div(readpos, data_disks); 4406 safepos = conf->reshape_safe; 4407 sector_div(safepos, data_disks); 4408 if (mddev->reshape_backwards) { 4409 writepos -= min_t(sector_t, reshape_sectors, writepos); 4410 readpos += reshape_sectors; 4411 safepos += reshape_sectors; 4412 } else { 4413 writepos += reshape_sectors; 4414 readpos -= min_t(sector_t, reshape_sectors, readpos); 4415 safepos -= min_t(sector_t, reshape_sectors, safepos); 4416 } 4417 4418 /* Having calculated the 'writepos' possibly use it 4419 * to set 'stripe_addr' which is where we will write to. 4420 */ 4421 if (mddev->reshape_backwards) { 4422 BUG_ON(conf->reshape_progress == 0); 4423 stripe_addr = writepos; 4424 BUG_ON((mddev->dev_sectors & 4425 ~((sector_t)reshape_sectors - 1)) 4426 - reshape_sectors - stripe_addr 4427 != sector_nr); 4428 } else { 4429 BUG_ON(writepos != sector_nr + reshape_sectors); 4430 stripe_addr = sector_nr; 4431 } 4432 4433 /* 'writepos' is the most advanced device address we might write. 4434 * 'readpos' is the least advanced device address we might read. 4435 * 'safepos' is the least address recorded in the metadata as having 4436 * been reshaped. 4437 * If there is a min_offset_diff, these are adjusted either by 4438 * increasing the safepos/readpos if diff is negative, or 4439 * increasing writepos if diff is positive. 4440 * If 'readpos' is then behind 'writepos', there is no way that we can 4441 * ensure safety in the face of a crash - that must be done by userspace 4442 * making a backup of the data. So in that case there is no particular 4443 * rush to update metadata. 4444 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4445 * update the metadata to advance 'safepos' to match 'readpos' so that 4446 * we can be safe in the event of a crash. 4447 * So we insist on updating metadata if safepos is behind writepos and 4448 * readpos is beyond writepos. 4449 * In any case, update the metadata every 10 seconds. 4450 * Maybe that number should be configurable, but I'm not sure it is 4451 * worth it.... maybe it could be a multiple of safemode_delay??? 4452 */ 4453 if (conf->min_offset_diff < 0) { 4454 safepos += -conf->min_offset_diff; 4455 readpos += -conf->min_offset_diff; 4456 } else 4457 writepos += conf->min_offset_diff; 4458 4459 if ((mddev->reshape_backwards 4460 ? (safepos > writepos && readpos < writepos) 4461 : (safepos < writepos && readpos > writepos)) || 4462 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4463 /* Cannot proceed until we've updated the superblock... */ 4464 wait_event(conf->wait_for_overlap, 4465 atomic_read(&conf->reshape_stripes)==0); 4466 mddev->reshape_position = conf->reshape_progress; 4467 mddev->curr_resync_completed = sector_nr; 4468 conf->reshape_checkpoint = jiffies; 4469 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4470 md_wakeup_thread(mddev->thread); 4471 wait_event(mddev->sb_wait, mddev->flags == 0 || 4472 kthread_should_stop()); 4473 spin_lock_irq(&conf->device_lock); 4474 conf->reshape_safe = mddev->reshape_position; 4475 spin_unlock_irq(&conf->device_lock); 4476 wake_up(&conf->wait_for_overlap); 4477 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4478 } 4479 4480 INIT_LIST_HEAD(&stripes); 4481 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4482 int j; 4483 int skipped_disk = 0; 4484 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4485 set_bit(STRIPE_EXPANDING, &sh->state); 4486 atomic_inc(&conf->reshape_stripes); 4487 /* If any of this stripe is beyond the end of the old 4488 * array, then we need to zero those blocks 4489 */ 4490 for (j=sh->disks; j--;) { 4491 sector_t s; 4492 if (j == sh->pd_idx) 4493 continue; 4494 if (conf->level == 6 && 4495 j == sh->qd_idx) 4496 continue; 4497 s = compute_blocknr(sh, j, 0); 4498 if (s < raid5_size(mddev, 0, 0)) { 4499 skipped_disk = 1; 4500 continue; 4501 } 4502 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4503 set_bit(R5_Expanded, &sh->dev[j].flags); 4504 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4505 } 4506 if (!skipped_disk) { 4507 set_bit(STRIPE_EXPAND_READY, &sh->state); 4508 set_bit(STRIPE_HANDLE, &sh->state); 4509 } 4510 list_add(&sh->lru, &stripes); 4511 } 4512 spin_lock_irq(&conf->device_lock); 4513 if (mddev->reshape_backwards) 4514 conf->reshape_progress -= reshape_sectors * new_data_disks; 4515 else 4516 conf->reshape_progress += reshape_sectors * new_data_disks; 4517 spin_unlock_irq(&conf->device_lock); 4518 /* Ok, those stripe are ready. We can start scheduling 4519 * reads on the source stripes. 4520 * The source stripes are determined by mapping the first and last 4521 * block on the destination stripes. 4522 */ 4523 first_sector = 4524 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4525 1, &dd_idx, NULL); 4526 last_sector = 4527 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4528 * new_data_disks - 1), 4529 1, &dd_idx, NULL); 4530 if (last_sector >= mddev->dev_sectors) 4531 last_sector = mddev->dev_sectors - 1; 4532 while (first_sector <= last_sector) { 4533 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4534 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4535 set_bit(STRIPE_HANDLE, &sh->state); 4536 release_stripe(sh); 4537 first_sector += STRIPE_SECTORS; 4538 } 4539 /* Now that the sources are clearly marked, we can release 4540 * the destination stripes 4541 */ 4542 while (!list_empty(&stripes)) { 4543 sh = list_entry(stripes.next, struct stripe_head, lru); 4544 list_del_init(&sh->lru); 4545 release_stripe(sh); 4546 } 4547 /* If this takes us to the resync_max point where we have to pause, 4548 * then we need to write out the superblock. 4549 */ 4550 sector_nr += reshape_sectors; 4551 if ((sector_nr - mddev->curr_resync_completed) * 2 4552 >= mddev->resync_max - mddev->curr_resync_completed) { 4553 /* Cannot proceed until we've updated the superblock... */ 4554 wait_event(conf->wait_for_overlap, 4555 atomic_read(&conf->reshape_stripes) == 0); 4556 mddev->reshape_position = conf->reshape_progress; 4557 mddev->curr_resync_completed = sector_nr; 4558 conf->reshape_checkpoint = jiffies; 4559 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4560 md_wakeup_thread(mddev->thread); 4561 wait_event(mddev->sb_wait, 4562 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4563 || kthread_should_stop()); 4564 spin_lock_irq(&conf->device_lock); 4565 conf->reshape_safe = mddev->reshape_position; 4566 spin_unlock_irq(&conf->device_lock); 4567 wake_up(&conf->wait_for_overlap); 4568 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4569 } 4570 return reshape_sectors; 4571 } 4572 4573 /* FIXME go_faster isn't used */ 4574 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4575 { 4576 struct r5conf *conf = mddev->private; 4577 struct stripe_head *sh; 4578 sector_t max_sector = mddev->dev_sectors; 4579 sector_t sync_blocks; 4580 int still_degraded = 0; 4581 int i; 4582 4583 if (sector_nr >= max_sector) { 4584 /* just being told to finish up .. nothing much to do */ 4585 4586 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4587 end_reshape(conf); 4588 return 0; 4589 } 4590 4591 if (mddev->curr_resync < max_sector) /* aborted */ 4592 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4593 &sync_blocks, 1); 4594 else /* completed sync */ 4595 conf->fullsync = 0; 4596 bitmap_close_sync(mddev->bitmap); 4597 4598 return 0; 4599 } 4600 4601 /* Allow raid5_quiesce to complete */ 4602 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4603 4604 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4605 return reshape_request(mddev, sector_nr, skipped); 4606 4607 /* No need to check resync_max as we never do more than one 4608 * stripe, and as resync_max will always be on a chunk boundary, 4609 * if the check in md_do_sync didn't fire, there is no chance 4610 * of overstepping resync_max here 4611 */ 4612 4613 /* if there is too many failed drives and we are trying 4614 * to resync, then assert that we are finished, because there is 4615 * nothing we can do. 4616 */ 4617 if (mddev->degraded >= conf->max_degraded && 4618 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4619 sector_t rv = mddev->dev_sectors - sector_nr; 4620 *skipped = 1; 4621 return rv; 4622 } 4623 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4624 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4625 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4626 /* we can skip this block, and probably more */ 4627 sync_blocks /= STRIPE_SECTORS; 4628 *skipped = 1; 4629 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4630 } 4631 4632 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4633 4634 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4635 if (sh == NULL) { 4636 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4637 /* make sure we don't swamp the stripe cache if someone else 4638 * is trying to get access 4639 */ 4640 schedule_timeout_uninterruptible(1); 4641 } 4642 /* Need to check if array will still be degraded after recovery/resync 4643 * We don't need to check the 'failed' flag as when that gets set, 4644 * recovery aborts. 4645 */ 4646 for (i = 0; i < conf->raid_disks; i++) 4647 if (conf->disks[i].rdev == NULL) 4648 still_degraded = 1; 4649 4650 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4651 4652 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4653 4654 handle_stripe(sh); 4655 release_stripe(sh); 4656 4657 return STRIPE_SECTORS; 4658 } 4659 4660 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4661 { 4662 /* We may not be able to submit a whole bio at once as there 4663 * may not be enough stripe_heads available. 4664 * We cannot pre-allocate enough stripe_heads as we may need 4665 * more than exist in the cache (if we allow ever large chunks). 4666 * So we do one stripe head at a time and record in 4667 * ->bi_hw_segments how many have been done. 4668 * 4669 * We *know* that this entire raid_bio is in one chunk, so 4670 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4671 */ 4672 struct stripe_head *sh; 4673 int dd_idx; 4674 sector_t sector, logical_sector, last_sector; 4675 int scnt = 0; 4676 int remaining; 4677 int handled = 0; 4678 4679 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4680 sector = raid5_compute_sector(conf, logical_sector, 4681 0, &dd_idx, NULL); 4682 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4683 4684 for (; logical_sector < last_sector; 4685 logical_sector += STRIPE_SECTORS, 4686 sector += STRIPE_SECTORS, 4687 scnt++) { 4688 4689 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4690 /* already done this stripe */ 4691 continue; 4692 4693 sh = get_active_stripe(conf, sector, 0, 1, 0); 4694 4695 if (!sh) { 4696 /* failed to get a stripe - must wait */ 4697 raid5_set_bi_processed_stripes(raid_bio, scnt); 4698 conf->retry_read_aligned = raid_bio; 4699 return handled; 4700 } 4701 4702 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4703 release_stripe(sh); 4704 raid5_set_bi_processed_stripes(raid_bio, scnt); 4705 conf->retry_read_aligned = raid_bio; 4706 return handled; 4707 } 4708 4709 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4710 handle_stripe(sh); 4711 release_stripe(sh); 4712 handled++; 4713 } 4714 remaining = raid5_dec_bi_active_stripes(raid_bio); 4715 if (remaining == 0) 4716 bio_endio(raid_bio, 0); 4717 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4718 wake_up(&conf->wait_for_stripe); 4719 return handled; 4720 } 4721 4722 #define MAX_STRIPE_BATCH 8 4723 static int handle_active_stripes(struct r5conf *conf) 4724 { 4725 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4726 int i, batch_size = 0; 4727 4728 while (batch_size < MAX_STRIPE_BATCH && 4729 (sh = __get_priority_stripe(conf)) != NULL) 4730 batch[batch_size++] = sh; 4731 4732 if (batch_size == 0) 4733 return batch_size; 4734 spin_unlock_irq(&conf->device_lock); 4735 4736 for (i = 0; i < batch_size; i++) 4737 handle_stripe(batch[i]); 4738 4739 cond_resched(); 4740 4741 spin_lock_irq(&conf->device_lock); 4742 for (i = 0; i < batch_size; i++) 4743 __release_stripe(conf, batch[i]); 4744 return batch_size; 4745 } 4746 4747 /* 4748 * This is our raid5 kernel thread. 4749 * 4750 * We scan the hash table for stripes which can be handled now. 4751 * During the scan, completed stripes are saved for us by the interrupt 4752 * handler, so that they will not have to wait for our next wakeup. 4753 */ 4754 static void raid5d(struct md_thread *thread) 4755 { 4756 struct mddev *mddev = thread->mddev; 4757 struct r5conf *conf = mddev->private; 4758 int handled; 4759 struct blk_plug plug; 4760 4761 pr_debug("+++ raid5d active\n"); 4762 4763 md_check_recovery(mddev); 4764 4765 blk_start_plug(&plug); 4766 handled = 0; 4767 spin_lock_irq(&conf->device_lock); 4768 while (1) { 4769 struct bio *bio; 4770 int batch_size; 4771 4772 if ( 4773 !list_empty(&conf->bitmap_list)) { 4774 /* Now is a good time to flush some bitmap updates */ 4775 conf->seq_flush++; 4776 spin_unlock_irq(&conf->device_lock); 4777 bitmap_unplug(mddev->bitmap); 4778 spin_lock_irq(&conf->device_lock); 4779 conf->seq_write = conf->seq_flush; 4780 activate_bit_delay(conf); 4781 } 4782 raid5_activate_delayed(conf); 4783 4784 while ((bio = remove_bio_from_retry(conf))) { 4785 int ok; 4786 spin_unlock_irq(&conf->device_lock); 4787 ok = retry_aligned_read(conf, bio); 4788 spin_lock_irq(&conf->device_lock); 4789 if (!ok) 4790 break; 4791 handled++; 4792 } 4793 4794 batch_size = handle_active_stripes(conf); 4795 if (!batch_size) 4796 break; 4797 handled += batch_size; 4798 4799 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 4800 spin_unlock_irq(&conf->device_lock); 4801 md_check_recovery(mddev); 4802 spin_lock_irq(&conf->device_lock); 4803 } 4804 } 4805 pr_debug("%d stripes handled\n", handled); 4806 4807 spin_unlock_irq(&conf->device_lock); 4808 4809 async_tx_issue_pending_all(); 4810 blk_finish_plug(&plug); 4811 4812 pr_debug("--- raid5d inactive\n"); 4813 } 4814 4815 static ssize_t 4816 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4817 { 4818 struct r5conf *conf = mddev->private; 4819 if (conf) 4820 return sprintf(page, "%d\n", conf->max_nr_stripes); 4821 else 4822 return 0; 4823 } 4824 4825 int 4826 raid5_set_cache_size(struct mddev *mddev, int size) 4827 { 4828 struct r5conf *conf = mddev->private; 4829 int err; 4830 4831 if (size <= 16 || size > 32768) 4832 return -EINVAL; 4833 while (size < conf->max_nr_stripes) { 4834 if (drop_one_stripe(conf)) 4835 conf->max_nr_stripes--; 4836 else 4837 break; 4838 } 4839 err = md_allow_write(mddev); 4840 if (err) 4841 return err; 4842 while (size > conf->max_nr_stripes) { 4843 if (grow_one_stripe(conf)) 4844 conf->max_nr_stripes++; 4845 else break; 4846 } 4847 return 0; 4848 } 4849 EXPORT_SYMBOL(raid5_set_cache_size); 4850 4851 static ssize_t 4852 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4853 { 4854 struct r5conf *conf = mddev->private; 4855 unsigned long new; 4856 int err; 4857 4858 if (len >= PAGE_SIZE) 4859 return -EINVAL; 4860 if (!conf) 4861 return -ENODEV; 4862 4863 if (strict_strtoul(page, 10, &new)) 4864 return -EINVAL; 4865 err = raid5_set_cache_size(mddev, new); 4866 if (err) 4867 return err; 4868 return len; 4869 } 4870 4871 static struct md_sysfs_entry 4872 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4873 raid5_show_stripe_cache_size, 4874 raid5_store_stripe_cache_size); 4875 4876 static ssize_t 4877 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4878 { 4879 struct r5conf *conf = mddev->private; 4880 if (conf) 4881 return sprintf(page, "%d\n", conf->bypass_threshold); 4882 else 4883 return 0; 4884 } 4885 4886 static ssize_t 4887 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4888 { 4889 struct r5conf *conf = mddev->private; 4890 unsigned long new; 4891 if (len >= PAGE_SIZE) 4892 return -EINVAL; 4893 if (!conf) 4894 return -ENODEV; 4895 4896 if (strict_strtoul(page, 10, &new)) 4897 return -EINVAL; 4898 if (new > conf->max_nr_stripes) 4899 return -EINVAL; 4900 conf->bypass_threshold = new; 4901 return len; 4902 } 4903 4904 static struct md_sysfs_entry 4905 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4906 S_IRUGO | S_IWUSR, 4907 raid5_show_preread_threshold, 4908 raid5_store_preread_threshold); 4909 4910 static ssize_t 4911 stripe_cache_active_show(struct mddev *mddev, char *page) 4912 { 4913 struct r5conf *conf = mddev->private; 4914 if (conf) 4915 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4916 else 4917 return 0; 4918 } 4919 4920 static struct md_sysfs_entry 4921 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4922 4923 static struct attribute *raid5_attrs[] = { 4924 &raid5_stripecache_size.attr, 4925 &raid5_stripecache_active.attr, 4926 &raid5_preread_bypass_threshold.attr, 4927 NULL, 4928 }; 4929 static struct attribute_group raid5_attrs_group = { 4930 .name = NULL, 4931 .attrs = raid5_attrs, 4932 }; 4933 4934 static sector_t 4935 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4936 { 4937 struct r5conf *conf = mddev->private; 4938 4939 if (!sectors) 4940 sectors = mddev->dev_sectors; 4941 if (!raid_disks) 4942 /* size is defined by the smallest of previous and new size */ 4943 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4944 4945 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4946 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4947 return sectors * (raid_disks - conf->max_degraded); 4948 } 4949 4950 static void raid5_free_percpu(struct r5conf *conf) 4951 { 4952 struct raid5_percpu *percpu; 4953 unsigned long cpu; 4954 4955 if (!conf->percpu) 4956 return; 4957 4958 get_online_cpus(); 4959 for_each_possible_cpu(cpu) { 4960 percpu = per_cpu_ptr(conf->percpu, cpu); 4961 safe_put_page(percpu->spare_page); 4962 kfree(percpu->scribble); 4963 } 4964 #ifdef CONFIG_HOTPLUG_CPU 4965 unregister_cpu_notifier(&conf->cpu_notify); 4966 #endif 4967 put_online_cpus(); 4968 4969 free_percpu(conf->percpu); 4970 } 4971 4972 static void free_conf(struct r5conf *conf) 4973 { 4974 shrink_stripes(conf); 4975 raid5_free_percpu(conf); 4976 kfree(conf->disks); 4977 kfree(conf->stripe_hashtbl); 4978 kfree(conf); 4979 } 4980 4981 #ifdef CONFIG_HOTPLUG_CPU 4982 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4983 void *hcpu) 4984 { 4985 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4986 long cpu = (long)hcpu; 4987 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4988 4989 switch (action) { 4990 case CPU_UP_PREPARE: 4991 case CPU_UP_PREPARE_FROZEN: 4992 if (conf->level == 6 && !percpu->spare_page) 4993 percpu->spare_page = alloc_page(GFP_KERNEL); 4994 if (!percpu->scribble) 4995 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4996 4997 if (!percpu->scribble || 4998 (conf->level == 6 && !percpu->spare_page)) { 4999 safe_put_page(percpu->spare_page); 5000 kfree(percpu->scribble); 5001 pr_err("%s: failed memory allocation for cpu%ld\n", 5002 __func__, cpu); 5003 return notifier_from_errno(-ENOMEM); 5004 } 5005 break; 5006 case CPU_DEAD: 5007 case CPU_DEAD_FROZEN: 5008 safe_put_page(percpu->spare_page); 5009 kfree(percpu->scribble); 5010 percpu->spare_page = NULL; 5011 percpu->scribble = NULL; 5012 break; 5013 default: 5014 break; 5015 } 5016 return NOTIFY_OK; 5017 } 5018 #endif 5019 5020 static int raid5_alloc_percpu(struct r5conf *conf) 5021 { 5022 unsigned long cpu; 5023 struct page *spare_page; 5024 struct raid5_percpu __percpu *allcpus; 5025 void *scribble; 5026 int err; 5027 5028 allcpus = alloc_percpu(struct raid5_percpu); 5029 if (!allcpus) 5030 return -ENOMEM; 5031 conf->percpu = allcpus; 5032 5033 get_online_cpus(); 5034 err = 0; 5035 for_each_present_cpu(cpu) { 5036 if (conf->level == 6) { 5037 spare_page = alloc_page(GFP_KERNEL); 5038 if (!spare_page) { 5039 err = -ENOMEM; 5040 break; 5041 } 5042 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 5043 } 5044 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5045 if (!scribble) { 5046 err = -ENOMEM; 5047 break; 5048 } 5049 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 5050 } 5051 #ifdef CONFIG_HOTPLUG_CPU 5052 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5053 conf->cpu_notify.priority = 0; 5054 if (err == 0) 5055 err = register_cpu_notifier(&conf->cpu_notify); 5056 #endif 5057 put_online_cpus(); 5058 5059 return err; 5060 } 5061 5062 static struct r5conf *setup_conf(struct mddev *mddev) 5063 { 5064 struct r5conf *conf; 5065 int raid_disk, memory, max_disks; 5066 struct md_rdev *rdev; 5067 struct disk_info *disk; 5068 char pers_name[6]; 5069 5070 if (mddev->new_level != 5 5071 && mddev->new_level != 4 5072 && mddev->new_level != 6) { 5073 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5074 mdname(mddev), mddev->new_level); 5075 return ERR_PTR(-EIO); 5076 } 5077 if ((mddev->new_level == 5 5078 && !algorithm_valid_raid5(mddev->new_layout)) || 5079 (mddev->new_level == 6 5080 && !algorithm_valid_raid6(mddev->new_layout))) { 5081 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5082 mdname(mddev), mddev->new_layout); 5083 return ERR_PTR(-EIO); 5084 } 5085 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5086 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5087 mdname(mddev), mddev->raid_disks); 5088 return ERR_PTR(-EINVAL); 5089 } 5090 5091 if (!mddev->new_chunk_sectors || 5092 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5093 !is_power_of_2(mddev->new_chunk_sectors)) { 5094 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5095 mdname(mddev), mddev->new_chunk_sectors << 9); 5096 return ERR_PTR(-EINVAL); 5097 } 5098 5099 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5100 if (conf == NULL) 5101 goto abort; 5102 spin_lock_init(&conf->device_lock); 5103 init_waitqueue_head(&conf->wait_for_stripe); 5104 init_waitqueue_head(&conf->wait_for_overlap); 5105 INIT_LIST_HEAD(&conf->handle_list); 5106 INIT_LIST_HEAD(&conf->hold_list); 5107 INIT_LIST_HEAD(&conf->delayed_list); 5108 INIT_LIST_HEAD(&conf->bitmap_list); 5109 INIT_LIST_HEAD(&conf->inactive_list); 5110 atomic_set(&conf->active_stripes, 0); 5111 atomic_set(&conf->preread_active_stripes, 0); 5112 atomic_set(&conf->active_aligned_reads, 0); 5113 conf->bypass_threshold = BYPASS_THRESHOLD; 5114 conf->recovery_disabled = mddev->recovery_disabled - 1; 5115 5116 conf->raid_disks = mddev->raid_disks; 5117 if (mddev->reshape_position == MaxSector) 5118 conf->previous_raid_disks = mddev->raid_disks; 5119 else 5120 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5121 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5122 conf->scribble_len = scribble_len(max_disks); 5123 5124 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5125 GFP_KERNEL); 5126 if (!conf->disks) 5127 goto abort; 5128 5129 conf->mddev = mddev; 5130 5131 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5132 goto abort; 5133 5134 conf->level = mddev->new_level; 5135 if (raid5_alloc_percpu(conf) != 0) 5136 goto abort; 5137 5138 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5139 5140 rdev_for_each(rdev, mddev) { 5141 raid_disk = rdev->raid_disk; 5142 if (raid_disk >= max_disks 5143 || raid_disk < 0) 5144 continue; 5145 disk = conf->disks + raid_disk; 5146 5147 if (test_bit(Replacement, &rdev->flags)) { 5148 if (disk->replacement) 5149 goto abort; 5150 disk->replacement = rdev; 5151 } else { 5152 if (disk->rdev) 5153 goto abort; 5154 disk->rdev = rdev; 5155 } 5156 5157 if (test_bit(In_sync, &rdev->flags)) { 5158 char b[BDEVNAME_SIZE]; 5159 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5160 " disk %d\n", 5161 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5162 } else if (rdev->saved_raid_disk != raid_disk) 5163 /* Cannot rely on bitmap to complete recovery */ 5164 conf->fullsync = 1; 5165 } 5166 5167 conf->chunk_sectors = mddev->new_chunk_sectors; 5168 conf->level = mddev->new_level; 5169 if (conf->level == 6) 5170 conf->max_degraded = 2; 5171 else 5172 conf->max_degraded = 1; 5173 conf->algorithm = mddev->new_layout; 5174 conf->max_nr_stripes = NR_STRIPES; 5175 conf->reshape_progress = mddev->reshape_position; 5176 if (conf->reshape_progress != MaxSector) { 5177 conf->prev_chunk_sectors = mddev->chunk_sectors; 5178 conf->prev_algo = mddev->layout; 5179 } 5180 5181 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5182 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5183 if (grow_stripes(conf, conf->max_nr_stripes)) { 5184 printk(KERN_ERR 5185 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5186 mdname(mddev), memory); 5187 goto abort; 5188 } else 5189 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5190 mdname(mddev), memory); 5191 5192 sprintf(pers_name, "raid%d", mddev->new_level); 5193 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5194 if (!conf->thread) { 5195 printk(KERN_ERR 5196 "md/raid:%s: couldn't allocate thread.\n", 5197 mdname(mddev)); 5198 goto abort; 5199 } 5200 5201 return conf; 5202 5203 abort: 5204 if (conf) { 5205 free_conf(conf); 5206 return ERR_PTR(-EIO); 5207 } else 5208 return ERR_PTR(-ENOMEM); 5209 } 5210 5211 5212 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5213 { 5214 switch (algo) { 5215 case ALGORITHM_PARITY_0: 5216 if (raid_disk < max_degraded) 5217 return 1; 5218 break; 5219 case ALGORITHM_PARITY_N: 5220 if (raid_disk >= raid_disks - max_degraded) 5221 return 1; 5222 break; 5223 case ALGORITHM_PARITY_0_6: 5224 if (raid_disk == 0 || 5225 raid_disk == raid_disks - 1) 5226 return 1; 5227 break; 5228 case ALGORITHM_LEFT_ASYMMETRIC_6: 5229 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5230 case ALGORITHM_LEFT_SYMMETRIC_6: 5231 case ALGORITHM_RIGHT_SYMMETRIC_6: 5232 if (raid_disk == raid_disks - 1) 5233 return 1; 5234 } 5235 return 0; 5236 } 5237 5238 static int run(struct mddev *mddev) 5239 { 5240 struct r5conf *conf; 5241 int working_disks = 0; 5242 int dirty_parity_disks = 0; 5243 struct md_rdev *rdev; 5244 sector_t reshape_offset = 0; 5245 int i; 5246 long long min_offset_diff = 0; 5247 int first = 1; 5248 5249 if (mddev->recovery_cp != MaxSector) 5250 printk(KERN_NOTICE "md/raid:%s: not clean" 5251 " -- starting background reconstruction\n", 5252 mdname(mddev)); 5253 5254 rdev_for_each(rdev, mddev) { 5255 long long diff; 5256 if (rdev->raid_disk < 0) 5257 continue; 5258 diff = (rdev->new_data_offset - rdev->data_offset); 5259 if (first) { 5260 min_offset_diff = diff; 5261 first = 0; 5262 } else if (mddev->reshape_backwards && 5263 diff < min_offset_diff) 5264 min_offset_diff = diff; 5265 else if (!mddev->reshape_backwards && 5266 diff > min_offset_diff) 5267 min_offset_diff = diff; 5268 } 5269 5270 if (mddev->reshape_position != MaxSector) { 5271 /* Check that we can continue the reshape. 5272 * Difficulties arise if the stripe we would write to 5273 * next is at or after the stripe we would read from next. 5274 * For a reshape that changes the number of devices, this 5275 * is only possible for a very short time, and mdadm makes 5276 * sure that time appears to have past before assembling 5277 * the array. So we fail if that time hasn't passed. 5278 * For a reshape that keeps the number of devices the same 5279 * mdadm must be monitoring the reshape can keeping the 5280 * critical areas read-only and backed up. It will start 5281 * the array in read-only mode, so we check for that. 5282 */ 5283 sector_t here_new, here_old; 5284 int old_disks; 5285 int max_degraded = (mddev->level == 6 ? 2 : 1); 5286 5287 if (mddev->new_level != mddev->level) { 5288 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5289 "required - aborting.\n", 5290 mdname(mddev)); 5291 return -EINVAL; 5292 } 5293 old_disks = mddev->raid_disks - mddev->delta_disks; 5294 /* reshape_position must be on a new-stripe boundary, and one 5295 * further up in new geometry must map after here in old 5296 * geometry. 5297 */ 5298 here_new = mddev->reshape_position; 5299 if (sector_div(here_new, mddev->new_chunk_sectors * 5300 (mddev->raid_disks - max_degraded))) { 5301 printk(KERN_ERR "md/raid:%s: reshape_position not " 5302 "on a stripe boundary\n", mdname(mddev)); 5303 return -EINVAL; 5304 } 5305 reshape_offset = here_new * mddev->new_chunk_sectors; 5306 /* here_new is the stripe we will write to */ 5307 here_old = mddev->reshape_position; 5308 sector_div(here_old, mddev->chunk_sectors * 5309 (old_disks-max_degraded)); 5310 /* here_old is the first stripe that we might need to read 5311 * from */ 5312 if (mddev->delta_disks == 0) { 5313 if ((here_new * mddev->new_chunk_sectors != 5314 here_old * mddev->chunk_sectors)) { 5315 printk(KERN_ERR "md/raid:%s: reshape position is" 5316 " confused - aborting\n", mdname(mddev)); 5317 return -EINVAL; 5318 } 5319 /* We cannot be sure it is safe to start an in-place 5320 * reshape. It is only safe if user-space is monitoring 5321 * and taking constant backups. 5322 * mdadm always starts a situation like this in 5323 * readonly mode so it can take control before 5324 * allowing any writes. So just check for that. 5325 */ 5326 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5327 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5328 /* not really in-place - so OK */; 5329 else if (mddev->ro == 0) { 5330 printk(KERN_ERR "md/raid:%s: in-place reshape " 5331 "must be started in read-only mode " 5332 "- aborting\n", 5333 mdname(mddev)); 5334 return -EINVAL; 5335 } 5336 } else if (mddev->reshape_backwards 5337 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5338 here_old * mddev->chunk_sectors) 5339 : (here_new * mddev->new_chunk_sectors >= 5340 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5341 /* Reading from the same stripe as writing to - bad */ 5342 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5343 "auto-recovery - aborting.\n", 5344 mdname(mddev)); 5345 return -EINVAL; 5346 } 5347 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5348 mdname(mddev)); 5349 /* OK, we should be able to continue; */ 5350 } else { 5351 BUG_ON(mddev->level != mddev->new_level); 5352 BUG_ON(mddev->layout != mddev->new_layout); 5353 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5354 BUG_ON(mddev->delta_disks != 0); 5355 } 5356 5357 if (mddev->private == NULL) 5358 conf = setup_conf(mddev); 5359 else 5360 conf = mddev->private; 5361 5362 if (IS_ERR(conf)) 5363 return PTR_ERR(conf); 5364 5365 conf->min_offset_diff = min_offset_diff; 5366 mddev->thread = conf->thread; 5367 conf->thread = NULL; 5368 mddev->private = conf; 5369 5370 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5371 i++) { 5372 rdev = conf->disks[i].rdev; 5373 if (!rdev && conf->disks[i].replacement) { 5374 /* The replacement is all we have yet */ 5375 rdev = conf->disks[i].replacement; 5376 conf->disks[i].replacement = NULL; 5377 clear_bit(Replacement, &rdev->flags); 5378 conf->disks[i].rdev = rdev; 5379 } 5380 if (!rdev) 5381 continue; 5382 if (conf->disks[i].replacement && 5383 conf->reshape_progress != MaxSector) { 5384 /* replacements and reshape simply do not mix. */ 5385 printk(KERN_ERR "md: cannot handle concurrent " 5386 "replacement and reshape.\n"); 5387 goto abort; 5388 } 5389 if (test_bit(In_sync, &rdev->flags)) { 5390 working_disks++; 5391 continue; 5392 } 5393 /* This disc is not fully in-sync. However if it 5394 * just stored parity (beyond the recovery_offset), 5395 * when we don't need to be concerned about the 5396 * array being dirty. 5397 * When reshape goes 'backwards', we never have 5398 * partially completed devices, so we only need 5399 * to worry about reshape going forwards. 5400 */ 5401 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5402 if (mddev->major_version == 0 && 5403 mddev->minor_version > 90) 5404 rdev->recovery_offset = reshape_offset; 5405 5406 if (rdev->recovery_offset < reshape_offset) { 5407 /* We need to check old and new layout */ 5408 if (!only_parity(rdev->raid_disk, 5409 conf->algorithm, 5410 conf->raid_disks, 5411 conf->max_degraded)) 5412 continue; 5413 } 5414 if (!only_parity(rdev->raid_disk, 5415 conf->prev_algo, 5416 conf->previous_raid_disks, 5417 conf->max_degraded)) 5418 continue; 5419 dirty_parity_disks++; 5420 } 5421 5422 /* 5423 * 0 for a fully functional array, 1 or 2 for a degraded array. 5424 */ 5425 mddev->degraded = calc_degraded(conf); 5426 5427 if (has_failed(conf)) { 5428 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5429 " (%d/%d failed)\n", 5430 mdname(mddev), mddev->degraded, conf->raid_disks); 5431 goto abort; 5432 } 5433 5434 /* device size must be a multiple of chunk size */ 5435 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5436 mddev->resync_max_sectors = mddev->dev_sectors; 5437 5438 if (mddev->degraded > dirty_parity_disks && 5439 mddev->recovery_cp != MaxSector) { 5440 if (mddev->ok_start_degraded) 5441 printk(KERN_WARNING 5442 "md/raid:%s: starting dirty degraded array" 5443 " - data corruption possible.\n", 5444 mdname(mddev)); 5445 else { 5446 printk(KERN_ERR 5447 "md/raid:%s: cannot start dirty degraded array.\n", 5448 mdname(mddev)); 5449 goto abort; 5450 } 5451 } 5452 5453 if (mddev->degraded == 0) 5454 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5455 " devices, algorithm %d\n", mdname(mddev), conf->level, 5456 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5457 mddev->new_layout); 5458 else 5459 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5460 " out of %d devices, algorithm %d\n", 5461 mdname(mddev), conf->level, 5462 mddev->raid_disks - mddev->degraded, 5463 mddev->raid_disks, mddev->new_layout); 5464 5465 print_raid5_conf(conf); 5466 5467 if (conf->reshape_progress != MaxSector) { 5468 conf->reshape_safe = conf->reshape_progress; 5469 atomic_set(&conf->reshape_stripes, 0); 5470 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5471 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5472 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5473 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5474 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5475 "reshape"); 5476 } 5477 5478 5479 /* Ok, everything is just fine now */ 5480 if (mddev->to_remove == &raid5_attrs_group) 5481 mddev->to_remove = NULL; 5482 else if (mddev->kobj.sd && 5483 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5484 printk(KERN_WARNING 5485 "raid5: failed to create sysfs attributes for %s\n", 5486 mdname(mddev)); 5487 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5488 5489 if (mddev->queue) { 5490 int chunk_size; 5491 bool discard_supported = true; 5492 /* read-ahead size must cover two whole stripes, which 5493 * is 2 * (datadisks) * chunksize where 'n' is the 5494 * number of raid devices 5495 */ 5496 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5497 int stripe = data_disks * 5498 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5499 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5500 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5501 5502 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5503 5504 mddev->queue->backing_dev_info.congested_data = mddev; 5505 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5506 5507 chunk_size = mddev->chunk_sectors << 9; 5508 blk_queue_io_min(mddev->queue, chunk_size); 5509 blk_queue_io_opt(mddev->queue, chunk_size * 5510 (conf->raid_disks - conf->max_degraded)); 5511 /* 5512 * We can only discard a whole stripe. It doesn't make sense to 5513 * discard data disk but write parity disk 5514 */ 5515 stripe = stripe * PAGE_SIZE; 5516 /* Round up to power of 2, as discard handling 5517 * currently assumes that */ 5518 while ((stripe-1) & stripe) 5519 stripe = (stripe | (stripe-1)) + 1; 5520 mddev->queue->limits.discard_alignment = stripe; 5521 mddev->queue->limits.discard_granularity = stripe; 5522 /* 5523 * unaligned part of discard request will be ignored, so can't 5524 * guarantee discard_zerors_data 5525 */ 5526 mddev->queue->limits.discard_zeroes_data = 0; 5527 5528 rdev_for_each(rdev, mddev) { 5529 disk_stack_limits(mddev->gendisk, rdev->bdev, 5530 rdev->data_offset << 9); 5531 disk_stack_limits(mddev->gendisk, rdev->bdev, 5532 rdev->new_data_offset << 9); 5533 /* 5534 * discard_zeroes_data is required, otherwise data 5535 * could be lost. Consider a scenario: discard a stripe 5536 * (the stripe could be inconsistent if 5537 * discard_zeroes_data is 0); write one disk of the 5538 * stripe (the stripe could be inconsistent again 5539 * depending on which disks are used to calculate 5540 * parity); the disk is broken; The stripe data of this 5541 * disk is lost. 5542 */ 5543 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5544 !bdev_get_queue(rdev->bdev)-> 5545 limits.discard_zeroes_data) 5546 discard_supported = false; 5547 } 5548 5549 if (discard_supported && 5550 mddev->queue->limits.max_discard_sectors >= stripe && 5551 mddev->queue->limits.discard_granularity >= stripe) 5552 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5553 mddev->queue); 5554 else 5555 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5556 mddev->queue); 5557 } 5558 5559 return 0; 5560 abort: 5561 md_unregister_thread(&mddev->thread); 5562 print_raid5_conf(conf); 5563 free_conf(conf); 5564 mddev->private = NULL; 5565 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5566 return -EIO; 5567 } 5568 5569 static int stop(struct mddev *mddev) 5570 { 5571 struct r5conf *conf = mddev->private; 5572 5573 md_unregister_thread(&mddev->thread); 5574 if (mddev->queue) 5575 mddev->queue->backing_dev_info.congested_fn = NULL; 5576 free_conf(conf); 5577 mddev->private = NULL; 5578 mddev->to_remove = &raid5_attrs_group; 5579 return 0; 5580 } 5581 5582 static void status(struct seq_file *seq, struct mddev *mddev) 5583 { 5584 struct r5conf *conf = mddev->private; 5585 int i; 5586 5587 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5588 mddev->chunk_sectors / 2, mddev->layout); 5589 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5590 for (i = 0; i < conf->raid_disks; i++) 5591 seq_printf (seq, "%s", 5592 conf->disks[i].rdev && 5593 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5594 seq_printf (seq, "]"); 5595 } 5596 5597 static void print_raid5_conf (struct r5conf *conf) 5598 { 5599 int i; 5600 struct disk_info *tmp; 5601 5602 printk(KERN_DEBUG "RAID conf printout:\n"); 5603 if (!conf) { 5604 printk("(conf==NULL)\n"); 5605 return; 5606 } 5607 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5608 conf->raid_disks, 5609 conf->raid_disks - conf->mddev->degraded); 5610 5611 for (i = 0; i < conf->raid_disks; i++) { 5612 char b[BDEVNAME_SIZE]; 5613 tmp = conf->disks + i; 5614 if (tmp->rdev) 5615 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5616 i, !test_bit(Faulty, &tmp->rdev->flags), 5617 bdevname(tmp->rdev->bdev, b)); 5618 } 5619 } 5620 5621 static int raid5_spare_active(struct mddev *mddev) 5622 { 5623 int i; 5624 struct r5conf *conf = mddev->private; 5625 struct disk_info *tmp; 5626 int count = 0; 5627 unsigned long flags; 5628 5629 for (i = 0; i < conf->raid_disks; i++) { 5630 tmp = conf->disks + i; 5631 if (tmp->replacement 5632 && tmp->replacement->recovery_offset == MaxSector 5633 && !test_bit(Faulty, &tmp->replacement->flags) 5634 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5635 /* Replacement has just become active. */ 5636 if (!tmp->rdev 5637 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5638 count++; 5639 if (tmp->rdev) { 5640 /* Replaced device not technically faulty, 5641 * but we need to be sure it gets removed 5642 * and never re-added. 5643 */ 5644 set_bit(Faulty, &tmp->rdev->flags); 5645 sysfs_notify_dirent_safe( 5646 tmp->rdev->sysfs_state); 5647 } 5648 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5649 } else if (tmp->rdev 5650 && tmp->rdev->recovery_offset == MaxSector 5651 && !test_bit(Faulty, &tmp->rdev->flags) 5652 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5653 count++; 5654 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5655 } 5656 } 5657 spin_lock_irqsave(&conf->device_lock, flags); 5658 mddev->degraded = calc_degraded(conf); 5659 spin_unlock_irqrestore(&conf->device_lock, flags); 5660 print_raid5_conf(conf); 5661 return count; 5662 } 5663 5664 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5665 { 5666 struct r5conf *conf = mddev->private; 5667 int err = 0; 5668 int number = rdev->raid_disk; 5669 struct md_rdev **rdevp; 5670 struct disk_info *p = conf->disks + number; 5671 5672 print_raid5_conf(conf); 5673 if (rdev == p->rdev) 5674 rdevp = &p->rdev; 5675 else if (rdev == p->replacement) 5676 rdevp = &p->replacement; 5677 else 5678 return 0; 5679 5680 if (number >= conf->raid_disks && 5681 conf->reshape_progress == MaxSector) 5682 clear_bit(In_sync, &rdev->flags); 5683 5684 if (test_bit(In_sync, &rdev->flags) || 5685 atomic_read(&rdev->nr_pending)) { 5686 err = -EBUSY; 5687 goto abort; 5688 } 5689 /* Only remove non-faulty devices if recovery 5690 * isn't possible. 5691 */ 5692 if (!test_bit(Faulty, &rdev->flags) && 5693 mddev->recovery_disabled != conf->recovery_disabled && 5694 !has_failed(conf) && 5695 (!p->replacement || p->replacement == rdev) && 5696 number < conf->raid_disks) { 5697 err = -EBUSY; 5698 goto abort; 5699 } 5700 *rdevp = NULL; 5701 synchronize_rcu(); 5702 if (atomic_read(&rdev->nr_pending)) { 5703 /* lost the race, try later */ 5704 err = -EBUSY; 5705 *rdevp = rdev; 5706 } else if (p->replacement) { 5707 /* We must have just cleared 'rdev' */ 5708 p->rdev = p->replacement; 5709 clear_bit(Replacement, &p->replacement->flags); 5710 smp_mb(); /* Make sure other CPUs may see both as identical 5711 * but will never see neither - if they are careful 5712 */ 5713 p->replacement = NULL; 5714 clear_bit(WantReplacement, &rdev->flags); 5715 } else 5716 /* We might have just removed the Replacement as faulty- 5717 * clear the bit just in case 5718 */ 5719 clear_bit(WantReplacement, &rdev->flags); 5720 abort: 5721 5722 print_raid5_conf(conf); 5723 return err; 5724 } 5725 5726 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5727 { 5728 struct r5conf *conf = mddev->private; 5729 int err = -EEXIST; 5730 int disk; 5731 struct disk_info *p; 5732 int first = 0; 5733 int last = conf->raid_disks - 1; 5734 5735 if (mddev->recovery_disabled == conf->recovery_disabled) 5736 return -EBUSY; 5737 5738 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5739 /* no point adding a device */ 5740 return -EINVAL; 5741 5742 if (rdev->raid_disk >= 0) 5743 first = last = rdev->raid_disk; 5744 5745 /* 5746 * find the disk ... but prefer rdev->saved_raid_disk 5747 * if possible. 5748 */ 5749 if (rdev->saved_raid_disk >= 0 && 5750 rdev->saved_raid_disk >= first && 5751 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5752 first = rdev->saved_raid_disk; 5753 5754 for (disk = first; disk <= last; disk++) { 5755 p = conf->disks + disk; 5756 if (p->rdev == NULL) { 5757 clear_bit(In_sync, &rdev->flags); 5758 rdev->raid_disk = disk; 5759 err = 0; 5760 if (rdev->saved_raid_disk != disk) 5761 conf->fullsync = 1; 5762 rcu_assign_pointer(p->rdev, rdev); 5763 goto out; 5764 } 5765 } 5766 for (disk = first; disk <= last; disk++) { 5767 p = conf->disks + disk; 5768 if (test_bit(WantReplacement, &p->rdev->flags) && 5769 p->replacement == NULL) { 5770 clear_bit(In_sync, &rdev->flags); 5771 set_bit(Replacement, &rdev->flags); 5772 rdev->raid_disk = disk; 5773 err = 0; 5774 conf->fullsync = 1; 5775 rcu_assign_pointer(p->replacement, rdev); 5776 break; 5777 } 5778 } 5779 out: 5780 print_raid5_conf(conf); 5781 return err; 5782 } 5783 5784 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5785 { 5786 /* no resync is happening, and there is enough space 5787 * on all devices, so we can resize. 5788 * We need to make sure resync covers any new space. 5789 * If the array is shrinking we should possibly wait until 5790 * any io in the removed space completes, but it hardly seems 5791 * worth it. 5792 */ 5793 sector_t newsize; 5794 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5795 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5796 if (mddev->external_size && 5797 mddev->array_sectors > newsize) 5798 return -EINVAL; 5799 if (mddev->bitmap) { 5800 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5801 if (ret) 5802 return ret; 5803 } 5804 md_set_array_sectors(mddev, newsize); 5805 set_capacity(mddev->gendisk, mddev->array_sectors); 5806 revalidate_disk(mddev->gendisk); 5807 if (sectors > mddev->dev_sectors && 5808 mddev->recovery_cp > mddev->dev_sectors) { 5809 mddev->recovery_cp = mddev->dev_sectors; 5810 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5811 } 5812 mddev->dev_sectors = sectors; 5813 mddev->resync_max_sectors = sectors; 5814 return 0; 5815 } 5816 5817 static int check_stripe_cache(struct mddev *mddev) 5818 { 5819 /* Can only proceed if there are plenty of stripe_heads. 5820 * We need a minimum of one full stripe,, and for sensible progress 5821 * it is best to have about 4 times that. 5822 * If we require 4 times, then the default 256 4K stripe_heads will 5823 * allow for chunk sizes up to 256K, which is probably OK. 5824 * If the chunk size is greater, user-space should request more 5825 * stripe_heads first. 5826 */ 5827 struct r5conf *conf = mddev->private; 5828 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5829 > conf->max_nr_stripes || 5830 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5831 > conf->max_nr_stripes) { 5832 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5833 mdname(mddev), 5834 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5835 / STRIPE_SIZE)*4); 5836 return 0; 5837 } 5838 return 1; 5839 } 5840 5841 static int check_reshape(struct mddev *mddev) 5842 { 5843 struct r5conf *conf = mddev->private; 5844 5845 if (mddev->delta_disks == 0 && 5846 mddev->new_layout == mddev->layout && 5847 mddev->new_chunk_sectors == mddev->chunk_sectors) 5848 return 0; /* nothing to do */ 5849 if (has_failed(conf)) 5850 return -EINVAL; 5851 if (mddev->delta_disks < 0) { 5852 /* We might be able to shrink, but the devices must 5853 * be made bigger first. 5854 * For raid6, 4 is the minimum size. 5855 * Otherwise 2 is the minimum 5856 */ 5857 int min = 2; 5858 if (mddev->level == 6) 5859 min = 4; 5860 if (mddev->raid_disks + mddev->delta_disks < min) 5861 return -EINVAL; 5862 } 5863 5864 if (!check_stripe_cache(mddev)) 5865 return -ENOSPC; 5866 5867 return resize_stripes(conf, (conf->previous_raid_disks 5868 + mddev->delta_disks)); 5869 } 5870 5871 static int raid5_start_reshape(struct mddev *mddev) 5872 { 5873 struct r5conf *conf = mddev->private; 5874 struct md_rdev *rdev; 5875 int spares = 0; 5876 unsigned long flags; 5877 5878 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5879 return -EBUSY; 5880 5881 if (!check_stripe_cache(mddev)) 5882 return -ENOSPC; 5883 5884 if (has_failed(conf)) 5885 return -EINVAL; 5886 5887 rdev_for_each(rdev, mddev) { 5888 if (!test_bit(In_sync, &rdev->flags) 5889 && !test_bit(Faulty, &rdev->flags)) 5890 spares++; 5891 } 5892 5893 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5894 /* Not enough devices even to make a degraded array 5895 * of that size 5896 */ 5897 return -EINVAL; 5898 5899 /* Refuse to reduce size of the array. Any reductions in 5900 * array size must be through explicit setting of array_size 5901 * attribute. 5902 */ 5903 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5904 < mddev->array_sectors) { 5905 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5906 "before number of disks\n", mdname(mddev)); 5907 return -EINVAL; 5908 } 5909 5910 atomic_set(&conf->reshape_stripes, 0); 5911 spin_lock_irq(&conf->device_lock); 5912 conf->previous_raid_disks = conf->raid_disks; 5913 conf->raid_disks += mddev->delta_disks; 5914 conf->prev_chunk_sectors = conf->chunk_sectors; 5915 conf->chunk_sectors = mddev->new_chunk_sectors; 5916 conf->prev_algo = conf->algorithm; 5917 conf->algorithm = mddev->new_layout; 5918 conf->generation++; 5919 /* Code that selects data_offset needs to see the generation update 5920 * if reshape_progress has been set - so a memory barrier needed. 5921 */ 5922 smp_mb(); 5923 if (mddev->reshape_backwards) 5924 conf->reshape_progress = raid5_size(mddev, 0, 0); 5925 else 5926 conf->reshape_progress = 0; 5927 conf->reshape_safe = conf->reshape_progress; 5928 spin_unlock_irq(&conf->device_lock); 5929 5930 /* Add some new drives, as many as will fit. 5931 * We know there are enough to make the newly sized array work. 5932 * Don't add devices if we are reducing the number of 5933 * devices in the array. This is because it is not possible 5934 * to correctly record the "partially reconstructed" state of 5935 * such devices during the reshape and confusion could result. 5936 */ 5937 if (mddev->delta_disks >= 0) { 5938 rdev_for_each(rdev, mddev) 5939 if (rdev->raid_disk < 0 && 5940 !test_bit(Faulty, &rdev->flags)) { 5941 if (raid5_add_disk(mddev, rdev) == 0) { 5942 if (rdev->raid_disk 5943 >= conf->previous_raid_disks) 5944 set_bit(In_sync, &rdev->flags); 5945 else 5946 rdev->recovery_offset = 0; 5947 5948 if (sysfs_link_rdev(mddev, rdev)) 5949 /* Failure here is OK */; 5950 } 5951 } else if (rdev->raid_disk >= conf->previous_raid_disks 5952 && !test_bit(Faulty, &rdev->flags)) { 5953 /* This is a spare that was manually added */ 5954 set_bit(In_sync, &rdev->flags); 5955 } 5956 5957 /* When a reshape changes the number of devices, 5958 * ->degraded is measured against the larger of the 5959 * pre and post number of devices. 5960 */ 5961 spin_lock_irqsave(&conf->device_lock, flags); 5962 mddev->degraded = calc_degraded(conf); 5963 spin_unlock_irqrestore(&conf->device_lock, flags); 5964 } 5965 mddev->raid_disks = conf->raid_disks; 5966 mddev->reshape_position = conf->reshape_progress; 5967 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5968 5969 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5970 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5971 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5972 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5973 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5974 "reshape"); 5975 if (!mddev->sync_thread) { 5976 mddev->recovery = 0; 5977 spin_lock_irq(&conf->device_lock); 5978 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5979 rdev_for_each(rdev, mddev) 5980 rdev->new_data_offset = rdev->data_offset; 5981 smp_wmb(); 5982 conf->reshape_progress = MaxSector; 5983 mddev->reshape_position = MaxSector; 5984 spin_unlock_irq(&conf->device_lock); 5985 return -EAGAIN; 5986 } 5987 conf->reshape_checkpoint = jiffies; 5988 md_wakeup_thread(mddev->sync_thread); 5989 md_new_event(mddev); 5990 return 0; 5991 } 5992 5993 /* This is called from the reshape thread and should make any 5994 * changes needed in 'conf' 5995 */ 5996 static void end_reshape(struct r5conf *conf) 5997 { 5998 5999 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6000 struct md_rdev *rdev; 6001 6002 spin_lock_irq(&conf->device_lock); 6003 conf->previous_raid_disks = conf->raid_disks; 6004 rdev_for_each(rdev, conf->mddev) 6005 rdev->data_offset = rdev->new_data_offset; 6006 smp_wmb(); 6007 conf->reshape_progress = MaxSector; 6008 spin_unlock_irq(&conf->device_lock); 6009 wake_up(&conf->wait_for_overlap); 6010 6011 /* read-ahead size must cover two whole stripes, which is 6012 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6013 */ 6014 if (conf->mddev->queue) { 6015 int data_disks = conf->raid_disks - conf->max_degraded; 6016 int stripe = data_disks * ((conf->chunk_sectors << 9) 6017 / PAGE_SIZE); 6018 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6019 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6020 } 6021 } 6022 } 6023 6024 /* This is called from the raid5d thread with mddev_lock held. 6025 * It makes config changes to the device. 6026 */ 6027 static void raid5_finish_reshape(struct mddev *mddev) 6028 { 6029 struct r5conf *conf = mddev->private; 6030 6031 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6032 6033 if (mddev->delta_disks > 0) { 6034 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6035 set_capacity(mddev->gendisk, mddev->array_sectors); 6036 revalidate_disk(mddev->gendisk); 6037 } else { 6038 int d; 6039 spin_lock_irq(&conf->device_lock); 6040 mddev->degraded = calc_degraded(conf); 6041 spin_unlock_irq(&conf->device_lock); 6042 for (d = conf->raid_disks ; 6043 d < conf->raid_disks - mddev->delta_disks; 6044 d++) { 6045 struct md_rdev *rdev = conf->disks[d].rdev; 6046 if (rdev) 6047 clear_bit(In_sync, &rdev->flags); 6048 rdev = conf->disks[d].replacement; 6049 if (rdev) 6050 clear_bit(In_sync, &rdev->flags); 6051 } 6052 } 6053 mddev->layout = conf->algorithm; 6054 mddev->chunk_sectors = conf->chunk_sectors; 6055 mddev->reshape_position = MaxSector; 6056 mddev->delta_disks = 0; 6057 mddev->reshape_backwards = 0; 6058 } 6059 } 6060 6061 static void raid5_quiesce(struct mddev *mddev, int state) 6062 { 6063 struct r5conf *conf = mddev->private; 6064 6065 switch(state) { 6066 case 2: /* resume for a suspend */ 6067 wake_up(&conf->wait_for_overlap); 6068 break; 6069 6070 case 1: /* stop all writes */ 6071 spin_lock_irq(&conf->device_lock); 6072 /* '2' tells resync/reshape to pause so that all 6073 * active stripes can drain 6074 */ 6075 conf->quiesce = 2; 6076 wait_event_lock_irq(conf->wait_for_stripe, 6077 atomic_read(&conf->active_stripes) == 0 && 6078 atomic_read(&conf->active_aligned_reads) == 0, 6079 conf->device_lock); 6080 conf->quiesce = 1; 6081 spin_unlock_irq(&conf->device_lock); 6082 /* allow reshape to continue */ 6083 wake_up(&conf->wait_for_overlap); 6084 break; 6085 6086 case 0: /* re-enable writes */ 6087 spin_lock_irq(&conf->device_lock); 6088 conf->quiesce = 0; 6089 wake_up(&conf->wait_for_stripe); 6090 wake_up(&conf->wait_for_overlap); 6091 spin_unlock_irq(&conf->device_lock); 6092 break; 6093 } 6094 } 6095 6096 6097 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6098 { 6099 struct r0conf *raid0_conf = mddev->private; 6100 sector_t sectors; 6101 6102 /* for raid0 takeover only one zone is supported */ 6103 if (raid0_conf->nr_strip_zones > 1) { 6104 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6105 mdname(mddev)); 6106 return ERR_PTR(-EINVAL); 6107 } 6108 6109 sectors = raid0_conf->strip_zone[0].zone_end; 6110 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6111 mddev->dev_sectors = sectors; 6112 mddev->new_level = level; 6113 mddev->new_layout = ALGORITHM_PARITY_N; 6114 mddev->new_chunk_sectors = mddev->chunk_sectors; 6115 mddev->raid_disks += 1; 6116 mddev->delta_disks = 1; 6117 /* make sure it will be not marked as dirty */ 6118 mddev->recovery_cp = MaxSector; 6119 6120 return setup_conf(mddev); 6121 } 6122 6123 6124 static void *raid5_takeover_raid1(struct mddev *mddev) 6125 { 6126 int chunksect; 6127 6128 if (mddev->raid_disks != 2 || 6129 mddev->degraded > 1) 6130 return ERR_PTR(-EINVAL); 6131 6132 /* Should check if there are write-behind devices? */ 6133 6134 chunksect = 64*2; /* 64K by default */ 6135 6136 /* The array must be an exact multiple of chunksize */ 6137 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6138 chunksect >>= 1; 6139 6140 if ((chunksect<<9) < STRIPE_SIZE) 6141 /* array size does not allow a suitable chunk size */ 6142 return ERR_PTR(-EINVAL); 6143 6144 mddev->new_level = 5; 6145 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6146 mddev->new_chunk_sectors = chunksect; 6147 6148 return setup_conf(mddev); 6149 } 6150 6151 static void *raid5_takeover_raid6(struct mddev *mddev) 6152 { 6153 int new_layout; 6154 6155 switch (mddev->layout) { 6156 case ALGORITHM_LEFT_ASYMMETRIC_6: 6157 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6158 break; 6159 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6160 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6161 break; 6162 case ALGORITHM_LEFT_SYMMETRIC_6: 6163 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6164 break; 6165 case ALGORITHM_RIGHT_SYMMETRIC_6: 6166 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6167 break; 6168 case ALGORITHM_PARITY_0_6: 6169 new_layout = ALGORITHM_PARITY_0; 6170 break; 6171 case ALGORITHM_PARITY_N: 6172 new_layout = ALGORITHM_PARITY_N; 6173 break; 6174 default: 6175 return ERR_PTR(-EINVAL); 6176 } 6177 mddev->new_level = 5; 6178 mddev->new_layout = new_layout; 6179 mddev->delta_disks = -1; 6180 mddev->raid_disks -= 1; 6181 return setup_conf(mddev); 6182 } 6183 6184 6185 static int raid5_check_reshape(struct mddev *mddev) 6186 { 6187 /* For a 2-drive array, the layout and chunk size can be changed 6188 * immediately as not restriping is needed. 6189 * For larger arrays we record the new value - after validation 6190 * to be used by a reshape pass. 6191 */ 6192 struct r5conf *conf = mddev->private; 6193 int new_chunk = mddev->new_chunk_sectors; 6194 6195 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6196 return -EINVAL; 6197 if (new_chunk > 0) { 6198 if (!is_power_of_2(new_chunk)) 6199 return -EINVAL; 6200 if (new_chunk < (PAGE_SIZE>>9)) 6201 return -EINVAL; 6202 if (mddev->array_sectors & (new_chunk-1)) 6203 /* not factor of array size */ 6204 return -EINVAL; 6205 } 6206 6207 /* They look valid */ 6208 6209 if (mddev->raid_disks == 2) { 6210 /* can make the change immediately */ 6211 if (mddev->new_layout >= 0) { 6212 conf->algorithm = mddev->new_layout; 6213 mddev->layout = mddev->new_layout; 6214 } 6215 if (new_chunk > 0) { 6216 conf->chunk_sectors = new_chunk ; 6217 mddev->chunk_sectors = new_chunk; 6218 } 6219 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6220 md_wakeup_thread(mddev->thread); 6221 } 6222 return check_reshape(mddev); 6223 } 6224 6225 static int raid6_check_reshape(struct mddev *mddev) 6226 { 6227 int new_chunk = mddev->new_chunk_sectors; 6228 6229 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6230 return -EINVAL; 6231 if (new_chunk > 0) { 6232 if (!is_power_of_2(new_chunk)) 6233 return -EINVAL; 6234 if (new_chunk < (PAGE_SIZE >> 9)) 6235 return -EINVAL; 6236 if (mddev->array_sectors & (new_chunk-1)) 6237 /* not factor of array size */ 6238 return -EINVAL; 6239 } 6240 6241 /* They look valid */ 6242 return check_reshape(mddev); 6243 } 6244 6245 static void *raid5_takeover(struct mddev *mddev) 6246 { 6247 /* raid5 can take over: 6248 * raid0 - if there is only one strip zone - make it a raid4 layout 6249 * raid1 - if there are two drives. We need to know the chunk size 6250 * raid4 - trivial - just use a raid4 layout. 6251 * raid6 - Providing it is a *_6 layout 6252 */ 6253 if (mddev->level == 0) 6254 return raid45_takeover_raid0(mddev, 5); 6255 if (mddev->level == 1) 6256 return raid5_takeover_raid1(mddev); 6257 if (mddev->level == 4) { 6258 mddev->new_layout = ALGORITHM_PARITY_N; 6259 mddev->new_level = 5; 6260 return setup_conf(mddev); 6261 } 6262 if (mddev->level == 6) 6263 return raid5_takeover_raid6(mddev); 6264 6265 return ERR_PTR(-EINVAL); 6266 } 6267 6268 static void *raid4_takeover(struct mddev *mddev) 6269 { 6270 /* raid4 can take over: 6271 * raid0 - if there is only one strip zone 6272 * raid5 - if layout is right 6273 */ 6274 if (mddev->level == 0) 6275 return raid45_takeover_raid0(mddev, 4); 6276 if (mddev->level == 5 && 6277 mddev->layout == ALGORITHM_PARITY_N) { 6278 mddev->new_layout = 0; 6279 mddev->new_level = 4; 6280 return setup_conf(mddev); 6281 } 6282 return ERR_PTR(-EINVAL); 6283 } 6284 6285 static struct md_personality raid5_personality; 6286 6287 static void *raid6_takeover(struct mddev *mddev) 6288 { 6289 /* Currently can only take over a raid5. We map the 6290 * personality to an equivalent raid6 personality 6291 * with the Q block at the end. 6292 */ 6293 int new_layout; 6294 6295 if (mddev->pers != &raid5_personality) 6296 return ERR_PTR(-EINVAL); 6297 if (mddev->degraded > 1) 6298 return ERR_PTR(-EINVAL); 6299 if (mddev->raid_disks > 253) 6300 return ERR_PTR(-EINVAL); 6301 if (mddev->raid_disks < 3) 6302 return ERR_PTR(-EINVAL); 6303 6304 switch (mddev->layout) { 6305 case ALGORITHM_LEFT_ASYMMETRIC: 6306 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6307 break; 6308 case ALGORITHM_RIGHT_ASYMMETRIC: 6309 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6310 break; 6311 case ALGORITHM_LEFT_SYMMETRIC: 6312 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6313 break; 6314 case ALGORITHM_RIGHT_SYMMETRIC: 6315 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6316 break; 6317 case ALGORITHM_PARITY_0: 6318 new_layout = ALGORITHM_PARITY_0_6; 6319 break; 6320 case ALGORITHM_PARITY_N: 6321 new_layout = ALGORITHM_PARITY_N; 6322 break; 6323 default: 6324 return ERR_PTR(-EINVAL); 6325 } 6326 mddev->new_level = 6; 6327 mddev->new_layout = new_layout; 6328 mddev->delta_disks = 1; 6329 mddev->raid_disks += 1; 6330 return setup_conf(mddev); 6331 } 6332 6333 6334 static struct md_personality raid6_personality = 6335 { 6336 .name = "raid6", 6337 .level = 6, 6338 .owner = THIS_MODULE, 6339 .make_request = make_request, 6340 .run = run, 6341 .stop = stop, 6342 .status = status, 6343 .error_handler = error, 6344 .hot_add_disk = raid5_add_disk, 6345 .hot_remove_disk= raid5_remove_disk, 6346 .spare_active = raid5_spare_active, 6347 .sync_request = sync_request, 6348 .resize = raid5_resize, 6349 .size = raid5_size, 6350 .check_reshape = raid6_check_reshape, 6351 .start_reshape = raid5_start_reshape, 6352 .finish_reshape = raid5_finish_reshape, 6353 .quiesce = raid5_quiesce, 6354 .takeover = raid6_takeover, 6355 }; 6356 static struct md_personality raid5_personality = 6357 { 6358 .name = "raid5", 6359 .level = 5, 6360 .owner = THIS_MODULE, 6361 .make_request = make_request, 6362 .run = run, 6363 .stop = stop, 6364 .status = status, 6365 .error_handler = error, 6366 .hot_add_disk = raid5_add_disk, 6367 .hot_remove_disk= raid5_remove_disk, 6368 .spare_active = raid5_spare_active, 6369 .sync_request = sync_request, 6370 .resize = raid5_resize, 6371 .size = raid5_size, 6372 .check_reshape = raid5_check_reshape, 6373 .start_reshape = raid5_start_reshape, 6374 .finish_reshape = raid5_finish_reshape, 6375 .quiesce = raid5_quiesce, 6376 .takeover = raid5_takeover, 6377 }; 6378 6379 static struct md_personality raid4_personality = 6380 { 6381 .name = "raid4", 6382 .level = 4, 6383 .owner = THIS_MODULE, 6384 .make_request = make_request, 6385 .run = run, 6386 .stop = stop, 6387 .status = status, 6388 .error_handler = error, 6389 .hot_add_disk = raid5_add_disk, 6390 .hot_remove_disk= raid5_remove_disk, 6391 .spare_active = raid5_spare_active, 6392 .sync_request = sync_request, 6393 .resize = raid5_resize, 6394 .size = raid5_size, 6395 .check_reshape = raid5_check_reshape, 6396 .start_reshape = raid5_start_reshape, 6397 .finish_reshape = raid5_finish_reshape, 6398 .quiesce = raid5_quiesce, 6399 .takeover = raid4_takeover, 6400 }; 6401 6402 static int __init raid5_init(void) 6403 { 6404 register_md_personality(&raid6_personality); 6405 register_md_personality(&raid5_personality); 6406 register_md_personality(&raid4_personality); 6407 return 0; 6408 } 6409 6410 static void raid5_exit(void) 6411 { 6412 unregister_md_personality(&raid6_personality); 6413 unregister_md_personality(&raid5_personality); 6414 unregister_md_personality(&raid4_personality); 6415 } 6416 6417 module_init(raid5_init); 6418 module_exit(raid5_exit); 6419 MODULE_LICENSE("GPL"); 6420 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6421 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6422 MODULE_ALIAS("md-raid5"); 6423 MODULE_ALIAS("md-raid4"); 6424 MODULE_ALIAS("md-level-5"); 6425 MODULE_ALIAS("md-level-4"); 6426 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6427 MODULE_ALIAS("md-raid6"); 6428 MODULE_ALIAS("md-level-6"); 6429 6430 /* This used to be two separate modules, they were: */ 6431 MODULE_ALIAS("raid5"); 6432 MODULE_ALIAS("raid6"); 6433