1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <trace/events/block.h> 57 58 #include "md.h" 59 #include "raid5.h" 60 #include "raid0.h" 61 #include "bitmap.h" 62 63 /* 64 * Stripe cache 65 */ 66 67 #define NR_STRIPES 256 68 #define STRIPE_SIZE PAGE_SIZE 69 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 70 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 71 #define IO_THRESHOLD 1 72 #define BYPASS_THRESHOLD 1 73 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 74 #define HASH_MASK (NR_HASH - 1) 75 76 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 77 { 78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 79 return &conf->stripe_hashtbl[hash]; 80 } 81 82 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 83 * order without overlap. There may be several bio's per stripe+device, and 84 * a bio could span several devices. 85 * When walking this list for a particular stripe+device, we must never proceed 86 * beyond a bio that extends past this device, as the next bio might no longer 87 * be valid. 88 * This function is used to determine the 'next' bio in the list, given the sector 89 * of the current stripe+device 90 */ 91 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 92 { 93 int sectors = bio_sectors(bio); 94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 95 return bio->bi_next; 96 else 97 return NULL; 98 } 99 100 /* 101 * We maintain a biased count of active stripes in the bottom 16 bits of 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 103 */ 104 static inline int raid5_bi_processed_stripes(struct bio *bio) 105 { 106 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 107 return (atomic_read(segments) >> 16) & 0xffff; 108 } 109 110 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 111 { 112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 113 return atomic_sub_return(1, segments) & 0xffff; 114 } 115 116 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 117 { 118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 119 atomic_inc(segments); 120 } 121 122 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 123 unsigned int cnt) 124 { 125 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 126 int old, new; 127 128 do { 129 old = atomic_read(segments); 130 new = (old & 0xffff) | (cnt << 16); 131 } while (atomic_cmpxchg(segments, old, new) != old); 132 } 133 134 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 135 { 136 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 137 atomic_set(segments, cnt); 138 } 139 140 /* Find first data disk in a raid6 stripe */ 141 static inline int raid6_d0(struct stripe_head *sh) 142 { 143 if (sh->ddf_layout) 144 /* ddf always start from first device */ 145 return 0; 146 /* md starts just after Q block */ 147 if (sh->qd_idx == sh->disks - 1) 148 return 0; 149 else 150 return sh->qd_idx + 1; 151 } 152 static inline int raid6_next_disk(int disk, int raid_disks) 153 { 154 disk++; 155 return (disk < raid_disks) ? disk : 0; 156 } 157 158 /* When walking through the disks in a raid5, starting at raid6_d0, 159 * We need to map each disk to a 'slot', where the data disks are slot 160 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 161 * is raid_disks-1. This help does that mapping. 162 */ 163 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 164 int *count, int syndrome_disks) 165 { 166 int slot = *count; 167 168 if (sh->ddf_layout) 169 (*count)++; 170 if (idx == sh->pd_idx) 171 return syndrome_disks; 172 if (idx == sh->qd_idx) 173 return syndrome_disks + 1; 174 if (!sh->ddf_layout) 175 (*count)++; 176 return slot; 177 } 178 179 static void return_io(struct bio *return_bi) 180 { 181 struct bio *bi = return_bi; 182 while (bi) { 183 184 return_bi = bi->bi_next; 185 bi->bi_next = NULL; 186 bi->bi_size = 0; 187 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 188 bi, 0); 189 bio_endio(bi, 0); 190 bi = return_bi; 191 } 192 } 193 194 static void print_raid5_conf (struct r5conf *conf); 195 196 static int stripe_operations_active(struct stripe_head *sh) 197 { 198 return sh->check_state || sh->reconstruct_state || 199 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 200 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 201 } 202 203 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 204 { 205 BUG_ON(!list_empty(&sh->lru)); 206 BUG_ON(atomic_read(&conf->active_stripes)==0); 207 if (test_bit(STRIPE_HANDLE, &sh->state)) { 208 if (test_bit(STRIPE_DELAYED, &sh->state) && 209 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 210 list_add_tail(&sh->lru, &conf->delayed_list); 211 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 212 sh->bm_seq - conf->seq_write > 0) 213 list_add_tail(&sh->lru, &conf->bitmap_list); 214 else { 215 clear_bit(STRIPE_DELAYED, &sh->state); 216 clear_bit(STRIPE_BIT_DELAY, &sh->state); 217 list_add_tail(&sh->lru, &conf->handle_list); 218 } 219 md_wakeup_thread(conf->mddev->thread); 220 } else { 221 BUG_ON(stripe_operations_active(sh)); 222 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 223 if (atomic_dec_return(&conf->preread_active_stripes) 224 < IO_THRESHOLD) 225 md_wakeup_thread(conf->mddev->thread); 226 atomic_dec(&conf->active_stripes); 227 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 228 list_add_tail(&sh->lru, &conf->inactive_list); 229 wake_up(&conf->wait_for_stripe); 230 if (conf->retry_read_aligned) 231 md_wakeup_thread(conf->mddev->thread); 232 } 233 } 234 } 235 236 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 237 { 238 if (atomic_dec_and_test(&sh->count)) 239 do_release_stripe(conf, sh); 240 } 241 242 static void release_stripe(struct stripe_head *sh) 243 { 244 struct r5conf *conf = sh->raid_conf; 245 unsigned long flags; 246 247 local_irq_save(flags); 248 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 249 do_release_stripe(conf, sh); 250 spin_unlock(&conf->device_lock); 251 } 252 local_irq_restore(flags); 253 } 254 255 static inline void remove_hash(struct stripe_head *sh) 256 { 257 pr_debug("remove_hash(), stripe %llu\n", 258 (unsigned long long)sh->sector); 259 260 hlist_del_init(&sh->hash); 261 } 262 263 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 264 { 265 struct hlist_head *hp = stripe_hash(conf, sh->sector); 266 267 pr_debug("insert_hash(), stripe %llu\n", 268 (unsigned long long)sh->sector); 269 270 hlist_add_head(&sh->hash, hp); 271 } 272 273 274 /* find an idle stripe, make sure it is unhashed, and return it. */ 275 static struct stripe_head *get_free_stripe(struct r5conf *conf) 276 { 277 struct stripe_head *sh = NULL; 278 struct list_head *first; 279 280 if (list_empty(&conf->inactive_list)) 281 goto out; 282 first = conf->inactive_list.next; 283 sh = list_entry(first, struct stripe_head, lru); 284 list_del_init(first); 285 remove_hash(sh); 286 atomic_inc(&conf->active_stripes); 287 out: 288 return sh; 289 } 290 291 static void shrink_buffers(struct stripe_head *sh) 292 { 293 struct page *p; 294 int i; 295 int num = sh->raid_conf->pool_size; 296 297 for (i = 0; i < num ; i++) { 298 p = sh->dev[i].page; 299 if (!p) 300 continue; 301 sh->dev[i].page = NULL; 302 put_page(p); 303 } 304 } 305 306 static int grow_buffers(struct stripe_head *sh) 307 { 308 int i; 309 int num = sh->raid_conf->pool_size; 310 311 for (i = 0; i < num; i++) { 312 struct page *page; 313 314 if (!(page = alloc_page(GFP_KERNEL))) { 315 return 1; 316 } 317 sh->dev[i].page = page; 318 } 319 return 0; 320 } 321 322 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 323 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 324 struct stripe_head *sh); 325 326 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 327 { 328 struct r5conf *conf = sh->raid_conf; 329 int i; 330 331 BUG_ON(atomic_read(&sh->count) != 0); 332 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 333 BUG_ON(stripe_operations_active(sh)); 334 335 pr_debug("init_stripe called, stripe %llu\n", 336 (unsigned long long)sh->sector); 337 338 remove_hash(sh); 339 340 sh->generation = conf->generation - previous; 341 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 342 sh->sector = sector; 343 stripe_set_idx(sector, conf, previous, sh); 344 sh->state = 0; 345 346 347 for (i = sh->disks; i--; ) { 348 struct r5dev *dev = &sh->dev[i]; 349 350 if (dev->toread || dev->read || dev->towrite || dev->written || 351 test_bit(R5_LOCKED, &dev->flags)) { 352 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 353 (unsigned long long)sh->sector, i, dev->toread, 354 dev->read, dev->towrite, dev->written, 355 test_bit(R5_LOCKED, &dev->flags)); 356 WARN_ON(1); 357 } 358 dev->flags = 0; 359 raid5_build_block(sh, i, previous); 360 } 361 insert_hash(conf, sh); 362 } 363 364 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 365 short generation) 366 { 367 struct stripe_head *sh; 368 369 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 370 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 371 if (sh->sector == sector && sh->generation == generation) 372 return sh; 373 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 374 return NULL; 375 } 376 377 /* 378 * Need to check if array has failed when deciding whether to: 379 * - start an array 380 * - remove non-faulty devices 381 * - add a spare 382 * - allow a reshape 383 * This determination is simple when no reshape is happening. 384 * However if there is a reshape, we need to carefully check 385 * both the before and after sections. 386 * This is because some failed devices may only affect one 387 * of the two sections, and some non-in_sync devices may 388 * be insync in the section most affected by failed devices. 389 */ 390 static int calc_degraded(struct r5conf *conf) 391 { 392 int degraded, degraded2; 393 int i; 394 395 rcu_read_lock(); 396 degraded = 0; 397 for (i = 0; i < conf->previous_raid_disks; i++) { 398 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 399 if (rdev && test_bit(Faulty, &rdev->flags)) 400 rdev = rcu_dereference(conf->disks[i].replacement); 401 if (!rdev || test_bit(Faulty, &rdev->flags)) 402 degraded++; 403 else if (test_bit(In_sync, &rdev->flags)) 404 ; 405 else 406 /* not in-sync or faulty. 407 * If the reshape increases the number of devices, 408 * this is being recovered by the reshape, so 409 * this 'previous' section is not in_sync. 410 * If the number of devices is being reduced however, 411 * the device can only be part of the array if 412 * we are reverting a reshape, so this section will 413 * be in-sync. 414 */ 415 if (conf->raid_disks >= conf->previous_raid_disks) 416 degraded++; 417 } 418 rcu_read_unlock(); 419 if (conf->raid_disks == conf->previous_raid_disks) 420 return degraded; 421 rcu_read_lock(); 422 degraded2 = 0; 423 for (i = 0; i < conf->raid_disks; i++) { 424 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 425 if (rdev && test_bit(Faulty, &rdev->flags)) 426 rdev = rcu_dereference(conf->disks[i].replacement); 427 if (!rdev || test_bit(Faulty, &rdev->flags)) 428 degraded2++; 429 else if (test_bit(In_sync, &rdev->flags)) 430 ; 431 else 432 /* not in-sync or faulty. 433 * If reshape increases the number of devices, this 434 * section has already been recovered, else it 435 * almost certainly hasn't. 436 */ 437 if (conf->raid_disks <= conf->previous_raid_disks) 438 degraded2++; 439 } 440 rcu_read_unlock(); 441 if (degraded2 > degraded) 442 return degraded2; 443 return degraded; 444 } 445 446 static int has_failed(struct r5conf *conf) 447 { 448 int degraded; 449 450 if (conf->mddev->reshape_position == MaxSector) 451 return conf->mddev->degraded > conf->max_degraded; 452 453 degraded = calc_degraded(conf); 454 if (degraded > conf->max_degraded) 455 return 1; 456 return 0; 457 } 458 459 static struct stripe_head * 460 get_active_stripe(struct r5conf *conf, sector_t sector, 461 int previous, int noblock, int noquiesce) 462 { 463 struct stripe_head *sh; 464 465 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 466 467 spin_lock_irq(&conf->device_lock); 468 469 do { 470 wait_event_lock_irq(conf->wait_for_stripe, 471 conf->quiesce == 0 || noquiesce, 472 conf->device_lock); 473 sh = __find_stripe(conf, sector, conf->generation - previous); 474 if (!sh) { 475 if (!conf->inactive_blocked) 476 sh = get_free_stripe(conf); 477 if (noblock && sh == NULL) 478 break; 479 if (!sh) { 480 conf->inactive_blocked = 1; 481 wait_event_lock_irq(conf->wait_for_stripe, 482 !list_empty(&conf->inactive_list) && 483 (atomic_read(&conf->active_stripes) 484 < (conf->max_nr_stripes *3/4) 485 || !conf->inactive_blocked), 486 conf->device_lock); 487 conf->inactive_blocked = 0; 488 } else 489 init_stripe(sh, sector, previous); 490 } else { 491 if (atomic_read(&sh->count)) { 492 BUG_ON(!list_empty(&sh->lru) 493 && !test_bit(STRIPE_EXPANDING, &sh->state) 494 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 495 } else { 496 if (!test_bit(STRIPE_HANDLE, &sh->state)) 497 atomic_inc(&conf->active_stripes); 498 if (list_empty(&sh->lru) && 499 !test_bit(STRIPE_EXPANDING, &sh->state)) 500 BUG(); 501 list_del_init(&sh->lru); 502 } 503 } 504 } while (sh == NULL); 505 506 if (sh) 507 atomic_inc(&sh->count); 508 509 spin_unlock_irq(&conf->device_lock); 510 return sh; 511 } 512 513 /* Determine if 'data_offset' or 'new_data_offset' should be used 514 * in this stripe_head. 515 */ 516 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 517 { 518 sector_t progress = conf->reshape_progress; 519 /* Need a memory barrier to make sure we see the value 520 * of conf->generation, or ->data_offset that was set before 521 * reshape_progress was updated. 522 */ 523 smp_rmb(); 524 if (progress == MaxSector) 525 return 0; 526 if (sh->generation == conf->generation - 1) 527 return 0; 528 /* We are in a reshape, and this is a new-generation stripe, 529 * so use new_data_offset. 530 */ 531 return 1; 532 } 533 534 static void 535 raid5_end_read_request(struct bio *bi, int error); 536 static void 537 raid5_end_write_request(struct bio *bi, int error); 538 539 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 540 { 541 struct r5conf *conf = sh->raid_conf; 542 int i, disks = sh->disks; 543 544 might_sleep(); 545 546 for (i = disks; i--; ) { 547 int rw; 548 int replace_only = 0; 549 struct bio *bi, *rbi; 550 struct md_rdev *rdev, *rrdev = NULL; 551 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 552 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 553 rw = WRITE_FUA; 554 else 555 rw = WRITE; 556 if (test_bit(R5_Discard, &sh->dev[i].flags)) 557 rw |= REQ_DISCARD; 558 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 559 rw = READ; 560 else if (test_and_clear_bit(R5_WantReplace, 561 &sh->dev[i].flags)) { 562 rw = WRITE; 563 replace_only = 1; 564 } else 565 continue; 566 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 567 rw |= REQ_SYNC; 568 569 bi = &sh->dev[i].req; 570 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 571 572 rcu_read_lock(); 573 rrdev = rcu_dereference(conf->disks[i].replacement); 574 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 575 rdev = rcu_dereference(conf->disks[i].rdev); 576 if (!rdev) { 577 rdev = rrdev; 578 rrdev = NULL; 579 } 580 if (rw & WRITE) { 581 if (replace_only) 582 rdev = NULL; 583 if (rdev == rrdev) 584 /* We raced and saw duplicates */ 585 rrdev = NULL; 586 } else { 587 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 588 rdev = rrdev; 589 rrdev = NULL; 590 } 591 592 if (rdev && test_bit(Faulty, &rdev->flags)) 593 rdev = NULL; 594 if (rdev) 595 atomic_inc(&rdev->nr_pending); 596 if (rrdev && test_bit(Faulty, &rrdev->flags)) 597 rrdev = NULL; 598 if (rrdev) 599 atomic_inc(&rrdev->nr_pending); 600 rcu_read_unlock(); 601 602 /* We have already checked bad blocks for reads. Now 603 * need to check for writes. We never accept write errors 604 * on the replacement, so we don't to check rrdev. 605 */ 606 while ((rw & WRITE) && rdev && 607 test_bit(WriteErrorSeen, &rdev->flags)) { 608 sector_t first_bad; 609 int bad_sectors; 610 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 611 &first_bad, &bad_sectors); 612 if (!bad) 613 break; 614 615 if (bad < 0) { 616 set_bit(BlockedBadBlocks, &rdev->flags); 617 if (!conf->mddev->external && 618 conf->mddev->flags) { 619 /* It is very unlikely, but we might 620 * still need to write out the 621 * bad block log - better give it 622 * a chance*/ 623 md_check_recovery(conf->mddev); 624 } 625 /* 626 * Because md_wait_for_blocked_rdev 627 * will dec nr_pending, we must 628 * increment it first. 629 */ 630 atomic_inc(&rdev->nr_pending); 631 md_wait_for_blocked_rdev(rdev, conf->mddev); 632 } else { 633 /* Acknowledged bad block - skip the write */ 634 rdev_dec_pending(rdev, conf->mddev); 635 rdev = NULL; 636 } 637 } 638 639 if (rdev) { 640 if (s->syncing || s->expanding || s->expanded 641 || s->replacing) 642 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 643 644 set_bit(STRIPE_IO_STARTED, &sh->state); 645 646 bio_reset(bi); 647 bi->bi_bdev = rdev->bdev; 648 bi->bi_rw = rw; 649 bi->bi_end_io = (rw & WRITE) 650 ? raid5_end_write_request 651 : raid5_end_read_request; 652 bi->bi_private = sh; 653 654 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 655 __func__, (unsigned long long)sh->sector, 656 bi->bi_rw, i); 657 atomic_inc(&sh->count); 658 if (use_new_offset(conf, sh)) 659 bi->bi_sector = (sh->sector 660 + rdev->new_data_offset); 661 else 662 bi->bi_sector = (sh->sector 663 + rdev->data_offset); 664 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 665 bi->bi_rw |= REQ_FLUSH; 666 667 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 668 bi->bi_io_vec[0].bv_offset = 0; 669 bi->bi_size = STRIPE_SIZE; 670 if (rrdev) 671 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 672 673 if (conf->mddev->gendisk) 674 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 675 bi, disk_devt(conf->mddev->gendisk), 676 sh->dev[i].sector); 677 generic_make_request(bi); 678 } 679 if (rrdev) { 680 if (s->syncing || s->expanding || s->expanded 681 || s->replacing) 682 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 683 684 set_bit(STRIPE_IO_STARTED, &sh->state); 685 686 bio_reset(rbi); 687 rbi->bi_bdev = rrdev->bdev; 688 rbi->bi_rw = rw; 689 BUG_ON(!(rw & WRITE)); 690 rbi->bi_end_io = raid5_end_write_request; 691 rbi->bi_private = sh; 692 693 pr_debug("%s: for %llu schedule op %ld on " 694 "replacement disc %d\n", 695 __func__, (unsigned long long)sh->sector, 696 rbi->bi_rw, i); 697 atomic_inc(&sh->count); 698 if (use_new_offset(conf, sh)) 699 rbi->bi_sector = (sh->sector 700 + rrdev->new_data_offset); 701 else 702 rbi->bi_sector = (sh->sector 703 + rrdev->data_offset); 704 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 705 rbi->bi_io_vec[0].bv_offset = 0; 706 rbi->bi_size = STRIPE_SIZE; 707 if (conf->mddev->gendisk) 708 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 709 rbi, disk_devt(conf->mddev->gendisk), 710 sh->dev[i].sector); 711 generic_make_request(rbi); 712 } 713 if (!rdev && !rrdev) { 714 if (rw & WRITE) 715 set_bit(STRIPE_DEGRADED, &sh->state); 716 pr_debug("skip op %ld on disc %d for sector %llu\n", 717 bi->bi_rw, i, (unsigned long long)sh->sector); 718 clear_bit(R5_LOCKED, &sh->dev[i].flags); 719 set_bit(STRIPE_HANDLE, &sh->state); 720 } 721 } 722 } 723 724 static struct dma_async_tx_descriptor * 725 async_copy_data(int frombio, struct bio *bio, struct page *page, 726 sector_t sector, struct dma_async_tx_descriptor *tx) 727 { 728 struct bio_vec *bvl; 729 struct page *bio_page; 730 int i; 731 int page_offset; 732 struct async_submit_ctl submit; 733 enum async_tx_flags flags = 0; 734 735 if (bio->bi_sector >= sector) 736 page_offset = (signed)(bio->bi_sector - sector) * 512; 737 else 738 page_offset = (signed)(sector - bio->bi_sector) * -512; 739 740 if (frombio) 741 flags |= ASYNC_TX_FENCE; 742 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 743 744 bio_for_each_segment(bvl, bio, i) { 745 int len = bvl->bv_len; 746 int clen; 747 int b_offset = 0; 748 749 if (page_offset < 0) { 750 b_offset = -page_offset; 751 page_offset += b_offset; 752 len -= b_offset; 753 } 754 755 if (len > 0 && page_offset + len > STRIPE_SIZE) 756 clen = STRIPE_SIZE - page_offset; 757 else 758 clen = len; 759 760 if (clen > 0) { 761 b_offset += bvl->bv_offset; 762 bio_page = bvl->bv_page; 763 if (frombio) 764 tx = async_memcpy(page, bio_page, page_offset, 765 b_offset, clen, &submit); 766 else 767 tx = async_memcpy(bio_page, page, b_offset, 768 page_offset, clen, &submit); 769 } 770 /* chain the operations */ 771 submit.depend_tx = tx; 772 773 if (clen < len) /* hit end of page */ 774 break; 775 page_offset += len; 776 } 777 778 return tx; 779 } 780 781 static void ops_complete_biofill(void *stripe_head_ref) 782 { 783 struct stripe_head *sh = stripe_head_ref; 784 struct bio *return_bi = NULL; 785 int i; 786 787 pr_debug("%s: stripe %llu\n", __func__, 788 (unsigned long long)sh->sector); 789 790 /* clear completed biofills */ 791 for (i = sh->disks; i--; ) { 792 struct r5dev *dev = &sh->dev[i]; 793 794 /* acknowledge completion of a biofill operation */ 795 /* and check if we need to reply to a read request, 796 * new R5_Wantfill requests are held off until 797 * !STRIPE_BIOFILL_RUN 798 */ 799 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 800 struct bio *rbi, *rbi2; 801 802 BUG_ON(!dev->read); 803 rbi = dev->read; 804 dev->read = NULL; 805 while (rbi && rbi->bi_sector < 806 dev->sector + STRIPE_SECTORS) { 807 rbi2 = r5_next_bio(rbi, dev->sector); 808 if (!raid5_dec_bi_active_stripes(rbi)) { 809 rbi->bi_next = return_bi; 810 return_bi = rbi; 811 } 812 rbi = rbi2; 813 } 814 } 815 } 816 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 817 818 return_io(return_bi); 819 820 set_bit(STRIPE_HANDLE, &sh->state); 821 release_stripe(sh); 822 } 823 824 static void ops_run_biofill(struct stripe_head *sh) 825 { 826 struct dma_async_tx_descriptor *tx = NULL; 827 struct async_submit_ctl submit; 828 int i; 829 830 pr_debug("%s: stripe %llu\n", __func__, 831 (unsigned long long)sh->sector); 832 833 for (i = sh->disks; i--; ) { 834 struct r5dev *dev = &sh->dev[i]; 835 if (test_bit(R5_Wantfill, &dev->flags)) { 836 struct bio *rbi; 837 spin_lock_irq(&sh->stripe_lock); 838 dev->read = rbi = dev->toread; 839 dev->toread = NULL; 840 spin_unlock_irq(&sh->stripe_lock); 841 while (rbi && rbi->bi_sector < 842 dev->sector + STRIPE_SECTORS) { 843 tx = async_copy_data(0, rbi, dev->page, 844 dev->sector, tx); 845 rbi = r5_next_bio(rbi, dev->sector); 846 } 847 } 848 } 849 850 atomic_inc(&sh->count); 851 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 852 async_trigger_callback(&submit); 853 } 854 855 static void mark_target_uptodate(struct stripe_head *sh, int target) 856 { 857 struct r5dev *tgt; 858 859 if (target < 0) 860 return; 861 862 tgt = &sh->dev[target]; 863 set_bit(R5_UPTODATE, &tgt->flags); 864 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 865 clear_bit(R5_Wantcompute, &tgt->flags); 866 } 867 868 static void ops_complete_compute(void *stripe_head_ref) 869 { 870 struct stripe_head *sh = stripe_head_ref; 871 872 pr_debug("%s: stripe %llu\n", __func__, 873 (unsigned long long)sh->sector); 874 875 /* mark the computed target(s) as uptodate */ 876 mark_target_uptodate(sh, sh->ops.target); 877 mark_target_uptodate(sh, sh->ops.target2); 878 879 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 880 if (sh->check_state == check_state_compute_run) 881 sh->check_state = check_state_compute_result; 882 set_bit(STRIPE_HANDLE, &sh->state); 883 release_stripe(sh); 884 } 885 886 /* return a pointer to the address conversion region of the scribble buffer */ 887 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 888 struct raid5_percpu *percpu) 889 { 890 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 891 } 892 893 static struct dma_async_tx_descriptor * 894 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 895 { 896 int disks = sh->disks; 897 struct page **xor_srcs = percpu->scribble; 898 int target = sh->ops.target; 899 struct r5dev *tgt = &sh->dev[target]; 900 struct page *xor_dest = tgt->page; 901 int count = 0; 902 struct dma_async_tx_descriptor *tx; 903 struct async_submit_ctl submit; 904 int i; 905 906 pr_debug("%s: stripe %llu block: %d\n", 907 __func__, (unsigned long long)sh->sector, target); 908 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 909 910 for (i = disks; i--; ) 911 if (i != target) 912 xor_srcs[count++] = sh->dev[i].page; 913 914 atomic_inc(&sh->count); 915 916 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 917 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 918 if (unlikely(count == 1)) 919 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 920 else 921 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 922 923 return tx; 924 } 925 926 /* set_syndrome_sources - populate source buffers for gen_syndrome 927 * @srcs - (struct page *) array of size sh->disks 928 * @sh - stripe_head to parse 929 * 930 * Populates srcs in proper layout order for the stripe and returns the 931 * 'count' of sources to be used in a call to async_gen_syndrome. The P 932 * destination buffer is recorded in srcs[count] and the Q destination 933 * is recorded in srcs[count+1]]. 934 */ 935 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 936 { 937 int disks = sh->disks; 938 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 939 int d0_idx = raid6_d0(sh); 940 int count; 941 int i; 942 943 for (i = 0; i < disks; i++) 944 srcs[i] = NULL; 945 946 count = 0; 947 i = d0_idx; 948 do { 949 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 950 951 srcs[slot] = sh->dev[i].page; 952 i = raid6_next_disk(i, disks); 953 } while (i != d0_idx); 954 955 return syndrome_disks; 956 } 957 958 static struct dma_async_tx_descriptor * 959 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 960 { 961 int disks = sh->disks; 962 struct page **blocks = percpu->scribble; 963 int target; 964 int qd_idx = sh->qd_idx; 965 struct dma_async_tx_descriptor *tx; 966 struct async_submit_ctl submit; 967 struct r5dev *tgt; 968 struct page *dest; 969 int i; 970 int count; 971 972 if (sh->ops.target < 0) 973 target = sh->ops.target2; 974 else if (sh->ops.target2 < 0) 975 target = sh->ops.target; 976 else 977 /* we should only have one valid target */ 978 BUG(); 979 BUG_ON(target < 0); 980 pr_debug("%s: stripe %llu block: %d\n", 981 __func__, (unsigned long long)sh->sector, target); 982 983 tgt = &sh->dev[target]; 984 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 985 dest = tgt->page; 986 987 atomic_inc(&sh->count); 988 989 if (target == qd_idx) { 990 count = set_syndrome_sources(blocks, sh); 991 blocks[count] = NULL; /* regenerating p is not necessary */ 992 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 993 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 994 ops_complete_compute, sh, 995 to_addr_conv(sh, percpu)); 996 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 997 } else { 998 /* Compute any data- or p-drive using XOR */ 999 count = 0; 1000 for (i = disks; i-- ; ) { 1001 if (i == target || i == qd_idx) 1002 continue; 1003 blocks[count++] = sh->dev[i].page; 1004 } 1005 1006 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1007 NULL, ops_complete_compute, sh, 1008 to_addr_conv(sh, percpu)); 1009 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1010 } 1011 1012 return tx; 1013 } 1014 1015 static struct dma_async_tx_descriptor * 1016 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1017 { 1018 int i, count, disks = sh->disks; 1019 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1020 int d0_idx = raid6_d0(sh); 1021 int faila = -1, failb = -1; 1022 int target = sh->ops.target; 1023 int target2 = sh->ops.target2; 1024 struct r5dev *tgt = &sh->dev[target]; 1025 struct r5dev *tgt2 = &sh->dev[target2]; 1026 struct dma_async_tx_descriptor *tx; 1027 struct page **blocks = percpu->scribble; 1028 struct async_submit_ctl submit; 1029 1030 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1031 __func__, (unsigned long long)sh->sector, target, target2); 1032 BUG_ON(target < 0 || target2 < 0); 1033 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1034 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1035 1036 /* we need to open-code set_syndrome_sources to handle the 1037 * slot number conversion for 'faila' and 'failb' 1038 */ 1039 for (i = 0; i < disks ; i++) 1040 blocks[i] = NULL; 1041 count = 0; 1042 i = d0_idx; 1043 do { 1044 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1045 1046 blocks[slot] = sh->dev[i].page; 1047 1048 if (i == target) 1049 faila = slot; 1050 if (i == target2) 1051 failb = slot; 1052 i = raid6_next_disk(i, disks); 1053 } while (i != d0_idx); 1054 1055 BUG_ON(faila == failb); 1056 if (failb < faila) 1057 swap(faila, failb); 1058 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1059 __func__, (unsigned long long)sh->sector, faila, failb); 1060 1061 atomic_inc(&sh->count); 1062 1063 if (failb == syndrome_disks+1) { 1064 /* Q disk is one of the missing disks */ 1065 if (faila == syndrome_disks) { 1066 /* Missing P+Q, just recompute */ 1067 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1068 ops_complete_compute, sh, 1069 to_addr_conv(sh, percpu)); 1070 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1071 STRIPE_SIZE, &submit); 1072 } else { 1073 struct page *dest; 1074 int data_target; 1075 int qd_idx = sh->qd_idx; 1076 1077 /* Missing D+Q: recompute D from P, then recompute Q */ 1078 if (target == qd_idx) 1079 data_target = target2; 1080 else 1081 data_target = target; 1082 1083 count = 0; 1084 for (i = disks; i-- ; ) { 1085 if (i == data_target || i == qd_idx) 1086 continue; 1087 blocks[count++] = sh->dev[i].page; 1088 } 1089 dest = sh->dev[data_target].page; 1090 init_async_submit(&submit, 1091 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1092 NULL, NULL, NULL, 1093 to_addr_conv(sh, percpu)); 1094 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1095 &submit); 1096 1097 count = set_syndrome_sources(blocks, sh); 1098 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1099 ops_complete_compute, sh, 1100 to_addr_conv(sh, percpu)); 1101 return async_gen_syndrome(blocks, 0, count+2, 1102 STRIPE_SIZE, &submit); 1103 } 1104 } else { 1105 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1106 ops_complete_compute, sh, 1107 to_addr_conv(sh, percpu)); 1108 if (failb == syndrome_disks) { 1109 /* We're missing D+P. */ 1110 return async_raid6_datap_recov(syndrome_disks+2, 1111 STRIPE_SIZE, faila, 1112 blocks, &submit); 1113 } else { 1114 /* We're missing D+D. */ 1115 return async_raid6_2data_recov(syndrome_disks+2, 1116 STRIPE_SIZE, faila, failb, 1117 blocks, &submit); 1118 } 1119 } 1120 } 1121 1122 1123 static void ops_complete_prexor(void *stripe_head_ref) 1124 { 1125 struct stripe_head *sh = stripe_head_ref; 1126 1127 pr_debug("%s: stripe %llu\n", __func__, 1128 (unsigned long long)sh->sector); 1129 } 1130 1131 static struct dma_async_tx_descriptor * 1132 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1133 struct dma_async_tx_descriptor *tx) 1134 { 1135 int disks = sh->disks; 1136 struct page **xor_srcs = percpu->scribble; 1137 int count = 0, pd_idx = sh->pd_idx, i; 1138 struct async_submit_ctl submit; 1139 1140 /* existing parity data subtracted */ 1141 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1142 1143 pr_debug("%s: stripe %llu\n", __func__, 1144 (unsigned long long)sh->sector); 1145 1146 for (i = disks; i--; ) { 1147 struct r5dev *dev = &sh->dev[i]; 1148 /* Only process blocks that are known to be uptodate */ 1149 if (test_bit(R5_Wantdrain, &dev->flags)) 1150 xor_srcs[count++] = dev->page; 1151 } 1152 1153 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1154 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1155 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1156 1157 return tx; 1158 } 1159 1160 static struct dma_async_tx_descriptor * 1161 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1162 { 1163 int disks = sh->disks; 1164 int i; 1165 1166 pr_debug("%s: stripe %llu\n", __func__, 1167 (unsigned long long)sh->sector); 1168 1169 for (i = disks; i--; ) { 1170 struct r5dev *dev = &sh->dev[i]; 1171 struct bio *chosen; 1172 1173 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1174 struct bio *wbi; 1175 1176 spin_lock_irq(&sh->stripe_lock); 1177 chosen = dev->towrite; 1178 dev->towrite = NULL; 1179 BUG_ON(dev->written); 1180 wbi = dev->written = chosen; 1181 spin_unlock_irq(&sh->stripe_lock); 1182 1183 while (wbi && wbi->bi_sector < 1184 dev->sector + STRIPE_SECTORS) { 1185 if (wbi->bi_rw & REQ_FUA) 1186 set_bit(R5_WantFUA, &dev->flags); 1187 if (wbi->bi_rw & REQ_SYNC) 1188 set_bit(R5_SyncIO, &dev->flags); 1189 if (wbi->bi_rw & REQ_DISCARD) 1190 set_bit(R5_Discard, &dev->flags); 1191 else 1192 tx = async_copy_data(1, wbi, dev->page, 1193 dev->sector, tx); 1194 wbi = r5_next_bio(wbi, dev->sector); 1195 } 1196 } 1197 } 1198 1199 return tx; 1200 } 1201 1202 static void ops_complete_reconstruct(void *stripe_head_ref) 1203 { 1204 struct stripe_head *sh = stripe_head_ref; 1205 int disks = sh->disks; 1206 int pd_idx = sh->pd_idx; 1207 int qd_idx = sh->qd_idx; 1208 int i; 1209 bool fua = false, sync = false, discard = false; 1210 1211 pr_debug("%s: stripe %llu\n", __func__, 1212 (unsigned long long)sh->sector); 1213 1214 for (i = disks; i--; ) { 1215 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1216 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1217 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1218 } 1219 1220 for (i = disks; i--; ) { 1221 struct r5dev *dev = &sh->dev[i]; 1222 1223 if (dev->written || i == pd_idx || i == qd_idx) { 1224 if (!discard) 1225 set_bit(R5_UPTODATE, &dev->flags); 1226 if (fua) 1227 set_bit(R5_WantFUA, &dev->flags); 1228 if (sync) 1229 set_bit(R5_SyncIO, &dev->flags); 1230 } 1231 } 1232 1233 if (sh->reconstruct_state == reconstruct_state_drain_run) 1234 sh->reconstruct_state = reconstruct_state_drain_result; 1235 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1236 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1237 else { 1238 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1239 sh->reconstruct_state = reconstruct_state_result; 1240 } 1241 1242 set_bit(STRIPE_HANDLE, &sh->state); 1243 release_stripe(sh); 1244 } 1245 1246 static void 1247 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1248 struct dma_async_tx_descriptor *tx) 1249 { 1250 int disks = sh->disks; 1251 struct page **xor_srcs = percpu->scribble; 1252 struct async_submit_ctl submit; 1253 int count = 0, pd_idx = sh->pd_idx, i; 1254 struct page *xor_dest; 1255 int prexor = 0; 1256 unsigned long flags; 1257 1258 pr_debug("%s: stripe %llu\n", __func__, 1259 (unsigned long long)sh->sector); 1260 1261 for (i = 0; i < sh->disks; i++) { 1262 if (pd_idx == i) 1263 continue; 1264 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1265 break; 1266 } 1267 if (i >= sh->disks) { 1268 atomic_inc(&sh->count); 1269 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1270 ops_complete_reconstruct(sh); 1271 return; 1272 } 1273 /* check if prexor is active which means only process blocks 1274 * that are part of a read-modify-write (written) 1275 */ 1276 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1277 prexor = 1; 1278 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1279 for (i = disks; i--; ) { 1280 struct r5dev *dev = &sh->dev[i]; 1281 if (dev->written) 1282 xor_srcs[count++] = dev->page; 1283 } 1284 } else { 1285 xor_dest = sh->dev[pd_idx].page; 1286 for (i = disks; i--; ) { 1287 struct r5dev *dev = &sh->dev[i]; 1288 if (i != pd_idx) 1289 xor_srcs[count++] = dev->page; 1290 } 1291 } 1292 1293 /* 1/ if we prexor'd then the dest is reused as a source 1294 * 2/ if we did not prexor then we are redoing the parity 1295 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1296 * for the synchronous xor case 1297 */ 1298 flags = ASYNC_TX_ACK | 1299 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1300 1301 atomic_inc(&sh->count); 1302 1303 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1304 to_addr_conv(sh, percpu)); 1305 if (unlikely(count == 1)) 1306 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1307 else 1308 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1309 } 1310 1311 static void 1312 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1313 struct dma_async_tx_descriptor *tx) 1314 { 1315 struct async_submit_ctl submit; 1316 struct page **blocks = percpu->scribble; 1317 int count, i; 1318 1319 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1320 1321 for (i = 0; i < sh->disks; i++) { 1322 if (sh->pd_idx == i || sh->qd_idx == i) 1323 continue; 1324 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1325 break; 1326 } 1327 if (i >= sh->disks) { 1328 atomic_inc(&sh->count); 1329 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1330 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1331 ops_complete_reconstruct(sh); 1332 return; 1333 } 1334 1335 count = set_syndrome_sources(blocks, sh); 1336 1337 atomic_inc(&sh->count); 1338 1339 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1340 sh, to_addr_conv(sh, percpu)); 1341 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1342 } 1343 1344 static void ops_complete_check(void *stripe_head_ref) 1345 { 1346 struct stripe_head *sh = stripe_head_ref; 1347 1348 pr_debug("%s: stripe %llu\n", __func__, 1349 (unsigned long long)sh->sector); 1350 1351 sh->check_state = check_state_check_result; 1352 set_bit(STRIPE_HANDLE, &sh->state); 1353 release_stripe(sh); 1354 } 1355 1356 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1357 { 1358 int disks = sh->disks; 1359 int pd_idx = sh->pd_idx; 1360 int qd_idx = sh->qd_idx; 1361 struct page *xor_dest; 1362 struct page **xor_srcs = percpu->scribble; 1363 struct dma_async_tx_descriptor *tx; 1364 struct async_submit_ctl submit; 1365 int count; 1366 int i; 1367 1368 pr_debug("%s: stripe %llu\n", __func__, 1369 (unsigned long long)sh->sector); 1370 1371 count = 0; 1372 xor_dest = sh->dev[pd_idx].page; 1373 xor_srcs[count++] = xor_dest; 1374 for (i = disks; i--; ) { 1375 if (i == pd_idx || i == qd_idx) 1376 continue; 1377 xor_srcs[count++] = sh->dev[i].page; 1378 } 1379 1380 init_async_submit(&submit, 0, NULL, NULL, NULL, 1381 to_addr_conv(sh, percpu)); 1382 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1383 &sh->ops.zero_sum_result, &submit); 1384 1385 atomic_inc(&sh->count); 1386 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1387 tx = async_trigger_callback(&submit); 1388 } 1389 1390 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1391 { 1392 struct page **srcs = percpu->scribble; 1393 struct async_submit_ctl submit; 1394 int count; 1395 1396 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1397 (unsigned long long)sh->sector, checkp); 1398 1399 count = set_syndrome_sources(srcs, sh); 1400 if (!checkp) 1401 srcs[count] = NULL; 1402 1403 atomic_inc(&sh->count); 1404 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1405 sh, to_addr_conv(sh, percpu)); 1406 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1407 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1408 } 1409 1410 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1411 { 1412 int overlap_clear = 0, i, disks = sh->disks; 1413 struct dma_async_tx_descriptor *tx = NULL; 1414 struct r5conf *conf = sh->raid_conf; 1415 int level = conf->level; 1416 struct raid5_percpu *percpu; 1417 unsigned long cpu; 1418 1419 cpu = get_cpu(); 1420 percpu = per_cpu_ptr(conf->percpu, cpu); 1421 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1422 ops_run_biofill(sh); 1423 overlap_clear++; 1424 } 1425 1426 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1427 if (level < 6) 1428 tx = ops_run_compute5(sh, percpu); 1429 else { 1430 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1431 tx = ops_run_compute6_1(sh, percpu); 1432 else 1433 tx = ops_run_compute6_2(sh, percpu); 1434 } 1435 /* terminate the chain if reconstruct is not set to be run */ 1436 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1437 async_tx_ack(tx); 1438 } 1439 1440 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1441 tx = ops_run_prexor(sh, percpu, tx); 1442 1443 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1444 tx = ops_run_biodrain(sh, tx); 1445 overlap_clear++; 1446 } 1447 1448 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1449 if (level < 6) 1450 ops_run_reconstruct5(sh, percpu, tx); 1451 else 1452 ops_run_reconstruct6(sh, percpu, tx); 1453 } 1454 1455 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1456 if (sh->check_state == check_state_run) 1457 ops_run_check_p(sh, percpu); 1458 else if (sh->check_state == check_state_run_q) 1459 ops_run_check_pq(sh, percpu, 0); 1460 else if (sh->check_state == check_state_run_pq) 1461 ops_run_check_pq(sh, percpu, 1); 1462 else 1463 BUG(); 1464 } 1465 1466 if (overlap_clear) 1467 for (i = disks; i--; ) { 1468 struct r5dev *dev = &sh->dev[i]; 1469 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1470 wake_up(&sh->raid_conf->wait_for_overlap); 1471 } 1472 put_cpu(); 1473 } 1474 1475 static int grow_one_stripe(struct r5conf *conf) 1476 { 1477 struct stripe_head *sh; 1478 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1479 if (!sh) 1480 return 0; 1481 1482 sh->raid_conf = conf; 1483 1484 spin_lock_init(&sh->stripe_lock); 1485 1486 if (grow_buffers(sh)) { 1487 shrink_buffers(sh); 1488 kmem_cache_free(conf->slab_cache, sh); 1489 return 0; 1490 } 1491 /* we just created an active stripe so... */ 1492 atomic_set(&sh->count, 1); 1493 atomic_inc(&conf->active_stripes); 1494 INIT_LIST_HEAD(&sh->lru); 1495 release_stripe(sh); 1496 return 1; 1497 } 1498 1499 static int grow_stripes(struct r5conf *conf, int num) 1500 { 1501 struct kmem_cache *sc; 1502 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1503 1504 if (conf->mddev->gendisk) 1505 sprintf(conf->cache_name[0], 1506 "raid%d-%s", conf->level, mdname(conf->mddev)); 1507 else 1508 sprintf(conf->cache_name[0], 1509 "raid%d-%p", conf->level, conf->mddev); 1510 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1511 1512 conf->active_name = 0; 1513 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1514 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1515 0, 0, NULL); 1516 if (!sc) 1517 return 1; 1518 conf->slab_cache = sc; 1519 conf->pool_size = devs; 1520 while (num--) 1521 if (!grow_one_stripe(conf)) 1522 return 1; 1523 return 0; 1524 } 1525 1526 /** 1527 * scribble_len - return the required size of the scribble region 1528 * @num - total number of disks in the array 1529 * 1530 * The size must be enough to contain: 1531 * 1/ a struct page pointer for each device in the array +2 1532 * 2/ room to convert each entry in (1) to its corresponding dma 1533 * (dma_map_page()) or page (page_address()) address. 1534 * 1535 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1536 * calculate over all devices (not just the data blocks), using zeros in place 1537 * of the P and Q blocks. 1538 */ 1539 static size_t scribble_len(int num) 1540 { 1541 size_t len; 1542 1543 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1544 1545 return len; 1546 } 1547 1548 static int resize_stripes(struct r5conf *conf, int newsize) 1549 { 1550 /* Make all the stripes able to hold 'newsize' devices. 1551 * New slots in each stripe get 'page' set to a new page. 1552 * 1553 * This happens in stages: 1554 * 1/ create a new kmem_cache and allocate the required number of 1555 * stripe_heads. 1556 * 2/ gather all the old stripe_heads and transfer the pages across 1557 * to the new stripe_heads. This will have the side effect of 1558 * freezing the array as once all stripe_heads have been collected, 1559 * no IO will be possible. Old stripe heads are freed once their 1560 * pages have been transferred over, and the old kmem_cache is 1561 * freed when all stripes are done. 1562 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1563 * we simple return a failre status - no need to clean anything up. 1564 * 4/ allocate new pages for the new slots in the new stripe_heads. 1565 * If this fails, we don't bother trying the shrink the 1566 * stripe_heads down again, we just leave them as they are. 1567 * As each stripe_head is processed the new one is released into 1568 * active service. 1569 * 1570 * Once step2 is started, we cannot afford to wait for a write, 1571 * so we use GFP_NOIO allocations. 1572 */ 1573 struct stripe_head *osh, *nsh; 1574 LIST_HEAD(newstripes); 1575 struct disk_info *ndisks; 1576 unsigned long cpu; 1577 int err; 1578 struct kmem_cache *sc; 1579 int i; 1580 1581 if (newsize <= conf->pool_size) 1582 return 0; /* never bother to shrink */ 1583 1584 err = md_allow_write(conf->mddev); 1585 if (err) 1586 return err; 1587 1588 /* Step 1 */ 1589 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1590 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1591 0, 0, NULL); 1592 if (!sc) 1593 return -ENOMEM; 1594 1595 for (i = conf->max_nr_stripes; i; i--) { 1596 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1597 if (!nsh) 1598 break; 1599 1600 nsh->raid_conf = conf; 1601 spin_lock_init(&nsh->stripe_lock); 1602 1603 list_add(&nsh->lru, &newstripes); 1604 } 1605 if (i) { 1606 /* didn't get enough, give up */ 1607 while (!list_empty(&newstripes)) { 1608 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1609 list_del(&nsh->lru); 1610 kmem_cache_free(sc, nsh); 1611 } 1612 kmem_cache_destroy(sc); 1613 return -ENOMEM; 1614 } 1615 /* Step 2 - Must use GFP_NOIO now. 1616 * OK, we have enough stripes, start collecting inactive 1617 * stripes and copying them over 1618 */ 1619 list_for_each_entry(nsh, &newstripes, lru) { 1620 spin_lock_irq(&conf->device_lock); 1621 wait_event_lock_irq(conf->wait_for_stripe, 1622 !list_empty(&conf->inactive_list), 1623 conf->device_lock); 1624 osh = get_free_stripe(conf); 1625 spin_unlock_irq(&conf->device_lock); 1626 atomic_set(&nsh->count, 1); 1627 for(i=0; i<conf->pool_size; i++) 1628 nsh->dev[i].page = osh->dev[i].page; 1629 for( ; i<newsize; i++) 1630 nsh->dev[i].page = NULL; 1631 kmem_cache_free(conf->slab_cache, osh); 1632 } 1633 kmem_cache_destroy(conf->slab_cache); 1634 1635 /* Step 3. 1636 * At this point, we are holding all the stripes so the array 1637 * is completely stalled, so now is a good time to resize 1638 * conf->disks and the scribble region 1639 */ 1640 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1641 if (ndisks) { 1642 for (i=0; i<conf->raid_disks; i++) 1643 ndisks[i] = conf->disks[i]; 1644 kfree(conf->disks); 1645 conf->disks = ndisks; 1646 } else 1647 err = -ENOMEM; 1648 1649 get_online_cpus(); 1650 conf->scribble_len = scribble_len(newsize); 1651 for_each_present_cpu(cpu) { 1652 struct raid5_percpu *percpu; 1653 void *scribble; 1654 1655 percpu = per_cpu_ptr(conf->percpu, cpu); 1656 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1657 1658 if (scribble) { 1659 kfree(percpu->scribble); 1660 percpu->scribble = scribble; 1661 } else { 1662 err = -ENOMEM; 1663 break; 1664 } 1665 } 1666 put_online_cpus(); 1667 1668 /* Step 4, return new stripes to service */ 1669 while(!list_empty(&newstripes)) { 1670 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1671 list_del_init(&nsh->lru); 1672 1673 for (i=conf->raid_disks; i < newsize; i++) 1674 if (nsh->dev[i].page == NULL) { 1675 struct page *p = alloc_page(GFP_NOIO); 1676 nsh->dev[i].page = p; 1677 if (!p) 1678 err = -ENOMEM; 1679 } 1680 release_stripe(nsh); 1681 } 1682 /* critical section pass, GFP_NOIO no longer needed */ 1683 1684 conf->slab_cache = sc; 1685 conf->active_name = 1-conf->active_name; 1686 conf->pool_size = newsize; 1687 return err; 1688 } 1689 1690 static int drop_one_stripe(struct r5conf *conf) 1691 { 1692 struct stripe_head *sh; 1693 1694 spin_lock_irq(&conf->device_lock); 1695 sh = get_free_stripe(conf); 1696 spin_unlock_irq(&conf->device_lock); 1697 if (!sh) 1698 return 0; 1699 BUG_ON(atomic_read(&sh->count)); 1700 shrink_buffers(sh); 1701 kmem_cache_free(conf->slab_cache, sh); 1702 atomic_dec(&conf->active_stripes); 1703 return 1; 1704 } 1705 1706 static void shrink_stripes(struct r5conf *conf) 1707 { 1708 while (drop_one_stripe(conf)) 1709 ; 1710 1711 if (conf->slab_cache) 1712 kmem_cache_destroy(conf->slab_cache); 1713 conf->slab_cache = NULL; 1714 } 1715 1716 static void raid5_end_read_request(struct bio * bi, int error) 1717 { 1718 struct stripe_head *sh = bi->bi_private; 1719 struct r5conf *conf = sh->raid_conf; 1720 int disks = sh->disks, i; 1721 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1722 char b[BDEVNAME_SIZE]; 1723 struct md_rdev *rdev = NULL; 1724 sector_t s; 1725 1726 for (i=0 ; i<disks; i++) 1727 if (bi == &sh->dev[i].req) 1728 break; 1729 1730 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1731 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1732 uptodate); 1733 if (i == disks) { 1734 BUG(); 1735 return; 1736 } 1737 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1738 /* If replacement finished while this request was outstanding, 1739 * 'replacement' might be NULL already. 1740 * In that case it moved down to 'rdev'. 1741 * rdev is not removed until all requests are finished. 1742 */ 1743 rdev = conf->disks[i].replacement; 1744 if (!rdev) 1745 rdev = conf->disks[i].rdev; 1746 1747 if (use_new_offset(conf, sh)) 1748 s = sh->sector + rdev->new_data_offset; 1749 else 1750 s = sh->sector + rdev->data_offset; 1751 if (uptodate) { 1752 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1753 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1754 /* Note that this cannot happen on a 1755 * replacement device. We just fail those on 1756 * any error 1757 */ 1758 printk_ratelimited( 1759 KERN_INFO 1760 "md/raid:%s: read error corrected" 1761 " (%lu sectors at %llu on %s)\n", 1762 mdname(conf->mddev), STRIPE_SECTORS, 1763 (unsigned long long)s, 1764 bdevname(rdev->bdev, b)); 1765 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1766 clear_bit(R5_ReadError, &sh->dev[i].flags); 1767 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1768 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1769 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1770 1771 if (atomic_read(&rdev->read_errors)) 1772 atomic_set(&rdev->read_errors, 0); 1773 } else { 1774 const char *bdn = bdevname(rdev->bdev, b); 1775 int retry = 0; 1776 int set_bad = 0; 1777 1778 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1779 atomic_inc(&rdev->read_errors); 1780 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1781 printk_ratelimited( 1782 KERN_WARNING 1783 "md/raid:%s: read error on replacement device " 1784 "(sector %llu on %s).\n", 1785 mdname(conf->mddev), 1786 (unsigned long long)s, 1787 bdn); 1788 else if (conf->mddev->degraded >= conf->max_degraded) { 1789 set_bad = 1; 1790 printk_ratelimited( 1791 KERN_WARNING 1792 "md/raid:%s: read error not correctable " 1793 "(sector %llu on %s).\n", 1794 mdname(conf->mddev), 1795 (unsigned long long)s, 1796 bdn); 1797 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1798 /* Oh, no!!! */ 1799 set_bad = 1; 1800 printk_ratelimited( 1801 KERN_WARNING 1802 "md/raid:%s: read error NOT corrected!! " 1803 "(sector %llu on %s).\n", 1804 mdname(conf->mddev), 1805 (unsigned long long)s, 1806 bdn); 1807 } else if (atomic_read(&rdev->read_errors) 1808 > conf->max_nr_stripes) 1809 printk(KERN_WARNING 1810 "md/raid:%s: Too many read errors, failing device %s.\n", 1811 mdname(conf->mddev), bdn); 1812 else 1813 retry = 1; 1814 if (retry) 1815 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1816 set_bit(R5_ReadError, &sh->dev[i].flags); 1817 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1818 } else 1819 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1820 else { 1821 clear_bit(R5_ReadError, &sh->dev[i].flags); 1822 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1823 if (!(set_bad 1824 && test_bit(In_sync, &rdev->flags) 1825 && rdev_set_badblocks( 1826 rdev, sh->sector, STRIPE_SECTORS, 0))) 1827 md_error(conf->mddev, rdev); 1828 } 1829 } 1830 rdev_dec_pending(rdev, conf->mddev); 1831 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1832 set_bit(STRIPE_HANDLE, &sh->state); 1833 release_stripe(sh); 1834 } 1835 1836 static void raid5_end_write_request(struct bio *bi, int error) 1837 { 1838 struct stripe_head *sh = bi->bi_private; 1839 struct r5conf *conf = sh->raid_conf; 1840 int disks = sh->disks, i; 1841 struct md_rdev *uninitialized_var(rdev); 1842 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1843 sector_t first_bad; 1844 int bad_sectors; 1845 int replacement = 0; 1846 1847 for (i = 0 ; i < disks; i++) { 1848 if (bi == &sh->dev[i].req) { 1849 rdev = conf->disks[i].rdev; 1850 break; 1851 } 1852 if (bi == &sh->dev[i].rreq) { 1853 rdev = conf->disks[i].replacement; 1854 if (rdev) 1855 replacement = 1; 1856 else 1857 /* rdev was removed and 'replacement' 1858 * replaced it. rdev is not removed 1859 * until all requests are finished. 1860 */ 1861 rdev = conf->disks[i].rdev; 1862 break; 1863 } 1864 } 1865 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1866 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1867 uptodate); 1868 if (i == disks) { 1869 BUG(); 1870 return; 1871 } 1872 1873 if (replacement) { 1874 if (!uptodate) 1875 md_error(conf->mddev, rdev); 1876 else if (is_badblock(rdev, sh->sector, 1877 STRIPE_SECTORS, 1878 &first_bad, &bad_sectors)) 1879 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1880 } else { 1881 if (!uptodate) { 1882 set_bit(WriteErrorSeen, &rdev->flags); 1883 set_bit(R5_WriteError, &sh->dev[i].flags); 1884 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1885 set_bit(MD_RECOVERY_NEEDED, 1886 &rdev->mddev->recovery); 1887 } else if (is_badblock(rdev, sh->sector, 1888 STRIPE_SECTORS, 1889 &first_bad, &bad_sectors)) { 1890 set_bit(R5_MadeGood, &sh->dev[i].flags); 1891 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 1892 /* That was a successful write so make 1893 * sure it looks like we already did 1894 * a re-write. 1895 */ 1896 set_bit(R5_ReWrite, &sh->dev[i].flags); 1897 } 1898 } 1899 rdev_dec_pending(rdev, conf->mddev); 1900 1901 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1902 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1903 set_bit(STRIPE_HANDLE, &sh->state); 1904 release_stripe(sh); 1905 } 1906 1907 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1908 1909 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1910 { 1911 struct r5dev *dev = &sh->dev[i]; 1912 1913 bio_init(&dev->req); 1914 dev->req.bi_io_vec = &dev->vec; 1915 dev->req.bi_vcnt++; 1916 dev->req.bi_max_vecs++; 1917 dev->req.bi_private = sh; 1918 dev->vec.bv_page = dev->page; 1919 1920 bio_init(&dev->rreq); 1921 dev->rreq.bi_io_vec = &dev->rvec; 1922 dev->rreq.bi_vcnt++; 1923 dev->rreq.bi_max_vecs++; 1924 dev->rreq.bi_private = sh; 1925 dev->rvec.bv_page = dev->page; 1926 1927 dev->flags = 0; 1928 dev->sector = compute_blocknr(sh, i, previous); 1929 } 1930 1931 static void error(struct mddev *mddev, struct md_rdev *rdev) 1932 { 1933 char b[BDEVNAME_SIZE]; 1934 struct r5conf *conf = mddev->private; 1935 unsigned long flags; 1936 pr_debug("raid456: error called\n"); 1937 1938 spin_lock_irqsave(&conf->device_lock, flags); 1939 clear_bit(In_sync, &rdev->flags); 1940 mddev->degraded = calc_degraded(conf); 1941 spin_unlock_irqrestore(&conf->device_lock, flags); 1942 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1943 1944 set_bit(Blocked, &rdev->flags); 1945 set_bit(Faulty, &rdev->flags); 1946 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1947 printk(KERN_ALERT 1948 "md/raid:%s: Disk failure on %s, disabling device.\n" 1949 "md/raid:%s: Operation continuing on %d devices.\n", 1950 mdname(mddev), 1951 bdevname(rdev->bdev, b), 1952 mdname(mddev), 1953 conf->raid_disks - mddev->degraded); 1954 } 1955 1956 /* 1957 * Input: a 'big' sector number, 1958 * Output: index of the data and parity disk, and the sector # in them. 1959 */ 1960 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1961 int previous, int *dd_idx, 1962 struct stripe_head *sh) 1963 { 1964 sector_t stripe, stripe2; 1965 sector_t chunk_number; 1966 unsigned int chunk_offset; 1967 int pd_idx, qd_idx; 1968 int ddf_layout = 0; 1969 sector_t new_sector; 1970 int algorithm = previous ? conf->prev_algo 1971 : conf->algorithm; 1972 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1973 : conf->chunk_sectors; 1974 int raid_disks = previous ? conf->previous_raid_disks 1975 : conf->raid_disks; 1976 int data_disks = raid_disks - conf->max_degraded; 1977 1978 /* First compute the information on this sector */ 1979 1980 /* 1981 * Compute the chunk number and the sector offset inside the chunk 1982 */ 1983 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1984 chunk_number = r_sector; 1985 1986 /* 1987 * Compute the stripe number 1988 */ 1989 stripe = chunk_number; 1990 *dd_idx = sector_div(stripe, data_disks); 1991 stripe2 = stripe; 1992 /* 1993 * Select the parity disk based on the user selected algorithm. 1994 */ 1995 pd_idx = qd_idx = -1; 1996 switch(conf->level) { 1997 case 4: 1998 pd_idx = data_disks; 1999 break; 2000 case 5: 2001 switch (algorithm) { 2002 case ALGORITHM_LEFT_ASYMMETRIC: 2003 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2004 if (*dd_idx >= pd_idx) 2005 (*dd_idx)++; 2006 break; 2007 case ALGORITHM_RIGHT_ASYMMETRIC: 2008 pd_idx = sector_div(stripe2, raid_disks); 2009 if (*dd_idx >= pd_idx) 2010 (*dd_idx)++; 2011 break; 2012 case ALGORITHM_LEFT_SYMMETRIC: 2013 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2014 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2015 break; 2016 case ALGORITHM_RIGHT_SYMMETRIC: 2017 pd_idx = sector_div(stripe2, raid_disks); 2018 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2019 break; 2020 case ALGORITHM_PARITY_0: 2021 pd_idx = 0; 2022 (*dd_idx)++; 2023 break; 2024 case ALGORITHM_PARITY_N: 2025 pd_idx = data_disks; 2026 break; 2027 default: 2028 BUG(); 2029 } 2030 break; 2031 case 6: 2032 2033 switch (algorithm) { 2034 case ALGORITHM_LEFT_ASYMMETRIC: 2035 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2036 qd_idx = pd_idx + 1; 2037 if (pd_idx == raid_disks-1) { 2038 (*dd_idx)++; /* Q D D D P */ 2039 qd_idx = 0; 2040 } else if (*dd_idx >= pd_idx) 2041 (*dd_idx) += 2; /* D D P Q D */ 2042 break; 2043 case ALGORITHM_RIGHT_ASYMMETRIC: 2044 pd_idx = sector_div(stripe2, raid_disks); 2045 qd_idx = pd_idx + 1; 2046 if (pd_idx == raid_disks-1) { 2047 (*dd_idx)++; /* Q D D D P */ 2048 qd_idx = 0; 2049 } else if (*dd_idx >= pd_idx) 2050 (*dd_idx) += 2; /* D D P Q D */ 2051 break; 2052 case ALGORITHM_LEFT_SYMMETRIC: 2053 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2054 qd_idx = (pd_idx + 1) % raid_disks; 2055 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2056 break; 2057 case ALGORITHM_RIGHT_SYMMETRIC: 2058 pd_idx = sector_div(stripe2, raid_disks); 2059 qd_idx = (pd_idx + 1) % raid_disks; 2060 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2061 break; 2062 2063 case ALGORITHM_PARITY_0: 2064 pd_idx = 0; 2065 qd_idx = 1; 2066 (*dd_idx) += 2; 2067 break; 2068 case ALGORITHM_PARITY_N: 2069 pd_idx = data_disks; 2070 qd_idx = data_disks + 1; 2071 break; 2072 2073 case ALGORITHM_ROTATING_ZERO_RESTART: 2074 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2075 * of blocks for computing Q is different. 2076 */ 2077 pd_idx = sector_div(stripe2, raid_disks); 2078 qd_idx = pd_idx + 1; 2079 if (pd_idx == raid_disks-1) { 2080 (*dd_idx)++; /* Q D D D P */ 2081 qd_idx = 0; 2082 } else if (*dd_idx >= pd_idx) 2083 (*dd_idx) += 2; /* D D P Q D */ 2084 ddf_layout = 1; 2085 break; 2086 2087 case ALGORITHM_ROTATING_N_RESTART: 2088 /* Same a left_asymmetric, by first stripe is 2089 * D D D P Q rather than 2090 * Q D D D P 2091 */ 2092 stripe2 += 1; 2093 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2094 qd_idx = pd_idx + 1; 2095 if (pd_idx == raid_disks-1) { 2096 (*dd_idx)++; /* Q D D D P */ 2097 qd_idx = 0; 2098 } else if (*dd_idx >= pd_idx) 2099 (*dd_idx) += 2; /* D D P Q D */ 2100 ddf_layout = 1; 2101 break; 2102 2103 case ALGORITHM_ROTATING_N_CONTINUE: 2104 /* Same as left_symmetric but Q is before P */ 2105 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2106 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2107 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2108 ddf_layout = 1; 2109 break; 2110 2111 case ALGORITHM_LEFT_ASYMMETRIC_6: 2112 /* RAID5 left_asymmetric, with Q on last device */ 2113 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2114 if (*dd_idx >= pd_idx) 2115 (*dd_idx)++; 2116 qd_idx = raid_disks - 1; 2117 break; 2118 2119 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2120 pd_idx = sector_div(stripe2, raid_disks-1); 2121 if (*dd_idx >= pd_idx) 2122 (*dd_idx)++; 2123 qd_idx = raid_disks - 1; 2124 break; 2125 2126 case ALGORITHM_LEFT_SYMMETRIC_6: 2127 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2128 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2129 qd_idx = raid_disks - 1; 2130 break; 2131 2132 case ALGORITHM_RIGHT_SYMMETRIC_6: 2133 pd_idx = sector_div(stripe2, raid_disks-1); 2134 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2135 qd_idx = raid_disks - 1; 2136 break; 2137 2138 case ALGORITHM_PARITY_0_6: 2139 pd_idx = 0; 2140 (*dd_idx)++; 2141 qd_idx = raid_disks - 1; 2142 break; 2143 2144 default: 2145 BUG(); 2146 } 2147 break; 2148 } 2149 2150 if (sh) { 2151 sh->pd_idx = pd_idx; 2152 sh->qd_idx = qd_idx; 2153 sh->ddf_layout = ddf_layout; 2154 } 2155 /* 2156 * Finally, compute the new sector number 2157 */ 2158 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2159 return new_sector; 2160 } 2161 2162 2163 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2164 { 2165 struct r5conf *conf = sh->raid_conf; 2166 int raid_disks = sh->disks; 2167 int data_disks = raid_disks - conf->max_degraded; 2168 sector_t new_sector = sh->sector, check; 2169 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2170 : conf->chunk_sectors; 2171 int algorithm = previous ? conf->prev_algo 2172 : conf->algorithm; 2173 sector_t stripe; 2174 int chunk_offset; 2175 sector_t chunk_number; 2176 int dummy1, dd_idx = i; 2177 sector_t r_sector; 2178 struct stripe_head sh2; 2179 2180 2181 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2182 stripe = new_sector; 2183 2184 if (i == sh->pd_idx) 2185 return 0; 2186 switch(conf->level) { 2187 case 4: break; 2188 case 5: 2189 switch (algorithm) { 2190 case ALGORITHM_LEFT_ASYMMETRIC: 2191 case ALGORITHM_RIGHT_ASYMMETRIC: 2192 if (i > sh->pd_idx) 2193 i--; 2194 break; 2195 case ALGORITHM_LEFT_SYMMETRIC: 2196 case ALGORITHM_RIGHT_SYMMETRIC: 2197 if (i < sh->pd_idx) 2198 i += raid_disks; 2199 i -= (sh->pd_idx + 1); 2200 break; 2201 case ALGORITHM_PARITY_0: 2202 i -= 1; 2203 break; 2204 case ALGORITHM_PARITY_N: 2205 break; 2206 default: 2207 BUG(); 2208 } 2209 break; 2210 case 6: 2211 if (i == sh->qd_idx) 2212 return 0; /* It is the Q disk */ 2213 switch (algorithm) { 2214 case ALGORITHM_LEFT_ASYMMETRIC: 2215 case ALGORITHM_RIGHT_ASYMMETRIC: 2216 case ALGORITHM_ROTATING_ZERO_RESTART: 2217 case ALGORITHM_ROTATING_N_RESTART: 2218 if (sh->pd_idx == raid_disks-1) 2219 i--; /* Q D D D P */ 2220 else if (i > sh->pd_idx) 2221 i -= 2; /* D D P Q D */ 2222 break; 2223 case ALGORITHM_LEFT_SYMMETRIC: 2224 case ALGORITHM_RIGHT_SYMMETRIC: 2225 if (sh->pd_idx == raid_disks-1) 2226 i--; /* Q D D D P */ 2227 else { 2228 /* D D P Q D */ 2229 if (i < sh->pd_idx) 2230 i += raid_disks; 2231 i -= (sh->pd_idx + 2); 2232 } 2233 break; 2234 case ALGORITHM_PARITY_0: 2235 i -= 2; 2236 break; 2237 case ALGORITHM_PARITY_N: 2238 break; 2239 case ALGORITHM_ROTATING_N_CONTINUE: 2240 /* Like left_symmetric, but P is before Q */ 2241 if (sh->pd_idx == 0) 2242 i--; /* P D D D Q */ 2243 else { 2244 /* D D Q P D */ 2245 if (i < sh->pd_idx) 2246 i += raid_disks; 2247 i -= (sh->pd_idx + 1); 2248 } 2249 break; 2250 case ALGORITHM_LEFT_ASYMMETRIC_6: 2251 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2252 if (i > sh->pd_idx) 2253 i--; 2254 break; 2255 case ALGORITHM_LEFT_SYMMETRIC_6: 2256 case ALGORITHM_RIGHT_SYMMETRIC_6: 2257 if (i < sh->pd_idx) 2258 i += data_disks + 1; 2259 i -= (sh->pd_idx + 1); 2260 break; 2261 case ALGORITHM_PARITY_0_6: 2262 i -= 1; 2263 break; 2264 default: 2265 BUG(); 2266 } 2267 break; 2268 } 2269 2270 chunk_number = stripe * data_disks + i; 2271 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2272 2273 check = raid5_compute_sector(conf, r_sector, 2274 previous, &dummy1, &sh2); 2275 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2276 || sh2.qd_idx != sh->qd_idx) { 2277 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2278 mdname(conf->mddev)); 2279 return 0; 2280 } 2281 return r_sector; 2282 } 2283 2284 2285 static void 2286 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2287 int rcw, int expand) 2288 { 2289 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2290 struct r5conf *conf = sh->raid_conf; 2291 int level = conf->level; 2292 2293 if (rcw) { 2294 2295 for (i = disks; i--; ) { 2296 struct r5dev *dev = &sh->dev[i]; 2297 2298 if (dev->towrite) { 2299 set_bit(R5_LOCKED, &dev->flags); 2300 set_bit(R5_Wantdrain, &dev->flags); 2301 if (!expand) 2302 clear_bit(R5_UPTODATE, &dev->flags); 2303 s->locked++; 2304 } 2305 } 2306 /* if we are not expanding this is a proper write request, and 2307 * there will be bios with new data to be drained into the 2308 * stripe cache 2309 */ 2310 if (!expand) { 2311 if (!s->locked) 2312 /* False alarm, nothing to do */ 2313 return; 2314 sh->reconstruct_state = reconstruct_state_drain_run; 2315 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2316 } else 2317 sh->reconstruct_state = reconstruct_state_run; 2318 2319 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2320 2321 if (s->locked + conf->max_degraded == disks) 2322 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2323 atomic_inc(&conf->pending_full_writes); 2324 } else { 2325 BUG_ON(level == 6); 2326 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2327 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2328 2329 for (i = disks; i--; ) { 2330 struct r5dev *dev = &sh->dev[i]; 2331 if (i == pd_idx) 2332 continue; 2333 2334 if (dev->towrite && 2335 (test_bit(R5_UPTODATE, &dev->flags) || 2336 test_bit(R5_Wantcompute, &dev->flags))) { 2337 set_bit(R5_Wantdrain, &dev->flags); 2338 set_bit(R5_LOCKED, &dev->flags); 2339 clear_bit(R5_UPTODATE, &dev->flags); 2340 s->locked++; 2341 } 2342 } 2343 if (!s->locked) 2344 /* False alarm - nothing to do */ 2345 return; 2346 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2347 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2348 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2349 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2350 } 2351 2352 /* keep the parity disk(s) locked while asynchronous operations 2353 * are in flight 2354 */ 2355 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2356 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2357 s->locked++; 2358 2359 if (level == 6) { 2360 int qd_idx = sh->qd_idx; 2361 struct r5dev *dev = &sh->dev[qd_idx]; 2362 2363 set_bit(R5_LOCKED, &dev->flags); 2364 clear_bit(R5_UPTODATE, &dev->flags); 2365 s->locked++; 2366 } 2367 2368 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2369 __func__, (unsigned long long)sh->sector, 2370 s->locked, s->ops_request); 2371 } 2372 2373 /* 2374 * Each stripe/dev can have one or more bion attached. 2375 * toread/towrite point to the first in a chain. 2376 * The bi_next chain must be in order. 2377 */ 2378 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2379 { 2380 struct bio **bip; 2381 struct r5conf *conf = sh->raid_conf; 2382 int firstwrite=0; 2383 2384 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2385 (unsigned long long)bi->bi_sector, 2386 (unsigned long long)sh->sector); 2387 2388 /* 2389 * If several bio share a stripe. The bio bi_phys_segments acts as a 2390 * reference count to avoid race. The reference count should already be 2391 * increased before this function is called (for example, in 2392 * make_request()), so other bio sharing this stripe will not free the 2393 * stripe. If a stripe is owned by one stripe, the stripe lock will 2394 * protect it. 2395 */ 2396 spin_lock_irq(&sh->stripe_lock); 2397 if (forwrite) { 2398 bip = &sh->dev[dd_idx].towrite; 2399 if (*bip == NULL) 2400 firstwrite = 1; 2401 } else 2402 bip = &sh->dev[dd_idx].toread; 2403 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2404 if (bio_end_sector(*bip) > bi->bi_sector) 2405 goto overlap; 2406 bip = & (*bip)->bi_next; 2407 } 2408 if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) 2409 goto overlap; 2410 2411 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2412 if (*bip) 2413 bi->bi_next = *bip; 2414 *bip = bi; 2415 raid5_inc_bi_active_stripes(bi); 2416 2417 if (forwrite) { 2418 /* check if page is covered */ 2419 sector_t sector = sh->dev[dd_idx].sector; 2420 for (bi=sh->dev[dd_idx].towrite; 2421 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2422 bi && bi->bi_sector <= sector; 2423 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2424 if (bio_end_sector(bi) >= sector) 2425 sector = bio_end_sector(bi); 2426 } 2427 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2428 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2429 } 2430 2431 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2432 (unsigned long long)(*bip)->bi_sector, 2433 (unsigned long long)sh->sector, dd_idx); 2434 spin_unlock_irq(&sh->stripe_lock); 2435 2436 if (conf->mddev->bitmap && firstwrite) { 2437 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2438 STRIPE_SECTORS, 0); 2439 sh->bm_seq = conf->seq_flush+1; 2440 set_bit(STRIPE_BIT_DELAY, &sh->state); 2441 } 2442 return 1; 2443 2444 overlap: 2445 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2446 spin_unlock_irq(&sh->stripe_lock); 2447 return 0; 2448 } 2449 2450 static void end_reshape(struct r5conf *conf); 2451 2452 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2453 struct stripe_head *sh) 2454 { 2455 int sectors_per_chunk = 2456 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2457 int dd_idx; 2458 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2459 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2460 2461 raid5_compute_sector(conf, 2462 stripe * (disks - conf->max_degraded) 2463 *sectors_per_chunk + chunk_offset, 2464 previous, 2465 &dd_idx, sh); 2466 } 2467 2468 static void 2469 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2470 struct stripe_head_state *s, int disks, 2471 struct bio **return_bi) 2472 { 2473 int i; 2474 for (i = disks; i--; ) { 2475 struct bio *bi; 2476 int bitmap_end = 0; 2477 2478 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2479 struct md_rdev *rdev; 2480 rcu_read_lock(); 2481 rdev = rcu_dereference(conf->disks[i].rdev); 2482 if (rdev && test_bit(In_sync, &rdev->flags)) 2483 atomic_inc(&rdev->nr_pending); 2484 else 2485 rdev = NULL; 2486 rcu_read_unlock(); 2487 if (rdev) { 2488 if (!rdev_set_badblocks( 2489 rdev, 2490 sh->sector, 2491 STRIPE_SECTORS, 0)) 2492 md_error(conf->mddev, rdev); 2493 rdev_dec_pending(rdev, conf->mddev); 2494 } 2495 } 2496 spin_lock_irq(&sh->stripe_lock); 2497 /* fail all writes first */ 2498 bi = sh->dev[i].towrite; 2499 sh->dev[i].towrite = NULL; 2500 spin_unlock_irq(&sh->stripe_lock); 2501 if (bi) 2502 bitmap_end = 1; 2503 2504 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2505 wake_up(&conf->wait_for_overlap); 2506 2507 while (bi && bi->bi_sector < 2508 sh->dev[i].sector + STRIPE_SECTORS) { 2509 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2510 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2511 if (!raid5_dec_bi_active_stripes(bi)) { 2512 md_write_end(conf->mddev); 2513 bi->bi_next = *return_bi; 2514 *return_bi = bi; 2515 } 2516 bi = nextbi; 2517 } 2518 if (bitmap_end) 2519 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2520 STRIPE_SECTORS, 0, 0); 2521 bitmap_end = 0; 2522 /* and fail all 'written' */ 2523 bi = sh->dev[i].written; 2524 sh->dev[i].written = NULL; 2525 if (bi) bitmap_end = 1; 2526 while (bi && bi->bi_sector < 2527 sh->dev[i].sector + STRIPE_SECTORS) { 2528 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2529 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2530 if (!raid5_dec_bi_active_stripes(bi)) { 2531 md_write_end(conf->mddev); 2532 bi->bi_next = *return_bi; 2533 *return_bi = bi; 2534 } 2535 bi = bi2; 2536 } 2537 2538 /* fail any reads if this device is non-operational and 2539 * the data has not reached the cache yet. 2540 */ 2541 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2542 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2543 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2544 spin_lock_irq(&sh->stripe_lock); 2545 bi = sh->dev[i].toread; 2546 sh->dev[i].toread = NULL; 2547 spin_unlock_irq(&sh->stripe_lock); 2548 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2549 wake_up(&conf->wait_for_overlap); 2550 while (bi && bi->bi_sector < 2551 sh->dev[i].sector + STRIPE_SECTORS) { 2552 struct bio *nextbi = 2553 r5_next_bio(bi, sh->dev[i].sector); 2554 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2555 if (!raid5_dec_bi_active_stripes(bi)) { 2556 bi->bi_next = *return_bi; 2557 *return_bi = bi; 2558 } 2559 bi = nextbi; 2560 } 2561 } 2562 if (bitmap_end) 2563 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2564 STRIPE_SECTORS, 0, 0); 2565 /* If we were in the middle of a write the parity block might 2566 * still be locked - so just clear all R5_LOCKED flags 2567 */ 2568 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2569 } 2570 2571 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2572 if (atomic_dec_and_test(&conf->pending_full_writes)) 2573 md_wakeup_thread(conf->mddev->thread); 2574 } 2575 2576 static void 2577 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2578 struct stripe_head_state *s) 2579 { 2580 int abort = 0; 2581 int i; 2582 2583 clear_bit(STRIPE_SYNCING, &sh->state); 2584 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 2585 wake_up(&conf->wait_for_overlap); 2586 s->syncing = 0; 2587 s->replacing = 0; 2588 /* There is nothing more to do for sync/check/repair. 2589 * Don't even need to abort as that is handled elsewhere 2590 * if needed, and not always wanted e.g. if there is a known 2591 * bad block here. 2592 * For recover/replace we need to record a bad block on all 2593 * non-sync devices, or abort the recovery 2594 */ 2595 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2596 /* During recovery devices cannot be removed, so 2597 * locking and refcounting of rdevs is not needed 2598 */ 2599 for (i = 0; i < conf->raid_disks; i++) { 2600 struct md_rdev *rdev = conf->disks[i].rdev; 2601 if (rdev 2602 && !test_bit(Faulty, &rdev->flags) 2603 && !test_bit(In_sync, &rdev->flags) 2604 && !rdev_set_badblocks(rdev, sh->sector, 2605 STRIPE_SECTORS, 0)) 2606 abort = 1; 2607 rdev = conf->disks[i].replacement; 2608 if (rdev 2609 && !test_bit(Faulty, &rdev->flags) 2610 && !test_bit(In_sync, &rdev->flags) 2611 && !rdev_set_badblocks(rdev, sh->sector, 2612 STRIPE_SECTORS, 0)) 2613 abort = 1; 2614 } 2615 if (abort) 2616 conf->recovery_disabled = 2617 conf->mddev->recovery_disabled; 2618 } 2619 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2620 } 2621 2622 static int want_replace(struct stripe_head *sh, int disk_idx) 2623 { 2624 struct md_rdev *rdev; 2625 int rv = 0; 2626 /* Doing recovery so rcu locking not required */ 2627 rdev = sh->raid_conf->disks[disk_idx].replacement; 2628 if (rdev 2629 && !test_bit(Faulty, &rdev->flags) 2630 && !test_bit(In_sync, &rdev->flags) 2631 && (rdev->recovery_offset <= sh->sector 2632 || rdev->mddev->recovery_cp <= sh->sector)) 2633 rv = 1; 2634 2635 return rv; 2636 } 2637 2638 /* fetch_block - checks the given member device to see if its data needs 2639 * to be read or computed to satisfy a request. 2640 * 2641 * Returns 1 when no more member devices need to be checked, otherwise returns 2642 * 0 to tell the loop in handle_stripe_fill to continue 2643 */ 2644 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2645 int disk_idx, int disks) 2646 { 2647 struct r5dev *dev = &sh->dev[disk_idx]; 2648 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2649 &sh->dev[s->failed_num[1]] }; 2650 2651 /* is the data in this block needed, and can we get it? */ 2652 if (!test_bit(R5_LOCKED, &dev->flags) && 2653 !test_bit(R5_UPTODATE, &dev->flags) && 2654 (dev->toread || 2655 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2656 s->syncing || s->expanding || 2657 (s->replacing && want_replace(sh, disk_idx)) || 2658 (s->failed >= 1 && fdev[0]->toread) || 2659 (s->failed >= 2 && fdev[1]->toread) || 2660 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2661 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2662 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2663 /* we would like to get this block, possibly by computing it, 2664 * otherwise read it if the backing disk is insync 2665 */ 2666 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2667 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2668 if ((s->uptodate == disks - 1) && 2669 (s->failed && (disk_idx == s->failed_num[0] || 2670 disk_idx == s->failed_num[1]))) { 2671 /* have disk failed, and we're requested to fetch it; 2672 * do compute it 2673 */ 2674 pr_debug("Computing stripe %llu block %d\n", 2675 (unsigned long long)sh->sector, disk_idx); 2676 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2677 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2678 set_bit(R5_Wantcompute, &dev->flags); 2679 sh->ops.target = disk_idx; 2680 sh->ops.target2 = -1; /* no 2nd target */ 2681 s->req_compute = 1; 2682 /* Careful: from this point on 'uptodate' is in the eye 2683 * of raid_run_ops which services 'compute' operations 2684 * before writes. R5_Wantcompute flags a block that will 2685 * be R5_UPTODATE by the time it is needed for a 2686 * subsequent operation. 2687 */ 2688 s->uptodate++; 2689 return 1; 2690 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2691 /* Computing 2-failure is *very* expensive; only 2692 * do it if failed >= 2 2693 */ 2694 int other; 2695 for (other = disks; other--; ) { 2696 if (other == disk_idx) 2697 continue; 2698 if (!test_bit(R5_UPTODATE, 2699 &sh->dev[other].flags)) 2700 break; 2701 } 2702 BUG_ON(other < 0); 2703 pr_debug("Computing stripe %llu blocks %d,%d\n", 2704 (unsigned long long)sh->sector, 2705 disk_idx, other); 2706 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2707 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2708 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2709 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2710 sh->ops.target = disk_idx; 2711 sh->ops.target2 = other; 2712 s->uptodate += 2; 2713 s->req_compute = 1; 2714 return 1; 2715 } else if (test_bit(R5_Insync, &dev->flags)) { 2716 set_bit(R5_LOCKED, &dev->flags); 2717 set_bit(R5_Wantread, &dev->flags); 2718 s->locked++; 2719 pr_debug("Reading block %d (sync=%d)\n", 2720 disk_idx, s->syncing); 2721 } 2722 } 2723 2724 return 0; 2725 } 2726 2727 /** 2728 * handle_stripe_fill - read or compute data to satisfy pending requests. 2729 */ 2730 static void handle_stripe_fill(struct stripe_head *sh, 2731 struct stripe_head_state *s, 2732 int disks) 2733 { 2734 int i; 2735 2736 /* look for blocks to read/compute, skip this if a compute 2737 * is already in flight, or if the stripe contents are in the 2738 * midst of changing due to a write 2739 */ 2740 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2741 !sh->reconstruct_state) 2742 for (i = disks; i--; ) 2743 if (fetch_block(sh, s, i, disks)) 2744 break; 2745 set_bit(STRIPE_HANDLE, &sh->state); 2746 } 2747 2748 2749 /* handle_stripe_clean_event 2750 * any written block on an uptodate or failed drive can be returned. 2751 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2752 * never LOCKED, so we don't need to test 'failed' directly. 2753 */ 2754 static void handle_stripe_clean_event(struct r5conf *conf, 2755 struct stripe_head *sh, int disks, struct bio **return_bi) 2756 { 2757 int i; 2758 struct r5dev *dev; 2759 int discard_pending = 0; 2760 2761 for (i = disks; i--; ) 2762 if (sh->dev[i].written) { 2763 dev = &sh->dev[i]; 2764 if (!test_bit(R5_LOCKED, &dev->flags) && 2765 (test_bit(R5_UPTODATE, &dev->flags) || 2766 test_bit(R5_Discard, &dev->flags))) { 2767 /* We can return any write requests */ 2768 struct bio *wbi, *wbi2; 2769 pr_debug("Return write for disc %d\n", i); 2770 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2771 clear_bit(R5_UPTODATE, &dev->flags); 2772 wbi = dev->written; 2773 dev->written = NULL; 2774 while (wbi && wbi->bi_sector < 2775 dev->sector + STRIPE_SECTORS) { 2776 wbi2 = r5_next_bio(wbi, dev->sector); 2777 if (!raid5_dec_bi_active_stripes(wbi)) { 2778 md_write_end(conf->mddev); 2779 wbi->bi_next = *return_bi; 2780 *return_bi = wbi; 2781 } 2782 wbi = wbi2; 2783 } 2784 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2785 STRIPE_SECTORS, 2786 !test_bit(STRIPE_DEGRADED, &sh->state), 2787 0); 2788 } else if (test_bit(R5_Discard, &dev->flags)) 2789 discard_pending = 1; 2790 } 2791 if (!discard_pending && 2792 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 2793 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 2794 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2795 if (sh->qd_idx >= 0) { 2796 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 2797 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 2798 } 2799 /* now that discard is done we can proceed with any sync */ 2800 clear_bit(STRIPE_DISCARD, &sh->state); 2801 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 2802 set_bit(STRIPE_HANDLE, &sh->state); 2803 2804 } 2805 2806 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2807 if (atomic_dec_and_test(&conf->pending_full_writes)) 2808 md_wakeup_thread(conf->mddev->thread); 2809 } 2810 2811 static void handle_stripe_dirtying(struct r5conf *conf, 2812 struct stripe_head *sh, 2813 struct stripe_head_state *s, 2814 int disks) 2815 { 2816 int rmw = 0, rcw = 0, i; 2817 sector_t recovery_cp = conf->mddev->recovery_cp; 2818 2819 /* RAID6 requires 'rcw' in current implementation. 2820 * Otherwise, check whether resync is now happening or should start. 2821 * If yes, then the array is dirty (after unclean shutdown or 2822 * initial creation), so parity in some stripes might be inconsistent. 2823 * In this case, we need to always do reconstruct-write, to ensure 2824 * that in case of drive failure or read-error correction, we 2825 * generate correct data from the parity. 2826 */ 2827 if (conf->max_degraded == 2 || 2828 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2829 /* Calculate the real rcw later - for now make it 2830 * look like rcw is cheaper 2831 */ 2832 rcw = 1; rmw = 2; 2833 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2834 conf->max_degraded, (unsigned long long)recovery_cp, 2835 (unsigned long long)sh->sector); 2836 } else for (i = disks; i--; ) { 2837 /* would I have to read this buffer for read_modify_write */ 2838 struct r5dev *dev = &sh->dev[i]; 2839 if ((dev->towrite || i == sh->pd_idx) && 2840 !test_bit(R5_LOCKED, &dev->flags) && 2841 !(test_bit(R5_UPTODATE, &dev->flags) || 2842 test_bit(R5_Wantcompute, &dev->flags))) { 2843 if (test_bit(R5_Insync, &dev->flags)) 2844 rmw++; 2845 else 2846 rmw += 2*disks; /* cannot read it */ 2847 } 2848 /* Would I have to read this buffer for reconstruct_write */ 2849 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2850 !test_bit(R5_LOCKED, &dev->flags) && 2851 !(test_bit(R5_UPTODATE, &dev->flags) || 2852 test_bit(R5_Wantcompute, &dev->flags))) { 2853 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2854 else 2855 rcw += 2*disks; 2856 } 2857 } 2858 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2859 (unsigned long long)sh->sector, rmw, rcw); 2860 set_bit(STRIPE_HANDLE, &sh->state); 2861 if (rmw < rcw && rmw > 0) { 2862 /* prefer read-modify-write, but need to get some data */ 2863 if (conf->mddev->queue) 2864 blk_add_trace_msg(conf->mddev->queue, 2865 "raid5 rmw %llu %d", 2866 (unsigned long long)sh->sector, rmw); 2867 for (i = disks; i--; ) { 2868 struct r5dev *dev = &sh->dev[i]; 2869 if ((dev->towrite || i == sh->pd_idx) && 2870 !test_bit(R5_LOCKED, &dev->flags) && 2871 !(test_bit(R5_UPTODATE, &dev->flags) || 2872 test_bit(R5_Wantcompute, &dev->flags)) && 2873 test_bit(R5_Insync, &dev->flags)) { 2874 if ( 2875 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2876 pr_debug("Read_old block " 2877 "%d for r-m-w\n", i); 2878 set_bit(R5_LOCKED, &dev->flags); 2879 set_bit(R5_Wantread, &dev->flags); 2880 s->locked++; 2881 } else { 2882 set_bit(STRIPE_DELAYED, &sh->state); 2883 set_bit(STRIPE_HANDLE, &sh->state); 2884 } 2885 } 2886 } 2887 } 2888 if (rcw <= rmw && rcw > 0) { 2889 /* want reconstruct write, but need to get some data */ 2890 int qread =0; 2891 rcw = 0; 2892 for (i = disks; i--; ) { 2893 struct r5dev *dev = &sh->dev[i]; 2894 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2895 i != sh->pd_idx && i != sh->qd_idx && 2896 !test_bit(R5_LOCKED, &dev->flags) && 2897 !(test_bit(R5_UPTODATE, &dev->flags) || 2898 test_bit(R5_Wantcompute, &dev->flags))) { 2899 rcw++; 2900 if (!test_bit(R5_Insync, &dev->flags)) 2901 continue; /* it's a failed drive */ 2902 if ( 2903 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2904 pr_debug("Read_old block " 2905 "%d for Reconstruct\n", i); 2906 set_bit(R5_LOCKED, &dev->flags); 2907 set_bit(R5_Wantread, &dev->flags); 2908 s->locked++; 2909 qread++; 2910 } else { 2911 set_bit(STRIPE_DELAYED, &sh->state); 2912 set_bit(STRIPE_HANDLE, &sh->state); 2913 } 2914 } 2915 } 2916 if (rcw && conf->mddev->queue) 2917 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 2918 (unsigned long long)sh->sector, 2919 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 2920 } 2921 /* now if nothing is locked, and if we have enough data, 2922 * we can start a write request 2923 */ 2924 /* since handle_stripe can be called at any time we need to handle the 2925 * case where a compute block operation has been submitted and then a 2926 * subsequent call wants to start a write request. raid_run_ops only 2927 * handles the case where compute block and reconstruct are requested 2928 * simultaneously. If this is not the case then new writes need to be 2929 * held off until the compute completes. 2930 */ 2931 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2932 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2933 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2934 schedule_reconstruction(sh, s, rcw == 0, 0); 2935 } 2936 2937 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2938 struct stripe_head_state *s, int disks) 2939 { 2940 struct r5dev *dev = NULL; 2941 2942 set_bit(STRIPE_HANDLE, &sh->state); 2943 2944 switch (sh->check_state) { 2945 case check_state_idle: 2946 /* start a new check operation if there are no failures */ 2947 if (s->failed == 0) { 2948 BUG_ON(s->uptodate != disks); 2949 sh->check_state = check_state_run; 2950 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2951 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2952 s->uptodate--; 2953 break; 2954 } 2955 dev = &sh->dev[s->failed_num[0]]; 2956 /* fall through */ 2957 case check_state_compute_result: 2958 sh->check_state = check_state_idle; 2959 if (!dev) 2960 dev = &sh->dev[sh->pd_idx]; 2961 2962 /* check that a write has not made the stripe insync */ 2963 if (test_bit(STRIPE_INSYNC, &sh->state)) 2964 break; 2965 2966 /* either failed parity check, or recovery is happening */ 2967 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2968 BUG_ON(s->uptodate != disks); 2969 2970 set_bit(R5_LOCKED, &dev->flags); 2971 s->locked++; 2972 set_bit(R5_Wantwrite, &dev->flags); 2973 2974 clear_bit(STRIPE_DEGRADED, &sh->state); 2975 set_bit(STRIPE_INSYNC, &sh->state); 2976 break; 2977 case check_state_run: 2978 break; /* we will be called again upon completion */ 2979 case check_state_check_result: 2980 sh->check_state = check_state_idle; 2981 2982 /* if a failure occurred during the check operation, leave 2983 * STRIPE_INSYNC not set and let the stripe be handled again 2984 */ 2985 if (s->failed) 2986 break; 2987 2988 /* handle a successful check operation, if parity is correct 2989 * we are done. Otherwise update the mismatch count and repair 2990 * parity if !MD_RECOVERY_CHECK 2991 */ 2992 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2993 /* parity is correct (on disc, 2994 * not in buffer any more) 2995 */ 2996 set_bit(STRIPE_INSYNC, &sh->state); 2997 else { 2998 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 2999 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3000 /* don't try to repair!! */ 3001 set_bit(STRIPE_INSYNC, &sh->state); 3002 else { 3003 sh->check_state = check_state_compute_run; 3004 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3005 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3006 set_bit(R5_Wantcompute, 3007 &sh->dev[sh->pd_idx].flags); 3008 sh->ops.target = sh->pd_idx; 3009 sh->ops.target2 = -1; 3010 s->uptodate++; 3011 } 3012 } 3013 break; 3014 case check_state_compute_run: 3015 break; 3016 default: 3017 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3018 __func__, sh->check_state, 3019 (unsigned long long) sh->sector); 3020 BUG(); 3021 } 3022 } 3023 3024 3025 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3026 struct stripe_head_state *s, 3027 int disks) 3028 { 3029 int pd_idx = sh->pd_idx; 3030 int qd_idx = sh->qd_idx; 3031 struct r5dev *dev; 3032 3033 set_bit(STRIPE_HANDLE, &sh->state); 3034 3035 BUG_ON(s->failed > 2); 3036 3037 /* Want to check and possibly repair P and Q. 3038 * However there could be one 'failed' device, in which 3039 * case we can only check one of them, possibly using the 3040 * other to generate missing data 3041 */ 3042 3043 switch (sh->check_state) { 3044 case check_state_idle: 3045 /* start a new check operation if there are < 2 failures */ 3046 if (s->failed == s->q_failed) { 3047 /* The only possible failed device holds Q, so it 3048 * makes sense to check P (If anything else were failed, 3049 * we would have used P to recreate it). 3050 */ 3051 sh->check_state = check_state_run; 3052 } 3053 if (!s->q_failed && s->failed < 2) { 3054 /* Q is not failed, and we didn't use it to generate 3055 * anything, so it makes sense to check it 3056 */ 3057 if (sh->check_state == check_state_run) 3058 sh->check_state = check_state_run_pq; 3059 else 3060 sh->check_state = check_state_run_q; 3061 } 3062 3063 /* discard potentially stale zero_sum_result */ 3064 sh->ops.zero_sum_result = 0; 3065 3066 if (sh->check_state == check_state_run) { 3067 /* async_xor_zero_sum destroys the contents of P */ 3068 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3069 s->uptodate--; 3070 } 3071 if (sh->check_state >= check_state_run && 3072 sh->check_state <= check_state_run_pq) { 3073 /* async_syndrome_zero_sum preserves P and Q, so 3074 * no need to mark them !uptodate here 3075 */ 3076 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3077 break; 3078 } 3079 3080 /* we have 2-disk failure */ 3081 BUG_ON(s->failed != 2); 3082 /* fall through */ 3083 case check_state_compute_result: 3084 sh->check_state = check_state_idle; 3085 3086 /* check that a write has not made the stripe insync */ 3087 if (test_bit(STRIPE_INSYNC, &sh->state)) 3088 break; 3089 3090 /* now write out any block on a failed drive, 3091 * or P or Q if they were recomputed 3092 */ 3093 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3094 if (s->failed == 2) { 3095 dev = &sh->dev[s->failed_num[1]]; 3096 s->locked++; 3097 set_bit(R5_LOCKED, &dev->flags); 3098 set_bit(R5_Wantwrite, &dev->flags); 3099 } 3100 if (s->failed >= 1) { 3101 dev = &sh->dev[s->failed_num[0]]; 3102 s->locked++; 3103 set_bit(R5_LOCKED, &dev->flags); 3104 set_bit(R5_Wantwrite, &dev->flags); 3105 } 3106 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3107 dev = &sh->dev[pd_idx]; 3108 s->locked++; 3109 set_bit(R5_LOCKED, &dev->flags); 3110 set_bit(R5_Wantwrite, &dev->flags); 3111 } 3112 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3113 dev = &sh->dev[qd_idx]; 3114 s->locked++; 3115 set_bit(R5_LOCKED, &dev->flags); 3116 set_bit(R5_Wantwrite, &dev->flags); 3117 } 3118 clear_bit(STRIPE_DEGRADED, &sh->state); 3119 3120 set_bit(STRIPE_INSYNC, &sh->state); 3121 break; 3122 case check_state_run: 3123 case check_state_run_q: 3124 case check_state_run_pq: 3125 break; /* we will be called again upon completion */ 3126 case check_state_check_result: 3127 sh->check_state = check_state_idle; 3128 3129 /* handle a successful check operation, if parity is correct 3130 * we are done. Otherwise update the mismatch count and repair 3131 * parity if !MD_RECOVERY_CHECK 3132 */ 3133 if (sh->ops.zero_sum_result == 0) { 3134 /* both parities are correct */ 3135 if (!s->failed) 3136 set_bit(STRIPE_INSYNC, &sh->state); 3137 else { 3138 /* in contrast to the raid5 case we can validate 3139 * parity, but still have a failure to write 3140 * back 3141 */ 3142 sh->check_state = check_state_compute_result; 3143 /* Returning at this point means that we may go 3144 * off and bring p and/or q uptodate again so 3145 * we make sure to check zero_sum_result again 3146 * to verify if p or q need writeback 3147 */ 3148 } 3149 } else { 3150 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3151 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3152 /* don't try to repair!! */ 3153 set_bit(STRIPE_INSYNC, &sh->state); 3154 else { 3155 int *target = &sh->ops.target; 3156 3157 sh->ops.target = -1; 3158 sh->ops.target2 = -1; 3159 sh->check_state = check_state_compute_run; 3160 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3161 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3162 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3163 set_bit(R5_Wantcompute, 3164 &sh->dev[pd_idx].flags); 3165 *target = pd_idx; 3166 target = &sh->ops.target2; 3167 s->uptodate++; 3168 } 3169 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3170 set_bit(R5_Wantcompute, 3171 &sh->dev[qd_idx].flags); 3172 *target = qd_idx; 3173 s->uptodate++; 3174 } 3175 } 3176 } 3177 break; 3178 case check_state_compute_run: 3179 break; 3180 default: 3181 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3182 __func__, sh->check_state, 3183 (unsigned long long) sh->sector); 3184 BUG(); 3185 } 3186 } 3187 3188 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3189 { 3190 int i; 3191 3192 /* We have read all the blocks in this stripe and now we need to 3193 * copy some of them into a target stripe for expand. 3194 */ 3195 struct dma_async_tx_descriptor *tx = NULL; 3196 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3197 for (i = 0; i < sh->disks; i++) 3198 if (i != sh->pd_idx && i != sh->qd_idx) { 3199 int dd_idx, j; 3200 struct stripe_head *sh2; 3201 struct async_submit_ctl submit; 3202 3203 sector_t bn = compute_blocknr(sh, i, 1); 3204 sector_t s = raid5_compute_sector(conf, bn, 0, 3205 &dd_idx, NULL); 3206 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3207 if (sh2 == NULL) 3208 /* so far only the early blocks of this stripe 3209 * have been requested. When later blocks 3210 * get requested, we will try again 3211 */ 3212 continue; 3213 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3214 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3215 /* must have already done this block */ 3216 release_stripe(sh2); 3217 continue; 3218 } 3219 3220 /* place all the copies on one channel */ 3221 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3222 tx = async_memcpy(sh2->dev[dd_idx].page, 3223 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3224 &submit); 3225 3226 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3227 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3228 for (j = 0; j < conf->raid_disks; j++) 3229 if (j != sh2->pd_idx && 3230 j != sh2->qd_idx && 3231 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3232 break; 3233 if (j == conf->raid_disks) { 3234 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3235 set_bit(STRIPE_HANDLE, &sh2->state); 3236 } 3237 release_stripe(sh2); 3238 3239 } 3240 /* done submitting copies, wait for them to complete */ 3241 async_tx_quiesce(&tx); 3242 } 3243 3244 /* 3245 * handle_stripe - do things to a stripe. 3246 * 3247 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3248 * state of various bits to see what needs to be done. 3249 * Possible results: 3250 * return some read requests which now have data 3251 * return some write requests which are safely on storage 3252 * schedule a read on some buffers 3253 * schedule a write of some buffers 3254 * return confirmation of parity correctness 3255 * 3256 */ 3257 3258 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3259 { 3260 struct r5conf *conf = sh->raid_conf; 3261 int disks = sh->disks; 3262 struct r5dev *dev; 3263 int i; 3264 int do_recovery = 0; 3265 3266 memset(s, 0, sizeof(*s)); 3267 3268 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3269 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3270 s->failed_num[0] = -1; 3271 s->failed_num[1] = -1; 3272 3273 /* Now to look around and see what can be done */ 3274 rcu_read_lock(); 3275 for (i=disks; i--; ) { 3276 struct md_rdev *rdev; 3277 sector_t first_bad; 3278 int bad_sectors; 3279 int is_bad = 0; 3280 3281 dev = &sh->dev[i]; 3282 3283 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3284 i, dev->flags, 3285 dev->toread, dev->towrite, dev->written); 3286 /* maybe we can reply to a read 3287 * 3288 * new wantfill requests are only permitted while 3289 * ops_complete_biofill is guaranteed to be inactive 3290 */ 3291 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3292 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3293 set_bit(R5_Wantfill, &dev->flags); 3294 3295 /* now count some things */ 3296 if (test_bit(R5_LOCKED, &dev->flags)) 3297 s->locked++; 3298 if (test_bit(R5_UPTODATE, &dev->flags)) 3299 s->uptodate++; 3300 if (test_bit(R5_Wantcompute, &dev->flags)) { 3301 s->compute++; 3302 BUG_ON(s->compute > 2); 3303 } 3304 3305 if (test_bit(R5_Wantfill, &dev->flags)) 3306 s->to_fill++; 3307 else if (dev->toread) 3308 s->to_read++; 3309 if (dev->towrite) { 3310 s->to_write++; 3311 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3312 s->non_overwrite++; 3313 } 3314 if (dev->written) 3315 s->written++; 3316 /* Prefer to use the replacement for reads, but only 3317 * if it is recovered enough and has no bad blocks. 3318 */ 3319 rdev = rcu_dereference(conf->disks[i].replacement); 3320 if (rdev && !test_bit(Faulty, &rdev->flags) && 3321 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3322 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3323 &first_bad, &bad_sectors)) 3324 set_bit(R5_ReadRepl, &dev->flags); 3325 else { 3326 if (rdev) 3327 set_bit(R5_NeedReplace, &dev->flags); 3328 rdev = rcu_dereference(conf->disks[i].rdev); 3329 clear_bit(R5_ReadRepl, &dev->flags); 3330 } 3331 if (rdev && test_bit(Faulty, &rdev->flags)) 3332 rdev = NULL; 3333 if (rdev) { 3334 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3335 &first_bad, &bad_sectors); 3336 if (s->blocked_rdev == NULL 3337 && (test_bit(Blocked, &rdev->flags) 3338 || is_bad < 0)) { 3339 if (is_bad < 0) 3340 set_bit(BlockedBadBlocks, 3341 &rdev->flags); 3342 s->blocked_rdev = rdev; 3343 atomic_inc(&rdev->nr_pending); 3344 } 3345 } 3346 clear_bit(R5_Insync, &dev->flags); 3347 if (!rdev) 3348 /* Not in-sync */; 3349 else if (is_bad) { 3350 /* also not in-sync */ 3351 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3352 test_bit(R5_UPTODATE, &dev->flags)) { 3353 /* treat as in-sync, but with a read error 3354 * which we can now try to correct 3355 */ 3356 set_bit(R5_Insync, &dev->flags); 3357 set_bit(R5_ReadError, &dev->flags); 3358 } 3359 } else if (test_bit(In_sync, &rdev->flags)) 3360 set_bit(R5_Insync, &dev->flags); 3361 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3362 /* in sync if before recovery_offset */ 3363 set_bit(R5_Insync, &dev->flags); 3364 else if (test_bit(R5_UPTODATE, &dev->flags) && 3365 test_bit(R5_Expanded, &dev->flags)) 3366 /* If we've reshaped into here, we assume it is Insync. 3367 * We will shortly update recovery_offset to make 3368 * it official. 3369 */ 3370 set_bit(R5_Insync, &dev->flags); 3371 3372 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3373 /* This flag does not apply to '.replacement' 3374 * only to .rdev, so make sure to check that*/ 3375 struct md_rdev *rdev2 = rcu_dereference( 3376 conf->disks[i].rdev); 3377 if (rdev2 == rdev) 3378 clear_bit(R5_Insync, &dev->flags); 3379 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3380 s->handle_bad_blocks = 1; 3381 atomic_inc(&rdev2->nr_pending); 3382 } else 3383 clear_bit(R5_WriteError, &dev->flags); 3384 } 3385 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3386 /* This flag does not apply to '.replacement' 3387 * only to .rdev, so make sure to check that*/ 3388 struct md_rdev *rdev2 = rcu_dereference( 3389 conf->disks[i].rdev); 3390 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3391 s->handle_bad_blocks = 1; 3392 atomic_inc(&rdev2->nr_pending); 3393 } else 3394 clear_bit(R5_MadeGood, &dev->flags); 3395 } 3396 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3397 struct md_rdev *rdev2 = rcu_dereference( 3398 conf->disks[i].replacement); 3399 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3400 s->handle_bad_blocks = 1; 3401 atomic_inc(&rdev2->nr_pending); 3402 } else 3403 clear_bit(R5_MadeGoodRepl, &dev->flags); 3404 } 3405 if (!test_bit(R5_Insync, &dev->flags)) { 3406 /* The ReadError flag will just be confusing now */ 3407 clear_bit(R5_ReadError, &dev->flags); 3408 clear_bit(R5_ReWrite, &dev->flags); 3409 } 3410 if (test_bit(R5_ReadError, &dev->flags)) 3411 clear_bit(R5_Insync, &dev->flags); 3412 if (!test_bit(R5_Insync, &dev->flags)) { 3413 if (s->failed < 2) 3414 s->failed_num[s->failed] = i; 3415 s->failed++; 3416 if (rdev && !test_bit(Faulty, &rdev->flags)) 3417 do_recovery = 1; 3418 } 3419 } 3420 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3421 /* If there is a failed device being replaced, 3422 * we must be recovering. 3423 * else if we are after recovery_cp, we must be syncing 3424 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3425 * else we can only be replacing 3426 * sync and recovery both need to read all devices, and so 3427 * use the same flag. 3428 */ 3429 if (do_recovery || 3430 sh->sector >= conf->mddev->recovery_cp || 3431 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3432 s->syncing = 1; 3433 else 3434 s->replacing = 1; 3435 } 3436 rcu_read_unlock(); 3437 } 3438 3439 static void handle_stripe(struct stripe_head *sh) 3440 { 3441 struct stripe_head_state s; 3442 struct r5conf *conf = sh->raid_conf; 3443 int i; 3444 int prexor; 3445 int disks = sh->disks; 3446 struct r5dev *pdev, *qdev; 3447 3448 clear_bit(STRIPE_HANDLE, &sh->state); 3449 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3450 /* already being handled, ensure it gets handled 3451 * again when current action finishes */ 3452 set_bit(STRIPE_HANDLE, &sh->state); 3453 return; 3454 } 3455 3456 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3457 spin_lock(&sh->stripe_lock); 3458 /* Cannot process 'sync' concurrently with 'discard' */ 3459 if (!test_bit(STRIPE_DISCARD, &sh->state) && 3460 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3461 set_bit(STRIPE_SYNCING, &sh->state); 3462 clear_bit(STRIPE_INSYNC, &sh->state); 3463 } 3464 spin_unlock(&sh->stripe_lock); 3465 } 3466 clear_bit(STRIPE_DELAYED, &sh->state); 3467 3468 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3469 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3470 (unsigned long long)sh->sector, sh->state, 3471 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3472 sh->check_state, sh->reconstruct_state); 3473 3474 analyse_stripe(sh, &s); 3475 3476 if (s.handle_bad_blocks) { 3477 set_bit(STRIPE_HANDLE, &sh->state); 3478 goto finish; 3479 } 3480 3481 if (unlikely(s.blocked_rdev)) { 3482 if (s.syncing || s.expanding || s.expanded || 3483 s.replacing || s.to_write || s.written) { 3484 set_bit(STRIPE_HANDLE, &sh->state); 3485 goto finish; 3486 } 3487 /* There is nothing for the blocked_rdev to block */ 3488 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3489 s.blocked_rdev = NULL; 3490 } 3491 3492 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3493 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3494 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3495 } 3496 3497 pr_debug("locked=%d uptodate=%d to_read=%d" 3498 " to_write=%d failed=%d failed_num=%d,%d\n", 3499 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3500 s.failed_num[0], s.failed_num[1]); 3501 /* check if the array has lost more than max_degraded devices and, 3502 * if so, some requests might need to be failed. 3503 */ 3504 if (s.failed > conf->max_degraded) { 3505 sh->check_state = 0; 3506 sh->reconstruct_state = 0; 3507 if (s.to_read+s.to_write+s.written) 3508 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3509 if (s.syncing + s.replacing) 3510 handle_failed_sync(conf, sh, &s); 3511 } 3512 3513 /* Now we check to see if any write operations have recently 3514 * completed 3515 */ 3516 prexor = 0; 3517 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3518 prexor = 1; 3519 if (sh->reconstruct_state == reconstruct_state_drain_result || 3520 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3521 sh->reconstruct_state = reconstruct_state_idle; 3522 3523 /* All the 'written' buffers and the parity block are ready to 3524 * be written back to disk 3525 */ 3526 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3527 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3528 BUG_ON(sh->qd_idx >= 0 && 3529 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3530 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3531 for (i = disks; i--; ) { 3532 struct r5dev *dev = &sh->dev[i]; 3533 if (test_bit(R5_LOCKED, &dev->flags) && 3534 (i == sh->pd_idx || i == sh->qd_idx || 3535 dev->written)) { 3536 pr_debug("Writing block %d\n", i); 3537 set_bit(R5_Wantwrite, &dev->flags); 3538 if (prexor) 3539 continue; 3540 if (!test_bit(R5_Insync, &dev->flags) || 3541 ((i == sh->pd_idx || i == sh->qd_idx) && 3542 s.failed == 0)) 3543 set_bit(STRIPE_INSYNC, &sh->state); 3544 } 3545 } 3546 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3547 s.dec_preread_active = 1; 3548 } 3549 3550 /* 3551 * might be able to return some write requests if the parity blocks 3552 * are safe, or on a failed drive 3553 */ 3554 pdev = &sh->dev[sh->pd_idx]; 3555 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3556 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3557 qdev = &sh->dev[sh->qd_idx]; 3558 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3559 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3560 || conf->level < 6; 3561 3562 if (s.written && 3563 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3564 && !test_bit(R5_LOCKED, &pdev->flags) 3565 && (test_bit(R5_UPTODATE, &pdev->flags) || 3566 test_bit(R5_Discard, &pdev->flags))))) && 3567 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3568 && !test_bit(R5_LOCKED, &qdev->flags) 3569 && (test_bit(R5_UPTODATE, &qdev->flags) || 3570 test_bit(R5_Discard, &qdev->flags)))))) 3571 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3572 3573 /* Now we might consider reading some blocks, either to check/generate 3574 * parity, or to satisfy requests 3575 * or to load a block that is being partially written. 3576 */ 3577 if (s.to_read || s.non_overwrite 3578 || (conf->level == 6 && s.to_write && s.failed) 3579 || (s.syncing && (s.uptodate + s.compute < disks)) 3580 || s.replacing 3581 || s.expanding) 3582 handle_stripe_fill(sh, &s, disks); 3583 3584 /* Now to consider new write requests and what else, if anything 3585 * should be read. We do not handle new writes when: 3586 * 1/ A 'write' operation (copy+xor) is already in flight. 3587 * 2/ A 'check' operation is in flight, as it may clobber the parity 3588 * block. 3589 */ 3590 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3591 handle_stripe_dirtying(conf, sh, &s, disks); 3592 3593 /* maybe we need to check and possibly fix the parity for this stripe 3594 * Any reads will already have been scheduled, so we just see if enough 3595 * data is available. The parity check is held off while parity 3596 * dependent operations are in flight. 3597 */ 3598 if (sh->check_state || 3599 (s.syncing && s.locked == 0 && 3600 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3601 !test_bit(STRIPE_INSYNC, &sh->state))) { 3602 if (conf->level == 6) 3603 handle_parity_checks6(conf, sh, &s, disks); 3604 else 3605 handle_parity_checks5(conf, sh, &s, disks); 3606 } 3607 3608 if (s.replacing && s.locked == 0 3609 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3610 /* Write out to replacement devices where possible */ 3611 for (i = 0; i < conf->raid_disks; i++) 3612 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3613 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3614 set_bit(R5_WantReplace, &sh->dev[i].flags); 3615 set_bit(R5_LOCKED, &sh->dev[i].flags); 3616 s.locked++; 3617 } 3618 set_bit(STRIPE_INSYNC, &sh->state); 3619 } 3620 if ((s.syncing || s.replacing) && s.locked == 0 && 3621 test_bit(STRIPE_INSYNC, &sh->state)) { 3622 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3623 clear_bit(STRIPE_SYNCING, &sh->state); 3624 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3625 wake_up(&conf->wait_for_overlap); 3626 } 3627 3628 /* If the failed drives are just a ReadError, then we might need 3629 * to progress the repair/check process 3630 */ 3631 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3632 for (i = 0; i < s.failed; i++) { 3633 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3634 if (test_bit(R5_ReadError, &dev->flags) 3635 && !test_bit(R5_LOCKED, &dev->flags) 3636 && test_bit(R5_UPTODATE, &dev->flags) 3637 ) { 3638 if (!test_bit(R5_ReWrite, &dev->flags)) { 3639 set_bit(R5_Wantwrite, &dev->flags); 3640 set_bit(R5_ReWrite, &dev->flags); 3641 set_bit(R5_LOCKED, &dev->flags); 3642 s.locked++; 3643 } else { 3644 /* let's read it back */ 3645 set_bit(R5_Wantread, &dev->flags); 3646 set_bit(R5_LOCKED, &dev->flags); 3647 s.locked++; 3648 } 3649 } 3650 } 3651 3652 3653 /* Finish reconstruct operations initiated by the expansion process */ 3654 if (sh->reconstruct_state == reconstruct_state_result) { 3655 struct stripe_head *sh_src 3656 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3657 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3658 /* sh cannot be written until sh_src has been read. 3659 * so arrange for sh to be delayed a little 3660 */ 3661 set_bit(STRIPE_DELAYED, &sh->state); 3662 set_bit(STRIPE_HANDLE, &sh->state); 3663 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3664 &sh_src->state)) 3665 atomic_inc(&conf->preread_active_stripes); 3666 release_stripe(sh_src); 3667 goto finish; 3668 } 3669 if (sh_src) 3670 release_stripe(sh_src); 3671 3672 sh->reconstruct_state = reconstruct_state_idle; 3673 clear_bit(STRIPE_EXPANDING, &sh->state); 3674 for (i = conf->raid_disks; i--; ) { 3675 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3676 set_bit(R5_LOCKED, &sh->dev[i].flags); 3677 s.locked++; 3678 } 3679 } 3680 3681 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3682 !sh->reconstruct_state) { 3683 /* Need to write out all blocks after computing parity */ 3684 sh->disks = conf->raid_disks; 3685 stripe_set_idx(sh->sector, conf, 0, sh); 3686 schedule_reconstruction(sh, &s, 1, 1); 3687 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3688 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3689 atomic_dec(&conf->reshape_stripes); 3690 wake_up(&conf->wait_for_overlap); 3691 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3692 } 3693 3694 if (s.expanding && s.locked == 0 && 3695 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3696 handle_stripe_expansion(conf, sh); 3697 3698 finish: 3699 /* wait for this device to become unblocked */ 3700 if (unlikely(s.blocked_rdev)) { 3701 if (conf->mddev->external) 3702 md_wait_for_blocked_rdev(s.blocked_rdev, 3703 conf->mddev); 3704 else 3705 /* Internal metadata will immediately 3706 * be written by raid5d, so we don't 3707 * need to wait here. 3708 */ 3709 rdev_dec_pending(s.blocked_rdev, 3710 conf->mddev); 3711 } 3712 3713 if (s.handle_bad_blocks) 3714 for (i = disks; i--; ) { 3715 struct md_rdev *rdev; 3716 struct r5dev *dev = &sh->dev[i]; 3717 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3718 /* We own a safe reference to the rdev */ 3719 rdev = conf->disks[i].rdev; 3720 if (!rdev_set_badblocks(rdev, sh->sector, 3721 STRIPE_SECTORS, 0)) 3722 md_error(conf->mddev, rdev); 3723 rdev_dec_pending(rdev, conf->mddev); 3724 } 3725 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3726 rdev = conf->disks[i].rdev; 3727 rdev_clear_badblocks(rdev, sh->sector, 3728 STRIPE_SECTORS, 0); 3729 rdev_dec_pending(rdev, conf->mddev); 3730 } 3731 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3732 rdev = conf->disks[i].replacement; 3733 if (!rdev) 3734 /* rdev have been moved down */ 3735 rdev = conf->disks[i].rdev; 3736 rdev_clear_badblocks(rdev, sh->sector, 3737 STRIPE_SECTORS, 0); 3738 rdev_dec_pending(rdev, conf->mddev); 3739 } 3740 } 3741 3742 if (s.ops_request) 3743 raid_run_ops(sh, s.ops_request); 3744 3745 ops_run_io(sh, &s); 3746 3747 if (s.dec_preread_active) { 3748 /* We delay this until after ops_run_io so that if make_request 3749 * is waiting on a flush, it won't continue until the writes 3750 * have actually been submitted. 3751 */ 3752 atomic_dec(&conf->preread_active_stripes); 3753 if (atomic_read(&conf->preread_active_stripes) < 3754 IO_THRESHOLD) 3755 md_wakeup_thread(conf->mddev->thread); 3756 } 3757 3758 return_io(s.return_bi); 3759 3760 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3761 } 3762 3763 static void raid5_activate_delayed(struct r5conf *conf) 3764 { 3765 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3766 while (!list_empty(&conf->delayed_list)) { 3767 struct list_head *l = conf->delayed_list.next; 3768 struct stripe_head *sh; 3769 sh = list_entry(l, struct stripe_head, lru); 3770 list_del_init(l); 3771 clear_bit(STRIPE_DELAYED, &sh->state); 3772 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3773 atomic_inc(&conf->preread_active_stripes); 3774 list_add_tail(&sh->lru, &conf->hold_list); 3775 } 3776 } 3777 } 3778 3779 static void activate_bit_delay(struct r5conf *conf) 3780 { 3781 /* device_lock is held */ 3782 struct list_head head; 3783 list_add(&head, &conf->bitmap_list); 3784 list_del_init(&conf->bitmap_list); 3785 while (!list_empty(&head)) { 3786 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3787 list_del_init(&sh->lru); 3788 atomic_inc(&sh->count); 3789 __release_stripe(conf, sh); 3790 } 3791 } 3792 3793 int md_raid5_congested(struct mddev *mddev, int bits) 3794 { 3795 struct r5conf *conf = mddev->private; 3796 3797 /* No difference between reads and writes. Just check 3798 * how busy the stripe_cache is 3799 */ 3800 3801 if (conf->inactive_blocked) 3802 return 1; 3803 if (conf->quiesce) 3804 return 1; 3805 if (list_empty_careful(&conf->inactive_list)) 3806 return 1; 3807 3808 return 0; 3809 } 3810 EXPORT_SYMBOL_GPL(md_raid5_congested); 3811 3812 static int raid5_congested(void *data, int bits) 3813 { 3814 struct mddev *mddev = data; 3815 3816 return mddev_congested(mddev, bits) || 3817 md_raid5_congested(mddev, bits); 3818 } 3819 3820 /* We want read requests to align with chunks where possible, 3821 * but write requests don't need to. 3822 */ 3823 static int raid5_mergeable_bvec(struct request_queue *q, 3824 struct bvec_merge_data *bvm, 3825 struct bio_vec *biovec) 3826 { 3827 struct mddev *mddev = q->queuedata; 3828 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3829 int max; 3830 unsigned int chunk_sectors = mddev->chunk_sectors; 3831 unsigned int bio_sectors = bvm->bi_size >> 9; 3832 3833 if ((bvm->bi_rw & 1) == WRITE) 3834 return biovec->bv_len; /* always allow writes to be mergeable */ 3835 3836 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3837 chunk_sectors = mddev->new_chunk_sectors; 3838 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3839 if (max < 0) max = 0; 3840 if (max <= biovec->bv_len && bio_sectors == 0) 3841 return biovec->bv_len; 3842 else 3843 return max; 3844 } 3845 3846 3847 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3848 { 3849 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3850 unsigned int chunk_sectors = mddev->chunk_sectors; 3851 unsigned int bio_sectors = bio_sectors(bio); 3852 3853 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3854 chunk_sectors = mddev->new_chunk_sectors; 3855 return chunk_sectors >= 3856 ((sector & (chunk_sectors - 1)) + bio_sectors); 3857 } 3858 3859 /* 3860 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3861 * later sampled by raid5d. 3862 */ 3863 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3864 { 3865 unsigned long flags; 3866 3867 spin_lock_irqsave(&conf->device_lock, flags); 3868 3869 bi->bi_next = conf->retry_read_aligned_list; 3870 conf->retry_read_aligned_list = bi; 3871 3872 spin_unlock_irqrestore(&conf->device_lock, flags); 3873 md_wakeup_thread(conf->mddev->thread); 3874 } 3875 3876 3877 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3878 { 3879 struct bio *bi; 3880 3881 bi = conf->retry_read_aligned; 3882 if (bi) { 3883 conf->retry_read_aligned = NULL; 3884 return bi; 3885 } 3886 bi = conf->retry_read_aligned_list; 3887 if(bi) { 3888 conf->retry_read_aligned_list = bi->bi_next; 3889 bi->bi_next = NULL; 3890 /* 3891 * this sets the active strip count to 1 and the processed 3892 * strip count to zero (upper 8 bits) 3893 */ 3894 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3895 } 3896 3897 return bi; 3898 } 3899 3900 3901 /* 3902 * The "raid5_align_endio" should check if the read succeeded and if it 3903 * did, call bio_endio on the original bio (having bio_put the new bio 3904 * first). 3905 * If the read failed.. 3906 */ 3907 static void raid5_align_endio(struct bio *bi, int error) 3908 { 3909 struct bio* raid_bi = bi->bi_private; 3910 struct mddev *mddev; 3911 struct r5conf *conf; 3912 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3913 struct md_rdev *rdev; 3914 3915 bio_put(bi); 3916 3917 rdev = (void*)raid_bi->bi_next; 3918 raid_bi->bi_next = NULL; 3919 mddev = rdev->mddev; 3920 conf = mddev->private; 3921 3922 rdev_dec_pending(rdev, conf->mddev); 3923 3924 if (!error && uptodate) { 3925 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 3926 raid_bi, 0); 3927 bio_endio(raid_bi, 0); 3928 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3929 wake_up(&conf->wait_for_stripe); 3930 return; 3931 } 3932 3933 3934 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3935 3936 add_bio_to_retry(raid_bi, conf); 3937 } 3938 3939 static int bio_fits_rdev(struct bio *bi) 3940 { 3941 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3942 3943 if (bio_sectors(bi) > queue_max_sectors(q)) 3944 return 0; 3945 blk_recount_segments(q, bi); 3946 if (bi->bi_phys_segments > queue_max_segments(q)) 3947 return 0; 3948 3949 if (q->merge_bvec_fn) 3950 /* it's too hard to apply the merge_bvec_fn at this stage, 3951 * just just give up 3952 */ 3953 return 0; 3954 3955 return 1; 3956 } 3957 3958 3959 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3960 { 3961 struct r5conf *conf = mddev->private; 3962 int dd_idx; 3963 struct bio* align_bi; 3964 struct md_rdev *rdev; 3965 sector_t end_sector; 3966 3967 if (!in_chunk_boundary(mddev, raid_bio)) { 3968 pr_debug("chunk_aligned_read : non aligned\n"); 3969 return 0; 3970 } 3971 /* 3972 * use bio_clone_mddev to make a copy of the bio 3973 */ 3974 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3975 if (!align_bi) 3976 return 0; 3977 /* 3978 * set bi_end_io to a new function, and set bi_private to the 3979 * original bio. 3980 */ 3981 align_bi->bi_end_io = raid5_align_endio; 3982 align_bi->bi_private = raid_bio; 3983 /* 3984 * compute position 3985 */ 3986 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3987 0, 3988 &dd_idx, NULL); 3989 3990 end_sector = bio_end_sector(align_bi); 3991 rcu_read_lock(); 3992 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3993 if (!rdev || test_bit(Faulty, &rdev->flags) || 3994 rdev->recovery_offset < end_sector) { 3995 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3996 if (rdev && 3997 (test_bit(Faulty, &rdev->flags) || 3998 !(test_bit(In_sync, &rdev->flags) || 3999 rdev->recovery_offset >= end_sector))) 4000 rdev = NULL; 4001 } 4002 if (rdev) { 4003 sector_t first_bad; 4004 int bad_sectors; 4005 4006 atomic_inc(&rdev->nr_pending); 4007 rcu_read_unlock(); 4008 raid_bio->bi_next = (void*)rdev; 4009 align_bi->bi_bdev = rdev->bdev; 4010 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4011 4012 if (!bio_fits_rdev(align_bi) || 4013 is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), 4014 &first_bad, &bad_sectors)) { 4015 /* too big in some way, or has a known bad block */ 4016 bio_put(align_bi); 4017 rdev_dec_pending(rdev, mddev); 4018 return 0; 4019 } 4020 4021 /* No reshape active, so we can trust rdev->data_offset */ 4022 align_bi->bi_sector += rdev->data_offset; 4023 4024 spin_lock_irq(&conf->device_lock); 4025 wait_event_lock_irq(conf->wait_for_stripe, 4026 conf->quiesce == 0, 4027 conf->device_lock); 4028 atomic_inc(&conf->active_aligned_reads); 4029 spin_unlock_irq(&conf->device_lock); 4030 4031 if (mddev->gendisk) 4032 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4033 align_bi, disk_devt(mddev->gendisk), 4034 raid_bio->bi_sector); 4035 generic_make_request(align_bi); 4036 return 1; 4037 } else { 4038 rcu_read_unlock(); 4039 bio_put(align_bi); 4040 return 0; 4041 } 4042 } 4043 4044 /* __get_priority_stripe - get the next stripe to process 4045 * 4046 * Full stripe writes are allowed to pass preread active stripes up until 4047 * the bypass_threshold is exceeded. In general the bypass_count 4048 * increments when the handle_list is handled before the hold_list; however, it 4049 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4050 * stripe with in flight i/o. The bypass_count will be reset when the 4051 * head of the hold_list has changed, i.e. the head was promoted to the 4052 * handle_list. 4053 */ 4054 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 4055 { 4056 struct stripe_head *sh; 4057 4058 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4059 __func__, 4060 list_empty(&conf->handle_list) ? "empty" : "busy", 4061 list_empty(&conf->hold_list) ? "empty" : "busy", 4062 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4063 4064 if (!list_empty(&conf->handle_list)) { 4065 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 4066 4067 if (list_empty(&conf->hold_list)) 4068 conf->bypass_count = 0; 4069 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4070 if (conf->hold_list.next == conf->last_hold) 4071 conf->bypass_count++; 4072 else { 4073 conf->last_hold = conf->hold_list.next; 4074 conf->bypass_count -= conf->bypass_threshold; 4075 if (conf->bypass_count < 0) 4076 conf->bypass_count = 0; 4077 } 4078 } 4079 } else if (!list_empty(&conf->hold_list) && 4080 ((conf->bypass_threshold && 4081 conf->bypass_count > conf->bypass_threshold) || 4082 atomic_read(&conf->pending_full_writes) == 0)) { 4083 sh = list_entry(conf->hold_list.next, 4084 typeof(*sh), lru); 4085 conf->bypass_count -= conf->bypass_threshold; 4086 if (conf->bypass_count < 0) 4087 conf->bypass_count = 0; 4088 } else 4089 return NULL; 4090 4091 list_del_init(&sh->lru); 4092 atomic_inc(&sh->count); 4093 BUG_ON(atomic_read(&sh->count) != 1); 4094 return sh; 4095 } 4096 4097 struct raid5_plug_cb { 4098 struct blk_plug_cb cb; 4099 struct list_head list; 4100 }; 4101 4102 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4103 { 4104 struct raid5_plug_cb *cb = container_of( 4105 blk_cb, struct raid5_plug_cb, cb); 4106 struct stripe_head *sh; 4107 struct mddev *mddev = cb->cb.data; 4108 struct r5conf *conf = mddev->private; 4109 int cnt = 0; 4110 4111 if (cb->list.next && !list_empty(&cb->list)) { 4112 spin_lock_irq(&conf->device_lock); 4113 while (!list_empty(&cb->list)) { 4114 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4115 list_del_init(&sh->lru); 4116 /* 4117 * avoid race release_stripe_plug() sees 4118 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4119 * is still in our list 4120 */ 4121 smp_mb__before_clear_bit(); 4122 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4123 __release_stripe(conf, sh); 4124 cnt++; 4125 } 4126 spin_unlock_irq(&conf->device_lock); 4127 } 4128 if (mddev->queue) 4129 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4130 kfree(cb); 4131 } 4132 4133 static void release_stripe_plug(struct mddev *mddev, 4134 struct stripe_head *sh) 4135 { 4136 struct blk_plug_cb *blk_cb = blk_check_plugged( 4137 raid5_unplug, mddev, 4138 sizeof(struct raid5_plug_cb)); 4139 struct raid5_plug_cb *cb; 4140 4141 if (!blk_cb) { 4142 release_stripe(sh); 4143 return; 4144 } 4145 4146 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4147 4148 if (cb->list.next == NULL) 4149 INIT_LIST_HEAD(&cb->list); 4150 4151 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4152 list_add_tail(&sh->lru, &cb->list); 4153 else 4154 release_stripe(sh); 4155 } 4156 4157 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4158 { 4159 struct r5conf *conf = mddev->private; 4160 sector_t logical_sector, last_sector; 4161 struct stripe_head *sh; 4162 int remaining; 4163 int stripe_sectors; 4164 4165 if (mddev->reshape_position != MaxSector) 4166 /* Skip discard while reshape is happening */ 4167 return; 4168 4169 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4170 last_sector = bi->bi_sector + (bi->bi_size>>9); 4171 4172 bi->bi_next = NULL; 4173 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4174 4175 stripe_sectors = conf->chunk_sectors * 4176 (conf->raid_disks - conf->max_degraded); 4177 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4178 stripe_sectors); 4179 sector_div(last_sector, stripe_sectors); 4180 4181 logical_sector *= conf->chunk_sectors; 4182 last_sector *= conf->chunk_sectors; 4183 4184 for (; logical_sector < last_sector; 4185 logical_sector += STRIPE_SECTORS) { 4186 DEFINE_WAIT(w); 4187 int d; 4188 again: 4189 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4190 prepare_to_wait(&conf->wait_for_overlap, &w, 4191 TASK_UNINTERRUPTIBLE); 4192 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4193 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4194 release_stripe(sh); 4195 schedule(); 4196 goto again; 4197 } 4198 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4199 spin_lock_irq(&sh->stripe_lock); 4200 for (d = 0; d < conf->raid_disks; d++) { 4201 if (d == sh->pd_idx || d == sh->qd_idx) 4202 continue; 4203 if (sh->dev[d].towrite || sh->dev[d].toread) { 4204 set_bit(R5_Overlap, &sh->dev[d].flags); 4205 spin_unlock_irq(&sh->stripe_lock); 4206 release_stripe(sh); 4207 schedule(); 4208 goto again; 4209 } 4210 } 4211 set_bit(STRIPE_DISCARD, &sh->state); 4212 finish_wait(&conf->wait_for_overlap, &w); 4213 for (d = 0; d < conf->raid_disks; d++) { 4214 if (d == sh->pd_idx || d == sh->qd_idx) 4215 continue; 4216 sh->dev[d].towrite = bi; 4217 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4218 raid5_inc_bi_active_stripes(bi); 4219 } 4220 spin_unlock_irq(&sh->stripe_lock); 4221 if (conf->mddev->bitmap) { 4222 for (d = 0; 4223 d < conf->raid_disks - conf->max_degraded; 4224 d++) 4225 bitmap_startwrite(mddev->bitmap, 4226 sh->sector, 4227 STRIPE_SECTORS, 4228 0); 4229 sh->bm_seq = conf->seq_flush + 1; 4230 set_bit(STRIPE_BIT_DELAY, &sh->state); 4231 } 4232 4233 set_bit(STRIPE_HANDLE, &sh->state); 4234 clear_bit(STRIPE_DELAYED, &sh->state); 4235 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4236 atomic_inc(&conf->preread_active_stripes); 4237 release_stripe_plug(mddev, sh); 4238 } 4239 4240 remaining = raid5_dec_bi_active_stripes(bi); 4241 if (remaining == 0) { 4242 md_write_end(mddev); 4243 bio_endio(bi, 0); 4244 } 4245 } 4246 4247 static void make_request(struct mddev *mddev, struct bio * bi) 4248 { 4249 struct r5conf *conf = mddev->private; 4250 int dd_idx; 4251 sector_t new_sector; 4252 sector_t logical_sector, last_sector; 4253 struct stripe_head *sh; 4254 const int rw = bio_data_dir(bi); 4255 int remaining; 4256 4257 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4258 md_flush_request(mddev, bi); 4259 return; 4260 } 4261 4262 md_write_start(mddev, bi); 4263 4264 if (rw == READ && 4265 mddev->reshape_position == MaxSector && 4266 chunk_aligned_read(mddev,bi)) 4267 return; 4268 4269 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4270 make_discard_request(mddev, bi); 4271 return; 4272 } 4273 4274 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4275 last_sector = bio_end_sector(bi); 4276 bi->bi_next = NULL; 4277 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4278 4279 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4280 DEFINE_WAIT(w); 4281 int previous; 4282 4283 retry: 4284 previous = 0; 4285 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4286 if (unlikely(conf->reshape_progress != MaxSector)) { 4287 /* spinlock is needed as reshape_progress may be 4288 * 64bit on a 32bit platform, and so it might be 4289 * possible to see a half-updated value 4290 * Of course reshape_progress could change after 4291 * the lock is dropped, so once we get a reference 4292 * to the stripe that we think it is, we will have 4293 * to check again. 4294 */ 4295 spin_lock_irq(&conf->device_lock); 4296 if (mddev->reshape_backwards 4297 ? logical_sector < conf->reshape_progress 4298 : logical_sector >= conf->reshape_progress) { 4299 previous = 1; 4300 } else { 4301 if (mddev->reshape_backwards 4302 ? logical_sector < conf->reshape_safe 4303 : logical_sector >= conf->reshape_safe) { 4304 spin_unlock_irq(&conf->device_lock); 4305 schedule(); 4306 goto retry; 4307 } 4308 } 4309 spin_unlock_irq(&conf->device_lock); 4310 } 4311 4312 new_sector = raid5_compute_sector(conf, logical_sector, 4313 previous, 4314 &dd_idx, NULL); 4315 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4316 (unsigned long long)new_sector, 4317 (unsigned long long)logical_sector); 4318 4319 sh = get_active_stripe(conf, new_sector, previous, 4320 (bi->bi_rw&RWA_MASK), 0); 4321 if (sh) { 4322 if (unlikely(previous)) { 4323 /* expansion might have moved on while waiting for a 4324 * stripe, so we must do the range check again. 4325 * Expansion could still move past after this 4326 * test, but as we are holding a reference to 4327 * 'sh', we know that if that happens, 4328 * STRIPE_EXPANDING will get set and the expansion 4329 * won't proceed until we finish with the stripe. 4330 */ 4331 int must_retry = 0; 4332 spin_lock_irq(&conf->device_lock); 4333 if (mddev->reshape_backwards 4334 ? logical_sector >= conf->reshape_progress 4335 : logical_sector < conf->reshape_progress) 4336 /* mismatch, need to try again */ 4337 must_retry = 1; 4338 spin_unlock_irq(&conf->device_lock); 4339 if (must_retry) { 4340 release_stripe(sh); 4341 schedule(); 4342 goto retry; 4343 } 4344 } 4345 4346 if (rw == WRITE && 4347 logical_sector >= mddev->suspend_lo && 4348 logical_sector < mddev->suspend_hi) { 4349 release_stripe(sh); 4350 /* As the suspend_* range is controlled by 4351 * userspace, we want an interruptible 4352 * wait. 4353 */ 4354 flush_signals(current); 4355 prepare_to_wait(&conf->wait_for_overlap, 4356 &w, TASK_INTERRUPTIBLE); 4357 if (logical_sector >= mddev->suspend_lo && 4358 logical_sector < mddev->suspend_hi) 4359 schedule(); 4360 goto retry; 4361 } 4362 4363 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4364 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4365 /* Stripe is busy expanding or 4366 * add failed due to overlap. Flush everything 4367 * and wait a while 4368 */ 4369 md_wakeup_thread(mddev->thread); 4370 release_stripe(sh); 4371 schedule(); 4372 goto retry; 4373 } 4374 finish_wait(&conf->wait_for_overlap, &w); 4375 set_bit(STRIPE_HANDLE, &sh->state); 4376 clear_bit(STRIPE_DELAYED, &sh->state); 4377 if ((bi->bi_rw & REQ_SYNC) && 4378 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4379 atomic_inc(&conf->preread_active_stripes); 4380 release_stripe_plug(mddev, sh); 4381 } else { 4382 /* cannot get stripe for read-ahead, just give-up */ 4383 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4384 finish_wait(&conf->wait_for_overlap, &w); 4385 break; 4386 } 4387 } 4388 4389 remaining = raid5_dec_bi_active_stripes(bi); 4390 if (remaining == 0) { 4391 4392 if ( rw == WRITE ) 4393 md_write_end(mddev); 4394 4395 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 4396 bi, 0); 4397 bio_endio(bi, 0); 4398 } 4399 } 4400 4401 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4402 4403 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4404 { 4405 /* reshaping is quite different to recovery/resync so it is 4406 * handled quite separately ... here. 4407 * 4408 * On each call to sync_request, we gather one chunk worth of 4409 * destination stripes and flag them as expanding. 4410 * Then we find all the source stripes and request reads. 4411 * As the reads complete, handle_stripe will copy the data 4412 * into the destination stripe and release that stripe. 4413 */ 4414 struct r5conf *conf = mddev->private; 4415 struct stripe_head *sh; 4416 sector_t first_sector, last_sector; 4417 int raid_disks = conf->previous_raid_disks; 4418 int data_disks = raid_disks - conf->max_degraded; 4419 int new_data_disks = conf->raid_disks - conf->max_degraded; 4420 int i; 4421 int dd_idx; 4422 sector_t writepos, readpos, safepos; 4423 sector_t stripe_addr; 4424 int reshape_sectors; 4425 struct list_head stripes; 4426 4427 if (sector_nr == 0) { 4428 /* If restarting in the middle, skip the initial sectors */ 4429 if (mddev->reshape_backwards && 4430 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4431 sector_nr = raid5_size(mddev, 0, 0) 4432 - conf->reshape_progress; 4433 } else if (!mddev->reshape_backwards && 4434 conf->reshape_progress > 0) 4435 sector_nr = conf->reshape_progress; 4436 sector_div(sector_nr, new_data_disks); 4437 if (sector_nr) { 4438 mddev->curr_resync_completed = sector_nr; 4439 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4440 *skipped = 1; 4441 return sector_nr; 4442 } 4443 } 4444 4445 /* We need to process a full chunk at a time. 4446 * If old and new chunk sizes differ, we need to process the 4447 * largest of these 4448 */ 4449 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4450 reshape_sectors = mddev->new_chunk_sectors; 4451 else 4452 reshape_sectors = mddev->chunk_sectors; 4453 4454 /* We update the metadata at least every 10 seconds, or when 4455 * the data about to be copied would over-write the source of 4456 * the data at the front of the range. i.e. one new_stripe 4457 * along from reshape_progress new_maps to after where 4458 * reshape_safe old_maps to 4459 */ 4460 writepos = conf->reshape_progress; 4461 sector_div(writepos, new_data_disks); 4462 readpos = conf->reshape_progress; 4463 sector_div(readpos, data_disks); 4464 safepos = conf->reshape_safe; 4465 sector_div(safepos, data_disks); 4466 if (mddev->reshape_backwards) { 4467 writepos -= min_t(sector_t, reshape_sectors, writepos); 4468 readpos += reshape_sectors; 4469 safepos += reshape_sectors; 4470 } else { 4471 writepos += reshape_sectors; 4472 readpos -= min_t(sector_t, reshape_sectors, readpos); 4473 safepos -= min_t(sector_t, reshape_sectors, safepos); 4474 } 4475 4476 /* Having calculated the 'writepos' possibly use it 4477 * to set 'stripe_addr' which is where we will write to. 4478 */ 4479 if (mddev->reshape_backwards) { 4480 BUG_ON(conf->reshape_progress == 0); 4481 stripe_addr = writepos; 4482 BUG_ON((mddev->dev_sectors & 4483 ~((sector_t)reshape_sectors - 1)) 4484 - reshape_sectors - stripe_addr 4485 != sector_nr); 4486 } else { 4487 BUG_ON(writepos != sector_nr + reshape_sectors); 4488 stripe_addr = sector_nr; 4489 } 4490 4491 /* 'writepos' is the most advanced device address we might write. 4492 * 'readpos' is the least advanced device address we might read. 4493 * 'safepos' is the least address recorded in the metadata as having 4494 * been reshaped. 4495 * If there is a min_offset_diff, these are adjusted either by 4496 * increasing the safepos/readpos if diff is negative, or 4497 * increasing writepos if diff is positive. 4498 * If 'readpos' is then behind 'writepos', there is no way that we can 4499 * ensure safety in the face of a crash - that must be done by userspace 4500 * making a backup of the data. So in that case there is no particular 4501 * rush to update metadata. 4502 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4503 * update the metadata to advance 'safepos' to match 'readpos' so that 4504 * we can be safe in the event of a crash. 4505 * So we insist on updating metadata if safepos is behind writepos and 4506 * readpos is beyond writepos. 4507 * In any case, update the metadata every 10 seconds. 4508 * Maybe that number should be configurable, but I'm not sure it is 4509 * worth it.... maybe it could be a multiple of safemode_delay??? 4510 */ 4511 if (conf->min_offset_diff < 0) { 4512 safepos += -conf->min_offset_diff; 4513 readpos += -conf->min_offset_diff; 4514 } else 4515 writepos += conf->min_offset_diff; 4516 4517 if ((mddev->reshape_backwards 4518 ? (safepos > writepos && readpos < writepos) 4519 : (safepos < writepos && readpos > writepos)) || 4520 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4521 /* Cannot proceed until we've updated the superblock... */ 4522 wait_event(conf->wait_for_overlap, 4523 atomic_read(&conf->reshape_stripes)==0); 4524 mddev->reshape_position = conf->reshape_progress; 4525 mddev->curr_resync_completed = sector_nr; 4526 conf->reshape_checkpoint = jiffies; 4527 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4528 md_wakeup_thread(mddev->thread); 4529 wait_event(mddev->sb_wait, mddev->flags == 0 || 4530 kthread_should_stop()); 4531 spin_lock_irq(&conf->device_lock); 4532 conf->reshape_safe = mddev->reshape_position; 4533 spin_unlock_irq(&conf->device_lock); 4534 wake_up(&conf->wait_for_overlap); 4535 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4536 } 4537 4538 INIT_LIST_HEAD(&stripes); 4539 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4540 int j; 4541 int skipped_disk = 0; 4542 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4543 set_bit(STRIPE_EXPANDING, &sh->state); 4544 atomic_inc(&conf->reshape_stripes); 4545 /* If any of this stripe is beyond the end of the old 4546 * array, then we need to zero those blocks 4547 */ 4548 for (j=sh->disks; j--;) { 4549 sector_t s; 4550 if (j == sh->pd_idx) 4551 continue; 4552 if (conf->level == 6 && 4553 j == sh->qd_idx) 4554 continue; 4555 s = compute_blocknr(sh, j, 0); 4556 if (s < raid5_size(mddev, 0, 0)) { 4557 skipped_disk = 1; 4558 continue; 4559 } 4560 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4561 set_bit(R5_Expanded, &sh->dev[j].flags); 4562 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4563 } 4564 if (!skipped_disk) { 4565 set_bit(STRIPE_EXPAND_READY, &sh->state); 4566 set_bit(STRIPE_HANDLE, &sh->state); 4567 } 4568 list_add(&sh->lru, &stripes); 4569 } 4570 spin_lock_irq(&conf->device_lock); 4571 if (mddev->reshape_backwards) 4572 conf->reshape_progress -= reshape_sectors * new_data_disks; 4573 else 4574 conf->reshape_progress += reshape_sectors * new_data_disks; 4575 spin_unlock_irq(&conf->device_lock); 4576 /* Ok, those stripe are ready. We can start scheduling 4577 * reads on the source stripes. 4578 * The source stripes are determined by mapping the first and last 4579 * block on the destination stripes. 4580 */ 4581 first_sector = 4582 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4583 1, &dd_idx, NULL); 4584 last_sector = 4585 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4586 * new_data_disks - 1), 4587 1, &dd_idx, NULL); 4588 if (last_sector >= mddev->dev_sectors) 4589 last_sector = mddev->dev_sectors - 1; 4590 while (first_sector <= last_sector) { 4591 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4592 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4593 set_bit(STRIPE_HANDLE, &sh->state); 4594 release_stripe(sh); 4595 first_sector += STRIPE_SECTORS; 4596 } 4597 /* Now that the sources are clearly marked, we can release 4598 * the destination stripes 4599 */ 4600 while (!list_empty(&stripes)) { 4601 sh = list_entry(stripes.next, struct stripe_head, lru); 4602 list_del_init(&sh->lru); 4603 release_stripe(sh); 4604 } 4605 /* If this takes us to the resync_max point where we have to pause, 4606 * then we need to write out the superblock. 4607 */ 4608 sector_nr += reshape_sectors; 4609 if ((sector_nr - mddev->curr_resync_completed) * 2 4610 >= mddev->resync_max - mddev->curr_resync_completed) { 4611 /* Cannot proceed until we've updated the superblock... */ 4612 wait_event(conf->wait_for_overlap, 4613 atomic_read(&conf->reshape_stripes) == 0); 4614 mddev->reshape_position = conf->reshape_progress; 4615 mddev->curr_resync_completed = sector_nr; 4616 conf->reshape_checkpoint = jiffies; 4617 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4618 md_wakeup_thread(mddev->thread); 4619 wait_event(mddev->sb_wait, 4620 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4621 || kthread_should_stop()); 4622 spin_lock_irq(&conf->device_lock); 4623 conf->reshape_safe = mddev->reshape_position; 4624 spin_unlock_irq(&conf->device_lock); 4625 wake_up(&conf->wait_for_overlap); 4626 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4627 } 4628 return reshape_sectors; 4629 } 4630 4631 /* FIXME go_faster isn't used */ 4632 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4633 { 4634 struct r5conf *conf = mddev->private; 4635 struct stripe_head *sh; 4636 sector_t max_sector = mddev->dev_sectors; 4637 sector_t sync_blocks; 4638 int still_degraded = 0; 4639 int i; 4640 4641 if (sector_nr >= max_sector) { 4642 /* just being told to finish up .. nothing much to do */ 4643 4644 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4645 end_reshape(conf); 4646 return 0; 4647 } 4648 4649 if (mddev->curr_resync < max_sector) /* aborted */ 4650 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4651 &sync_blocks, 1); 4652 else /* completed sync */ 4653 conf->fullsync = 0; 4654 bitmap_close_sync(mddev->bitmap); 4655 4656 return 0; 4657 } 4658 4659 /* Allow raid5_quiesce to complete */ 4660 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4661 4662 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4663 return reshape_request(mddev, sector_nr, skipped); 4664 4665 /* No need to check resync_max as we never do more than one 4666 * stripe, and as resync_max will always be on a chunk boundary, 4667 * if the check in md_do_sync didn't fire, there is no chance 4668 * of overstepping resync_max here 4669 */ 4670 4671 /* if there is too many failed drives and we are trying 4672 * to resync, then assert that we are finished, because there is 4673 * nothing we can do. 4674 */ 4675 if (mddev->degraded >= conf->max_degraded && 4676 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4677 sector_t rv = mddev->dev_sectors - sector_nr; 4678 *skipped = 1; 4679 return rv; 4680 } 4681 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4682 !conf->fullsync && 4683 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4684 sync_blocks >= STRIPE_SECTORS) { 4685 /* we can skip this block, and probably more */ 4686 sync_blocks /= STRIPE_SECTORS; 4687 *skipped = 1; 4688 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4689 } 4690 4691 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4692 4693 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4694 if (sh == NULL) { 4695 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4696 /* make sure we don't swamp the stripe cache if someone else 4697 * is trying to get access 4698 */ 4699 schedule_timeout_uninterruptible(1); 4700 } 4701 /* Need to check if array will still be degraded after recovery/resync 4702 * We don't need to check the 'failed' flag as when that gets set, 4703 * recovery aborts. 4704 */ 4705 for (i = 0; i < conf->raid_disks; i++) 4706 if (conf->disks[i].rdev == NULL) 4707 still_degraded = 1; 4708 4709 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4710 4711 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4712 4713 handle_stripe(sh); 4714 release_stripe(sh); 4715 4716 return STRIPE_SECTORS; 4717 } 4718 4719 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4720 { 4721 /* We may not be able to submit a whole bio at once as there 4722 * may not be enough stripe_heads available. 4723 * We cannot pre-allocate enough stripe_heads as we may need 4724 * more than exist in the cache (if we allow ever large chunks). 4725 * So we do one stripe head at a time and record in 4726 * ->bi_hw_segments how many have been done. 4727 * 4728 * We *know* that this entire raid_bio is in one chunk, so 4729 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4730 */ 4731 struct stripe_head *sh; 4732 int dd_idx; 4733 sector_t sector, logical_sector, last_sector; 4734 int scnt = 0; 4735 int remaining; 4736 int handled = 0; 4737 4738 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4739 sector = raid5_compute_sector(conf, logical_sector, 4740 0, &dd_idx, NULL); 4741 last_sector = bio_end_sector(raid_bio); 4742 4743 for (; logical_sector < last_sector; 4744 logical_sector += STRIPE_SECTORS, 4745 sector += STRIPE_SECTORS, 4746 scnt++) { 4747 4748 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4749 /* already done this stripe */ 4750 continue; 4751 4752 sh = get_active_stripe(conf, sector, 0, 1, 0); 4753 4754 if (!sh) { 4755 /* failed to get a stripe - must wait */ 4756 raid5_set_bi_processed_stripes(raid_bio, scnt); 4757 conf->retry_read_aligned = raid_bio; 4758 return handled; 4759 } 4760 4761 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4762 release_stripe(sh); 4763 raid5_set_bi_processed_stripes(raid_bio, scnt); 4764 conf->retry_read_aligned = raid_bio; 4765 return handled; 4766 } 4767 4768 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4769 handle_stripe(sh); 4770 release_stripe(sh); 4771 handled++; 4772 } 4773 remaining = raid5_dec_bi_active_stripes(raid_bio); 4774 if (remaining == 0) { 4775 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 4776 raid_bio, 0); 4777 bio_endio(raid_bio, 0); 4778 } 4779 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4780 wake_up(&conf->wait_for_stripe); 4781 return handled; 4782 } 4783 4784 #define MAX_STRIPE_BATCH 8 4785 static int handle_active_stripes(struct r5conf *conf) 4786 { 4787 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4788 int i, batch_size = 0; 4789 4790 while (batch_size < MAX_STRIPE_BATCH && 4791 (sh = __get_priority_stripe(conf)) != NULL) 4792 batch[batch_size++] = sh; 4793 4794 if (batch_size == 0) 4795 return batch_size; 4796 spin_unlock_irq(&conf->device_lock); 4797 4798 for (i = 0; i < batch_size; i++) 4799 handle_stripe(batch[i]); 4800 4801 cond_resched(); 4802 4803 spin_lock_irq(&conf->device_lock); 4804 for (i = 0; i < batch_size; i++) 4805 __release_stripe(conf, batch[i]); 4806 return batch_size; 4807 } 4808 4809 /* 4810 * This is our raid5 kernel thread. 4811 * 4812 * We scan the hash table for stripes which can be handled now. 4813 * During the scan, completed stripes are saved for us by the interrupt 4814 * handler, so that they will not have to wait for our next wakeup. 4815 */ 4816 static void raid5d(struct md_thread *thread) 4817 { 4818 struct mddev *mddev = thread->mddev; 4819 struct r5conf *conf = mddev->private; 4820 int handled; 4821 struct blk_plug plug; 4822 4823 pr_debug("+++ raid5d active\n"); 4824 4825 md_check_recovery(mddev); 4826 4827 blk_start_plug(&plug); 4828 handled = 0; 4829 spin_lock_irq(&conf->device_lock); 4830 while (1) { 4831 struct bio *bio; 4832 int batch_size; 4833 4834 if ( 4835 !list_empty(&conf->bitmap_list)) { 4836 /* Now is a good time to flush some bitmap updates */ 4837 conf->seq_flush++; 4838 spin_unlock_irq(&conf->device_lock); 4839 bitmap_unplug(mddev->bitmap); 4840 spin_lock_irq(&conf->device_lock); 4841 conf->seq_write = conf->seq_flush; 4842 activate_bit_delay(conf); 4843 } 4844 raid5_activate_delayed(conf); 4845 4846 while ((bio = remove_bio_from_retry(conf))) { 4847 int ok; 4848 spin_unlock_irq(&conf->device_lock); 4849 ok = retry_aligned_read(conf, bio); 4850 spin_lock_irq(&conf->device_lock); 4851 if (!ok) 4852 break; 4853 handled++; 4854 } 4855 4856 batch_size = handle_active_stripes(conf); 4857 if (!batch_size) 4858 break; 4859 handled += batch_size; 4860 4861 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 4862 spin_unlock_irq(&conf->device_lock); 4863 md_check_recovery(mddev); 4864 spin_lock_irq(&conf->device_lock); 4865 } 4866 } 4867 pr_debug("%d stripes handled\n", handled); 4868 4869 spin_unlock_irq(&conf->device_lock); 4870 4871 async_tx_issue_pending_all(); 4872 blk_finish_plug(&plug); 4873 4874 pr_debug("--- raid5d inactive\n"); 4875 } 4876 4877 static ssize_t 4878 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4879 { 4880 struct r5conf *conf = mddev->private; 4881 if (conf) 4882 return sprintf(page, "%d\n", conf->max_nr_stripes); 4883 else 4884 return 0; 4885 } 4886 4887 int 4888 raid5_set_cache_size(struct mddev *mddev, int size) 4889 { 4890 struct r5conf *conf = mddev->private; 4891 int err; 4892 4893 if (size <= 16 || size > 32768) 4894 return -EINVAL; 4895 while (size < conf->max_nr_stripes) { 4896 if (drop_one_stripe(conf)) 4897 conf->max_nr_stripes--; 4898 else 4899 break; 4900 } 4901 err = md_allow_write(mddev); 4902 if (err) 4903 return err; 4904 while (size > conf->max_nr_stripes) { 4905 if (grow_one_stripe(conf)) 4906 conf->max_nr_stripes++; 4907 else break; 4908 } 4909 return 0; 4910 } 4911 EXPORT_SYMBOL(raid5_set_cache_size); 4912 4913 static ssize_t 4914 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4915 { 4916 struct r5conf *conf = mddev->private; 4917 unsigned long new; 4918 int err; 4919 4920 if (len >= PAGE_SIZE) 4921 return -EINVAL; 4922 if (!conf) 4923 return -ENODEV; 4924 4925 if (strict_strtoul(page, 10, &new)) 4926 return -EINVAL; 4927 err = raid5_set_cache_size(mddev, new); 4928 if (err) 4929 return err; 4930 return len; 4931 } 4932 4933 static struct md_sysfs_entry 4934 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4935 raid5_show_stripe_cache_size, 4936 raid5_store_stripe_cache_size); 4937 4938 static ssize_t 4939 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4940 { 4941 struct r5conf *conf = mddev->private; 4942 if (conf) 4943 return sprintf(page, "%d\n", conf->bypass_threshold); 4944 else 4945 return 0; 4946 } 4947 4948 static ssize_t 4949 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4950 { 4951 struct r5conf *conf = mddev->private; 4952 unsigned long new; 4953 if (len >= PAGE_SIZE) 4954 return -EINVAL; 4955 if (!conf) 4956 return -ENODEV; 4957 4958 if (strict_strtoul(page, 10, &new)) 4959 return -EINVAL; 4960 if (new > conf->max_nr_stripes) 4961 return -EINVAL; 4962 conf->bypass_threshold = new; 4963 return len; 4964 } 4965 4966 static struct md_sysfs_entry 4967 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4968 S_IRUGO | S_IWUSR, 4969 raid5_show_preread_threshold, 4970 raid5_store_preread_threshold); 4971 4972 static ssize_t 4973 stripe_cache_active_show(struct mddev *mddev, char *page) 4974 { 4975 struct r5conf *conf = mddev->private; 4976 if (conf) 4977 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4978 else 4979 return 0; 4980 } 4981 4982 static struct md_sysfs_entry 4983 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4984 4985 static struct attribute *raid5_attrs[] = { 4986 &raid5_stripecache_size.attr, 4987 &raid5_stripecache_active.attr, 4988 &raid5_preread_bypass_threshold.attr, 4989 NULL, 4990 }; 4991 static struct attribute_group raid5_attrs_group = { 4992 .name = NULL, 4993 .attrs = raid5_attrs, 4994 }; 4995 4996 static sector_t 4997 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4998 { 4999 struct r5conf *conf = mddev->private; 5000 5001 if (!sectors) 5002 sectors = mddev->dev_sectors; 5003 if (!raid_disks) 5004 /* size is defined by the smallest of previous and new size */ 5005 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 5006 5007 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5008 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 5009 return sectors * (raid_disks - conf->max_degraded); 5010 } 5011 5012 static void raid5_free_percpu(struct r5conf *conf) 5013 { 5014 struct raid5_percpu *percpu; 5015 unsigned long cpu; 5016 5017 if (!conf->percpu) 5018 return; 5019 5020 get_online_cpus(); 5021 for_each_possible_cpu(cpu) { 5022 percpu = per_cpu_ptr(conf->percpu, cpu); 5023 safe_put_page(percpu->spare_page); 5024 kfree(percpu->scribble); 5025 } 5026 #ifdef CONFIG_HOTPLUG_CPU 5027 unregister_cpu_notifier(&conf->cpu_notify); 5028 #endif 5029 put_online_cpus(); 5030 5031 free_percpu(conf->percpu); 5032 } 5033 5034 static void free_conf(struct r5conf *conf) 5035 { 5036 shrink_stripes(conf); 5037 raid5_free_percpu(conf); 5038 kfree(conf->disks); 5039 kfree(conf->stripe_hashtbl); 5040 kfree(conf); 5041 } 5042 5043 #ifdef CONFIG_HOTPLUG_CPU 5044 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 5045 void *hcpu) 5046 { 5047 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 5048 long cpu = (long)hcpu; 5049 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 5050 5051 switch (action) { 5052 case CPU_UP_PREPARE: 5053 case CPU_UP_PREPARE_FROZEN: 5054 if (conf->level == 6 && !percpu->spare_page) 5055 percpu->spare_page = alloc_page(GFP_KERNEL); 5056 if (!percpu->scribble) 5057 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5058 5059 if (!percpu->scribble || 5060 (conf->level == 6 && !percpu->spare_page)) { 5061 safe_put_page(percpu->spare_page); 5062 kfree(percpu->scribble); 5063 pr_err("%s: failed memory allocation for cpu%ld\n", 5064 __func__, cpu); 5065 return notifier_from_errno(-ENOMEM); 5066 } 5067 break; 5068 case CPU_DEAD: 5069 case CPU_DEAD_FROZEN: 5070 safe_put_page(percpu->spare_page); 5071 kfree(percpu->scribble); 5072 percpu->spare_page = NULL; 5073 percpu->scribble = NULL; 5074 break; 5075 default: 5076 break; 5077 } 5078 return NOTIFY_OK; 5079 } 5080 #endif 5081 5082 static int raid5_alloc_percpu(struct r5conf *conf) 5083 { 5084 unsigned long cpu; 5085 struct page *spare_page; 5086 struct raid5_percpu __percpu *allcpus; 5087 void *scribble; 5088 int err; 5089 5090 allcpus = alloc_percpu(struct raid5_percpu); 5091 if (!allcpus) 5092 return -ENOMEM; 5093 conf->percpu = allcpus; 5094 5095 get_online_cpus(); 5096 err = 0; 5097 for_each_present_cpu(cpu) { 5098 if (conf->level == 6) { 5099 spare_page = alloc_page(GFP_KERNEL); 5100 if (!spare_page) { 5101 err = -ENOMEM; 5102 break; 5103 } 5104 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 5105 } 5106 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5107 if (!scribble) { 5108 err = -ENOMEM; 5109 break; 5110 } 5111 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 5112 } 5113 #ifdef CONFIG_HOTPLUG_CPU 5114 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5115 conf->cpu_notify.priority = 0; 5116 if (err == 0) 5117 err = register_cpu_notifier(&conf->cpu_notify); 5118 #endif 5119 put_online_cpus(); 5120 5121 return err; 5122 } 5123 5124 static struct r5conf *setup_conf(struct mddev *mddev) 5125 { 5126 struct r5conf *conf; 5127 int raid_disk, memory, max_disks; 5128 struct md_rdev *rdev; 5129 struct disk_info *disk; 5130 char pers_name[6]; 5131 5132 if (mddev->new_level != 5 5133 && mddev->new_level != 4 5134 && mddev->new_level != 6) { 5135 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5136 mdname(mddev), mddev->new_level); 5137 return ERR_PTR(-EIO); 5138 } 5139 if ((mddev->new_level == 5 5140 && !algorithm_valid_raid5(mddev->new_layout)) || 5141 (mddev->new_level == 6 5142 && !algorithm_valid_raid6(mddev->new_layout))) { 5143 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5144 mdname(mddev), mddev->new_layout); 5145 return ERR_PTR(-EIO); 5146 } 5147 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5148 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5149 mdname(mddev), mddev->raid_disks); 5150 return ERR_PTR(-EINVAL); 5151 } 5152 5153 if (!mddev->new_chunk_sectors || 5154 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5155 !is_power_of_2(mddev->new_chunk_sectors)) { 5156 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5157 mdname(mddev), mddev->new_chunk_sectors << 9); 5158 return ERR_PTR(-EINVAL); 5159 } 5160 5161 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5162 if (conf == NULL) 5163 goto abort; 5164 spin_lock_init(&conf->device_lock); 5165 init_waitqueue_head(&conf->wait_for_stripe); 5166 init_waitqueue_head(&conf->wait_for_overlap); 5167 INIT_LIST_HEAD(&conf->handle_list); 5168 INIT_LIST_HEAD(&conf->hold_list); 5169 INIT_LIST_HEAD(&conf->delayed_list); 5170 INIT_LIST_HEAD(&conf->bitmap_list); 5171 INIT_LIST_HEAD(&conf->inactive_list); 5172 atomic_set(&conf->active_stripes, 0); 5173 atomic_set(&conf->preread_active_stripes, 0); 5174 atomic_set(&conf->active_aligned_reads, 0); 5175 conf->bypass_threshold = BYPASS_THRESHOLD; 5176 conf->recovery_disabled = mddev->recovery_disabled - 1; 5177 5178 conf->raid_disks = mddev->raid_disks; 5179 if (mddev->reshape_position == MaxSector) 5180 conf->previous_raid_disks = mddev->raid_disks; 5181 else 5182 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5183 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5184 conf->scribble_len = scribble_len(max_disks); 5185 5186 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5187 GFP_KERNEL); 5188 if (!conf->disks) 5189 goto abort; 5190 5191 conf->mddev = mddev; 5192 5193 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5194 goto abort; 5195 5196 conf->level = mddev->new_level; 5197 if (raid5_alloc_percpu(conf) != 0) 5198 goto abort; 5199 5200 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5201 5202 rdev_for_each(rdev, mddev) { 5203 raid_disk = rdev->raid_disk; 5204 if (raid_disk >= max_disks 5205 || raid_disk < 0) 5206 continue; 5207 disk = conf->disks + raid_disk; 5208 5209 if (test_bit(Replacement, &rdev->flags)) { 5210 if (disk->replacement) 5211 goto abort; 5212 disk->replacement = rdev; 5213 } else { 5214 if (disk->rdev) 5215 goto abort; 5216 disk->rdev = rdev; 5217 } 5218 5219 if (test_bit(In_sync, &rdev->flags)) { 5220 char b[BDEVNAME_SIZE]; 5221 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5222 " disk %d\n", 5223 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5224 } else if (rdev->saved_raid_disk != raid_disk) 5225 /* Cannot rely on bitmap to complete recovery */ 5226 conf->fullsync = 1; 5227 } 5228 5229 conf->chunk_sectors = mddev->new_chunk_sectors; 5230 conf->level = mddev->new_level; 5231 if (conf->level == 6) 5232 conf->max_degraded = 2; 5233 else 5234 conf->max_degraded = 1; 5235 conf->algorithm = mddev->new_layout; 5236 conf->max_nr_stripes = NR_STRIPES; 5237 conf->reshape_progress = mddev->reshape_position; 5238 if (conf->reshape_progress != MaxSector) { 5239 conf->prev_chunk_sectors = mddev->chunk_sectors; 5240 conf->prev_algo = mddev->layout; 5241 } 5242 5243 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5244 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5245 if (grow_stripes(conf, conf->max_nr_stripes)) { 5246 printk(KERN_ERR 5247 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5248 mdname(mddev), memory); 5249 goto abort; 5250 } else 5251 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5252 mdname(mddev), memory); 5253 5254 sprintf(pers_name, "raid%d", mddev->new_level); 5255 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5256 if (!conf->thread) { 5257 printk(KERN_ERR 5258 "md/raid:%s: couldn't allocate thread.\n", 5259 mdname(mddev)); 5260 goto abort; 5261 } 5262 5263 return conf; 5264 5265 abort: 5266 if (conf) { 5267 free_conf(conf); 5268 return ERR_PTR(-EIO); 5269 } else 5270 return ERR_PTR(-ENOMEM); 5271 } 5272 5273 5274 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5275 { 5276 switch (algo) { 5277 case ALGORITHM_PARITY_0: 5278 if (raid_disk < max_degraded) 5279 return 1; 5280 break; 5281 case ALGORITHM_PARITY_N: 5282 if (raid_disk >= raid_disks - max_degraded) 5283 return 1; 5284 break; 5285 case ALGORITHM_PARITY_0_6: 5286 if (raid_disk == 0 || 5287 raid_disk == raid_disks - 1) 5288 return 1; 5289 break; 5290 case ALGORITHM_LEFT_ASYMMETRIC_6: 5291 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5292 case ALGORITHM_LEFT_SYMMETRIC_6: 5293 case ALGORITHM_RIGHT_SYMMETRIC_6: 5294 if (raid_disk == raid_disks - 1) 5295 return 1; 5296 } 5297 return 0; 5298 } 5299 5300 static int run(struct mddev *mddev) 5301 { 5302 struct r5conf *conf; 5303 int working_disks = 0; 5304 int dirty_parity_disks = 0; 5305 struct md_rdev *rdev; 5306 sector_t reshape_offset = 0; 5307 int i; 5308 long long min_offset_diff = 0; 5309 int first = 1; 5310 5311 if (mddev->recovery_cp != MaxSector) 5312 printk(KERN_NOTICE "md/raid:%s: not clean" 5313 " -- starting background reconstruction\n", 5314 mdname(mddev)); 5315 5316 rdev_for_each(rdev, mddev) { 5317 long long diff; 5318 if (rdev->raid_disk < 0) 5319 continue; 5320 diff = (rdev->new_data_offset - rdev->data_offset); 5321 if (first) { 5322 min_offset_diff = diff; 5323 first = 0; 5324 } else if (mddev->reshape_backwards && 5325 diff < min_offset_diff) 5326 min_offset_diff = diff; 5327 else if (!mddev->reshape_backwards && 5328 diff > min_offset_diff) 5329 min_offset_diff = diff; 5330 } 5331 5332 if (mddev->reshape_position != MaxSector) { 5333 /* Check that we can continue the reshape. 5334 * Difficulties arise if the stripe we would write to 5335 * next is at or after the stripe we would read from next. 5336 * For a reshape that changes the number of devices, this 5337 * is only possible for a very short time, and mdadm makes 5338 * sure that time appears to have past before assembling 5339 * the array. So we fail if that time hasn't passed. 5340 * For a reshape that keeps the number of devices the same 5341 * mdadm must be monitoring the reshape can keeping the 5342 * critical areas read-only and backed up. It will start 5343 * the array in read-only mode, so we check for that. 5344 */ 5345 sector_t here_new, here_old; 5346 int old_disks; 5347 int max_degraded = (mddev->level == 6 ? 2 : 1); 5348 5349 if (mddev->new_level != mddev->level) { 5350 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5351 "required - aborting.\n", 5352 mdname(mddev)); 5353 return -EINVAL; 5354 } 5355 old_disks = mddev->raid_disks - mddev->delta_disks; 5356 /* reshape_position must be on a new-stripe boundary, and one 5357 * further up in new geometry must map after here in old 5358 * geometry. 5359 */ 5360 here_new = mddev->reshape_position; 5361 if (sector_div(here_new, mddev->new_chunk_sectors * 5362 (mddev->raid_disks - max_degraded))) { 5363 printk(KERN_ERR "md/raid:%s: reshape_position not " 5364 "on a stripe boundary\n", mdname(mddev)); 5365 return -EINVAL; 5366 } 5367 reshape_offset = here_new * mddev->new_chunk_sectors; 5368 /* here_new is the stripe we will write to */ 5369 here_old = mddev->reshape_position; 5370 sector_div(here_old, mddev->chunk_sectors * 5371 (old_disks-max_degraded)); 5372 /* here_old is the first stripe that we might need to read 5373 * from */ 5374 if (mddev->delta_disks == 0) { 5375 if ((here_new * mddev->new_chunk_sectors != 5376 here_old * mddev->chunk_sectors)) { 5377 printk(KERN_ERR "md/raid:%s: reshape position is" 5378 " confused - aborting\n", mdname(mddev)); 5379 return -EINVAL; 5380 } 5381 /* We cannot be sure it is safe to start an in-place 5382 * reshape. It is only safe if user-space is monitoring 5383 * and taking constant backups. 5384 * mdadm always starts a situation like this in 5385 * readonly mode so it can take control before 5386 * allowing any writes. So just check for that. 5387 */ 5388 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5389 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5390 /* not really in-place - so OK */; 5391 else if (mddev->ro == 0) { 5392 printk(KERN_ERR "md/raid:%s: in-place reshape " 5393 "must be started in read-only mode " 5394 "- aborting\n", 5395 mdname(mddev)); 5396 return -EINVAL; 5397 } 5398 } else if (mddev->reshape_backwards 5399 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5400 here_old * mddev->chunk_sectors) 5401 : (here_new * mddev->new_chunk_sectors >= 5402 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5403 /* Reading from the same stripe as writing to - bad */ 5404 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5405 "auto-recovery - aborting.\n", 5406 mdname(mddev)); 5407 return -EINVAL; 5408 } 5409 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5410 mdname(mddev)); 5411 /* OK, we should be able to continue; */ 5412 } else { 5413 BUG_ON(mddev->level != mddev->new_level); 5414 BUG_ON(mddev->layout != mddev->new_layout); 5415 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5416 BUG_ON(mddev->delta_disks != 0); 5417 } 5418 5419 if (mddev->private == NULL) 5420 conf = setup_conf(mddev); 5421 else 5422 conf = mddev->private; 5423 5424 if (IS_ERR(conf)) 5425 return PTR_ERR(conf); 5426 5427 conf->min_offset_diff = min_offset_diff; 5428 mddev->thread = conf->thread; 5429 conf->thread = NULL; 5430 mddev->private = conf; 5431 5432 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5433 i++) { 5434 rdev = conf->disks[i].rdev; 5435 if (!rdev && conf->disks[i].replacement) { 5436 /* The replacement is all we have yet */ 5437 rdev = conf->disks[i].replacement; 5438 conf->disks[i].replacement = NULL; 5439 clear_bit(Replacement, &rdev->flags); 5440 conf->disks[i].rdev = rdev; 5441 } 5442 if (!rdev) 5443 continue; 5444 if (conf->disks[i].replacement && 5445 conf->reshape_progress != MaxSector) { 5446 /* replacements and reshape simply do not mix. */ 5447 printk(KERN_ERR "md: cannot handle concurrent " 5448 "replacement and reshape.\n"); 5449 goto abort; 5450 } 5451 if (test_bit(In_sync, &rdev->flags)) { 5452 working_disks++; 5453 continue; 5454 } 5455 /* This disc is not fully in-sync. However if it 5456 * just stored parity (beyond the recovery_offset), 5457 * when we don't need to be concerned about the 5458 * array being dirty. 5459 * When reshape goes 'backwards', we never have 5460 * partially completed devices, so we only need 5461 * to worry about reshape going forwards. 5462 */ 5463 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5464 if (mddev->major_version == 0 && 5465 mddev->minor_version > 90) 5466 rdev->recovery_offset = reshape_offset; 5467 5468 if (rdev->recovery_offset < reshape_offset) { 5469 /* We need to check old and new layout */ 5470 if (!only_parity(rdev->raid_disk, 5471 conf->algorithm, 5472 conf->raid_disks, 5473 conf->max_degraded)) 5474 continue; 5475 } 5476 if (!only_parity(rdev->raid_disk, 5477 conf->prev_algo, 5478 conf->previous_raid_disks, 5479 conf->max_degraded)) 5480 continue; 5481 dirty_parity_disks++; 5482 } 5483 5484 /* 5485 * 0 for a fully functional array, 1 or 2 for a degraded array. 5486 */ 5487 mddev->degraded = calc_degraded(conf); 5488 5489 if (has_failed(conf)) { 5490 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5491 " (%d/%d failed)\n", 5492 mdname(mddev), mddev->degraded, conf->raid_disks); 5493 goto abort; 5494 } 5495 5496 /* device size must be a multiple of chunk size */ 5497 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5498 mddev->resync_max_sectors = mddev->dev_sectors; 5499 5500 if (mddev->degraded > dirty_parity_disks && 5501 mddev->recovery_cp != MaxSector) { 5502 if (mddev->ok_start_degraded) 5503 printk(KERN_WARNING 5504 "md/raid:%s: starting dirty degraded array" 5505 " - data corruption possible.\n", 5506 mdname(mddev)); 5507 else { 5508 printk(KERN_ERR 5509 "md/raid:%s: cannot start dirty degraded array.\n", 5510 mdname(mddev)); 5511 goto abort; 5512 } 5513 } 5514 5515 if (mddev->degraded == 0) 5516 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5517 " devices, algorithm %d\n", mdname(mddev), conf->level, 5518 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5519 mddev->new_layout); 5520 else 5521 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5522 " out of %d devices, algorithm %d\n", 5523 mdname(mddev), conf->level, 5524 mddev->raid_disks - mddev->degraded, 5525 mddev->raid_disks, mddev->new_layout); 5526 5527 print_raid5_conf(conf); 5528 5529 if (conf->reshape_progress != MaxSector) { 5530 conf->reshape_safe = conf->reshape_progress; 5531 atomic_set(&conf->reshape_stripes, 0); 5532 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5533 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5534 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5535 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5536 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5537 "reshape"); 5538 } 5539 5540 5541 /* Ok, everything is just fine now */ 5542 if (mddev->to_remove == &raid5_attrs_group) 5543 mddev->to_remove = NULL; 5544 else if (mddev->kobj.sd && 5545 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5546 printk(KERN_WARNING 5547 "raid5: failed to create sysfs attributes for %s\n", 5548 mdname(mddev)); 5549 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5550 5551 if (mddev->queue) { 5552 int chunk_size; 5553 bool discard_supported = true; 5554 /* read-ahead size must cover two whole stripes, which 5555 * is 2 * (datadisks) * chunksize where 'n' is the 5556 * number of raid devices 5557 */ 5558 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5559 int stripe = data_disks * 5560 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5561 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5562 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5563 5564 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5565 5566 mddev->queue->backing_dev_info.congested_data = mddev; 5567 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5568 5569 chunk_size = mddev->chunk_sectors << 9; 5570 blk_queue_io_min(mddev->queue, chunk_size); 5571 blk_queue_io_opt(mddev->queue, chunk_size * 5572 (conf->raid_disks - conf->max_degraded)); 5573 /* 5574 * We can only discard a whole stripe. It doesn't make sense to 5575 * discard data disk but write parity disk 5576 */ 5577 stripe = stripe * PAGE_SIZE; 5578 /* Round up to power of 2, as discard handling 5579 * currently assumes that */ 5580 while ((stripe-1) & stripe) 5581 stripe = (stripe | (stripe-1)) + 1; 5582 mddev->queue->limits.discard_alignment = stripe; 5583 mddev->queue->limits.discard_granularity = stripe; 5584 /* 5585 * unaligned part of discard request will be ignored, so can't 5586 * guarantee discard_zerors_data 5587 */ 5588 mddev->queue->limits.discard_zeroes_data = 0; 5589 5590 rdev_for_each(rdev, mddev) { 5591 disk_stack_limits(mddev->gendisk, rdev->bdev, 5592 rdev->data_offset << 9); 5593 disk_stack_limits(mddev->gendisk, rdev->bdev, 5594 rdev->new_data_offset << 9); 5595 /* 5596 * discard_zeroes_data is required, otherwise data 5597 * could be lost. Consider a scenario: discard a stripe 5598 * (the stripe could be inconsistent if 5599 * discard_zeroes_data is 0); write one disk of the 5600 * stripe (the stripe could be inconsistent again 5601 * depending on which disks are used to calculate 5602 * parity); the disk is broken; The stripe data of this 5603 * disk is lost. 5604 */ 5605 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5606 !bdev_get_queue(rdev->bdev)-> 5607 limits.discard_zeroes_data) 5608 discard_supported = false; 5609 } 5610 5611 if (discard_supported && 5612 mddev->queue->limits.max_discard_sectors >= stripe && 5613 mddev->queue->limits.discard_granularity >= stripe) 5614 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5615 mddev->queue); 5616 else 5617 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5618 mddev->queue); 5619 } 5620 5621 return 0; 5622 abort: 5623 md_unregister_thread(&mddev->thread); 5624 print_raid5_conf(conf); 5625 free_conf(conf); 5626 mddev->private = NULL; 5627 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5628 return -EIO; 5629 } 5630 5631 static int stop(struct mddev *mddev) 5632 { 5633 struct r5conf *conf = mddev->private; 5634 5635 md_unregister_thread(&mddev->thread); 5636 if (mddev->queue) 5637 mddev->queue->backing_dev_info.congested_fn = NULL; 5638 free_conf(conf); 5639 mddev->private = NULL; 5640 mddev->to_remove = &raid5_attrs_group; 5641 return 0; 5642 } 5643 5644 static void status(struct seq_file *seq, struct mddev *mddev) 5645 { 5646 struct r5conf *conf = mddev->private; 5647 int i; 5648 5649 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5650 mddev->chunk_sectors / 2, mddev->layout); 5651 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5652 for (i = 0; i < conf->raid_disks; i++) 5653 seq_printf (seq, "%s", 5654 conf->disks[i].rdev && 5655 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5656 seq_printf (seq, "]"); 5657 } 5658 5659 static void print_raid5_conf (struct r5conf *conf) 5660 { 5661 int i; 5662 struct disk_info *tmp; 5663 5664 printk(KERN_DEBUG "RAID conf printout:\n"); 5665 if (!conf) { 5666 printk("(conf==NULL)\n"); 5667 return; 5668 } 5669 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5670 conf->raid_disks, 5671 conf->raid_disks - conf->mddev->degraded); 5672 5673 for (i = 0; i < conf->raid_disks; i++) { 5674 char b[BDEVNAME_SIZE]; 5675 tmp = conf->disks + i; 5676 if (tmp->rdev) 5677 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5678 i, !test_bit(Faulty, &tmp->rdev->flags), 5679 bdevname(tmp->rdev->bdev, b)); 5680 } 5681 } 5682 5683 static int raid5_spare_active(struct mddev *mddev) 5684 { 5685 int i; 5686 struct r5conf *conf = mddev->private; 5687 struct disk_info *tmp; 5688 int count = 0; 5689 unsigned long flags; 5690 5691 for (i = 0; i < conf->raid_disks; i++) { 5692 tmp = conf->disks + i; 5693 if (tmp->replacement 5694 && tmp->replacement->recovery_offset == MaxSector 5695 && !test_bit(Faulty, &tmp->replacement->flags) 5696 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5697 /* Replacement has just become active. */ 5698 if (!tmp->rdev 5699 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5700 count++; 5701 if (tmp->rdev) { 5702 /* Replaced device not technically faulty, 5703 * but we need to be sure it gets removed 5704 * and never re-added. 5705 */ 5706 set_bit(Faulty, &tmp->rdev->flags); 5707 sysfs_notify_dirent_safe( 5708 tmp->rdev->sysfs_state); 5709 } 5710 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5711 } else if (tmp->rdev 5712 && tmp->rdev->recovery_offset == MaxSector 5713 && !test_bit(Faulty, &tmp->rdev->flags) 5714 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5715 count++; 5716 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5717 } 5718 } 5719 spin_lock_irqsave(&conf->device_lock, flags); 5720 mddev->degraded = calc_degraded(conf); 5721 spin_unlock_irqrestore(&conf->device_lock, flags); 5722 print_raid5_conf(conf); 5723 return count; 5724 } 5725 5726 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5727 { 5728 struct r5conf *conf = mddev->private; 5729 int err = 0; 5730 int number = rdev->raid_disk; 5731 struct md_rdev **rdevp; 5732 struct disk_info *p = conf->disks + number; 5733 5734 print_raid5_conf(conf); 5735 if (rdev == p->rdev) 5736 rdevp = &p->rdev; 5737 else if (rdev == p->replacement) 5738 rdevp = &p->replacement; 5739 else 5740 return 0; 5741 5742 if (number >= conf->raid_disks && 5743 conf->reshape_progress == MaxSector) 5744 clear_bit(In_sync, &rdev->flags); 5745 5746 if (test_bit(In_sync, &rdev->flags) || 5747 atomic_read(&rdev->nr_pending)) { 5748 err = -EBUSY; 5749 goto abort; 5750 } 5751 /* Only remove non-faulty devices if recovery 5752 * isn't possible. 5753 */ 5754 if (!test_bit(Faulty, &rdev->flags) && 5755 mddev->recovery_disabled != conf->recovery_disabled && 5756 !has_failed(conf) && 5757 (!p->replacement || p->replacement == rdev) && 5758 number < conf->raid_disks) { 5759 err = -EBUSY; 5760 goto abort; 5761 } 5762 *rdevp = NULL; 5763 synchronize_rcu(); 5764 if (atomic_read(&rdev->nr_pending)) { 5765 /* lost the race, try later */ 5766 err = -EBUSY; 5767 *rdevp = rdev; 5768 } else if (p->replacement) { 5769 /* We must have just cleared 'rdev' */ 5770 p->rdev = p->replacement; 5771 clear_bit(Replacement, &p->replacement->flags); 5772 smp_mb(); /* Make sure other CPUs may see both as identical 5773 * but will never see neither - if they are careful 5774 */ 5775 p->replacement = NULL; 5776 clear_bit(WantReplacement, &rdev->flags); 5777 } else 5778 /* We might have just removed the Replacement as faulty- 5779 * clear the bit just in case 5780 */ 5781 clear_bit(WantReplacement, &rdev->flags); 5782 abort: 5783 5784 print_raid5_conf(conf); 5785 return err; 5786 } 5787 5788 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5789 { 5790 struct r5conf *conf = mddev->private; 5791 int err = -EEXIST; 5792 int disk; 5793 struct disk_info *p; 5794 int first = 0; 5795 int last = conf->raid_disks - 1; 5796 5797 if (mddev->recovery_disabled == conf->recovery_disabled) 5798 return -EBUSY; 5799 5800 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5801 /* no point adding a device */ 5802 return -EINVAL; 5803 5804 if (rdev->raid_disk >= 0) 5805 first = last = rdev->raid_disk; 5806 5807 /* 5808 * find the disk ... but prefer rdev->saved_raid_disk 5809 * if possible. 5810 */ 5811 if (rdev->saved_raid_disk >= 0 && 5812 rdev->saved_raid_disk >= first && 5813 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5814 first = rdev->saved_raid_disk; 5815 5816 for (disk = first; disk <= last; disk++) { 5817 p = conf->disks + disk; 5818 if (p->rdev == NULL) { 5819 clear_bit(In_sync, &rdev->flags); 5820 rdev->raid_disk = disk; 5821 err = 0; 5822 if (rdev->saved_raid_disk != disk) 5823 conf->fullsync = 1; 5824 rcu_assign_pointer(p->rdev, rdev); 5825 goto out; 5826 } 5827 } 5828 for (disk = first; disk <= last; disk++) { 5829 p = conf->disks + disk; 5830 if (test_bit(WantReplacement, &p->rdev->flags) && 5831 p->replacement == NULL) { 5832 clear_bit(In_sync, &rdev->flags); 5833 set_bit(Replacement, &rdev->flags); 5834 rdev->raid_disk = disk; 5835 err = 0; 5836 conf->fullsync = 1; 5837 rcu_assign_pointer(p->replacement, rdev); 5838 break; 5839 } 5840 } 5841 out: 5842 print_raid5_conf(conf); 5843 return err; 5844 } 5845 5846 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5847 { 5848 /* no resync is happening, and there is enough space 5849 * on all devices, so we can resize. 5850 * We need to make sure resync covers any new space. 5851 * If the array is shrinking we should possibly wait until 5852 * any io in the removed space completes, but it hardly seems 5853 * worth it. 5854 */ 5855 sector_t newsize; 5856 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5857 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5858 if (mddev->external_size && 5859 mddev->array_sectors > newsize) 5860 return -EINVAL; 5861 if (mddev->bitmap) { 5862 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5863 if (ret) 5864 return ret; 5865 } 5866 md_set_array_sectors(mddev, newsize); 5867 set_capacity(mddev->gendisk, mddev->array_sectors); 5868 revalidate_disk(mddev->gendisk); 5869 if (sectors > mddev->dev_sectors && 5870 mddev->recovery_cp > mddev->dev_sectors) { 5871 mddev->recovery_cp = mddev->dev_sectors; 5872 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5873 } 5874 mddev->dev_sectors = sectors; 5875 mddev->resync_max_sectors = sectors; 5876 return 0; 5877 } 5878 5879 static int check_stripe_cache(struct mddev *mddev) 5880 { 5881 /* Can only proceed if there are plenty of stripe_heads. 5882 * We need a minimum of one full stripe,, and for sensible progress 5883 * it is best to have about 4 times that. 5884 * If we require 4 times, then the default 256 4K stripe_heads will 5885 * allow for chunk sizes up to 256K, which is probably OK. 5886 * If the chunk size is greater, user-space should request more 5887 * stripe_heads first. 5888 */ 5889 struct r5conf *conf = mddev->private; 5890 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5891 > conf->max_nr_stripes || 5892 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5893 > conf->max_nr_stripes) { 5894 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5895 mdname(mddev), 5896 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5897 / STRIPE_SIZE)*4); 5898 return 0; 5899 } 5900 return 1; 5901 } 5902 5903 static int check_reshape(struct mddev *mddev) 5904 { 5905 struct r5conf *conf = mddev->private; 5906 5907 if (mddev->delta_disks == 0 && 5908 mddev->new_layout == mddev->layout && 5909 mddev->new_chunk_sectors == mddev->chunk_sectors) 5910 return 0; /* nothing to do */ 5911 if (has_failed(conf)) 5912 return -EINVAL; 5913 if (mddev->delta_disks < 0) { 5914 /* We might be able to shrink, but the devices must 5915 * be made bigger first. 5916 * For raid6, 4 is the minimum size. 5917 * Otherwise 2 is the minimum 5918 */ 5919 int min = 2; 5920 if (mddev->level == 6) 5921 min = 4; 5922 if (mddev->raid_disks + mddev->delta_disks < min) 5923 return -EINVAL; 5924 } 5925 5926 if (!check_stripe_cache(mddev)) 5927 return -ENOSPC; 5928 5929 return resize_stripes(conf, (conf->previous_raid_disks 5930 + mddev->delta_disks)); 5931 } 5932 5933 static int raid5_start_reshape(struct mddev *mddev) 5934 { 5935 struct r5conf *conf = mddev->private; 5936 struct md_rdev *rdev; 5937 int spares = 0; 5938 unsigned long flags; 5939 5940 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5941 return -EBUSY; 5942 5943 if (!check_stripe_cache(mddev)) 5944 return -ENOSPC; 5945 5946 if (has_failed(conf)) 5947 return -EINVAL; 5948 5949 rdev_for_each(rdev, mddev) { 5950 if (!test_bit(In_sync, &rdev->flags) 5951 && !test_bit(Faulty, &rdev->flags)) 5952 spares++; 5953 } 5954 5955 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5956 /* Not enough devices even to make a degraded array 5957 * of that size 5958 */ 5959 return -EINVAL; 5960 5961 /* Refuse to reduce size of the array. Any reductions in 5962 * array size must be through explicit setting of array_size 5963 * attribute. 5964 */ 5965 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5966 < mddev->array_sectors) { 5967 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5968 "before number of disks\n", mdname(mddev)); 5969 return -EINVAL; 5970 } 5971 5972 atomic_set(&conf->reshape_stripes, 0); 5973 spin_lock_irq(&conf->device_lock); 5974 conf->previous_raid_disks = conf->raid_disks; 5975 conf->raid_disks += mddev->delta_disks; 5976 conf->prev_chunk_sectors = conf->chunk_sectors; 5977 conf->chunk_sectors = mddev->new_chunk_sectors; 5978 conf->prev_algo = conf->algorithm; 5979 conf->algorithm = mddev->new_layout; 5980 conf->generation++; 5981 /* Code that selects data_offset needs to see the generation update 5982 * if reshape_progress has been set - so a memory barrier needed. 5983 */ 5984 smp_mb(); 5985 if (mddev->reshape_backwards) 5986 conf->reshape_progress = raid5_size(mddev, 0, 0); 5987 else 5988 conf->reshape_progress = 0; 5989 conf->reshape_safe = conf->reshape_progress; 5990 spin_unlock_irq(&conf->device_lock); 5991 5992 /* Add some new drives, as many as will fit. 5993 * We know there are enough to make the newly sized array work. 5994 * Don't add devices if we are reducing the number of 5995 * devices in the array. This is because it is not possible 5996 * to correctly record the "partially reconstructed" state of 5997 * such devices during the reshape and confusion could result. 5998 */ 5999 if (mddev->delta_disks >= 0) { 6000 rdev_for_each(rdev, mddev) 6001 if (rdev->raid_disk < 0 && 6002 !test_bit(Faulty, &rdev->flags)) { 6003 if (raid5_add_disk(mddev, rdev) == 0) { 6004 if (rdev->raid_disk 6005 >= conf->previous_raid_disks) 6006 set_bit(In_sync, &rdev->flags); 6007 else 6008 rdev->recovery_offset = 0; 6009 6010 if (sysfs_link_rdev(mddev, rdev)) 6011 /* Failure here is OK */; 6012 } 6013 } else if (rdev->raid_disk >= conf->previous_raid_disks 6014 && !test_bit(Faulty, &rdev->flags)) { 6015 /* This is a spare that was manually added */ 6016 set_bit(In_sync, &rdev->flags); 6017 } 6018 6019 /* When a reshape changes the number of devices, 6020 * ->degraded is measured against the larger of the 6021 * pre and post number of devices. 6022 */ 6023 spin_lock_irqsave(&conf->device_lock, flags); 6024 mddev->degraded = calc_degraded(conf); 6025 spin_unlock_irqrestore(&conf->device_lock, flags); 6026 } 6027 mddev->raid_disks = conf->raid_disks; 6028 mddev->reshape_position = conf->reshape_progress; 6029 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6030 6031 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6032 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6033 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6034 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6035 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6036 "reshape"); 6037 if (!mddev->sync_thread) { 6038 mddev->recovery = 0; 6039 spin_lock_irq(&conf->device_lock); 6040 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6041 rdev_for_each(rdev, mddev) 6042 rdev->new_data_offset = rdev->data_offset; 6043 smp_wmb(); 6044 conf->reshape_progress = MaxSector; 6045 mddev->reshape_position = MaxSector; 6046 spin_unlock_irq(&conf->device_lock); 6047 return -EAGAIN; 6048 } 6049 conf->reshape_checkpoint = jiffies; 6050 md_wakeup_thread(mddev->sync_thread); 6051 md_new_event(mddev); 6052 return 0; 6053 } 6054 6055 /* This is called from the reshape thread and should make any 6056 * changes needed in 'conf' 6057 */ 6058 static void end_reshape(struct r5conf *conf) 6059 { 6060 6061 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6062 struct md_rdev *rdev; 6063 6064 spin_lock_irq(&conf->device_lock); 6065 conf->previous_raid_disks = conf->raid_disks; 6066 rdev_for_each(rdev, conf->mddev) 6067 rdev->data_offset = rdev->new_data_offset; 6068 smp_wmb(); 6069 conf->reshape_progress = MaxSector; 6070 spin_unlock_irq(&conf->device_lock); 6071 wake_up(&conf->wait_for_overlap); 6072 6073 /* read-ahead size must cover two whole stripes, which is 6074 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6075 */ 6076 if (conf->mddev->queue) { 6077 int data_disks = conf->raid_disks - conf->max_degraded; 6078 int stripe = data_disks * ((conf->chunk_sectors << 9) 6079 / PAGE_SIZE); 6080 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6081 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6082 } 6083 } 6084 } 6085 6086 /* This is called from the raid5d thread with mddev_lock held. 6087 * It makes config changes to the device. 6088 */ 6089 static void raid5_finish_reshape(struct mddev *mddev) 6090 { 6091 struct r5conf *conf = mddev->private; 6092 6093 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6094 6095 if (mddev->delta_disks > 0) { 6096 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6097 set_capacity(mddev->gendisk, mddev->array_sectors); 6098 revalidate_disk(mddev->gendisk); 6099 } else { 6100 int d; 6101 spin_lock_irq(&conf->device_lock); 6102 mddev->degraded = calc_degraded(conf); 6103 spin_unlock_irq(&conf->device_lock); 6104 for (d = conf->raid_disks ; 6105 d < conf->raid_disks - mddev->delta_disks; 6106 d++) { 6107 struct md_rdev *rdev = conf->disks[d].rdev; 6108 if (rdev) 6109 clear_bit(In_sync, &rdev->flags); 6110 rdev = conf->disks[d].replacement; 6111 if (rdev) 6112 clear_bit(In_sync, &rdev->flags); 6113 } 6114 } 6115 mddev->layout = conf->algorithm; 6116 mddev->chunk_sectors = conf->chunk_sectors; 6117 mddev->reshape_position = MaxSector; 6118 mddev->delta_disks = 0; 6119 mddev->reshape_backwards = 0; 6120 } 6121 } 6122 6123 static void raid5_quiesce(struct mddev *mddev, int state) 6124 { 6125 struct r5conf *conf = mddev->private; 6126 6127 switch(state) { 6128 case 2: /* resume for a suspend */ 6129 wake_up(&conf->wait_for_overlap); 6130 break; 6131 6132 case 1: /* stop all writes */ 6133 spin_lock_irq(&conf->device_lock); 6134 /* '2' tells resync/reshape to pause so that all 6135 * active stripes can drain 6136 */ 6137 conf->quiesce = 2; 6138 wait_event_lock_irq(conf->wait_for_stripe, 6139 atomic_read(&conf->active_stripes) == 0 && 6140 atomic_read(&conf->active_aligned_reads) == 0, 6141 conf->device_lock); 6142 conf->quiesce = 1; 6143 spin_unlock_irq(&conf->device_lock); 6144 /* allow reshape to continue */ 6145 wake_up(&conf->wait_for_overlap); 6146 break; 6147 6148 case 0: /* re-enable writes */ 6149 spin_lock_irq(&conf->device_lock); 6150 conf->quiesce = 0; 6151 wake_up(&conf->wait_for_stripe); 6152 wake_up(&conf->wait_for_overlap); 6153 spin_unlock_irq(&conf->device_lock); 6154 break; 6155 } 6156 } 6157 6158 6159 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6160 { 6161 struct r0conf *raid0_conf = mddev->private; 6162 sector_t sectors; 6163 6164 /* for raid0 takeover only one zone is supported */ 6165 if (raid0_conf->nr_strip_zones > 1) { 6166 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6167 mdname(mddev)); 6168 return ERR_PTR(-EINVAL); 6169 } 6170 6171 sectors = raid0_conf->strip_zone[0].zone_end; 6172 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6173 mddev->dev_sectors = sectors; 6174 mddev->new_level = level; 6175 mddev->new_layout = ALGORITHM_PARITY_N; 6176 mddev->new_chunk_sectors = mddev->chunk_sectors; 6177 mddev->raid_disks += 1; 6178 mddev->delta_disks = 1; 6179 /* make sure it will be not marked as dirty */ 6180 mddev->recovery_cp = MaxSector; 6181 6182 return setup_conf(mddev); 6183 } 6184 6185 6186 static void *raid5_takeover_raid1(struct mddev *mddev) 6187 { 6188 int chunksect; 6189 6190 if (mddev->raid_disks != 2 || 6191 mddev->degraded > 1) 6192 return ERR_PTR(-EINVAL); 6193 6194 /* Should check if there are write-behind devices? */ 6195 6196 chunksect = 64*2; /* 64K by default */ 6197 6198 /* The array must be an exact multiple of chunksize */ 6199 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6200 chunksect >>= 1; 6201 6202 if ((chunksect<<9) < STRIPE_SIZE) 6203 /* array size does not allow a suitable chunk size */ 6204 return ERR_PTR(-EINVAL); 6205 6206 mddev->new_level = 5; 6207 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6208 mddev->new_chunk_sectors = chunksect; 6209 6210 return setup_conf(mddev); 6211 } 6212 6213 static void *raid5_takeover_raid6(struct mddev *mddev) 6214 { 6215 int new_layout; 6216 6217 switch (mddev->layout) { 6218 case ALGORITHM_LEFT_ASYMMETRIC_6: 6219 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6220 break; 6221 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6222 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6223 break; 6224 case ALGORITHM_LEFT_SYMMETRIC_6: 6225 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6226 break; 6227 case ALGORITHM_RIGHT_SYMMETRIC_6: 6228 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6229 break; 6230 case ALGORITHM_PARITY_0_6: 6231 new_layout = ALGORITHM_PARITY_0; 6232 break; 6233 case ALGORITHM_PARITY_N: 6234 new_layout = ALGORITHM_PARITY_N; 6235 break; 6236 default: 6237 return ERR_PTR(-EINVAL); 6238 } 6239 mddev->new_level = 5; 6240 mddev->new_layout = new_layout; 6241 mddev->delta_disks = -1; 6242 mddev->raid_disks -= 1; 6243 return setup_conf(mddev); 6244 } 6245 6246 6247 static int raid5_check_reshape(struct mddev *mddev) 6248 { 6249 /* For a 2-drive array, the layout and chunk size can be changed 6250 * immediately as not restriping is needed. 6251 * For larger arrays we record the new value - after validation 6252 * to be used by a reshape pass. 6253 */ 6254 struct r5conf *conf = mddev->private; 6255 int new_chunk = mddev->new_chunk_sectors; 6256 6257 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6258 return -EINVAL; 6259 if (new_chunk > 0) { 6260 if (!is_power_of_2(new_chunk)) 6261 return -EINVAL; 6262 if (new_chunk < (PAGE_SIZE>>9)) 6263 return -EINVAL; 6264 if (mddev->array_sectors & (new_chunk-1)) 6265 /* not factor of array size */ 6266 return -EINVAL; 6267 } 6268 6269 /* They look valid */ 6270 6271 if (mddev->raid_disks == 2) { 6272 /* can make the change immediately */ 6273 if (mddev->new_layout >= 0) { 6274 conf->algorithm = mddev->new_layout; 6275 mddev->layout = mddev->new_layout; 6276 } 6277 if (new_chunk > 0) { 6278 conf->chunk_sectors = new_chunk ; 6279 mddev->chunk_sectors = new_chunk; 6280 } 6281 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6282 md_wakeup_thread(mddev->thread); 6283 } 6284 return check_reshape(mddev); 6285 } 6286 6287 static int raid6_check_reshape(struct mddev *mddev) 6288 { 6289 int new_chunk = mddev->new_chunk_sectors; 6290 6291 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6292 return -EINVAL; 6293 if (new_chunk > 0) { 6294 if (!is_power_of_2(new_chunk)) 6295 return -EINVAL; 6296 if (new_chunk < (PAGE_SIZE >> 9)) 6297 return -EINVAL; 6298 if (mddev->array_sectors & (new_chunk-1)) 6299 /* not factor of array size */ 6300 return -EINVAL; 6301 } 6302 6303 /* They look valid */ 6304 return check_reshape(mddev); 6305 } 6306 6307 static void *raid5_takeover(struct mddev *mddev) 6308 { 6309 /* raid5 can take over: 6310 * raid0 - if there is only one strip zone - make it a raid4 layout 6311 * raid1 - if there are two drives. We need to know the chunk size 6312 * raid4 - trivial - just use a raid4 layout. 6313 * raid6 - Providing it is a *_6 layout 6314 */ 6315 if (mddev->level == 0) 6316 return raid45_takeover_raid0(mddev, 5); 6317 if (mddev->level == 1) 6318 return raid5_takeover_raid1(mddev); 6319 if (mddev->level == 4) { 6320 mddev->new_layout = ALGORITHM_PARITY_N; 6321 mddev->new_level = 5; 6322 return setup_conf(mddev); 6323 } 6324 if (mddev->level == 6) 6325 return raid5_takeover_raid6(mddev); 6326 6327 return ERR_PTR(-EINVAL); 6328 } 6329 6330 static void *raid4_takeover(struct mddev *mddev) 6331 { 6332 /* raid4 can take over: 6333 * raid0 - if there is only one strip zone 6334 * raid5 - if layout is right 6335 */ 6336 if (mddev->level == 0) 6337 return raid45_takeover_raid0(mddev, 4); 6338 if (mddev->level == 5 && 6339 mddev->layout == ALGORITHM_PARITY_N) { 6340 mddev->new_layout = 0; 6341 mddev->new_level = 4; 6342 return setup_conf(mddev); 6343 } 6344 return ERR_PTR(-EINVAL); 6345 } 6346 6347 static struct md_personality raid5_personality; 6348 6349 static void *raid6_takeover(struct mddev *mddev) 6350 { 6351 /* Currently can only take over a raid5. We map the 6352 * personality to an equivalent raid6 personality 6353 * with the Q block at the end. 6354 */ 6355 int new_layout; 6356 6357 if (mddev->pers != &raid5_personality) 6358 return ERR_PTR(-EINVAL); 6359 if (mddev->degraded > 1) 6360 return ERR_PTR(-EINVAL); 6361 if (mddev->raid_disks > 253) 6362 return ERR_PTR(-EINVAL); 6363 if (mddev->raid_disks < 3) 6364 return ERR_PTR(-EINVAL); 6365 6366 switch (mddev->layout) { 6367 case ALGORITHM_LEFT_ASYMMETRIC: 6368 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6369 break; 6370 case ALGORITHM_RIGHT_ASYMMETRIC: 6371 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6372 break; 6373 case ALGORITHM_LEFT_SYMMETRIC: 6374 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6375 break; 6376 case ALGORITHM_RIGHT_SYMMETRIC: 6377 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6378 break; 6379 case ALGORITHM_PARITY_0: 6380 new_layout = ALGORITHM_PARITY_0_6; 6381 break; 6382 case ALGORITHM_PARITY_N: 6383 new_layout = ALGORITHM_PARITY_N; 6384 break; 6385 default: 6386 return ERR_PTR(-EINVAL); 6387 } 6388 mddev->new_level = 6; 6389 mddev->new_layout = new_layout; 6390 mddev->delta_disks = 1; 6391 mddev->raid_disks += 1; 6392 return setup_conf(mddev); 6393 } 6394 6395 6396 static struct md_personality raid6_personality = 6397 { 6398 .name = "raid6", 6399 .level = 6, 6400 .owner = THIS_MODULE, 6401 .make_request = make_request, 6402 .run = run, 6403 .stop = stop, 6404 .status = status, 6405 .error_handler = error, 6406 .hot_add_disk = raid5_add_disk, 6407 .hot_remove_disk= raid5_remove_disk, 6408 .spare_active = raid5_spare_active, 6409 .sync_request = sync_request, 6410 .resize = raid5_resize, 6411 .size = raid5_size, 6412 .check_reshape = raid6_check_reshape, 6413 .start_reshape = raid5_start_reshape, 6414 .finish_reshape = raid5_finish_reshape, 6415 .quiesce = raid5_quiesce, 6416 .takeover = raid6_takeover, 6417 }; 6418 static struct md_personality raid5_personality = 6419 { 6420 .name = "raid5", 6421 .level = 5, 6422 .owner = THIS_MODULE, 6423 .make_request = make_request, 6424 .run = run, 6425 .stop = stop, 6426 .status = status, 6427 .error_handler = error, 6428 .hot_add_disk = raid5_add_disk, 6429 .hot_remove_disk= raid5_remove_disk, 6430 .spare_active = raid5_spare_active, 6431 .sync_request = sync_request, 6432 .resize = raid5_resize, 6433 .size = raid5_size, 6434 .check_reshape = raid5_check_reshape, 6435 .start_reshape = raid5_start_reshape, 6436 .finish_reshape = raid5_finish_reshape, 6437 .quiesce = raid5_quiesce, 6438 .takeover = raid5_takeover, 6439 }; 6440 6441 static struct md_personality raid4_personality = 6442 { 6443 .name = "raid4", 6444 .level = 4, 6445 .owner = THIS_MODULE, 6446 .make_request = make_request, 6447 .run = run, 6448 .stop = stop, 6449 .status = status, 6450 .error_handler = error, 6451 .hot_add_disk = raid5_add_disk, 6452 .hot_remove_disk= raid5_remove_disk, 6453 .spare_active = raid5_spare_active, 6454 .sync_request = sync_request, 6455 .resize = raid5_resize, 6456 .size = raid5_size, 6457 .check_reshape = raid5_check_reshape, 6458 .start_reshape = raid5_start_reshape, 6459 .finish_reshape = raid5_finish_reshape, 6460 .quiesce = raid5_quiesce, 6461 .takeover = raid4_takeover, 6462 }; 6463 6464 static int __init raid5_init(void) 6465 { 6466 register_md_personality(&raid6_personality); 6467 register_md_personality(&raid5_personality); 6468 register_md_personality(&raid4_personality); 6469 return 0; 6470 } 6471 6472 static void raid5_exit(void) 6473 { 6474 unregister_md_personality(&raid6_personality); 6475 unregister_md_personality(&raid5_personality); 6476 unregister_md_personality(&raid4_personality); 6477 } 6478 6479 module_init(raid5_init); 6480 module_exit(raid5_exit); 6481 MODULE_LICENSE("GPL"); 6482 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6483 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6484 MODULE_ALIAS("md-raid5"); 6485 MODULE_ALIAS("md-raid4"); 6486 MODULE_ALIAS("md-level-5"); 6487 MODULE_ALIAS("md-level-4"); 6488 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6489 MODULE_ALIAS("md-raid6"); 6490 MODULE_ALIAS("md-level-6"); 6491 6492 /* This used to be two separate modules, they were: */ 6493 MODULE_ALIAS("raid5"); 6494 MODULE_ALIAS("raid6"); 6495