1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <trace/events/block.h> 57 58 #include "md.h" 59 #include "raid5.h" 60 #include "raid0.h" 61 #include "bitmap.h" 62 63 /* 64 * Stripe cache 65 */ 66 67 #define NR_STRIPES 256 68 #define STRIPE_SIZE PAGE_SIZE 69 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 70 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 71 #define IO_THRESHOLD 1 72 #define BYPASS_THRESHOLD 1 73 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 74 #define HASH_MASK (NR_HASH - 1) 75 76 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 77 { 78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 79 return &conf->stripe_hashtbl[hash]; 80 } 81 82 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 83 * order without overlap. There may be several bio's per stripe+device, and 84 * a bio could span several devices. 85 * When walking this list for a particular stripe+device, we must never proceed 86 * beyond a bio that extends past this device, as the next bio might no longer 87 * be valid. 88 * This function is used to determine the 'next' bio in the list, given the sector 89 * of the current stripe+device 90 */ 91 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 92 { 93 int sectors = bio->bi_size >> 9; 94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 95 return bio->bi_next; 96 else 97 return NULL; 98 } 99 100 /* 101 * We maintain a biased count of active stripes in the bottom 16 bits of 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 103 */ 104 static inline int raid5_bi_processed_stripes(struct bio *bio) 105 { 106 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 107 return (atomic_read(segments) >> 16) & 0xffff; 108 } 109 110 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 111 { 112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 113 return atomic_sub_return(1, segments) & 0xffff; 114 } 115 116 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 117 { 118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 119 atomic_inc(segments); 120 } 121 122 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 123 unsigned int cnt) 124 { 125 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 126 int old, new; 127 128 do { 129 old = atomic_read(segments); 130 new = (old & 0xffff) | (cnt << 16); 131 } while (atomic_cmpxchg(segments, old, new) != old); 132 } 133 134 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 135 { 136 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 137 atomic_set(segments, cnt); 138 } 139 140 /* Find first data disk in a raid6 stripe */ 141 static inline int raid6_d0(struct stripe_head *sh) 142 { 143 if (sh->ddf_layout) 144 /* ddf always start from first device */ 145 return 0; 146 /* md starts just after Q block */ 147 if (sh->qd_idx == sh->disks - 1) 148 return 0; 149 else 150 return sh->qd_idx + 1; 151 } 152 static inline int raid6_next_disk(int disk, int raid_disks) 153 { 154 disk++; 155 return (disk < raid_disks) ? disk : 0; 156 } 157 158 /* When walking through the disks in a raid5, starting at raid6_d0, 159 * We need to map each disk to a 'slot', where the data disks are slot 160 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 161 * is raid_disks-1. This help does that mapping. 162 */ 163 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 164 int *count, int syndrome_disks) 165 { 166 int slot = *count; 167 168 if (sh->ddf_layout) 169 (*count)++; 170 if (idx == sh->pd_idx) 171 return syndrome_disks; 172 if (idx == sh->qd_idx) 173 return syndrome_disks + 1; 174 if (!sh->ddf_layout) 175 (*count)++; 176 return slot; 177 } 178 179 static void return_io(struct bio *return_bi) 180 { 181 struct bio *bi = return_bi; 182 while (bi) { 183 184 return_bi = bi->bi_next; 185 bi->bi_next = NULL; 186 bi->bi_size = 0; 187 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 188 bi, 0); 189 bio_endio(bi, 0); 190 bi = return_bi; 191 } 192 } 193 194 static void print_raid5_conf (struct r5conf *conf); 195 196 static int stripe_operations_active(struct stripe_head *sh) 197 { 198 return sh->check_state || sh->reconstruct_state || 199 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 200 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 201 } 202 203 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 204 { 205 BUG_ON(!list_empty(&sh->lru)); 206 BUG_ON(atomic_read(&conf->active_stripes)==0); 207 if (test_bit(STRIPE_HANDLE, &sh->state)) { 208 if (test_bit(STRIPE_DELAYED, &sh->state) && 209 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 210 list_add_tail(&sh->lru, &conf->delayed_list); 211 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 212 sh->bm_seq - conf->seq_write > 0) 213 list_add_tail(&sh->lru, &conf->bitmap_list); 214 else { 215 clear_bit(STRIPE_DELAYED, &sh->state); 216 clear_bit(STRIPE_BIT_DELAY, &sh->state); 217 list_add_tail(&sh->lru, &conf->handle_list); 218 } 219 md_wakeup_thread(conf->mddev->thread); 220 } else { 221 BUG_ON(stripe_operations_active(sh)); 222 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 223 if (atomic_dec_return(&conf->preread_active_stripes) 224 < IO_THRESHOLD) 225 md_wakeup_thread(conf->mddev->thread); 226 atomic_dec(&conf->active_stripes); 227 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 228 list_add_tail(&sh->lru, &conf->inactive_list); 229 wake_up(&conf->wait_for_stripe); 230 if (conf->retry_read_aligned) 231 md_wakeup_thread(conf->mddev->thread); 232 } 233 } 234 } 235 236 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 237 { 238 if (atomic_dec_and_test(&sh->count)) 239 do_release_stripe(conf, sh); 240 } 241 242 static void release_stripe(struct stripe_head *sh) 243 { 244 struct r5conf *conf = sh->raid_conf; 245 unsigned long flags; 246 247 local_irq_save(flags); 248 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 249 do_release_stripe(conf, sh); 250 spin_unlock(&conf->device_lock); 251 } 252 local_irq_restore(flags); 253 } 254 255 static inline void remove_hash(struct stripe_head *sh) 256 { 257 pr_debug("remove_hash(), stripe %llu\n", 258 (unsigned long long)sh->sector); 259 260 hlist_del_init(&sh->hash); 261 } 262 263 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 264 { 265 struct hlist_head *hp = stripe_hash(conf, sh->sector); 266 267 pr_debug("insert_hash(), stripe %llu\n", 268 (unsigned long long)sh->sector); 269 270 hlist_add_head(&sh->hash, hp); 271 } 272 273 274 /* find an idle stripe, make sure it is unhashed, and return it. */ 275 static struct stripe_head *get_free_stripe(struct r5conf *conf) 276 { 277 struct stripe_head *sh = NULL; 278 struct list_head *first; 279 280 if (list_empty(&conf->inactive_list)) 281 goto out; 282 first = conf->inactive_list.next; 283 sh = list_entry(first, struct stripe_head, lru); 284 list_del_init(first); 285 remove_hash(sh); 286 atomic_inc(&conf->active_stripes); 287 out: 288 return sh; 289 } 290 291 static void shrink_buffers(struct stripe_head *sh) 292 { 293 struct page *p; 294 int i; 295 int num = sh->raid_conf->pool_size; 296 297 for (i = 0; i < num ; i++) { 298 p = sh->dev[i].page; 299 if (!p) 300 continue; 301 sh->dev[i].page = NULL; 302 put_page(p); 303 } 304 } 305 306 static int grow_buffers(struct stripe_head *sh) 307 { 308 int i; 309 int num = sh->raid_conf->pool_size; 310 311 for (i = 0; i < num; i++) { 312 struct page *page; 313 314 if (!(page = alloc_page(GFP_KERNEL))) { 315 return 1; 316 } 317 sh->dev[i].page = page; 318 } 319 return 0; 320 } 321 322 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 323 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 324 struct stripe_head *sh); 325 326 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 327 { 328 struct r5conf *conf = sh->raid_conf; 329 int i; 330 331 BUG_ON(atomic_read(&sh->count) != 0); 332 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 333 BUG_ON(stripe_operations_active(sh)); 334 335 pr_debug("init_stripe called, stripe %llu\n", 336 (unsigned long long)sh->sector); 337 338 remove_hash(sh); 339 340 sh->generation = conf->generation - previous; 341 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 342 sh->sector = sector; 343 stripe_set_idx(sector, conf, previous, sh); 344 sh->state = 0; 345 346 347 for (i = sh->disks; i--; ) { 348 struct r5dev *dev = &sh->dev[i]; 349 350 if (dev->toread || dev->read || dev->towrite || dev->written || 351 test_bit(R5_LOCKED, &dev->flags)) { 352 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 353 (unsigned long long)sh->sector, i, dev->toread, 354 dev->read, dev->towrite, dev->written, 355 test_bit(R5_LOCKED, &dev->flags)); 356 WARN_ON(1); 357 } 358 dev->flags = 0; 359 raid5_build_block(sh, i, previous); 360 } 361 insert_hash(conf, sh); 362 } 363 364 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 365 short generation) 366 { 367 struct stripe_head *sh; 368 struct hlist_node *hn; 369 370 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 371 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 372 if (sh->sector == sector && sh->generation == generation) 373 return sh; 374 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 375 return NULL; 376 } 377 378 /* 379 * Need to check if array has failed when deciding whether to: 380 * - start an array 381 * - remove non-faulty devices 382 * - add a spare 383 * - allow a reshape 384 * This determination is simple when no reshape is happening. 385 * However if there is a reshape, we need to carefully check 386 * both the before and after sections. 387 * This is because some failed devices may only affect one 388 * of the two sections, and some non-in_sync devices may 389 * be insync in the section most affected by failed devices. 390 */ 391 static int calc_degraded(struct r5conf *conf) 392 { 393 int degraded, degraded2; 394 int i; 395 396 rcu_read_lock(); 397 degraded = 0; 398 for (i = 0; i < conf->previous_raid_disks; i++) { 399 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 400 if (rdev && test_bit(Faulty, &rdev->flags)) 401 rdev = rcu_dereference(conf->disks[i].replacement); 402 if (!rdev || test_bit(Faulty, &rdev->flags)) 403 degraded++; 404 else if (test_bit(In_sync, &rdev->flags)) 405 ; 406 else 407 /* not in-sync or faulty. 408 * If the reshape increases the number of devices, 409 * this is being recovered by the reshape, so 410 * this 'previous' section is not in_sync. 411 * If the number of devices is being reduced however, 412 * the device can only be part of the array if 413 * we are reverting a reshape, so this section will 414 * be in-sync. 415 */ 416 if (conf->raid_disks >= conf->previous_raid_disks) 417 degraded++; 418 } 419 rcu_read_unlock(); 420 if (conf->raid_disks == conf->previous_raid_disks) 421 return degraded; 422 rcu_read_lock(); 423 degraded2 = 0; 424 for (i = 0; i < conf->raid_disks; i++) { 425 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 426 if (rdev && test_bit(Faulty, &rdev->flags)) 427 rdev = rcu_dereference(conf->disks[i].replacement); 428 if (!rdev || test_bit(Faulty, &rdev->flags)) 429 degraded2++; 430 else if (test_bit(In_sync, &rdev->flags)) 431 ; 432 else 433 /* not in-sync or faulty. 434 * If reshape increases the number of devices, this 435 * section has already been recovered, else it 436 * almost certainly hasn't. 437 */ 438 if (conf->raid_disks <= conf->previous_raid_disks) 439 degraded2++; 440 } 441 rcu_read_unlock(); 442 if (degraded2 > degraded) 443 return degraded2; 444 return degraded; 445 } 446 447 static int has_failed(struct r5conf *conf) 448 { 449 int degraded; 450 451 if (conf->mddev->reshape_position == MaxSector) 452 return conf->mddev->degraded > conf->max_degraded; 453 454 degraded = calc_degraded(conf); 455 if (degraded > conf->max_degraded) 456 return 1; 457 return 0; 458 } 459 460 static struct stripe_head * 461 get_active_stripe(struct r5conf *conf, sector_t sector, 462 int previous, int noblock, int noquiesce) 463 { 464 struct stripe_head *sh; 465 466 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 467 468 spin_lock_irq(&conf->device_lock); 469 470 do { 471 wait_event_lock_irq(conf->wait_for_stripe, 472 conf->quiesce == 0 || noquiesce, 473 conf->device_lock); 474 sh = __find_stripe(conf, sector, conf->generation - previous); 475 if (!sh) { 476 if (!conf->inactive_blocked) 477 sh = get_free_stripe(conf); 478 if (noblock && sh == NULL) 479 break; 480 if (!sh) { 481 conf->inactive_blocked = 1; 482 wait_event_lock_irq(conf->wait_for_stripe, 483 !list_empty(&conf->inactive_list) && 484 (atomic_read(&conf->active_stripes) 485 < (conf->max_nr_stripes *3/4) 486 || !conf->inactive_blocked), 487 conf->device_lock); 488 conf->inactive_blocked = 0; 489 } else 490 init_stripe(sh, sector, previous); 491 } else { 492 if (atomic_read(&sh->count)) { 493 BUG_ON(!list_empty(&sh->lru) 494 && !test_bit(STRIPE_EXPANDING, &sh->state) 495 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 496 } else { 497 if (!test_bit(STRIPE_HANDLE, &sh->state)) 498 atomic_inc(&conf->active_stripes); 499 if (list_empty(&sh->lru) && 500 !test_bit(STRIPE_EXPANDING, &sh->state)) 501 BUG(); 502 list_del_init(&sh->lru); 503 } 504 } 505 } while (sh == NULL); 506 507 if (sh) 508 atomic_inc(&sh->count); 509 510 spin_unlock_irq(&conf->device_lock); 511 return sh; 512 } 513 514 /* Determine if 'data_offset' or 'new_data_offset' should be used 515 * in this stripe_head. 516 */ 517 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 518 { 519 sector_t progress = conf->reshape_progress; 520 /* Need a memory barrier to make sure we see the value 521 * of conf->generation, or ->data_offset that was set before 522 * reshape_progress was updated. 523 */ 524 smp_rmb(); 525 if (progress == MaxSector) 526 return 0; 527 if (sh->generation == conf->generation - 1) 528 return 0; 529 /* We are in a reshape, and this is a new-generation stripe, 530 * so use new_data_offset. 531 */ 532 return 1; 533 } 534 535 static void 536 raid5_end_read_request(struct bio *bi, int error); 537 static void 538 raid5_end_write_request(struct bio *bi, int error); 539 540 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 541 { 542 struct r5conf *conf = sh->raid_conf; 543 int i, disks = sh->disks; 544 545 might_sleep(); 546 547 for (i = disks; i--; ) { 548 int rw; 549 int replace_only = 0; 550 struct bio *bi, *rbi; 551 struct md_rdev *rdev, *rrdev = NULL; 552 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 553 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 554 rw = WRITE_FUA; 555 else 556 rw = WRITE; 557 if (test_bit(R5_Discard, &sh->dev[i].flags)) 558 rw |= REQ_DISCARD; 559 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 560 rw = READ; 561 else if (test_and_clear_bit(R5_WantReplace, 562 &sh->dev[i].flags)) { 563 rw = WRITE; 564 replace_only = 1; 565 } else 566 continue; 567 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 568 rw |= REQ_SYNC; 569 570 bi = &sh->dev[i].req; 571 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 572 573 bi->bi_rw = rw; 574 rbi->bi_rw = rw; 575 if (rw & WRITE) { 576 bi->bi_end_io = raid5_end_write_request; 577 rbi->bi_end_io = raid5_end_write_request; 578 } else 579 bi->bi_end_io = raid5_end_read_request; 580 581 rcu_read_lock(); 582 rrdev = rcu_dereference(conf->disks[i].replacement); 583 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 584 rdev = rcu_dereference(conf->disks[i].rdev); 585 if (!rdev) { 586 rdev = rrdev; 587 rrdev = NULL; 588 } 589 if (rw & WRITE) { 590 if (replace_only) 591 rdev = NULL; 592 if (rdev == rrdev) 593 /* We raced and saw duplicates */ 594 rrdev = NULL; 595 } else { 596 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 597 rdev = rrdev; 598 rrdev = NULL; 599 } 600 601 if (rdev && test_bit(Faulty, &rdev->flags)) 602 rdev = NULL; 603 if (rdev) 604 atomic_inc(&rdev->nr_pending); 605 if (rrdev && test_bit(Faulty, &rrdev->flags)) 606 rrdev = NULL; 607 if (rrdev) 608 atomic_inc(&rrdev->nr_pending); 609 rcu_read_unlock(); 610 611 /* We have already checked bad blocks for reads. Now 612 * need to check for writes. We never accept write errors 613 * on the replacement, so we don't to check rrdev. 614 */ 615 while ((rw & WRITE) && rdev && 616 test_bit(WriteErrorSeen, &rdev->flags)) { 617 sector_t first_bad; 618 int bad_sectors; 619 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 620 &first_bad, &bad_sectors); 621 if (!bad) 622 break; 623 624 if (bad < 0) { 625 set_bit(BlockedBadBlocks, &rdev->flags); 626 if (!conf->mddev->external && 627 conf->mddev->flags) { 628 /* It is very unlikely, but we might 629 * still need to write out the 630 * bad block log - better give it 631 * a chance*/ 632 md_check_recovery(conf->mddev); 633 } 634 /* 635 * Because md_wait_for_blocked_rdev 636 * will dec nr_pending, we must 637 * increment it first. 638 */ 639 atomic_inc(&rdev->nr_pending); 640 md_wait_for_blocked_rdev(rdev, conf->mddev); 641 } else { 642 /* Acknowledged bad block - skip the write */ 643 rdev_dec_pending(rdev, conf->mddev); 644 rdev = NULL; 645 } 646 } 647 648 if (rdev) { 649 if (s->syncing || s->expanding || s->expanded 650 || s->replacing) 651 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 652 653 set_bit(STRIPE_IO_STARTED, &sh->state); 654 655 bi->bi_bdev = rdev->bdev; 656 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 657 __func__, (unsigned long long)sh->sector, 658 bi->bi_rw, i); 659 atomic_inc(&sh->count); 660 if (use_new_offset(conf, sh)) 661 bi->bi_sector = (sh->sector 662 + rdev->new_data_offset); 663 else 664 bi->bi_sector = (sh->sector 665 + rdev->data_offset); 666 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 667 bi->bi_rw |= REQ_FLUSH; 668 669 bi->bi_flags = 1 << BIO_UPTODATE; 670 bi->bi_idx = 0; 671 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 672 bi->bi_io_vec[0].bv_offset = 0; 673 bi->bi_size = STRIPE_SIZE; 674 bi->bi_next = NULL; 675 if (rrdev) 676 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 677 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 678 bi, disk_devt(conf->mddev->gendisk), 679 sh->dev[i].sector); 680 generic_make_request(bi); 681 } 682 if (rrdev) { 683 if (s->syncing || s->expanding || s->expanded 684 || s->replacing) 685 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 686 687 set_bit(STRIPE_IO_STARTED, &sh->state); 688 689 rbi->bi_bdev = rrdev->bdev; 690 pr_debug("%s: for %llu schedule op %ld on " 691 "replacement disc %d\n", 692 __func__, (unsigned long long)sh->sector, 693 rbi->bi_rw, i); 694 atomic_inc(&sh->count); 695 if (use_new_offset(conf, sh)) 696 rbi->bi_sector = (sh->sector 697 + rrdev->new_data_offset); 698 else 699 rbi->bi_sector = (sh->sector 700 + rrdev->data_offset); 701 rbi->bi_flags = 1 << BIO_UPTODATE; 702 rbi->bi_idx = 0; 703 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 704 rbi->bi_io_vec[0].bv_offset = 0; 705 rbi->bi_size = STRIPE_SIZE; 706 rbi->bi_next = NULL; 707 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 708 rbi, disk_devt(conf->mddev->gendisk), 709 sh->dev[i].sector); 710 generic_make_request(rbi); 711 } 712 if (!rdev && !rrdev) { 713 if (rw & WRITE) 714 set_bit(STRIPE_DEGRADED, &sh->state); 715 pr_debug("skip op %ld on disc %d for sector %llu\n", 716 bi->bi_rw, i, (unsigned long long)sh->sector); 717 clear_bit(R5_LOCKED, &sh->dev[i].flags); 718 set_bit(STRIPE_HANDLE, &sh->state); 719 } 720 } 721 } 722 723 static struct dma_async_tx_descriptor * 724 async_copy_data(int frombio, struct bio *bio, struct page *page, 725 sector_t sector, struct dma_async_tx_descriptor *tx) 726 { 727 struct bio_vec *bvl; 728 struct page *bio_page; 729 int i; 730 int page_offset; 731 struct async_submit_ctl submit; 732 enum async_tx_flags flags = 0; 733 734 if (bio->bi_sector >= sector) 735 page_offset = (signed)(bio->bi_sector - sector) * 512; 736 else 737 page_offset = (signed)(sector - bio->bi_sector) * -512; 738 739 if (frombio) 740 flags |= ASYNC_TX_FENCE; 741 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 742 743 bio_for_each_segment(bvl, bio, i) { 744 int len = bvl->bv_len; 745 int clen; 746 int b_offset = 0; 747 748 if (page_offset < 0) { 749 b_offset = -page_offset; 750 page_offset += b_offset; 751 len -= b_offset; 752 } 753 754 if (len > 0 && page_offset + len > STRIPE_SIZE) 755 clen = STRIPE_SIZE - page_offset; 756 else 757 clen = len; 758 759 if (clen > 0) { 760 b_offset += bvl->bv_offset; 761 bio_page = bvl->bv_page; 762 if (frombio) 763 tx = async_memcpy(page, bio_page, page_offset, 764 b_offset, clen, &submit); 765 else 766 tx = async_memcpy(bio_page, page, b_offset, 767 page_offset, clen, &submit); 768 } 769 /* chain the operations */ 770 submit.depend_tx = tx; 771 772 if (clen < len) /* hit end of page */ 773 break; 774 page_offset += len; 775 } 776 777 return tx; 778 } 779 780 static void ops_complete_biofill(void *stripe_head_ref) 781 { 782 struct stripe_head *sh = stripe_head_ref; 783 struct bio *return_bi = NULL; 784 int i; 785 786 pr_debug("%s: stripe %llu\n", __func__, 787 (unsigned long long)sh->sector); 788 789 /* clear completed biofills */ 790 for (i = sh->disks; i--; ) { 791 struct r5dev *dev = &sh->dev[i]; 792 793 /* acknowledge completion of a biofill operation */ 794 /* and check if we need to reply to a read request, 795 * new R5_Wantfill requests are held off until 796 * !STRIPE_BIOFILL_RUN 797 */ 798 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 799 struct bio *rbi, *rbi2; 800 801 BUG_ON(!dev->read); 802 rbi = dev->read; 803 dev->read = NULL; 804 while (rbi && rbi->bi_sector < 805 dev->sector + STRIPE_SECTORS) { 806 rbi2 = r5_next_bio(rbi, dev->sector); 807 if (!raid5_dec_bi_active_stripes(rbi)) { 808 rbi->bi_next = return_bi; 809 return_bi = rbi; 810 } 811 rbi = rbi2; 812 } 813 } 814 } 815 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 816 817 return_io(return_bi); 818 819 set_bit(STRIPE_HANDLE, &sh->state); 820 release_stripe(sh); 821 } 822 823 static void ops_run_biofill(struct stripe_head *sh) 824 { 825 struct dma_async_tx_descriptor *tx = NULL; 826 struct async_submit_ctl submit; 827 int i; 828 829 pr_debug("%s: stripe %llu\n", __func__, 830 (unsigned long long)sh->sector); 831 832 for (i = sh->disks; i--; ) { 833 struct r5dev *dev = &sh->dev[i]; 834 if (test_bit(R5_Wantfill, &dev->flags)) { 835 struct bio *rbi; 836 spin_lock_irq(&sh->stripe_lock); 837 dev->read = rbi = dev->toread; 838 dev->toread = NULL; 839 spin_unlock_irq(&sh->stripe_lock); 840 while (rbi && rbi->bi_sector < 841 dev->sector + STRIPE_SECTORS) { 842 tx = async_copy_data(0, rbi, dev->page, 843 dev->sector, tx); 844 rbi = r5_next_bio(rbi, dev->sector); 845 } 846 } 847 } 848 849 atomic_inc(&sh->count); 850 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 851 async_trigger_callback(&submit); 852 } 853 854 static void mark_target_uptodate(struct stripe_head *sh, int target) 855 { 856 struct r5dev *tgt; 857 858 if (target < 0) 859 return; 860 861 tgt = &sh->dev[target]; 862 set_bit(R5_UPTODATE, &tgt->flags); 863 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 864 clear_bit(R5_Wantcompute, &tgt->flags); 865 } 866 867 static void ops_complete_compute(void *stripe_head_ref) 868 { 869 struct stripe_head *sh = stripe_head_ref; 870 871 pr_debug("%s: stripe %llu\n", __func__, 872 (unsigned long long)sh->sector); 873 874 /* mark the computed target(s) as uptodate */ 875 mark_target_uptodate(sh, sh->ops.target); 876 mark_target_uptodate(sh, sh->ops.target2); 877 878 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 879 if (sh->check_state == check_state_compute_run) 880 sh->check_state = check_state_compute_result; 881 set_bit(STRIPE_HANDLE, &sh->state); 882 release_stripe(sh); 883 } 884 885 /* return a pointer to the address conversion region of the scribble buffer */ 886 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 887 struct raid5_percpu *percpu) 888 { 889 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 890 } 891 892 static struct dma_async_tx_descriptor * 893 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 894 { 895 int disks = sh->disks; 896 struct page **xor_srcs = percpu->scribble; 897 int target = sh->ops.target; 898 struct r5dev *tgt = &sh->dev[target]; 899 struct page *xor_dest = tgt->page; 900 int count = 0; 901 struct dma_async_tx_descriptor *tx; 902 struct async_submit_ctl submit; 903 int i; 904 905 pr_debug("%s: stripe %llu block: %d\n", 906 __func__, (unsigned long long)sh->sector, target); 907 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 908 909 for (i = disks; i--; ) 910 if (i != target) 911 xor_srcs[count++] = sh->dev[i].page; 912 913 atomic_inc(&sh->count); 914 915 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 916 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 917 if (unlikely(count == 1)) 918 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 919 else 920 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 921 922 return tx; 923 } 924 925 /* set_syndrome_sources - populate source buffers for gen_syndrome 926 * @srcs - (struct page *) array of size sh->disks 927 * @sh - stripe_head to parse 928 * 929 * Populates srcs in proper layout order for the stripe and returns the 930 * 'count' of sources to be used in a call to async_gen_syndrome. The P 931 * destination buffer is recorded in srcs[count] and the Q destination 932 * is recorded in srcs[count+1]]. 933 */ 934 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 935 { 936 int disks = sh->disks; 937 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 938 int d0_idx = raid6_d0(sh); 939 int count; 940 int i; 941 942 for (i = 0; i < disks; i++) 943 srcs[i] = NULL; 944 945 count = 0; 946 i = d0_idx; 947 do { 948 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 949 950 srcs[slot] = sh->dev[i].page; 951 i = raid6_next_disk(i, disks); 952 } while (i != d0_idx); 953 954 return syndrome_disks; 955 } 956 957 static struct dma_async_tx_descriptor * 958 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 959 { 960 int disks = sh->disks; 961 struct page **blocks = percpu->scribble; 962 int target; 963 int qd_idx = sh->qd_idx; 964 struct dma_async_tx_descriptor *tx; 965 struct async_submit_ctl submit; 966 struct r5dev *tgt; 967 struct page *dest; 968 int i; 969 int count; 970 971 if (sh->ops.target < 0) 972 target = sh->ops.target2; 973 else if (sh->ops.target2 < 0) 974 target = sh->ops.target; 975 else 976 /* we should only have one valid target */ 977 BUG(); 978 BUG_ON(target < 0); 979 pr_debug("%s: stripe %llu block: %d\n", 980 __func__, (unsigned long long)sh->sector, target); 981 982 tgt = &sh->dev[target]; 983 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 984 dest = tgt->page; 985 986 atomic_inc(&sh->count); 987 988 if (target == qd_idx) { 989 count = set_syndrome_sources(blocks, sh); 990 blocks[count] = NULL; /* regenerating p is not necessary */ 991 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 992 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 993 ops_complete_compute, sh, 994 to_addr_conv(sh, percpu)); 995 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 996 } else { 997 /* Compute any data- or p-drive using XOR */ 998 count = 0; 999 for (i = disks; i-- ; ) { 1000 if (i == target || i == qd_idx) 1001 continue; 1002 blocks[count++] = sh->dev[i].page; 1003 } 1004 1005 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1006 NULL, ops_complete_compute, sh, 1007 to_addr_conv(sh, percpu)); 1008 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1009 } 1010 1011 return tx; 1012 } 1013 1014 static struct dma_async_tx_descriptor * 1015 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1016 { 1017 int i, count, disks = sh->disks; 1018 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1019 int d0_idx = raid6_d0(sh); 1020 int faila = -1, failb = -1; 1021 int target = sh->ops.target; 1022 int target2 = sh->ops.target2; 1023 struct r5dev *tgt = &sh->dev[target]; 1024 struct r5dev *tgt2 = &sh->dev[target2]; 1025 struct dma_async_tx_descriptor *tx; 1026 struct page **blocks = percpu->scribble; 1027 struct async_submit_ctl submit; 1028 1029 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1030 __func__, (unsigned long long)sh->sector, target, target2); 1031 BUG_ON(target < 0 || target2 < 0); 1032 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1033 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1034 1035 /* we need to open-code set_syndrome_sources to handle the 1036 * slot number conversion for 'faila' and 'failb' 1037 */ 1038 for (i = 0; i < disks ; i++) 1039 blocks[i] = NULL; 1040 count = 0; 1041 i = d0_idx; 1042 do { 1043 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1044 1045 blocks[slot] = sh->dev[i].page; 1046 1047 if (i == target) 1048 faila = slot; 1049 if (i == target2) 1050 failb = slot; 1051 i = raid6_next_disk(i, disks); 1052 } while (i != d0_idx); 1053 1054 BUG_ON(faila == failb); 1055 if (failb < faila) 1056 swap(faila, failb); 1057 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1058 __func__, (unsigned long long)sh->sector, faila, failb); 1059 1060 atomic_inc(&sh->count); 1061 1062 if (failb == syndrome_disks+1) { 1063 /* Q disk is one of the missing disks */ 1064 if (faila == syndrome_disks) { 1065 /* Missing P+Q, just recompute */ 1066 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1067 ops_complete_compute, sh, 1068 to_addr_conv(sh, percpu)); 1069 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1070 STRIPE_SIZE, &submit); 1071 } else { 1072 struct page *dest; 1073 int data_target; 1074 int qd_idx = sh->qd_idx; 1075 1076 /* Missing D+Q: recompute D from P, then recompute Q */ 1077 if (target == qd_idx) 1078 data_target = target2; 1079 else 1080 data_target = target; 1081 1082 count = 0; 1083 for (i = disks; i-- ; ) { 1084 if (i == data_target || i == qd_idx) 1085 continue; 1086 blocks[count++] = sh->dev[i].page; 1087 } 1088 dest = sh->dev[data_target].page; 1089 init_async_submit(&submit, 1090 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1091 NULL, NULL, NULL, 1092 to_addr_conv(sh, percpu)); 1093 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1094 &submit); 1095 1096 count = set_syndrome_sources(blocks, sh); 1097 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1098 ops_complete_compute, sh, 1099 to_addr_conv(sh, percpu)); 1100 return async_gen_syndrome(blocks, 0, count+2, 1101 STRIPE_SIZE, &submit); 1102 } 1103 } else { 1104 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1105 ops_complete_compute, sh, 1106 to_addr_conv(sh, percpu)); 1107 if (failb == syndrome_disks) { 1108 /* We're missing D+P. */ 1109 return async_raid6_datap_recov(syndrome_disks+2, 1110 STRIPE_SIZE, faila, 1111 blocks, &submit); 1112 } else { 1113 /* We're missing D+D. */ 1114 return async_raid6_2data_recov(syndrome_disks+2, 1115 STRIPE_SIZE, faila, failb, 1116 blocks, &submit); 1117 } 1118 } 1119 } 1120 1121 1122 static void ops_complete_prexor(void *stripe_head_ref) 1123 { 1124 struct stripe_head *sh = stripe_head_ref; 1125 1126 pr_debug("%s: stripe %llu\n", __func__, 1127 (unsigned long long)sh->sector); 1128 } 1129 1130 static struct dma_async_tx_descriptor * 1131 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1132 struct dma_async_tx_descriptor *tx) 1133 { 1134 int disks = sh->disks; 1135 struct page **xor_srcs = percpu->scribble; 1136 int count = 0, pd_idx = sh->pd_idx, i; 1137 struct async_submit_ctl submit; 1138 1139 /* existing parity data subtracted */ 1140 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1141 1142 pr_debug("%s: stripe %llu\n", __func__, 1143 (unsigned long long)sh->sector); 1144 1145 for (i = disks; i--; ) { 1146 struct r5dev *dev = &sh->dev[i]; 1147 /* Only process blocks that are known to be uptodate */ 1148 if (test_bit(R5_Wantdrain, &dev->flags)) 1149 xor_srcs[count++] = dev->page; 1150 } 1151 1152 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1153 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1154 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1155 1156 return tx; 1157 } 1158 1159 static struct dma_async_tx_descriptor * 1160 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1161 { 1162 int disks = sh->disks; 1163 int i; 1164 1165 pr_debug("%s: stripe %llu\n", __func__, 1166 (unsigned long long)sh->sector); 1167 1168 for (i = disks; i--; ) { 1169 struct r5dev *dev = &sh->dev[i]; 1170 struct bio *chosen; 1171 1172 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1173 struct bio *wbi; 1174 1175 spin_lock_irq(&sh->stripe_lock); 1176 chosen = dev->towrite; 1177 dev->towrite = NULL; 1178 BUG_ON(dev->written); 1179 wbi = dev->written = chosen; 1180 spin_unlock_irq(&sh->stripe_lock); 1181 1182 while (wbi && wbi->bi_sector < 1183 dev->sector + STRIPE_SECTORS) { 1184 if (wbi->bi_rw & REQ_FUA) 1185 set_bit(R5_WantFUA, &dev->flags); 1186 if (wbi->bi_rw & REQ_SYNC) 1187 set_bit(R5_SyncIO, &dev->flags); 1188 if (wbi->bi_rw & REQ_DISCARD) 1189 set_bit(R5_Discard, &dev->flags); 1190 else 1191 tx = async_copy_data(1, wbi, dev->page, 1192 dev->sector, tx); 1193 wbi = r5_next_bio(wbi, dev->sector); 1194 } 1195 } 1196 } 1197 1198 return tx; 1199 } 1200 1201 static void ops_complete_reconstruct(void *stripe_head_ref) 1202 { 1203 struct stripe_head *sh = stripe_head_ref; 1204 int disks = sh->disks; 1205 int pd_idx = sh->pd_idx; 1206 int qd_idx = sh->qd_idx; 1207 int i; 1208 bool fua = false, sync = false, discard = false; 1209 1210 pr_debug("%s: stripe %llu\n", __func__, 1211 (unsigned long long)sh->sector); 1212 1213 for (i = disks; i--; ) { 1214 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1215 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1216 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1217 } 1218 1219 for (i = disks; i--; ) { 1220 struct r5dev *dev = &sh->dev[i]; 1221 1222 if (dev->written || i == pd_idx || i == qd_idx) { 1223 if (!discard) 1224 set_bit(R5_UPTODATE, &dev->flags); 1225 if (fua) 1226 set_bit(R5_WantFUA, &dev->flags); 1227 if (sync) 1228 set_bit(R5_SyncIO, &dev->flags); 1229 } 1230 } 1231 1232 if (sh->reconstruct_state == reconstruct_state_drain_run) 1233 sh->reconstruct_state = reconstruct_state_drain_result; 1234 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1235 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1236 else { 1237 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1238 sh->reconstruct_state = reconstruct_state_result; 1239 } 1240 1241 set_bit(STRIPE_HANDLE, &sh->state); 1242 release_stripe(sh); 1243 } 1244 1245 static void 1246 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1247 struct dma_async_tx_descriptor *tx) 1248 { 1249 int disks = sh->disks; 1250 struct page **xor_srcs = percpu->scribble; 1251 struct async_submit_ctl submit; 1252 int count = 0, pd_idx = sh->pd_idx, i; 1253 struct page *xor_dest; 1254 int prexor = 0; 1255 unsigned long flags; 1256 1257 pr_debug("%s: stripe %llu\n", __func__, 1258 (unsigned long long)sh->sector); 1259 1260 for (i = 0; i < sh->disks; i++) { 1261 if (pd_idx == i) 1262 continue; 1263 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1264 break; 1265 } 1266 if (i >= sh->disks) { 1267 atomic_inc(&sh->count); 1268 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1269 ops_complete_reconstruct(sh); 1270 return; 1271 } 1272 /* check if prexor is active which means only process blocks 1273 * that are part of a read-modify-write (written) 1274 */ 1275 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1276 prexor = 1; 1277 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1278 for (i = disks; i--; ) { 1279 struct r5dev *dev = &sh->dev[i]; 1280 if (dev->written) 1281 xor_srcs[count++] = dev->page; 1282 } 1283 } else { 1284 xor_dest = sh->dev[pd_idx].page; 1285 for (i = disks; i--; ) { 1286 struct r5dev *dev = &sh->dev[i]; 1287 if (i != pd_idx) 1288 xor_srcs[count++] = dev->page; 1289 } 1290 } 1291 1292 /* 1/ if we prexor'd then the dest is reused as a source 1293 * 2/ if we did not prexor then we are redoing the parity 1294 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1295 * for the synchronous xor case 1296 */ 1297 flags = ASYNC_TX_ACK | 1298 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1299 1300 atomic_inc(&sh->count); 1301 1302 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1303 to_addr_conv(sh, percpu)); 1304 if (unlikely(count == 1)) 1305 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1306 else 1307 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1308 } 1309 1310 static void 1311 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1312 struct dma_async_tx_descriptor *tx) 1313 { 1314 struct async_submit_ctl submit; 1315 struct page **blocks = percpu->scribble; 1316 int count, i; 1317 1318 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1319 1320 for (i = 0; i < sh->disks; i++) { 1321 if (sh->pd_idx == i || sh->qd_idx == i) 1322 continue; 1323 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1324 break; 1325 } 1326 if (i >= sh->disks) { 1327 atomic_inc(&sh->count); 1328 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1329 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1330 ops_complete_reconstruct(sh); 1331 return; 1332 } 1333 1334 count = set_syndrome_sources(blocks, sh); 1335 1336 atomic_inc(&sh->count); 1337 1338 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1339 sh, to_addr_conv(sh, percpu)); 1340 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1341 } 1342 1343 static void ops_complete_check(void *stripe_head_ref) 1344 { 1345 struct stripe_head *sh = stripe_head_ref; 1346 1347 pr_debug("%s: stripe %llu\n", __func__, 1348 (unsigned long long)sh->sector); 1349 1350 sh->check_state = check_state_check_result; 1351 set_bit(STRIPE_HANDLE, &sh->state); 1352 release_stripe(sh); 1353 } 1354 1355 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1356 { 1357 int disks = sh->disks; 1358 int pd_idx = sh->pd_idx; 1359 int qd_idx = sh->qd_idx; 1360 struct page *xor_dest; 1361 struct page **xor_srcs = percpu->scribble; 1362 struct dma_async_tx_descriptor *tx; 1363 struct async_submit_ctl submit; 1364 int count; 1365 int i; 1366 1367 pr_debug("%s: stripe %llu\n", __func__, 1368 (unsigned long long)sh->sector); 1369 1370 count = 0; 1371 xor_dest = sh->dev[pd_idx].page; 1372 xor_srcs[count++] = xor_dest; 1373 for (i = disks; i--; ) { 1374 if (i == pd_idx || i == qd_idx) 1375 continue; 1376 xor_srcs[count++] = sh->dev[i].page; 1377 } 1378 1379 init_async_submit(&submit, 0, NULL, NULL, NULL, 1380 to_addr_conv(sh, percpu)); 1381 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1382 &sh->ops.zero_sum_result, &submit); 1383 1384 atomic_inc(&sh->count); 1385 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1386 tx = async_trigger_callback(&submit); 1387 } 1388 1389 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1390 { 1391 struct page **srcs = percpu->scribble; 1392 struct async_submit_ctl submit; 1393 int count; 1394 1395 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1396 (unsigned long long)sh->sector, checkp); 1397 1398 count = set_syndrome_sources(srcs, sh); 1399 if (!checkp) 1400 srcs[count] = NULL; 1401 1402 atomic_inc(&sh->count); 1403 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1404 sh, to_addr_conv(sh, percpu)); 1405 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1406 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1407 } 1408 1409 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1410 { 1411 int overlap_clear = 0, i, disks = sh->disks; 1412 struct dma_async_tx_descriptor *tx = NULL; 1413 struct r5conf *conf = sh->raid_conf; 1414 int level = conf->level; 1415 struct raid5_percpu *percpu; 1416 unsigned long cpu; 1417 1418 cpu = get_cpu(); 1419 percpu = per_cpu_ptr(conf->percpu, cpu); 1420 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1421 ops_run_biofill(sh); 1422 overlap_clear++; 1423 } 1424 1425 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1426 if (level < 6) 1427 tx = ops_run_compute5(sh, percpu); 1428 else { 1429 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1430 tx = ops_run_compute6_1(sh, percpu); 1431 else 1432 tx = ops_run_compute6_2(sh, percpu); 1433 } 1434 /* terminate the chain if reconstruct is not set to be run */ 1435 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1436 async_tx_ack(tx); 1437 } 1438 1439 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1440 tx = ops_run_prexor(sh, percpu, tx); 1441 1442 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1443 tx = ops_run_biodrain(sh, tx); 1444 overlap_clear++; 1445 } 1446 1447 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1448 if (level < 6) 1449 ops_run_reconstruct5(sh, percpu, tx); 1450 else 1451 ops_run_reconstruct6(sh, percpu, tx); 1452 } 1453 1454 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1455 if (sh->check_state == check_state_run) 1456 ops_run_check_p(sh, percpu); 1457 else if (sh->check_state == check_state_run_q) 1458 ops_run_check_pq(sh, percpu, 0); 1459 else if (sh->check_state == check_state_run_pq) 1460 ops_run_check_pq(sh, percpu, 1); 1461 else 1462 BUG(); 1463 } 1464 1465 if (overlap_clear) 1466 for (i = disks; i--; ) { 1467 struct r5dev *dev = &sh->dev[i]; 1468 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1469 wake_up(&sh->raid_conf->wait_for_overlap); 1470 } 1471 put_cpu(); 1472 } 1473 1474 #ifdef CONFIG_MULTICORE_RAID456 1475 static void async_run_ops(void *param, async_cookie_t cookie) 1476 { 1477 struct stripe_head *sh = param; 1478 unsigned long ops_request = sh->ops.request; 1479 1480 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1481 wake_up(&sh->ops.wait_for_ops); 1482 1483 __raid_run_ops(sh, ops_request); 1484 release_stripe(sh); 1485 } 1486 1487 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1488 { 1489 /* since handle_stripe can be called outside of raid5d context 1490 * we need to ensure sh->ops.request is de-staged before another 1491 * request arrives 1492 */ 1493 wait_event(sh->ops.wait_for_ops, 1494 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1495 sh->ops.request = ops_request; 1496 1497 atomic_inc(&sh->count); 1498 async_schedule(async_run_ops, sh); 1499 } 1500 #else 1501 #define raid_run_ops __raid_run_ops 1502 #endif 1503 1504 static int grow_one_stripe(struct r5conf *conf) 1505 { 1506 struct stripe_head *sh; 1507 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1508 if (!sh) 1509 return 0; 1510 1511 sh->raid_conf = conf; 1512 #ifdef CONFIG_MULTICORE_RAID456 1513 init_waitqueue_head(&sh->ops.wait_for_ops); 1514 #endif 1515 1516 spin_lock_init(&sh->stripe_lock); 1517 1518 if (grow_buffers(sh)) { 1519 shrink_buffers(sh); 1520 kmem_cache_free(conf->slab_cache, sh); 1521 return 0; 1522 } 1523 /* we just created an active stripe so... */ 1524 atomic_set(&sh->count, 1); 1525 atomic_inc(&conf->active_stripes); 1526 INIT_LIST_HEAD(&sh->lru); 1527 release_stripe(sh); 1528 return 1; 1529 } 1530 1531 static int grow_stripes(struct r5conf *conf, int num) 1532 { 1533 struct kmem_cache *sc; 1534 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1535 1536 if (conf->mddev->gendisk) 1537 sprintf(conf->cache_name[0], 1538 "raid%d-%s", conf->level, mdname(conf->mddev)); 1539 else 1540 sprintf(conf->cache_name[0], 1541 "raid%d-%p", conf->level, conf->mddev); 1542 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1543 1544 conf->active_name = 0; 1545 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1546 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1547 0, 0, NULL); 1548 if (!sc) 1549 return 1; 1550 conf->slab_cache = sc; 1551 conf->pool_size = devs; 1552 while (num--) 1553 if (!grow_one_stripe(conf)) 1554 return 1; 1555 return 0; 1556 } 1557 1558 /** 1559 * scribble_len - return the required size of the scribble region 1560 * @num - total number of disks in the array 1561 * 1562 * The size must be enough to contain: 1563 * 1/ a struct page pointer for each device in the array +2 1564 * 2/ room to convert each entry in (1) to its corresponding dma 1565 * (dma_map_page()) or page (page_address()) address. 1566 * 1567 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1568 * calculate over all devices (not just the data blocks), using zeros in place 1569 * of the P and Q blocks. 1570 */ 1571 static size_t scribble_len(int num) 1572 { 1573 size_t len; 1574 1575 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1576 1577 return len; 1578 } 1579 1580 static int resize_stripes(struct r5conf *conf, int newsize) 1581 { 1582 /* Make all the stripes able to hold 'newsize' devices. 1583 * New slots in each stripe get 'page' set to a new page. 1584 * 1585 * This happens in stages: 1586 * 1/ create a new kmem_cache and allocate the required number of 1587 * stripe_heads. 1588 * 2/ gather all the old stripe_heads and transfer the pages across 1589 * to the new stripe_heads. This will have the side effect of 1590 * freezing the array as once all stripe_heads have been collected, 1591 * no IO will be possible. Old stripe heads are freed once their 1592 * pages have been transferred over, and the old kmem_cache is 1593 * freed when all stripes are done. 1594 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1595 * we simple return a failre status - no need to clean anything up. 1596 * 4/ allocate new pages for the new slots in the new stripe_heads. 1597 * If this fails, we don't bother trying the shrink the 1598 * stripe_heads down again, we just leave them as they are. 1599 * As each stripe_head is processed the new one is released into 1600 * active service. 1601 * 1602 * Once step2 is started, we cannot afford to wait for a write, 1603 * so we use GFP_NOIO allocations. 1604 */ 1605 struct stripe_head *osh, *nsh; 1606 LIST_HEAD(newstripes); 1607 struct disk_info *ndisks; 1608 unsigned long cpu; 1609 int err; 1610 struct kmem_cache *sc; 1611 int i; 1612 1613 if (newsize <= conf->pool_size) 1614 return 0; /* never bother to shrink */ 1615 1616 err = md_allow_write(conf->mddev); 1617 if (err) 1618 return err; 1619 1620 /* Step 1 */ 1621 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1622 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1623 0, 0, NULL); 1624 if (!sc) 1625 return -ENOMEM; 1626 1627 for (i = conf->max_nr_stripes; i; i--) { 1628 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1629 if (!nsh) 1630 break; 1631 1632 nsh->raid_conf = conf; 1633 #ifdef CONFIG_MULTICORE_RAID456 1634 init_waitqueue_head(&nsh->ops.wait_for_ops); 1635 #endif 1636 spin_lock_init(&nsh->stripe_lock); 1637 1638 list_add(&nsh->lru, &newstripes); 1639 } 1640 if (i) { 1641 /* didn't get enough, give up */ 1642 while (!list_empty(&newstripes)) { 1643 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1644 list_del(&nsh->lru); 1645 kmem_cache_free(sc, nsh); 1646 } 1647 kmem_cache_destroy(sc); 1648 return -ENOMEM; 1649 } 1650 /* Step 2 - Must use GFP_NOIO now. 1651 * OK, we have enough stripes, start collecting inactive 1652 * stripes and copying them over 1653 */ 1654 list_for_each_entry(nsh, &newstripes, lru) { 1655 spin_lock_irq(&conf->device_lock); 1656 wait_event_lock_irq(conf->wait_for_stripe, 1657 !list_empty(&conf->inactive_list), 1658 conf->device_lock); 1659 osh = get_free_stripe(conf); 1660 spin_unlock_irq(&conf->device_lock); 1661 atomic_set(&nsh->count, 1); 1662 for(i=0; i<conf->pool_size; i++) 1663 nsh->dev[i].page = osh->dev[i].page; 1664 for( ; i<newsize; i++) 1665 nsh->dev[i].page = NULL; 1666 kmem_cache_free(conf->slab_cache, osh); 1667 } 1668 kmem_cache_destroy(conf->slab_cache); 1669 1670 /* Step 3. 1671 * At this point, we are holding all the stripes so the array 1672 * is completely stalled, so now is a good time to resize 1673 * conf->disks and the scribble region 1674 */ 1675 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1676 if (ndisks) { 1677 for (i=0; i<conf->raid_disks; i++) 1678 ndisks[i] = conf->disks[i]; 1679 kfree(conf->disks); 1680 conf->disks = ndisks; 1681 } else 1682 err = -ENOMEM; 1683 1684 get_online_cpus(); 1685 conf->scribble_len = scribble_len(newsize); 1686 for_each_present_cpu(cpu) { 1687 struct raid5_percpu *percpu; 1688 void *scribble; 1689 1690 percpu = per_cpu_ptr(conf->percpu, cpu); 1691 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1692 1693 if (scribble) { 1694 kfree(percpu->scribble); 1695 percpu->scribble = scribble; 1696 } else { 1697 err = -ENOMEM; 1698 break; 1699 } 1700 } 1701 put_online_cpus(); 1702 1703 /* Step 4, return new stripes to service */ 1704 while(!list_empty(&newstripes)) { 1705 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1706 list_del_init(&nsh->lru); 1707 1708 for (i=conf->raid_disks; i < newsize; i++) 1709 if (nsh->dev[i].page == NULL) { 1710 struct page *p = alloc_page(GFP_NOIO); 1711 nsh->dev[i].page = p; 1712 if (!p) 1713 err = -ENOMEM; 1714 } 1715 release_stripe(nsh); 1716 } 1717 /* critical section pass, GFP_NOIO no longer needed */ 1718 1719 conf->slab_cache = sc; 1720 conf->active_name = 1-conf->active_name; 1721 conf->pool_size = newsize; 1722 return err; 1723 } 1724 1725 static int drop_one_stripe(struct r5conf *conf) 1726 { 1727 struct stripe_head *sh; 1728 1729 spin_lock_irq(&conf->device_lock); 1730 sh = get_free_stripe(conf); 1731 spin_unlock_irq(&conf->device_lock); 1732 if (!sh) 1733 return 0; 1734 BUG_ON(atomic_read(&sh->count)); 1735 shrink_buffers(sh); 1736 kmem_cache_free(conf->slab_cache, sh); 1737 atomic_dec(&conf->active_stripes); 1738 return 1; 1739 } 1740 1741 static void shrink_stripes(struct r5conf *conf) 1742 { 1743 while (drop_one_stripe(conf)) 1744 ; 1745 1746 if (conf->slab_cache) 1747 kmem_cache_destroy(conf->slab_cache); 1748 conf->slab_cache = NULL; 1749 } 1750 1751 static void raid5_end_read_request(struct bio * bi, int error) 1752 { 1753 struct stripe_head *sh = bi->bi_private; 1754 struct r5conf *conf = sh->raid_conf; 1755 int disks = sh->disks, i; 1756 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1757 char b[BDEVNAME_SIZE]; 1758 struct md_rdev *rdev = NULL; 1759 sector_t s; 1760 1761 for (i=0 ; i<disks; i++) 1762 if (bi == &sh->dev[i].req) 1763 break; 1764 1765 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1766 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1767 uptodate); 1768 if (i == disks) { 1769 BUG(); 1770 return; 1771 } 1772 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1773 /* If replacement finished while this request was outstanding, 1774 * 'replacement' might be NULL already. 1775 * In that case it moved down to 'rdev'. 1776 * rdev is not removed until all requests are finished. 1777 */ 1778 rdev = conf->disks[i].replacement; 1779 if (!rdev) 1780 rdev = conf->disks[i].rdev; 1781 1782 if (use_new_offset(conf, sh)) 1783 s = sh->sector + rdev->new_data_offset; 1784 else 1785 s = sh->sector + rdev->data_offset; 1786 if (uptodate) { 1787 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1788 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1789 /* Note that this cannot happen on a 1790 * replacement device. We just fail those on 1791 * any error 1792 */ 1793 printk_ratelimited( 1794 KERN_INFO 1795 "md/raid:%s: read error corrected" 1796 " (%lu sectors at %llu on %s)\n", 1797 mdname(conf->mddev), STRIPE_SECTORS, 1798 (unsigned long long)s, 1799 bdevname(rdev->bdev, b)); 1800 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1801 clear_bit(R5_ReadError, &sh->dev[i].flags); 1802 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1803 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1804 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1805 1806 if (atomic_read(&rdev->read_errors)) 1807 atomic_set(&rdev->read_errors, 0); 1808 } else { 1809 const char *bdn = bdevname(rdev->bdev, b); 1810 int retry = 0; 1811 int set_bad = 0; 1812 1813 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1814 atomic_inc(&rdev->read_errors); 1815 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1816 printk_ratelimited( 1817 KERN_WARNING 1818 "md/raid:%s: read error on replacement device " 1819 "(sector %llu on %s).\n", 1820 mdname(conf->mddev), 1821 (unsigned long long)s, 1822 bdn); 1823 else if (conf->mddev->degraded >= conf->max_degraded) { 1824 set_bad = 1; 1825 printk_ratelimited( 1826 KERN_WARNING 1827 "md/raid:%s: read error not correctable " 1828 "(sector %llu on %s).\n", 1829 mdname(conf->mddev), 1830 (unsigned long long)s, 1831 bdn); 1832 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1833 /* Oh, no!!! */ 1834 set_bad = 1; 1835 printk_ratelimited( 1836 KERN_WARNING 1837 "md/raid:%s: read error NOT corrected!! " 1838 "(sector %llu on %s).\n", 1839 mdname(conf->mddev), 1840 (unsigned long long)s, 1841 bdn); 1842 } else if (atomic_read(&rdev->read_errors) 1843 > conf->max_nr_stripes) 1844 printk(KERN_WARNING 1845 "md/raid:%s: Too many read errors, failing device %s.\n", 1846 mdname(conf->mddev), bdn); 1847 else 1848 retry = 1; 1849 if (retry) 1850 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1851 set_bit(R5_ReadError, &sh->dev[i].flags); 1852 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1853 } else 1854 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1855 else { 1856 clear_bit(R5_ReadError, &sh->dev[i].flags); 1857 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1858 if (!(set_bad 1859 && test_bit(In_sync, &rdev->flags) 1860 && rdev_set_badblocks( 1861 rdev, sh->sector, STRIPE_SECTORS, 0))) 1862 md_error(conf->mddev, rdev); 1863 } 1864 } 1865 rdev_dec_pending(rdev, conf->mddev); 1866 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1867 set_bit(STRIPE_HANDLE, &sh->state); 1868 release_stripe(sh); 1869 } 1870 1871 static void raid5_end_write_request(struct bio *bi, int error) 1872 { 1873 struct stripe_head *sh = bi->bi_private; 1874 struct r5conf *conf = sh->raid_conf; 1875 int disks = sh->disks, i; 1876 struct md_rdev *uninitialized_var(rdev); 1877 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1878 sector_t first_bad; 1879 int bad_sectors; 1880 int replacement = 0; 1881 1882 for (i = 0 ; i < disks; i++) { 1883 if (bi == &sh->dev[i].req) { 1884 rdev = conf->disks[i].rdev; 1885 break; 1886 } 1887 if (bi == &sh->dev[i].rreq) { 1888 rdev = conf->disks[i].replacement; 1889 if (rdev) 1890 replacement = 1; 1891 else 1892 /* rdev was removed and 'replacement' 1893 * replaced it. rdev is not removed 1894 * until all requests are finished. 1895 */ 1896 rdev = conf->disks[i].rdev; 1897 break; 1898 } 1899 } 1900 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1901 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1902 uptodate); 1903 if (i == disks) { 1904 BUG(); 1905 return; 1906 } 1907 1908 if (replacement) { 1909 if (!uptodate) 1910 md_error(conf->mddev, rdev); 1911 else if (is_badblock(rdev, sh->sector, 1912 STRIPE_SECTORS, 1913 &first_bad, &bad_sectors)) 1914 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1915 } else { 1916 if (!uptodate) { 1917 set_bit(WriteErrorSeen, &rdev->flags); 1918 set_bit(R5_WriteError, &sh->dev[i].flags); 1919 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1920 set_bit(MD_RECOVERY_NEEDED, 1921 &rdev->mddev->recovery); 1922 } else if (is_badblock(rdev, sh->sector, 1923 STRIPE_SECTORS, 1924 &first_bad, &bad_sectors)) 1925 set_bit(R5_MadeGood, &sh->dev[i].flags); 1926 } 1927 rdev_dec_pending(rdev, conf->mddev); 1928 1929 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1930 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1931 set_bit(STRIPE_HANDLE, &sh->state); 1932 release_stripe(sh); 1933 } 1934 1935 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1936 1937 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1938 { 1939 struct r5dev *dev = &sh->dev[i]; 1940 1941 bio_init(&dev->req); 1942 dev->req.bi_io_vec = &dev->vec; 1943 dev->req.bi_vcnt++; 1944 dev->req.bi_max_vecs++; 1945 dev->req.bi_private = sh; 1946 dev->vec.bv_page = dev->page; 1947 1948 bio_init(&dev->rreq); 1949 dev->rreq.bi_io_vec = &dev->rvec; 1950 dev->rreq.bi_vcnt++; 1951 dev->rreq.bi_max_vecs++; 1952 dev->rreq.bi_private = sh; 1953 dev->rvec.bv_page = dev->page; 1954 1955 dev->flags = 0; 1956 dev->sector = compute_blocknr(sh, i, previous); 1957 } 1958 1959 static void error(struct mddev *mddev, struct md_rdev *rdev) 1960 { 1961 char b[BDEVNAME_SIZE]; 1962 struct r5conf *conf = mddev->private; 1963 unsigned long flags; 1964 pr_debug("raid456: error called\n"); 1965 1966 spin_lock_irqsave(&conf->device_lock, flags); 1967 clear_bit(In_sync, &rdev->flags); 1968 mddev->degraded = calc_degraded(conf); 1969 spin_unlock_irqrestore(&conf->device_lock, flags); 1970 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1971 1972 set_bit(Blocked, &rdev->flags); 1973 set_bit(Faulty, &rdev->flags); 1974 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1975 printk(KERN_ALERT 1976 "md/raid:%s: Disk failure on %s, disabling device.\n" 1977 "md/raid:%s: Operation continuing on %d devices.\n", 1978 mdname(mddev), 1979 bdevname(rdev->bdev, b), 1980 mdname(mddev), 1981 conf->raid_disks - mddev->degraded); 1982 } 1983 1984 /* 1985 * Input: a 'big' sector number, 1986 * Output: index of the data and parity disk, and the sector # in them. 1987 */ 1988 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1989 int previous, int *dd_idx, 1990 struct stripe_head *sh) 1991 { 1992 sector_t stripe, stripe2; 1993 sector_t chunk_number; 1994 unsigned int chunk_offset; 1995 int pd_idx, qd_idx; 1996 int ddf_layout = 0; 1997 sector_t new_sector; 1998 int algorithm = previous ? conf->prev_algo 1999 : conf->algorithm; 2000 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2001 : conf->chunk_sectors; 2002 int raid_disks = previous ? conf->previous_raid_disks 2003 : conf->raid_disks; 2004 int data_disks = raid_disks - conf->max_degraded; 2005 2006 /* First compute the information on this sector */ 2007 2008 /* 2009 * Compute the chunk number and the sector offset inside the chunk 2010 */ 2011 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2012 chunk_number = r_sector; 2013 2014 /* 2015 * Compute the stripe number 2016 */ 2017 stripe = chunk_number; 2018 *dd_idx = sector_div(stripe, data_disks); 2019 stripe2 = stripe; 2020 /* 2021 * Select the parity disk based on the user selected algorithm. 2022 */ 2023 pd_idx = qd_idx = -1; 2024 switch(conf->level) { 2025 case 4: 2026 pd_idx = data_disks; 2027 break; 2028 case 5: 2029 switch (algorithm) { 2030 case ALGORITHM_LEFT_ASYMMETRIC: 2031 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2032 if (*dd_idx >= pd_idx) 2033 (*dd_idx)++; 2034 break; 2035 case ALGORITHM_RIGHT_ASYMMETRIC: 2036 pd_idx = sector_div(stripe2, raid_disks); 2037 if (*dd_idx >= pd_idx) 2038 (*dd_idx)++; 2039 break; 2040 case ALGORITHM_LEFT_SYMMETRIC: 2041 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2042 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2043 break; 2044 case ALGORITHM_RIGHT_SYMMETRIC: 2045 pd_idx = sector_div(stripe2, raid_disks); 2046 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2047 break; 2048 case ALGORITHM_PARITY_0: 2049 pd_idx = 0; 2050 (*dd_idx)++; 2051 break; 2052 case ALGORITHM_PARITY_N: 2053 pd_idx = data_disks; 2054 break; 2055 default: 2056 BUG(); 2057 } 2058 break; 2059 case 6: 2060 2061 switch (algorithm) { 2062 case ALGORITHM_LEFT_ASYMMETRIC: 2063 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2064 qd_idx = pd_idx + 1; 2065 if (pd_idx == raid_disks-1) { 2066 (*dd_idx)++; /* Q D D D P */ 2067 qd_idx = 0; 2068 } else if (*dd_idx >= pd_idx) 2069 (*dd_idx) += 2; /* D D P Q D */ 2070 break; 2071 case ALGORITHM_RIGHT_ASYMMETRIC: 2072 pd_idx = sector_div(stripe2, raid_disks); 2073 qd_idx = pd_idx + 1; 2074 if (pd_idx == raid_disks-1) { 2075 (*dd_idx)++; /* Q D D D P */ 2076 qd_idx = 0; 2077 } else if (*dd_idx >= pd_idx) 2078 (*dd_idx) += 2; /* D D P Q D */ 2079 break; 2080 case ALGORITHM_LEFT_SYMMETRIC: 2081 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2082 qd_idx = (pd_idx + 1) % raid_disks; 2083 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2084 break; 2085 case ALGORITHM_RIGHT_SYMMETRIC: 2086 pd_idx = sector_div(stripe2, raid_disks); 2087 qd_idx = (pd_idx + 1) % raid_disks; 2088 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2089 break; 2090 2091 case ALGORITHM_PARITY_0: 2092 pd_idx = 0; 2093 qd_idx = 1; 2094 (*dd_idx) += 2; 2095 break; 2096 case ALGORITHM_PARITY_N: 2097 pd_idx = data_disks; 2098 qd_idx = data_disks + 1; 2099 break; 2100 2101 case ALGORITHM_ROTATING_ZERO_RESTART: 2102 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2103 * of blocks for computing Q is different. 2104 */ 2105 pd_idx = sector_div(stripe2, raid_disks); 2106 qd_idx = pd_idx + 1; 2107 if (pd_idx == raid_disks-1) { 2108 (*dd_idx)++; /* Q D D D P */ 2109 qd_idx = 0; 2110 } else if (*dd_idx >= pd_idx) 2111 (*dd_idx) += 2; /* D D P Q D */ 2112 ddf_layout = 1; 2113 break; 2114 2115 case ALGORITHM_ROTATING_N_RESTART: 2116 /* Same a left_asymmetric, by first stripe is 2117 * D D D P Q rather than 2118 * Q D D D P 2119 */ 2120 stripe2 += 1; 2121 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2122 qd_idx = pd_idx + 1; 2123 if (pd_idx == raid_disks-1) { 2124 (*dd_idx)++; /* Q D D D P */ 2125 qd_idx = 0; 2126 } else if (*dd_idx >= pd_idx) 2127 (*dd_idx) += 2; /* D D P Q D */ 2128 ddf_layout = 1; 2129 break; 2130 2131 case ALGORITHM_ROTATING_N_CONTINUE: 2132 /* Same as left_symmetric but Q is before P */ 2133 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2134 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2135 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2136 ddf_layout = 1; 2137 break; 2138 2139 case ALGORITHM_LEFT_ASYMMETRIC_6: 2140 /* RAID5 left_asymmetric, with Q on last device */ 2141 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2142 if (*dd_idx >= pd_idx) 2143 (*dd_idx)++; 2144 qd_idx = raid_disks - 1; 2145 break; 2146 2147 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2148 pd_idx = sector_div(stripe2, raid_disks-1); 2149 if (*dd_idx >= pd_idx) 2150 (*dd_idx)++; 2151 qd_idx = raid_disks - 1; 2152 break; 2153 2154 case ALGORITHM_LEFT_SYMMETRIC_6: 2155 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2156 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2157 qd_idx = raid_disks - 1; 2158 break; 2159 2160 case ALGORITHM_RIGHT_SYMMETRIC_6: 2161 pd_idx = sector_div(stripe2, raid_disks-1); 2162 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2163 qd_idx = raid_disks - 1; 2164 break; 2165 2166 case ALGORITHM_PARITY_0_6: 2167 pd_idx = 0; 2168 (*dd_idx)++; 2169 qd_idx = raid_disks - 1; 2170 break; 2171 2172 default: 2173 BUG(); 2174 } 2175 break; 2176 } 2177 2178 if (sh) { 2179 sh->pd_idx = pd_idx; 2180 sh->qd_idx = qd_idx; 2181 sh->ddf_layout = ddf_layout; 2182 } 2183 /* 2184 * Finally, compute the new sector number 2185 */ 2186 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2187 return new_sector; 2188 } 2189 2190 2191 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2192 { 2193 struct r5conf *conf = sh->raid_conf; 2194 int raid_disks = sh->disks; 2195 int data_disks = raid_disks - conf->max_degraded; 2196 sector_t new_sector = sh->sector, check; 2197 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2198 : conf->chunk_sectors; 2199 int algorithm = previous ? conf->prev_algo 2200 : conf->algorithm; 2201 sector_t stripe; 2202 int chunk_offset; 2203 sector_t chunk_number; 2204 int dummy1, dd_idx = i; 2205 sector_t r_sector; 2206 struct stripe_head sh2; 2207 2208 2209 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2210 stripe = new_sector; 2211 2212 if (i == sh->pd_idx) 2213 return 0; 2214 switch(conf->level) { 2215 case 4: break; 2216 case 5: 2217 switch (algorithm) { 2218 case ALGORITHM_LEFT_ASYMMETRIC: 2219 case ALGORITHM_RIGHT_ASYMMETRIC: 2220 if (i > sh->pd_idx) 2221 i--; 2222 break; 2223 case ALGORITHM_LEFT_SYMMETRIC: 2224 case ALGORITHM_RIGHT_SYMMETRIC: 2225 if (i < sh->pd_idx) 2226 i += raid_disks; 2227 i -= (sh->pd_idx + 1); 2228 break; 2229 case ALGORITHM_PARITY_0: 2230 i -= 1; 2231 break; 2232 case ALGORITHM_PARITY_N: 2233 break; 2234 default: 2235 BUG(); 2236 } 2237 break; 2238 case 6: 2239 if (i == sh->qd_idx) 2240 return 0; /* It is the Q disk */ 2241 switch (algorithm) { 2242 case ALGORITHM_LEFT_ASYMMETRIC: 2243 case ALGORITHM_RIGHT_ASYMMETRIC: 2244 case ALGORITHM_ROTATING_ZERO_RESTART: 2245 case ALGORITHM_ROTATING_N_RESTART: 2246 if (sh->pd_idx == raid_disks-1) 2247 i--; /* Q D D D P */ 2248 else if (i > sh->pd_idx) 2249 i -= 2; /* D D P Q D */ 2250 break; 2251 case ALGORITHM_LEFT_SYMMETRIC: 2252 case ALGORITHM_RIGHT_SYMMETRIC: 2253 if (sh->pd_idx == raid_disks-1) 2254 i--; /* Q D D D P */ 2255 else { 2256 /* D D P Q D */ 2257 if (i < sh->pd_idx) 2258 i += raid_disks; 2259 i -= (sh->pd_idx + 2); 2260 } 2261 break; 2262 case ALGORITHM_PARITY_0: 2263 i -= 2; 2264 break; 2265 case ALGORITHM_PARITY_N: 2266 break; 2267 case ALGORITHM_ROTATING_N_CONTINUE: 2268 /* Like left_symmetric, but P is before Q */ 2269 if (sh->pd_idx == 0) 2270 i--; /* P D D D Q */ 2271 else { 2272 /* D D Q P D */ 2273 if (i < sh->pd_idx) 2274 i += raid_disks; 2275 i -= (sh->pd_idx + 1); 2276 } 2277 break; 2278 case ALGORITHM_LEFT_ASYMMETRIC_6: 2279 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2280 if (i > sh->pd_idx) 2281 i--; 2282 break; 2283 case ALGORITHM_LEFT_SYMMETRIC_6: 2284 case ALGORITHM_RIGHT_SYMMETRIC_6: 2285 if (i < sh->pd_idx) 2286 i += data_disks + 1; 2287 i -= (sh->pd_idx + 1); 2288 break; 2289 case ALGORITHM_PARITY_0_6: 2290 i -= 1; 2291 break; 2292 default: 2293 BUG(); 2294 } 2295 break; 2296 } 2297 2298 chunk_number = stripe * data_disks + i; 2299 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2300 2301 check = raid5_compute_sector(conf, r_sector, 2302 previous, &dummy1, &sh2); 2303 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2304 || sh2.qd_idx != sh->qd_idx) { 2305 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2306 mdname(conf->mddev)); 2307 return 0; 2308 } 2309 return r_sector; 2310 } 2311 2312 2313 static void 2314 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2315 int rcw, int expand) 2316 { 2317 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2318 struct r5conf *conf = sh->raid_conf; 2319 int level = conf->level; 2320 2321 if (rcw) { 2322 /* if we are not expanding this is a proper write request, and 2323 * there will be bios with new data to be drained into the 2324 * stripe cache 2325 */ 2326 if (!expand) { 2327 sh->reconstruct_state = reconstruct_state_drain_run; 2328 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2329 } else 2330 sh->reconstruct_state = reconstruct_state_run; 2331 2332 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2333 2334 for (i = disks; i--; ) { 2335 struct r5dev *dev = &sh->dev[i]; 2336 2337 if (dev->towrite) { 2338 set_bit(R5_LOCKED, &dev->flags); 2339 set_bit(R5_Wantdrain, &dev->flags); 2340 if (!expand) 2341 clear_bit(R5_UPTODATE, &dev->flags); 2342 s->locked++; 2343 } 2344 } 2345 if (s->locked + conf->max_degraded == disks) 2346 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2347 atomic_inc(&conf->pending_full_writes); 2348 } else { 2349 BUG_ON(level == 6); 2350 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2351 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2352 2353 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2354 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2355 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2356 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2357 2358 for (i = disks; i--; ) { 2359 struct r5dev *dev = &sh->dev[i]; 2360 if (i == pd_idx) 2361 continue; 2362 2363 if (dev->towrite && 2364 (test_bit(R5_UPTODATE, &dev->flags) || 2365 test_bit(R5_Wantcompute, &dev->flags))) { 2366 set_bit(R5_Wantdrain, &dev->flags); 2367 set_bit(R5_LOCKED, &dev->flags); 2368 clear_bit(R5_UPTODATE, &dev->flags); 2369 s->locked++; 2370 } 2371 } 2372 } 2373 2374 /* keep the parity disk(s) locked while asynchronous operations 2375 * are in flight 2376 */ 2377 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2378 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2379 s->locked++; 2380 2381 if (level == 6) { 2382 int qd_idx = sh->qd_idx; 2383 struct r5dev *dev = &sh->dev[qd_idx]; 2384 2385 set_bit(R5_LOCKED, &dev->flags); 2386 clear_bit(R5_UPTODATE, &dev->flags); 2387 s->locked++; 2388 } 2389 2390 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2391 __func__, (unsigned long long)sh->sector, 2392 s->locked, s->ops_request); 2393 } 2394 2395 /* 2396 * Each stripe/dev can have one or more bion attached. 2397 * toread/towrite point to the first in a chain. 2398 * The bi_next chain must be in order. 2399 */ 2400 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2401 { 2402 struct bio **bip; 2403 struct r5conf *conf = sh->raid_conf; 2404 int firstwrite=0; 2405 2406 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2407 (unsigned long long)bi->bi_sector, 2408 (unsigned long long)sh->sector); 2409 2410 /* 2411 * If several bio share a stripe. The bio bi_phys_segments acts as a 2412 * reference count to avoid race. The reference count should already be 2413 * increased before this function is called (for example, in 2414 * make_request()), so other bio sharing this stripe will not free the 2415 * stripe. If a stripe is owned by one stripe, the stripe lock will 2416 * protect it. 2417 */ 2418 spin_lock_irq(&sh->stripe_lock); 2419 if (forwrite) { 2420 bip = &sh->dev[dd_idx].towrite; 2421 if (*bip == NULL) 2422 firstwrite = 1; 2423 } else 2424 bip = &sh->dev[dd_idx].toread; 2425 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2426 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2427 goto overlap; 2428 bip = & (*bip)->bi_next; 2429 } 2430 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2431 goto overlap; 2432 2433 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2434 if (*bip) 2435 bi->bi_next = *bip; 2436 *bip = bi; 2437 raid5_inc_bi_active_stripes(bi); 2438 2439 if (forwrite) { 2440 /* check if page is covered */ 2441 sector_t sector = sh->dev[dd_idx].sector; 2442 for (bi=sh->dev[dd_idx].towrite; 2443 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2444 bi && bi->bi_sector <= sector; 2445 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2446 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2447 sector = bi->bi_sector + (bi->bi_size>>9); 2448 } 2449 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2450 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2451 } 2452 2453 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2454 (unsigned long long)(*bip)->bi_sector, 2455 (unsigned long long)sh->sector, dd_idx); 2456 spin_unlock_irq(&sh->stripe_lock); 2457 2458 if (conf->mddev->bitmap && firstwrite) { 2459 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2460 STRIPE_SECTORS, 0); 2461 sh->bm_seq = conf->seq_flush+1; 2462 set_bit(STRIPE_BIT_DELAY, &sh->state); 2463 } 2464 return 1; 2465 2466 overlap: 2467 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2468 spin_unlock_irq(&sh->stripe_lock); 2469 return 0; 2470 } 2471 2472 static void end_reshape(struct r5conf *conf); 2473 2474 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2475 struct stripe_head *sh) 2476 { 2477 int sectors_per_chunk = 2478 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2479 int dd_idx; 2480 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2481 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2482 2483 raid5_compute_sector(conf, 2484 stripe * (disks - conf->max_degraded) 2485 *sectors_per_chunk + chunk_offset, 2486 previous, 2487 &dd_idx, sh); 2488 } 2489 2490 static void 2491 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2492 struct stripe_head_state *s, int disks, 2493 struct bio **return_bi) 2494 { 2495 int i; 2496 for (i = disks; i--; ) { 2497 struct bio *bi; 2498 int bitmap_end = 0; 2499 2500 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2501 struct md_rdev *rdev; 2502 rcu_read_lock(); 2503 rdev = rcu_dereference(conf->disks[i].rdev); 2504 if (rdev && test_bit(In_sync, &rdev->flags)) 2505 atomic_inc(&rdev->nr_pending); 2506 else 2507 rdev = NULL; 2508 rcu_read_unlock(); 2509 if (rdev) { 2510 if (!rdev_set_badblocks( 2511 rdev, 2512 sh->sector, 2513 STRIPE_SECTORS, 0)) 2514 md_error(conf->mddev, rdev); 2515 rdev_dec_pending(rdev, conf->mddev); 2516 } 2517 } 2518 spin_lock_irq(&sh->stripe_lock); 2519 /* fail all writes first */ 2520 bi = sh->dev[i].towrite; 2521 sh->dev[i].towrite = NULL; 2522 spin_unlock_irq(&sh->stripe_lock); 2523 if (bi) 2524 bitmap_end = 1; 2525 2526 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2527 wake_up(&conf->wait_for_overlap); 2528 2529 while (bi && bi->bi_sector < 2530 sh->dev[i].sector + STRIPE_SECTORS) { 2531 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2532 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2533 if (!raid5_dec_bi_active_stripes(bi)) { 2534 md_write_end(conf->mddev); 2535 bi->bi_next = *return_bi; 2536 *return_bi = bi; 2537 } 2538 bi = nextbi; 2539 } 2540 if (bitmap_end) 2541 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2542 STRIPE_SECTORS, 0, 0); 2543 bitmap_end = 0; 2544 /* and fail all 'written' */ 2545 bi = sh->dev[i].written; 2546 sh->dev[i].written = NULL; 2547 if (bi) bitmap_end = 1; 2548 while (bi && bi->bi_sector < 2549 sh->dev[i].sector + STRIPE_SECTORS) { 2550 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2551 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2552 if (!raid5_dec_bi_active_stripes(bi)) { 2553 md_write_end(conf->mddev); 2554 bi->bi_next = *return_bi; 2555 *return_bi = bi; 2556 } 2557 bi = bi2; 2558 } 2559 2560 /* fail any reads if this device is non-operational and 2561 * the data has not reached the cache yet. 2562 */ 2563 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2564 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2565 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2566 spin_lock_irq(&sh->stripe_lock); 2567 bi = sh->dev[i].toread; 2568 sh->dev[i].toread = NULL; 2569 spin_unlock_irq(&sh->stripe_lock); 2570 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2571 wake_up(&conf->wait_for_overlap); 2572 while (bi && bi->bi_sector < 2573 sh->dev[i].sector + STRIPE_SECTORS) { 2574 struct bio *nextbi = 2575 r5_next_bio(bi, sh->dev[i].sector); 2576 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2577 if (!raid5_dec_bi_active_stripes(bi)) { 2578 bi->bi_next = *return_bi; 2579 *return_bi = bi; 2580 } 2581 bi = nextbi; 2582 } 2583 } 2584 if (bitmap_end) 2585 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2586 STRIPE_SECTORS, 0, 0); 2587 /* If we were in the middle of a write the parity block might 2588 * still be locked - so just clear all R5_LOCKED flags 2589 */ 2590 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2591 } 2592 2593 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2594 if (atomic_dec_and_test(&conf->pending_full_writes)) 2595 md_wakeup_thread(conf->mddev->thread); 2596 } 2597 2598 static void 2599 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2600 struct stripe_head_state *s) 2601 { 2602 int abort = 0; 2603 int i; 2604 2605 clear_bit(STRIPE_SYNCING, &sh->state); 2606 s->syncing = 0; 2607 s->replacing = 0; 2608 /* There is nothing more to do for sync/check/repair. 2609 * Don't even need to abort as that is handled elsewhere 2610 * if needed, and not always wanted e.g. if there is a known 2611 * bad block here. 2612 * For recover/replace we need to record a bad block on all 2613 * non-sync devices, or abort the recovery 2614 */ 2615 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2616 /* During recovery devices cannot be removed, so 2617 * locking and refcounting of rdevs is not needed 2618 */ 2619 for (i = 0; i < conf->raid_disks; i++) { 2620 struct md_rdev *rdev = conf->disks[i].rdev; 2621 if (rdev 2622 && !test_bit(Faulty, &rdev->flags) 2623 && !test_bit(In_sync, &rdev->flags) 2624 && !rdev_set_badblocks(rdev, sh->sector, 2625 STRIPE_SECTORS, 0)) 2626 abort = 1; 2627 rdev = conf->disks[i].replacement; 2628 if (rdev 2629 && !test_bit(Faulty, &rdev->flags) 2630 && !test_bit(In_sync, &rdev->flags) 2631 && !rdev_set_badblocks(rdev, sh->sector, 2632 STRIPE_SECTORS, 0)) 2633 abort = 1; 2634 } 2635 if (abort) 2636 conf->recovery_disabled = 2637 conf->mddev->recovery_disabled; 2638 } 2639 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2640 } 2641 2642 static int want_replace(struct stripe_head *sh, int disk_idx) 2643 { 2644 struct md_rdev *rdev; 2645 int rv = 0; 2646 /* Doing recovery so rcu locking not required */ 2647 rdev = sh->raid_conf->disks[disk_idx].replacement; 2648 if (rdev 2649 && !test_bit(Faulty, &rdev->flags) 2650 && !test_bit(In_sync, &rdev->flags) 2651 && (rdev->recovery_offset <= sh->sector 2652 || rdev->mddev->recovery_cp <= sh->sector)) 2653 rv = 1; 2654 2655 return rv; 2656 } 2657 2658 /* fetch_block - checks the given member device to see if its data needs 2659 * to be read or computed to satisfy a request. 2660 * 2661 * Returns 1 when no more member devices need to be checked, otherwise returns 2662 * 0 to tell the loop in handle_stripe_fill to continue 2663 */ 2664 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2665 int disk_idx, int disks) 2666 { 2667 struct r5dev *dev = &sh->dev[disk_idx]; 2668 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2669 &sh->dev[s->failed_num[1]] }; 2670 2671 /* is the data in this block needed, and can we get it? */ 2672 if (!test_bit(R5_LOCKED, &dev->flags) && 2673 !test_bit(R5_UPTODATE, &dev->flags) && 2674 (dev->toread || 2675 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2676 s->syncing || s->expanding || 2677 (s->replacing && want_replace(sh, disk_idx)) || 2678 (s->failed >= 1 && fdev[0]->toread) || 2679 (s->failed >= 2 && fdev[1]->toread) || 2680 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2681 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2682 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2683 /* we would like to get this block, possibly by computing it, 2684 * otherwise read it if the backing disk is insync 2685 */ 2686 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2687 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2688 if ((s->uptodate == disks - 1) && 2689 (s->failed && (disk_idx == s->failed_num[0] || 2690 disk_idx == s->failed_num[1]))) { 2691 /* have disk failed, and we're requested to fetch it; 2692 * do compute it 2693 */ 2694 pr_debug("Computing stripe %llu block %d\n", 2695 (unsigned long long)sh->sector, disk_idx); 2696 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2697 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2698 set_bit(R5_Wantcompute, &dev->flags); 2699 sh->ops.target = disk_idx; 2700 sh->ops.target2 = -1; /* no 2nd target */ 2701 s->req_compute = 1; 2702 /* Careful: from this point on 'uptodate' is in the eye 2703 * of raid_run_ops which services 'compute' operations 2704 * before writes. R5_Wantcompute flags a block that will 2705 * be R5_UPTODATE by the time it is needed for a 2706 * subsequent operation. 2707 */ 2708 s->uptodate++; 2709 return 1; 2710 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2711 /* Computing 2-failure is *very* expensive; only 2712 * do it if failed >= 2 2713 */ 2714 int other; 2715 for (other = disks; other--; ) { 2716 if (other == disk_idx) 2717 continue; 2718 if (!test_bit(R5_UPTODATE, 2719 &sh->dev[other].flags)) 2720 break; 2721 } 2722 BUG_ON(other < 0); 2723 pr_debug("Computing stripe %llu blocks %d,%d\n", 2724 (unsigned long long)sh->sector, 2725 disk_idx, other); 2726 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2727 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2728 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2729 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2730 sh->ops.target = disk_idx; 2731 sh->ops.target2 = other; 2732 s->uptodate += 2; 2733 s->req_compute = 1; 2734 return 1; 2735 } else if (test_bit(R5_Insync, &dev->flags)) { 2736 set_bit(R5_LOCKED, &dev->flags); 2737 set_bit(R5_Wantread, &dev->flags); 2738 s->locked++; 2739 pr_debug("Reading block %d (sync=%d)\n", 2740 disk_idx, s->syncing); 2741 } 2742 } 2743 2744 return 0; 2745 } 2746 2747 /** 2748 * handle_stripe_fill - read or compute data to satisfy pending requests. 2749 */ 2750 static void handle_stripe_fill(struct stripe_head *sh, 2751 struct stripe_head_state *s, 2752 int disks) 2753 { 2754 int i; 2755 2756 /* look for blocks to read/compute, skip this if a compute 2757 * is already in flight, or if the stripe contents are in the 2758 * midst of changing due to a write 2759 */ 2760 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2761 !sh->reconstruct_state) 2762 for (i = disks; i--; ) 2763 if (fetch_block(sh, s, i, disks)) 2764 break; 2765 set_bit(STRIPE_HANDLE, &sh->state); 2766 } 2767 2768 2769 /* handle_stripe_clean_event 2770 * any written block on an uptodate or failed drive can be returned. 2771 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2772 * never LOCKED, so we don't need to test 'failed' directly. 2773 */ 2774 static void handle_stripe_clean_event(struct r5conf *conf, 2775 struct stripe_head *sh, int disks, struct bio **return_bi) 2776 { 2777 int i; 2778 struct r5dev *dev; 2779 2780 for (i = disks; i--; ) 2781 if (sh->dev[i].written) { 2782 dev = &sh->dev[i]; 2783 if (!test_bit(R5_LOCKED, &dev->flags) && 2784 (test_bit(R5_UPTODATE, &dev->flags) || 2785 test_bit(R5_Discard, &dev->flags))) { 2786 /* We can return any write requests */ 2787 struct bio *wbi, *wbi2; 2788 pr_debug("Return write for disc %d\n", i); 2789 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2790 clear_bit(R5_UPTODATE, &dev->flags); 2791 wbi = dev->written; 2792 dev->written = NULL; 2793 while (wbi && wbi->bi_sector < 2794 dev->sector + STRIPE_SECTORS) { 2795 wbi2 = r5_next_bio(wbi, dev->sector); 2796 if (!raid5_dec_bi_active_stripes(wbi)) { 2797 md_write_end(conf->mddev); 2798 wbi->bi_next = *return_bi; 2799 *return_bi = wbi; 2800 } 2801 wbi = wbi2; 2802 } 2803 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2804 STRIPE_SECTORS, 2805 !test_bit(STRIPE_DEGRADED, &sh->state), 2806 0); 2807 } 2808 } else if (test_bit(R5_Discard, &sh->dev[i].flags)) 2809 clear_bit(R5_Discard, &sh->dev[i].flags); 2810 2811 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2812 if (atomic_dec_and_test(&conf->pending_full_writes)) 2813 md_wakeup_thread(conf->mddev->thread); 2814 } 2815 2816 static void handle_stripe_dirtying(struct r5conf *conf, 2817 struct stripe_head *sh, 2818 struct stripe_head_state *s, 2819 int disks) 2820 { 2821 int rmw = 0, rcw = 0, i; 2822 sector_t recovery_cp = conf->mddev->recovery_cp; 2823 2824 /* RAID6 requires 'rcw' in current implementation. 2825 * Otherwise, check whether resync is now happening or should start. 2826 * If yes, then the array is dirty (after unclean shutdown or 2827 * initial creation), so parity in some stripes might be inconsistent. 2828 * In this case, we need to always do reconstruct-write, to ensure 2829 * that in case of drive failure or read-error correction, we 2830 * generate correct data from the parity. 2831 */ 2832 if (conf->max_degraded == 2 || 2833 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2834 /* Calculate the real rcw later - for now make it 2835 * look like rcw is cheaper 2836 */ 2837 rcw = 1; rmw = 2; 2838 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2839 conf->max_degraded, (unsigned long long)recovery_cp, 2840 (unsigned long long)sh->sector); 2841 } else for (i = disks; i--; ) { 2842 /* would I have to read this buffer for read_modify_write */ 2843 struct r5dev *dev = &sh->dev[i]; 2844 if ((dev->towrite || i == sh->pd_idx) && 2845 !test_bit(R5_LOCKED, &dev->flags) && 2846 !(test_bit(R5_UPTODATE, &dev->flags) || 2847 test_bit(R5_Wantcompute, &dev->flags))) { 2848 if (test_bit(R5_Insync, &dev->flags)) 2849 rmw++; 2850 else 2851 rmw += 2*disks; /* cannot read it */ 2852 } 2853 /* Would I have to read this buffer for reconstruct_write */ 2854 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2855 !test_bit(R5_LOCKED, &dev->flags) && 2856 !(test_bit(R5_UPTODATE, &dev->flags) || 2857 test_bit(R5_Wantcompute, &dev->flags))) { 2858 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2859 else 2860 rcw += 2*disks; 2861 } 2862 } 2863 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2864 (unsigned long long)sh->sector, rmw, rcw); 2865 set_bit(STRIPE_HANDLE, &sh->state); 2866 if (rmw < rcw && rmw > 0) { 2867 /* prefer read-modify-write, but need to get some data */ 2868 blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", 2869 (unsigned long long)sh->sector, rmw); 2870 for (i = disks; i--; ) { 2871 struct r5dev *dev = &sh->dev[i]; 2872 if ((dev->towrite || i == sh->pd_idx) && 2873 !test_bit(R5_LOCKED, &dev->flags) && 2874 !(test_bit(R5_UPTODATE, &dev->flags) || 2875 test_bit(R5_Wantcompute, &dev->flags)) && 2876 test_bit(R5_Insync, &dev->flags)) { 2877 if ( 2878 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2879 pr_debug("Read_old block " 2880 "%d for r-m-w\n", i); 2881 set_bit(R5_LOCKED, &dev->flags); 2882 set_bit(R5_Wantread, &dev->flags); 2883 s->locked++; 2884 } else { 2885 set_bit(STRIPE_DELAYED, &sh->state); 2886 set_bit(STRIPE_HANDLE, &sh->state); 2887 } 2888 } 2889 } 2890 } 2891 if (rcw <= rmw && rcw > 0) { 2892 /* want reconstruct write, but need to get some data */ 2893 int qread =0; 2894 rcw = 0; 2895 for (i = disks; i--; ) { 2896 struct r5dev *dev = &sh->dev[i]; 2897 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2898 i != sh->pd_idx && i != sh->qd_idx && 2899 !test_bit(R5_LOCKED, &dev->flags) && 2900 !(test_bit(R5_UPTODATE, &dev->flags) || 2901 test_bit(R5_Wantcompute, &dev->flags))) { 2902 rcw++; 2903 if (!test_bit(R5_Insync, &dev->flags)) 2904 continue; /* it's a failed drive */ 2905 if ( 2906 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2907 pr_debug("Read_old block " 2908 "%d for Reconstruct\n", i); 2909 set_bit(R5_LOCKED, &dev->flags); 2910 set_bit(R5_Wantread, &dev->flags); 2911 s->locked++; 2912 qread++; 2913 } else { 2914 set_bit(STRIPE_DELAYED, &sh->state); 2915 set_bit(STRIPE_HANDLE, &sh->state); 2916 } 2917 } 2918 } 2919 if (rcw) 2920 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 2921 (unsigned long long)sh->sector, 2922 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 2923 } 2924 /* now if nothing is locked, and if we have enough data, 2925 * we can start a write request 2926 */ 2927 /* since handle_stripe can be called at any time we need to handle the 2928 * case where a compute block operation has been submitted and then a 2929 * subsequent call wants to start a write request. raid_run_ops only 2930 * handles the case where compute block and reconstruct are requested 2931 * simultaneously. If this is not the case then new writes need to be 2932 * held off until the compute completes. 2933 */ 2934 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2935 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2936 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2937 schedule_reconstruction(sh, s, rcw == 0, 0); 2938 } 2939 2940 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2941 struct stripe_head_state *s, int disks) 2942 { 2943 struct r5dev *dev = NULL; 2944 2945 set_bit(STRIPE_HANDLE, &sh->state); 2946 2947 switch (sh->check_state) { 2948 case check_state_idle: 2949 /* start a new check operation if there are no failures */ 2950 if (s->failed == 0) { 2951 BUG_ON(s->uptodate != disks); 2952 sh->check_state = check_state_run; 2953 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2954 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2955 s->uptodate--; 2956 break; 2957 } 2958 dev = &sh->dev[s->failed_num[0]]; 2959 /* fall through */ 2960 case check_state_compute_result: 2961 sh->check_state = check_state_idle; 2962 if (!dev) 2963 dev = &sh->dev[sh->pd_idx]; 2964 2965 /* check that a write has not made the stripe insync */ 2966 if (test_bit(STRIPE_INSYNC, &sh->state)) 2967 break; 2968 2969 /* either failed parity check, or recovery is happening */ 2970 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2971 BUG_ON(s->uptodate != disks); 2972 2973 set_bit(R5_LOCKED, &dev->flags); 2974 s->locked++; 2975 set_bit(R5_Wantwrite, &dev->flags); 2976 2977 clear_bit(STRIPE_DEGRADED, &sh->state); 2978 set_bit(STRIPE_INSYNC, &sh->state); 2979 break; 2980 case check_state_run: 2981 break; /* we will be called again upon completion */ 2982 case check_state_check_result: 2983 sh->check_state = check_state_idle; 2984 2985 /* if a failure occurred during the check operation, leave 2986 * STRIPE_INSYNC not set and let the stripe be handled again 2987 */ 2988 if (s->failed) 2989 break; 2990 2991 /* handle a successful check operation, if parity is correct 2992 * we are done. Otherwise update the mismatch count and repair 2993 * parity if !MD_RECOVERY_CHECK 2994 */ 2995 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2996 /* parity is correct (on disc, 2997 * not in buffer any more) 2998 */ 2999 set_bit(STRIPE_INSYNC, &sh->state); 3000 else { 3001 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3002 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3003 /* don't try to repair!! */ 3004 set_bit(STRIPE_INSYNC, &sh->state); 3005 else { 3006 sh->check_state = check_state_compute_run; 3007 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3008 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3009 set_bit(R5_Wantcompute, 3010 &sh->dev[sh->pd_idx].flags); 3011 sh->ops.target = sh->pd_idx; 3012 sh->ops.target2 = -1; 3013 s->uptodate++; 3014 } 3015 } 3016 break; 3017 case check_state_compute_run: 3018 break; 3019 default: 3020 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3021 __func__, sh->check_state, 3022 (unsigned long long) sh->sector); 3023 BUG(); 3024 } 3025 } 3026 3027 3028 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3029 struct stripe_head_state *s, 3030 int disks) 3031 { 3032 int pd_idx = sh->pd_idx; 3033 int qd_idx = sh->qd_idx; 3034 struct r5dev *dev; 3035 3036 set_bit(STRIPE_HANDLE, &sh->state); 3037 3038 BUG_ON(s->failed > 2); 3039 3040 /* Want to check and possibly repair P and Q. 3041 * However there could be one 'failed' device, in which 3042 * case we can only check one of them, possibly using the 3043 * other to generate missing data 3044 */ 3045 3046 switch (sh->check_state) { 3047 case check_state_idle: 3048 /* start a new check operation if there are < 2 failures */ 3049 if (s->failed == s->q_failed) { 3050 /* The only possible failed device holds Q, so it 3051 * makes sense to check P (If anything else were failed, 3052 * we would have used P to recreate it). 3053 */ 3054 sh->check_state = check_state_run; 3055 } 3056 if (!s->q_failed && s->failed < 2) { 3057 /* Q is not failed, and we didn't use it to generate 3058 * anything, so it makes sense to check it 3059 */ 3060 if (sh->check_state == check_state_run) 3061 sh->check_state = check_state_run_pq; 3062 else 3063 sh->check_state = check_state_run_q; 3064 } 3065 3066 /* discard potentially stale zero_sum_result */ 3067 sh->ops.zero_sum_result = 0; 3068 3069 if (sh->check_state == check_state_run) { 3070 /* async_xor_zero_sum destroys the contents of P */ 3071 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3072 s->uptodate--; 3073 } 3074 if (sh->check_state >= check_state_run && 3075 sh->check_state <= check_state_run_pq) { 3076 /* async_syndrome_zero_sum preserves P and Q, so 3077 * no need to mark them !uptodate here 3078 */ 3079 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3080 break; 3081 } 3082 3083 /* we have 2-disk failure */ 3084 BUG_ON(s->failed != 2); 3085 /* fall through */ 3086 case check_state_compute_result: 3087 sh->check_state = check_state_idle; 3088 3089 /* check that a write has not made the stripe insync */ 3090 if (test_bit(STRIPE_INSYNC, &sh->state)) 3091 break; 3092 3093 /* now write out any block on a failed drive, 3094 * or P or Q if they were recomputed 3095 */ 3096 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3097 if (s->failed == 2) { 3098 dev = &sh->dev[s->failed_num[1]]; 3099 s->locked++; 3100 set_bit(R5_LOCKED, &dev->flags); 3101 set_bit(R5_Wantwrite, &dev->flags); 3102 } 3103 if (s->failed >= 1) { 3104 dev = &sh->dev[s->failed_num[0]]; 3105 s->locked++; 3106 set_bit(R5_LOCKED, &dev->flags); 3107 set_bit(R5_Wantwrite, &dev->flags); 3108 } 3109 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3110 dev = &sh->dev[pd_idx]; 3111 s->locked++; 3112 set_bit(R5_LOCKED, &dev->flags); 3113 set_bit(R5_Wantwrite, &dev->flags); 3114 } 3115 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3116 dev = &sh->dev[qd_idx]; 3117 s->locked++; 3118 set_bit(R5_LOCKED, &dev->flags); 3119 set_bit(R5_Wantwrite, &dev->flags); 3120 } 3121 clear_bit(STRIPE_DEGRADED, &sh->state); 3122 3123 set_bit(STRIPE_INSYNC, &sh->state); 3124 break; 3125 case check_state_run: 3126 case check_state_run_q: 3127 case check_state_run_pq: 3128 break; /* we will be called again upon completion */ 3129 case check_state_check_result: 3130 sh->check_state = check_state_idle; 3131 3132 /* handle a successful check operation, if parity is correct 3133 * we are done. Otherwise update the mismatch count and repair 3134 * parity if !MD_RECOVERY_CHECK 3135 */ 3136 if (sh->ops.zero_sum_result == 0) { 3137 /* both parities are correct */ 3138 if (!s->failed) 3139 set_bit(STRIPE_INSYNC, &sh->state); 3140 else { 3141 /* in contrast to the raid5 case we can validate 3142 * parity, but still have a failure to write 3143 * back 3144 */ 3145 sh->check_state = check_state_compute_result; 3146 /* Returning at this point means that we may go 3147 * off and bring p and/or q uptodate again so 3148 * we make sure to check zero_sum_result again 3149 * to verify if p or q need writeback 3150 */ 3151 } 3152 } else { 3153 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3154 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3155 /* don't try to repair!! */ 3156 set_bit(STRIPE_INSYNC, &sh->state); 3157 else { 3158 int *target = &sh->ops.target; 3159 3160 sh->ops.target = -1; 3161 sh->ops.target2 = -1; 3162 sh->check_state = check_state_compute_run; 3163 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3164 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3165 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3166 set_bit(R5_Wantcompute, 3167 &sh->dev[pd_idx].flags); 3168 *target = pd_idx; 3169 target = &sh->ops.target2; 3170 s->uptodate++; 3171 } 3172 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3173 set_bit(R5_Wantcompute, 3174 &sh->dev[qd_idx].flags); 3175 *target = qd_idx; 3176 s->uptodate++; 3177 } 3178 } 3179 } 3180 break; 3181 case check_state_compute_run: 3182 break; 3183 default: 3184 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3185 __func__, sh->check_state, 3186 (unsigned long long) sh->sector); 3187 BUG(); 3188 } 3189 } 3190 3191 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3192 { 3193 int i; 3194 3195 /* We have read all the blocks in this stripe and now we need to 3196 * copy some of them into a target stripe for expand. 3197 */ 3198 struct dma_async_tx_descriptor *tx = NULL; 3199 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3200 for (i = 0; i < sh->disks; i++) 3201 if (i != sh->pd_idx && i != sh->qd_idx) { 3202 int dd_idx, j; 3203 struct stripe_head *sh2; 3204 struct async_submit_ctl submit; 3205 3206 sector_t bn = compute_blocknr(sh, i, 1); 3207 sector_t s = raid5_compute_sector(conf, bn, 0, 3208 &dd_idx, NULL); 3209 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3210 if (sh2 == NULL) 3211 /* so far only the early blocks of this stripe 3212 * have been requested. When later blocks 3213 * get requested, we will try again 3214 */ 3215 continue; 3216 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3217 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3218 /* must have already done this block */ 3219 release_stripe(sh2); 3220 continue; 3221 } 3222 3223 /* place all the copies on one channel */ 3224 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3225 tx = async_memcpy(sh2->dev[dd_idx].page, 3226 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3227 &submit); 3228 3229 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3230 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3231 for (j = 0; j < conf->raid_disks; j++) 3232 if (j != sh2->pd_idx && 3233 j != sh2->qd_idx && 3234 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3235 break; 3236 if (j == conf->raid_disks) { 3237 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3238 set_bit(STRIPE_HANDLE, &sh2->state); 3239 } 3240 release_stripe(sh2); 3241 3242 } 3243 /* done submitting copies, wait for them to complete */ 3244 async_tx_quiesce(&tx); 3245 } 3246 3247 /* 3248 * handle_stripe - do things to a stripe. 3249 * 3250 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3251 * state of various bits to see what needs to be done. 3252 * Possible results: 3253 * return some read requests which now have data 3254 * return some write requests which are safely on storage 3255 * schedule a read on some buffers 3256 * schedule a write of some buffers 3257 * return confirmation of parity correctness 3258 * 3259 */ 3260 3261 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3262 { 3263 struct r5conf *conf = sh->raid_conf; 3264 int disks = sh->disks; 3265 struct r5dev *dev; 3266 int i; 3267 int do_recovery = 0; 3268 3269 memset(s, 0, sizeof(*s)); 3270 3271 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3272 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3273 s->failed_num[0] = -1; 3274 s->failed_num[1] = -1; 3275 3276 /* Now to look around and see what can be done */ 3277 rcu_read_lock(); 3278 for (i=disks; i--; ) { 3279 struct md_rdev *rdev; 3280 sector_t first_bad; 3281 int bad_sectors; 3282 int is_bad = 0; 3283 3284 dev = &sh->dev[i]; 3285 3286 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3287 i, dev->flags, 3288 dev->toread, dev->towrite, dev->written); 3289 /* maybe we can reply to a read 3290 * 3291 * new wantfill requests are only permitted while 3292 * ops_complete_biofill is guaranteed to be inactive 3293 */ 3294 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3295 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3296 set_bit(R5_Wantfill, &dev->flags); 3297 3298 /* now count some things */ 3299 if (test_bit(R5_LOCKED, &dev->flags)) 3300 s->locked++; 3301 if (test_bit(R5_UPTODATE, &dev->flags)) 3302 s->uptodate++; 3303 if (test_bit(R5_Wantcompute, &dev->flags)) { 3304 s->compute++; 3305 BUG_ON(s->compute > 2); 3306 } 3307 3308 if (test_bit(R5_Wantfill, &dev->flags)) 3309 s->to_fill++; 3310 else if (dev->toread) 3311 s->to_read++; 3312 if (dev->towrite) { 3313 s->to_write++; 3314 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3315 s->non_overwrite++; 3316 } 3317 if (dev->written) 3318 s->written++; 3319 /* Prefer to use the replacement for reads, but only 3320 * if it is recovered enough and has no bad blocks. 3321 */ 3322 rdev = rcu_dereference(conf->disks[i].replacement); 3323 if (rdev && !test_bit(Faulty, &rdev->flags) && 3324 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3325 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3326 &first_bad, &bad_sectors)) 3327 set_bit(R5_ReadRepl, &dev->flags); 3328 else { 3329 if (rdev) 3330 set_bit(R5_NeedReplace, &dev->flags); 3331 rdev = rcu_dereference(conf->disks[i].rdev); 3332 clear_bit(R5_ReadRepl, &dev->flags); 3333 } 3334 if (rdev && test_bit(Faulty, &rdev->flags)) 3335 rdev = NULL; 3336 if (rdev) { 3337 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3338 &first_bad, &bad_sectors); 3339 if (s->blocked_rdev == NULL 3340 && (test_bit(Blocked, &rdev->flags) 3341 || is_bad < 0)) { 3342 if (is_bad < 0) 3343 set_bit(BlockedBadBlocks, 3344 &rdev->flags); 3345 s->blocked_rdev = rdev; 3346 atomic_inc(&rdev->nr_pending); 3347 } 3348 } 3349 clear_bit(R5_Insync, &dev->flags); 3350 if (!rdev) 3351 /* Not in-sync */; 3352 else if (is_bad) { 3353 /* also not in-sync */ 3354 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3355 test_bit(R5_UPTODATE, &dev->flags)) { 3356 /* treat as in-sync, but with a read error 3357 * which we can now try to correct 3358 */ 3359 set_bit(R5_Insync, &dev->flags); 3360 set_bit(R5_ReadError, &dev->flags); 3361 } 3362 } else if (test_bit(In_sync, &rdev->flags)) 3363 set_bit(R5_Insync, &dev->flags); 3364 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3365 /* in sync if before recovery_offset */ 3366 set_bit(R5_Insync, &dev->flags); 3367 else if (test_bit(R5_UPTODATE, &dev->flags) && 3368 test_bit(R5_Expanded, &dev->flags)) 3369 /* If we've reshaped into here, we assume it is Insync. 3370 * We will shortly update recovery_offset to make 3371 * it official. 3372 */ 3373 set_bit(R5_Insync, &dev->flags); 3374 3375 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3376 /* This flag does not apply to '.replacement' 3377 * only to .rdev, so make sure to check that*/ 3378 struct md_rdev *rdev2 = rcu_dereference( 3379 conf->disks[i].rdev); 3380 if (rdev2 == rdev) 3381 clear_bit(R5_Insync, &dev->flags); 3382 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3383 s->handle_bad_blocks = 1; 3384 atomic_inc(&rdev2->nr_pending); 3385 } else 3386 clear_bit(R5_WriteError, &dev->flags); 3387 } 3388 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3389 /* This flag does not apply to '.replacement' 3390 * only to .rdev, so make sure to check that*/ 3391 struct md_rdev *rdev2 = rcu_dereference( 3392 conf->disks[i].rdev); 3393 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3394 s->handle_bad_blocks = 1; 3395 atomic_inc(&rdev2->nr_pending); 3396 } else 3397 clear_bit(R5_MadeGood, &dev->flags); 3398 } 3399 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3400 struct md_rdev *rdev2 = rcu_dereference( 3401 conf->disks[i].replacement); 3402 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3403 s->handle_bad_blocks = 1; 3404 atomic_inc(&rdev2->nr_pending); 3405 } else 3406 clear_bit(R5_MadeGoodRepl, &dev->flags); 3407 } 3408 if (!test_bit(R5_Insync, &dev->flags)) { 3409 /* The ReadError flag will just be confusing now */ 3410 clear_bit(R5_ReadError, &dev->flags); 3411 clear_bit(R5_ReWrite, &dev->flags); 3412 } 3413 if (test_bit(R5_ReadError, &dev->flags)) 3414 clear_bit(R5_Insync, &dev->flags); 3415 if (!test_bit(R5_Insync, &dev->flags)) { 3416 if (s->failed < 2) 3417 s->failed_num[s->failed] = i; 3418 s->failed++; 3419 if (rdev && !test_bit(Faulty, &rdev->flags)) 3420 do_recovery = 1; 3421 } 3422 } 3423 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3424 /* If there is a failed device being replaced, 3425 * we must be recovering. 3426 * else if we are after recovery_cp, we must be syncing 3427 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3428 * else we can only be replacing 3429 * sync and recovery both need to read all devices, and so 3430 * use the same flag. 3431 */ 3432 if (do_recovery || 3433 sh->sector >= conf->mddev->recovery_cp || 3434 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3435 s->syncing = 1; 3436 else 3437 s->replacing = 1; 3438 } 3439 rcu_read_unlock(); 3440 } 3441 3442 static void handle_stripe(struct stripe_head *sh) 3443 { 3444 struct stripe_head_state s; 3445 struct r5conf *conf = sh->raid_conf; 3446 int i; 3447 int prexor; 3448 int disks = sh->disks; 3449 struct r5dev *pdev, *qdev; 3450 3451 clear_bit(STRIPE_HANDLE, &sh->state); 3452 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3453 /* already being handled, ensure it gets handled 3454 * again when current action finishes */ 3455 set_bit(STRIPE_HANDLE, &sh->state); 3456 return; 3457 } 3458 3459 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3460 set_bit(STRIPE_SYNCING, &sh->state); 3461 clear_bit(STRIPE_INSYNC, &sh->state); 3462 } 3463 clear_bit(STRIPE_DELAYED, &sh->state); 3464 3465 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3466 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3467 (unsigned long long)sh->sector, sh->state, 3468 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3469 sh->check_state, sh->reconstruct_state); 3470 3471 analyse_stripe(sh, &s); 3472 3473 if (s.handle_bad_blocks) { 3474 set_bit(STRIPE_HANDLE, &sh->state); 3475 goto finish; 3476 } 3477 3478 if (unlikely(s.blocked_rdev)) { 3479 if (s.syncing || s.expanding || s.expanded || 3480 s.replacing || s.to_write || s.written) { 3481 set_bit(STRIPE_HANDLE, &sh->state); 3482 goto finish; 3483 } 3484 /* There is nothing for the blocked_rdev to block */ 3485 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3486 s.blocked_rdev = NULL; 3487 } 3488 3489 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3490 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3491 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3492 } 3493 3494 pr_debug("locked=%d uptodate=%d to_read=%d" 3495 " to_write=%d failed=%d failed_num=%d,%d\n", 3496 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3497 s.failed_num[0], s.failed_num[1]); 3498 /* check if the array has lost more than max_degraded devices and, 3499 * if so, some requests might need to be failed. 3500 */ 3501 if (s.failed > conf->max_degraded) { 3502 sh->check_state = 0; 3503 sh->reconstruct_state = 0; 3504 if (s.to_read+s.to_write+s.written) 3505 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3506 if (s.syncing + s.replacing) 3507 handle_failed_sync(conf, sh, &s); 3508 } 3509 3510 /* Now we check to see if any write operations have recently 3511 * completed 3512 */ 3513 prexor = 0; 3514 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3515 prexor = 1; 3516 if (sh->reconstruct_state == reconstruct_state_drain_result || 3517 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3518 sh->reconstruct_state = reconstruct_state_idle; 3519 3520 /* All the 'written' buffers and the parity block are ready to 3521 * be written back to disk 3522 */ 3523 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3524 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3525 BUG_ON(sh->qd_idx >= 0 && 3526 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3527 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3528 for (i = disks; i--; ) { 3529 struct r5dev *dev = &sh->dev[i]; 3530 if (test_bit(R5_LOCKED, &dev->flags) && 3531 (i == sh->pd_idx || i == sh->qd_idx || 3532 dev->written)) { 3533 pr_debug("Writing block %d\n", i); 3534 set_bit(R5_Wantwrite, &dev->flags); 3535 if (prexor) 3536 continue; 3537 if (!test_bit(R5_Insync, &dev->flags) || 3538 ((i == sh->pd_idx || i == sh->qd_idx) && 3539 s.failed == 0)) 3540 set_bit(STRIPE_INSYNC, &sh->state); 3541 } 3542 } 3543 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3544 s.dec_preread_active = 1; 3545 } 3546 3547 /* 3548 * might be able to return some write requests if the parity blocks 3549 * are safe, or on a failed drive 3550 */ 3551 pdev = &sh->dev[sh->pd_idx]; 3552 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3553 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3554 qdev = &sh->dev[sh->qd_idx]; 3555 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3556 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3557 || conf->level < 6; 3558 3559 if (s.written && 3560 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3561 && !test_bit(R5_LOCKED, &pdev->flags) 3562 && (test_bit(R5_UPTODATE, &pdev->flags) || 3563 test_bit(R5_Discard, &pdev->flags))))) && 3564 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3565 && !test_bit(R5_LOCKED, &qdev->flags) 3566 && (test_bit(R5_UPTODATE, &qdev->flags) || 3567 test_bit(R5_Discard, &qdev->flags)))))) 3568 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3569 3570 /* Now we might consider reading some blocks, either to check/generate 3571 * parity, or to satisfy requests 3572 * or to load a block that is being partially written. 3573 */ 3574 if (s.to_read || s.non_overwrite 3575 || (conf->level == 6 && s.to_write && s.failed) 3576 || (s.syncing && (s.uptodate + s.compute < disks)) 3577 || s.replacing 3578 || s.expanding) 3579 handle_stripe_fill(sh, &s, disks); 3580 3581 /* Now to consider new write requests and what else, if anything 3582 * should be read. We do not handle new writes when: 3583 * 1/ A 'write' operation (copy+xor) is already in flight. 3584 * 2/ A 'check' operation is in flight, as it may clobber the parity 3585 * block. 3586 */ 3587 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3588 handle_stripe_dirtying(conf, sh, &s, disks); 3589 3590 /* maybe we need to check and possibly fix the parity for this stripe 3591 * Any reads will already have been scheduled, so we just see if enough 3592 * data is available. The parity check is held off while parity 3593 * dependent operations are in flight. 3594 */ 3595 if (sh->check_state || 3596 (s.syncing && s.locked == 0 && 3597 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3598 !test_bit(STRIPE_INSYNC, &sh->state))) { 3599 if (conf->level == 6) 3600 handle_parity_checks6(conf, sh, &s, disks); 3601 else 3602 handle_parity_checks5(conf, sh, &s, disks); 3603 } 3604 3605 if (s.replacing && s.locked == 0 3606 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3607 /* Write out to replacement devices where possible */ 3608 for (i = 0; i < conf->raid_disks; i++) 3609 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3610 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3611 set_bit(R5_WantReplace, &sh->dev[i].flags); 3612 set_bit(R5_LOCKED, &sh->dev[i].flags); 3613 s.locked++; 3614 } 3615 set_bit(STRIPE_INSYNC, &sh->state); 3616 } 3617 if ((s.syncing || s.replacing) && s.locked == 0 && 3618 test_bit(STRIPE_INSYNC, &sh->state)) { 3619 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3620 clear_bit(STRIPE_SYNCING, &sh->state); 3621 } 3622 3623 /* If the failed drives are just a ReadError, then we might need 3624 * to progress the repair/check process 3625 */ 3626 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3627 for (i = 0; i < s.failed; i++) { 3628 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3629 if (test_bit(R5_ReadError, &dev->flags) 3630 && !test_bit(R5_LOCKED, &dev->flags) 3631 && test_bit(R5_UPTODATE, &dev->flags) 3632 ) { 3633 if (!test_bit(R5_ReWrite, &dev->flags)) { 3634 set_bit(R5_Wantwrite, &dev->flags); 3635 set_bit(R5_ReWrite, &dev->flags); 3636 set_bit(R5_LOCKED, &dev->flags); 3637 s.locked++; 3638 } else { 3639 /* let's read it back */ 3640 set_bit(R5_Wantread, &dev->flags); 3641 set_bit(R5_LOCKED, &dev->flags); 3642 s.locked++; 3643 } 3644 } 3645 } 3646 3647 3648 /* Finish reconstruct operations initiated by the expansion process */ 3649 if (sh->reconstruct_state == reconstruct_state_result) { 3650 struct stripe_head *sh_src 3651 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3652 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3653 /* sh cannot be written until sh_src has been read. 3654 * so arrange for sh to be delayed a little 3655 */ 3656 set_bit(STRIPE_DELAYED, &sh->state); 3657 set_bit(STRIPE_HANDLE, &sh->state); 3658 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3659 &sh_src->state)) 3660 atomic_inc(&conf->preread_active_stripes); 3661 release_stripe(sh_src); 3662 goto finish; 3663 } 3664 if (sh_src) 3665 release_stripe(sh_src); 3666 3667 sh->reconstruct_state = reconstruct_state_idle; 3668 clear_bit(STRIPE_EXPANDING, &sh->state); 3669 for (i = conf->raid_disks; i--; ) { 3670 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3671 set_bit(R5_LOCKED, &sh->dev[i].flags); 3672 s.locked++; 3673 } 3674 } 3675 3676 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3677 !sh->reconstruct_state) { 3678 /* Need to write out all blocks after computing parity */ 3679 sh->disks = conf->raid_disks; 3680 stripe_set_idx(sh->sector, conf, 0, sh); 3681 schedule_reconstruction(sh, &s, 1, 1); 3682 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3683 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3684 atomic_dec(&conf->reshape_stripes); 3685 wake_up(&conf->wait_for_overlap); 3686 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3687 } 3688 3689 if (s.expanding && s.locked == 0 && 3690 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3691 handle_stripe_expansion(conf, sh); 3692 3693 finish: 3694 /* wait for this device to become unblocked */ 3695 if (unlikely(s.blocked_rdev)) { 3696 if (conf->mddev->external) 3697 md_wait_for_blocked_rdev(s.blocked_rdev, 3698 conf->mddev); 3699 else 3700 /* Internal metadata will immediately 3701 * be written by raid5d, so we don't 3702 * need to wait here. 3703 */ 3704 rdev_dec_pending(s.blocked_rdev, 3705 conf->mddev); 3706 } 3707 3708 if (s.handle_bad_blocks) 3709 for (i = disks; i--; ) { 3710 struct md_rdev *rdev; 3711 struct r5dev *dev = &sh->dev[i]; 3712 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3713 /* We own a safe reference to the rdev */ 3714 rdev = conf->disks[i].rdev; 3715 if (!rdev_set_badblocks(rdev, sh->sector, 3716 STRIPE_SECTORS, 0)) 3717 md_error(conf->mddev, rdev); 3718 rdev_dec_pending(rdev, conf->mddev); 3719 } 3720 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3721 rdev = conf->disks[i].rdev; 3722 rdev_clear_badblocks(rdev, sh->sector, 3723 STRIPE_SECTORS, 0); 3724 rdev_dec_pending(rdev, conf->mddev); 3725 } 3726 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3727 rdev = conf->disks[i].replacement; 3728 if (!rdev) 3729 /* rdev have been moved down */ 3730 rdev = conf->disks[i].rdev; 3731 rdev_clear_badblocks(rdev, sh->sector, 3732 STRIPE_SECTORS, 0); 3733 rdev_dec_pending(rdev, conf->mddev); 3734 } 3735 } 3736 3737 if (s.ops_request) 3738 raid_run_ops(sh, s.ops_request); 3739 3740 ops_run_io(sh, &s); 3741 3742 if (s.dec_preread_active) { 3743 /* We delay this until after ops_run_io so that if make_request 3744 * is waiting on a flush, it won't continue until the writes 3745 * have actually been submitted. 3746 */ 3747 atomic_dec(&conf->preread_active_stripes); 3748 if (atomic_read(&conf->preread_active_stripes) < 3749 IO_THRESHOLD) 3750 md_wakeup_thread(conf->mddev->thread); 3751 } 3752 3753 return_io(s.return_bi); 3754 3755 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3756 } 3757 3758 static void raid5_activate_delayed(struct r5conf *conf) 3759 { 3760 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3761 while (!list_empty(&conf->delayed_list)) { 3762 struct list_head *l = conf->delayed_list.next; 3763 struct stripe_head *sh; 3764 sh = list_entry(l, struct stripe_head, lru); 3765 list_del_init(l); 3766 clear_bit(STRIPE_DELAYED, &sh->state); 3767 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3768 atomic_inc(&conf->preread_active_stripes); 3769 list_add_tail(&sh->lru, &conf->hold_list); 3770 } 3771 } 3772 } 3773 3774 static void activate_bit_delay(struct r5conf *conf) 3775 { 3776 /* device_lock is held */ 3777 struct list_head head; 3778 list_add(&head, &conf->bitmap_list); 3779 list_del_init(&conf->bitmap_list); 3780 while (!list_empty(&head)) { 3781 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3782 list_del_init(&sh->lru); 3783 atomic_inc(&sh->count); 3784 __release_stripe(conf, sh); 3785 } 3786 } 3787 3788 int md_raid5_congested(struct mddev *mddev, int bits) 3789 { 3790 struct r5conf *conf = mddev->private; 3791 3792 /* No difference between reads and writes. Just check 3793 * how busy the stripe_cache is 3794 */ 3795 3796 if (conf->inactive_blocked) 3797 return 1; 3798 if (conf->quiesce) 3799 return 1; 3800 if (list_empty_careful(&conf->inactive_list)) 3801 return 1; 3802 3803 return 0; 3804 } 3805 EXPORT_SYMBOL_GPL(md_raid5_congested); 3806 3807 static int raid5_congested(void *data, int bits) 3808 { 3809 struct mddev *mddev = data; 3810 3811 return mddev_congested(mddev, bits) || 3812 md_raid5_congested(mddev, bits); 3813 } 3814 3815 /* We want read requests to align with chunks where possible, 3816 * but write requests don't need to. 3817 */ 3818 static int raid5_mergeable_bvec(struct request_queue *q, 3819 struct bvec_merge_data *bvm, 3820 struct bio_vec *biovec) 3821 { 3822 struct mddev *mddev = q->queuedata; 3823 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3824 int max; 3825 unsigned int chunk_sectors = mddev->chunk_sectors; 3826 unsigned int bio_sectors = bvm->bi_size >> 9; 3827 3828 if ((bvm->bi_rw & 1) == WRITE) 3829 return biovec->bv_len; /* always allow writes to be mergeable */ 3830 3831 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3832 chunk_sectors = mddev->new_chunk_sectors; 3833 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3834 if (max < 0) max = 0; 3835 if (max <= biovec->bv_len && bio_sectors == 0) 3836 return biovec->bv_len; 3837 else 3838 return max; 3839 } 3840 3841 3842 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3843 { 3844 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3845 unsigned int chunk_sectors = mddev->chunk_sectors; 3846 unsigned int bio_sectors = bio->bi_size >> 9; 3847 3848 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3849 chunk_sectors = mddev->new_chunk_sectors; 3850 return chunk_sectors >= 3851 ((sector & (chunk_sectors - 1)) + bio_sectors); 3852 } 3853 3854 /* 3855 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3856 * later sampled by raid5d. 3857 */ 3858 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3859 { 3860 unsigned long flags; 3861 3862 spin_lock_irqsave(&conf->device_lock, flags); 3863 3864 bi->bi_next = conf->retry_read_aligned_list; 3865 conf->retry_read_aligned_list = bi; 3866 3867 spin_unlock_irqrestore(&conf->device_lock, flags); 3868 md_wakeup_thread(conf->mddev->thread); 3869 } 3870 3871 3872 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3873 { 3874 struct bio *bi; 3875 3876 bi = conf->retry_read_aligned; 3877 if (bi) { 3878 conf->retry_read_aligned = NULL; 3879 return bi; 3880 } 3881 bi = conf->retry_read_aligned_list; 3882 if(bi) { 3883 conf->retry_read_aligned_list = bi->bi_next; 3884 bi->bi_next = NULL; 3885 /* 3886 * this sets the active strip count to 1 and the processed 3887 * strip count to zero (upper 8 bits) 3888 */ 3889 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3890 } 3891 3892 return bi; 3893 } 3894 3895 3896 /* 3897 * The "raid5_align_endio" should check if the read succeeded and if it 3898 * did, call bio_endio on the original bio (having bio_put the new bio 3899 * first). 3900 * If the read failed.. 3901 */ 3902 static void raid5_align_endio(struct bio *bi, int error) 3903 { 3904 struct bio* raid_bi = bi->bi_private; 3905 struct mddev *mddev; 3906 struct r5conf *conf; 3907 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3908 struct md_rdev *rdev; 3909 3910 bio_put(bi); 3911 3912 rdev = (void*)raid_bi->bi_next; 3913 raid_bi->bi_next = NULL; 3914 mddev = rdev->mddev; 3915 conf = mddev->private; 3916 3917 rdev_dec_pending(rdev, conf->mddev); 3918 3919 if (!error && uptodate) { 3920 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 3921 raid_bi, 0); 3922 bio_endio(raid_bi, 0); 3923 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3924 wake_up(&conf->wait_for_stripe); 3925 return; 3926 } 3927 3928 3929 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3930 3931 add_bio_to_retry(raid_bi, conf); 3932 } 3933 3934 static int bio_fits_rdev(struct bio *bi) 3935 { 3936 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3937 3938 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3939 return 0; 3940 blk_recount_segments(q, bi); 3941 if (bi->bi_phys_segments > queue_max_segments(q)) 3942 return 0; 3943 3944 if (q->merge_bvec_fn) 3945 /* it's too hard to apply the merge_bvec_fn at this stage, 3946 * just just give up 3947 */ 3948 return 0; 3949 3950 return 1; 3951 } 3952 3953 3954 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3955 { 3956 struct r5conf *conf = mddev->private; 3957 int dd_idx; 3958 struct bio* align_bi; 3959 struct md_rdev *rdev; 3960 sector_t end_sector; 3961 3962 if (!in_chunk_boundary(mddev, raid_bio)) { 3963 pr_debug("chunk_aligned_read : non aligned\n"); 3964 return 0; 3965 } 3966 /* 3967 * use bio_clone_mddev to make a copy of the bio 3968 */ 3969 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3970 if (!align_bi) 3971 return 0; 3972 /* 3973 * set bi_end_io to a new function, and set bi_private to the 3974 * original bio. 3975 */ 3976 align_bi->bi_end_io = raid5_align_endio; 3977 align_bi->bi_private = raid_bio; 3978 /* 3979 * compute position 3980 */ 3981 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3982 0, 3983 &dd_idx, NULL); 3984 3985 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3986 rcu_read_lock(); 3987 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3988 if (!rdev || test_bit(Faulty, &rdev->flags) || 3989 rdev->recovery_offset < end_sector) { 3990 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3991 if (rdev && 3992 (test_bit(Faulty, &rdev->flags) || 3993 !(test_bit(In_sync, &rdev->flags) || 3994 rdev->recovery_offset >= end_sector))) 3995 rdev = NULL; 3996 } 3997 if (rdev) { 3998 sector_t first_bad; 3999 int bad_sectors; 4000 4001 atomic_inc(&rdev->nr_pending); 4002 rcu_read_unlock(); 4003 raid_bio->bi_next = (void*)rdev; 4004 align_bi->bi_bdev = rdev->bdev; 4005 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4006 4007 if (!bio_fits_rdev(align_bi) || 4008 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 4009 &first_bad, &bad_sectors)) { 4010 /* too big in some way, or has a known bad block */ 4011 bio_put(align_bi); 4012 rdev_dec_pending(rdev, mddev); 4013 return 0; 4014 } 4015 4016 /* No reshape active, so we can trust rdev->data_offset */ 4017 align_bi->bi_sector += rdev->data_offset; 4018 4019 spin_lock_irq(&conf->device_lock); 4020 wait_event_lock_irq(conf->wait_for_stripe, 4021 conf->quiesce == 0, 4022 conf->device_lock); 4023 atomic_inc(&conf->active_aligned_reads); 4024 spin_unlock_irq(&conf->device_lock); 4025 4026 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4027 align_bi, disk_devt(mddev->gendisk), 4028 raid_bio->bi_sector); 4029 generic_make_request(align_bi); 4030 return 1; 4031 } else { 4032 rcu_read_unlock(); 4033 bio_put(align_bi); 4034 return 0; 4035 } 4036 } 4037 4038 /* __get_priority_stripe - get the next stripe to process 4039 * 4040 * Full stripe writes are allowed to pass preread active stripes up until 4041 * the bypass_threshold is exceeded. In general the bypass_count 4042 * increments when the handle_list is handled before the hold_list; however, it 4043 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4044 * stripe with in flight i/o. The bypass_count will be reset when the 4045 * head of the hold_list has changed, i.e. the head was promoted to the 4046 * handle_list. 4047 */ 4048 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 4049 { 4050 struct stripe_head *sh; 4051 4052 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4053 __func__, 4054 list_empty(&conf->handle_list) ? "empty" : "busy", 4055 list_empty(&conf->hold_list) ? "empty" : "busy", 4056 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4057 4058 if (!list_empty(&conf->handle_list)) { 4059 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 4060 4061 if (list_empty(&conf->hold_list)) 4062 conf->bypass_count = 0; 4063 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4064 if (conf->hold_list.next == conf->last_hold) 4065 conf->bypass_count++; 4066 else { 4067 conf->last_hold = conf->hold_list.next; 4068 conf->bypass_count -= conf->bypass_threshold; 4069 if (conf->bypass_count < 0) 4070 conf->bypass_count = 0; 4071 } 4072 } 4073 } else if (!list_empty(&conf->hold_list) && 4074 ((conf->bypass_threshold && 4075 conf->bypass_count > conf->bypass_threshold) || 4076 atomic_read(&conf->pending_full_writes) == 0)) { 4077 sh = list_entry(conf->hold_list.next, 4078 typeof(*sh), lru); 4079 conf->bypass_count -= conf->bypass_threshold; 4080 if (conf->bypass_count < 0) 4081 conf->bypass_count = 0; 4082 } else 4083 return NULL; 4084 4085 list_del_init(&sh->lru); 4086 atomic_inc(&sh->count); 4087 BUG_ON(atomic_read(&sh->count) != 1); 4088 return sh; 4089 } 4090 4091 struct raid5_plug_cb { 4092 struct blk_plug_cb cb; 4093 struct list_head list; 4094 }; 4095 4096 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4097 { 4098 struct raid5_plug_cb *cb = container_of( 4099 blk_cb, struct raid5_plug_cb, cb); 4100 struct stripe_head *sh; 4101 struct mddev *mddev = cb->cb.data; 4102 struct r5conf *conf = mddev->private; 4103 int cnt = 0; 4104 4105 if (cb->list.next && !list_empty(&cb->list)) { 4106 spin_lock_irq(&conf->device_lock); 4107 while (!list_empty(&cb->list)) { 4108 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4109 list_del_init(&sh->lru); 4110 /* 4111 * avoid race release_stripe_plug() sees 4112 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4113 * is still in our list 4114 */ 4115 smp_mb__before_clear_bit(); 4116 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4117 __release_stripe(conf, sh); 4118 cnt++; 4119 } 4120 spin_unlock_irq(&conf->device_lock); 4121 } 4122 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4123 kfree(cb); 4124 } 4125 4126 static void release_stripe_plug(struct mddev *mddev, 4127 struct stripe_head *sh) 4128 { 4129 struct blk_plug_cb *blk_cb = blk_check_plugged( 4130 raid5_unplug, mddev, 4131 sizeof(struct raid5_plug_cb)); 4132 struct raid5_plug_cb *cb; 4133 4134 if (!blk_cb) { 4135 release_stripe(sh); 4136 return; 4137 } 4138 4139 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4140 4141 if (cb->list.next == NULL) 4142 INIT_LIST_HEAD(&cb->list); 4143 4144 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4145 list_add_tail(&sh->lru, &cb->list); 4146 else 4147 release_stripe(sh); 4148 } 4149 4150 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4151 { 4152 struct r5conf *conf = mddev->private; 4153 sector_t logical_sector, last_sector; 4154 struct stripe_head *sh; 4155 int remaining; 4156 int stripe_sectors; 4157 4158 if (mddev->reshape_position != MaxSector) 4159 /* Skip discard while reshape is happening */ 4160 return; 4161 4162 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4163 last_sector = bi->bi_sector + (bi->bi_size>>9); 4164 4165 bi->bi_next = NULL; 4166 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4167 4168 stripe_sectors = conf->chunk_sectors * 4169 (conf->raid_disks - conf->max_degraded); 4170 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4171 stripe_sectors); 4172 sector_div(last_sector, stripe_sectors); 4173 4174 logical_sector *= conf->chunk_sectors; 4175 last_sector *= conf->chunk_sectors; 4176 4177 for (; logical_sector < last_sector; 4178 logical_sector += STRIPE_SECTORS) { 4179 DEFINE_WAIT(w); 4180 int d; 4181 again: 4182 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4183 prepare_to_wait(&conf->wait_for_overlap, &w, 4184 TASK_UNINTERRUPTIBLE); 4185 spin_lock_irq(&sh->stripe_lock); 4186 for (d = 0; d < conf->raid_disks; d++) { 4187 if (d == sh->pd_idx || d == sh->qd_idx) 4188 continue; 4189 if (sh->dev[d].towrite || sh->dev[d].toread) { 4190 set_bit(R5_Overlap, &sh->dev[d].flags); 4191 spin_unlock_irq(&sh->stripe_lock); 4192 release_stripe(sh); 4193 schedule(); 4194 goto again; 4195 } 4196 } 4197 finish_wait(&conf->wait_for_overlap, &w); 4198 for (d = 0; d < conf->raid_disks; d++) { 4199 if (d == sh->pd_idx || d == sh->qd_idx) 4200 continue; 4201 sh->dev[d].towrite = bi; 4202 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4203 raid5_inc_bi_active_stripes(bi); 4204 } 4205 spin_unlock_irq(&sh->stripe_lock); 4206 if (conf->mddev->bitmap) { 4207 for (d = 0; 4208 d < conf->raid_disks - conf->max_degraded; 4209 d++) 4210 bitmap_startwrite(mddev->bitmap, 4211 sh->sector, 4212 STRIPE_SECTORS, 4213 0); 4214 sh->bm_seq = conf->seq_flush + 1; 4215 set_bit(STRIPE_BIT_DELAY, &sh->state); 4216 } 4217 4218 set_bit(STRIPE_HANDLE, &sh->state); 4219 clear_bit(STRIPE_DELAYED, &sh->state); 4220 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4221 atomic_inc(&conf->preread_active_stripes); 4222 release_stripe_plug(mddev, sh); 4223 } 4224 4225 remaining = raid5_dec_bi_active_stripes(bi); 4226 if (remaining == 0) { 4227 md_write_end(mddev); 4228 bio_endio(bi, 0); 4229 } 4230 } 4231 4232 static void make_request(struct mddev *mddev, struct bio * bi) 4233 { 4234 struct r5conf *conf = mddev->private; 4235 int dd_idx; 4236 sector_t new_sector; 4237 sector_t logical_sector, last_sector; 4238 struct stripe_head *sh; 4239 const int rw = bio_data_dir(bi); 4240 int remaining; 4241 4242 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4243 md_flush_request(mddev, bi); 4244 return; 4245 } 4246 4247 md_write_start(mddev, bi); 4248 4249 if (rw == READ && 4250 mddev->reshape_position == MaxSector && 4251 chunk_aligned_read(mddev,bi)) 4252 return; 4253 4254 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4255 make_discard_request(mddev, bi); 4256 return; 4257 } 4258 4259 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4260 last_sector = bi->bi_sector + (bi->bi_size>>9); 4261 bi->bi_next = NULL; 4262 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4263 4264 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4265 DEFINE_WAIT(w); 4266 int previous; 4267 4268 retry: 4269 previous = 0; 4270 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4271 if (unlikely(conf->reshape_progress != MaxSector)) { 4272 /* spinlock is needed as reshape_progress may be 4273 * 64bit on a 32bit platform, and so it might be 4274 * possible to see a half-updated value 4275 * Of course reshape_progress could change after 4276 * the lock is dropped, so once we get a reference 4277 * to the stripe that we think it is, we will have 4278 * to check again. 4279 */ 4280 spin_lock_irq(&conf->device_lock); 4281 if (mddev->reshape_backwards 4282 ? logical_sector < conf->reshape_progress 4283 : logical_sector >= conf->reshape_progress) { 4284 previous = 1; 4285 } else { 4286 if (mddev->reshape_backwards 4287 ? logical_sector < conf->reshape_safe 4288 : logical_sector >= conf->reshape_safe) { 4289 spin_unlock_irq(&conf->device_lock); 4290 schedule(); 4291 goto retry; 4292 } 4293 } 4294 spin_unlock_irq(&conf->device_lock); 4295 } 4296 4297 new_sector = raid5_compute_sector(conf, logical_sector, 4298 previous, 4299 &dd_idx, NULL); 4300 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4301 (unsigned long long)new_sector, 4302 (unsigned long long)logical_sector); 4303 4304 sh = get_active_stripe(conf, new_sector, previous, 4305 (bi->bi_rw&RWA_MASK), 0); 4306 if (sh) { 4307 if (unlikely(previous)) { 4308 /* expansion might have moved on while waiting for a 4309 * stripe, so we must do the range check again. 4310 * Expansion could still move past after this 4311 * test, but as we are holding a reference to 4312 * 'sh', we know that if that happens, 4313 * STRIPE_EXPANDING will get set and the expansion 4314 * won't proceed until we finish with the stripe. 4315 */ 4316 int must_retry = 0; 4317 spin_lock_irq(&conf->device_lock); 4318 if (mddev->reshape_backwards 4319 ? logical_sector >= conf->reshape_progress 4320 : logical_sector < conf->reshape_progress) 4321 /* mismatch, need to try again */ 4322 must_retry = 1; 4323 spin_unlock_irq(&conf->device_lock); 4324 if (must_retry) { 4325 release_stripe(sh); 4326 schedule(); 4327 goto retry; 4328 } 4329 } 4330 4331 if (rw == WRITE && 4332 logical_sector >= mddev->suspend_lo && 4333 logical_sector < mddev->suspend_hi) { 4334 release_stripe(sh); 4335 /* As the suspend_* range is controlled by 4336 * userspace, we want an interruptible 4337 * wait. 4338 */ 4339 flush_signals(current); 4340 prepare_to_wait(&conf->wait_for_overlap, 4341 &w, TASK_INTERRUPTIBLE); 4342 if (logical_sector >= mddev->suspend_lo && 4343 logical_sector < mddev->suspend_hi) 4344 schedule(); 4345 goto retry; 4346 } 4347 4348 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4349 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4350 /* Stripe is busy expanding or 4351 * add failed due to overlap. Flush everything 4352 * and wait a while 4353 */ 4354 md_wakeup_thread(mddev->thread); 4355 release_stripe(sh); 4356 schedule(); 4357 goto retry; 4358 } 4359 finish_wait(&conf->wait_for_overlap, &w); 4360 set_bit(STRIPE_HANDLE, &sh->state); 4361 clear_bit(STRIPE_DELAYED, &sh->state); 4362 if ((bi->bi_rw & REQ_SYNC) && 4363 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4364 atomic_inc(&conf->preread_active_stripes); 4365 release_stripe_plug(mddev, sh); 4366 } else { 4367 /* cannot get stripe for read-ahead, just give-up */ 4368 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4369 finish_wait(&conf->wait_for_overlap, &w); 4370 break; 4371 } 4372 } 4373 4374 remaining = raid5_dec_bi_active_stripes(bi); 4375 if (remaining == 0) { 4376 4377 if ( rw == WRITE ) 4378 md_write_end(mddev); 4379 4380 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 4381 bi, 0); 4382 bio_endio(bi, 0); 4383 } 4384 } 4385 4386 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4387 4388 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4389 { 4390 /* reshaping is quite different to recovery/resync so it is 4391 * handled quite separately ... here. 4392 * 4393 * On each call to sync_request, we gather one chunk worth of 4394 * destination stripes and flag them as expanding. 4395 * Then we find all the source stripes and request reads. 4396 * As the reads complete, handle_stripe will copy the data 4397 * into the destination stripe and release that stripe. 4398 */ 4399 struct r5conf *conf = mddev->private; 4400 struct stripe_head *sh; 4401 sector_t first_sector, last_sector; 4402 int raid_disks = conf->previous_raid_disks; 4403 int data_disks = raid_disks - conf->max_degraded; 4404 int new_data_disks = conf->raid_disks - conf->max_degraded; 4405 int i; 4406 int dd_idx; 4407 sector_t writepos, readpos, safepos; 4408 sector_t stripe_addr; 4409 int reshape_sectors; 4410 struct list_head stripes; 4411 4412 if (sector_nr == 0) { 4413 /* If restarting in the middle, skip the initial sectors */ 4414 if (mddev->reshape_backwards && 4415 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4416 sector_nr = raid5_size(mddev, 0, 0) 4417 - conf->reshape_progress; 4418 } else if (!mddev->reshape_backwards && 4419 conf->reshape_progress > 0) 4420 sector_nr = conf->reshape_progress; 4421 sector_div(sector_nr, new_data_disks); 4422 if (sector_nr) { 4423 mddev->curr_resync_completed = sector_nr; 4424 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4425 *skipped = 1; 4426 return sector_nr; 4427 } 4428 } 4429 4430 /* We need to process a full chunk at a time. 4431 * If old and new chunk sizes differ, we need to process the 4432 * largest of these 4433 */ 4434 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4435 reshape_sectors = mddev->new_chunk_sectors; 4436 else 4437 reshape_sectors = mddev->chunk_sectors; 4438 4439 /* We update the metadata at least every 10 seconds, or when 4440 * the data about to be copied would over-write the source of 4441 * the data at the front of the range. i.e. one new_stripe 4442 * along from reshape_progress new_maps to after where 4443 * reshape_safe old_maps to 4444 */ 4445 writepos = conf->reshape_progress; 4446 sector_div(writepos, new_data_disks); 4447 readpos = conf->reshape_progress; 4448 sector_div(readpos, data_disks); 4449 safepos = conf->reshape_safe; 4450 sector_div(safepos, data_disks); 4451 if (mddev->reshape_backwards) { 4452 writepos -= min_t(sector_t, reshape_sectors, writepos); 4453 readpos += reshape_sectors; 4454 safepos += reshape_sectors; 4455 } else { 4456 writepos += reshape_sectors; 4457 readpos -= min_t(sector_t, reshape_sectors, readpos); 4458 safepos -= min_t(sector_t, reshape_sectors, safepos); 4459 } 4460 4461 /* Having calculated the 'writepos' possibly use it 4462 * to set 'stripe_addr' which is where we will write to. 4463 */ 4464 if (mddev->reshape_backwards) { 4465 BUG_ON(conf->reshape_progress == 0); 4466 stripe_addr = writepos; 4467 BUG_ON((mddev->dev_sectors & 4468 ~((sector_t)reshape_sectors - 1)) 4469 - reshape_sectors - stripe_addr 4470 != sector_nr); 4471 } else { 4472 BUG_ON(writepos != sector_nr + reshape_sectors); 4473 stripe_addr = sector_nr; 4474 } 4475 4476 /* 'writepos' is the most advanced device address we might write. 4477 * 'readpos' is the least advanced device address we might read. 4478 * 'safepos' is the least address recorded in the metadata as having 4479 * been reshaped. 4480 * If there is a min_offset_diff, these are adjusted either by 4481 * increasing the safepos/readpos if diff is negative, or 4482 * increasing writepos if diff is positive. 4483 * If 'readpos' is then behind 'writepos', there is no way that we can 4484 * ensure safety in the face of a crash - that must be done by userspace 4485 * making a backup of the data. So in that case there is no particular 4486 * rush to update metadata. 4487 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4488 * update the metadata to advance 'safepos' to match 'readpos' so that 4489 * we can be safe in the event of a crash. 4490 * So we insist on updating metadata if safepos is behind writepos and 4491 * readpos is beyond writepos. 4492 * In any case, update the metadata every 10 seconds. 4493 * Maybe that number should be configurable, but I'm not sure it is 4494 * worth it.... maybe it could be a multiple of safemode_delay??? 4495 */ 4496 if (conf->min_offset_diff < 0) { 4497 safepos += -conf->min_offset_diff; 4498 readpos += -conf->min_offset_diff; 4499 } else 4500 writepos += conf->min_offset_diff; 4501 4502 if ((mddev->reshape_backwards 4503 ? (safepos > writepos && readpos < writepos) 4504 : (safepos < writepos && readpos > writepos)) || 4505 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4506 /* Cannot proceed until we've updated the superblock... */ 4507 wait_event(conf->wait_for_overlap, 4508 atomic_read(&conf->reshape_stripes)==0); 4509 mddev->reshape_position = conf->reshape_progress; 4510 mddev->curr_resync_completed = sector_nr; 4511 conf->reshape_checkpoint = jiffies; 4512 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4513 md_wakeup_thread(mddev->thread); 4514 wait_event(mddev->sb_wait, mddev->flags == 0 || 4515 kthread_should_stop()); 4516 spin_lock_irq(&conf->device_lock); 4517 conf->reshape_safe = mddev->reshape_position; 4518 spin_unlock_irq(&conf->device_lock); 4519 wake_up(&conf->wait_for_overlap); 4520 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4521 } 4522 4523 INIT_LIST_HEAD(&stripes); 4524 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4525 int j; 4526 int skipped_disk = 0; 4527 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4528 set_bit(STRIPE_EXPANDING, &sh->state); 4529 atomic_inc(&conf->reshape_stripes); 4530 /* If any of this stripe is beyond the end of the old 4531 * array, then we need to zero those blocks 4532 */ 4533 for (j=sh->disks; j--;) { 4534 sector_t s; 4535 if (j == sh->pd_idx) 4536 continue; 4537 if (conf->level == 6 && 4538 j == sh->qd_idx) 4539 continue; 4540 s = compute_blocknr(sh, j, 0); 4541 if (s < raid5_size(mddev, 0, 0)) { 4542 skipped_disk = 1; 4543 continue; 4544 } 4545 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4546 set_bit(R5_Expanded, &sh->dev[j].flags); 4547 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4548 } 4549 if (!skipped_disk) { 4550 set_bit(STRIPE_EXPAND_READY, &sh->state); 4551 set_bit(STRIPE_HANDLE, &sh->state); 4552 } 4553 list_add(&sh->lru, &stripes); 4554 } 4555 spin_lock_irq(&conf->device_lock); 4556 if (mddev->reshape_backwards) 4557 conf->reshape_progress -= reshape_sectors * new_data_disks; 4558 else 4559 conf->reshape_progress += reshape_sectors * new_data_disks; 4560 spin_unlock_irq(&conf->device_lock); 4561 /* Ok, those stripe are ready. We can start scheduling 4562 * reads on the source stripes. 4563 * The source stripes are determined by mapping the first and last 4564 * block on the destination stripes. 4565 */ 4566 first_sector = 4567 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4568 1, &dd_idx, NULL); 4569 last_sector = 4570 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4571 * new_data_disks - 1), 4572 1, &dd_idx, NULL); 4573 if (last_sector >= mddev->dev_sectors) 4574 last_sector = mddev->dev_sectors - 1; 4575 while (first_sector <= last_sector) { 4576 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4577 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4578 set_bit(STRIPE_HANDLE, &sh->state); 4579 release_stripe(sh); 4580 first_sector += STRIPE_SECTORS; 4581 } 4582 /* Now that the sources are clearly marked, we can release 4583 * the destination stripes 4584 */ 4585 while (!list_empty(&stripes)) { 4586 sh = list_entry(stripes.next, struct stripe_head, lru); 4587 list_del_init(&sh->lru); 4588 release_stripe(sh); 4589 } 4590 /* If this takes us to the resync_max point where we have to pause, 4591 * then we need to write out the superblock. 4592 */ 4593 sector_nr += reshape_sectors; 4594 if ((sector_nr - mddev->curr_resync_completed) * 2 4595 >= mddev->resync_max - mddev->curr_resync_completed) { 4596 /* Cannot proceed until we've updated the superblock... */ 4597 wait_event(conf->wait_for_overlap, 4598 atomic_read(&conf->reshape_stripes) == 0); 4599 mddev->reshape_position = conf->reshape_progress; 4600 mddev->curr_resync_completed = sector_nr; 4601 conf->reshape_checkpoint = jiffies; 4602 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4603 md_wakeup_thread(mddev->thread); 4604 wait_event(mddev->sb_wait, 4605 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4606 || kthread_should_stop()); 4607 spin_lock_irq(&conf->device_lock); 4608 conf->reshape_safe = mddev->reshape_position; 4609 spin_unlock_irq(&conf->device_lock); 4610 wake_up(&conf->wait_for_overlap); 4611 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4612 } 4613 return reshape_sectors; 4614 } 4615 4616 /* FIXME go_faster isn't used */ 4617 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4618 { 4619 struct r5conf *conf = mddev->private; 4620 struct stripe_head *sh; 4621 sector_t max_sector = mddev->dev_sectors; 4622 sector_t sync_blocks; 4623 int still_degraded = 0; 4624 int i; 4625 4626 if (sector_nr >= max_sector) { 4627 /* just being told to finish up .. nothing much to do */ 4628 4629 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4630 end_reshape(conf); 4631 return 0; 4632 } 4633 4634 if (mddev->curr_resync < max_sector) /* aborted */ 4635 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4636 &sync_blocks, 1); 4637 else /* completed sync */ 4638 conf->fullsync = 0; 4639 bitmap_close_sync(mddev->bitmap); 4640 4641 return 0; 4642 } 4643 4644 /* Allow raid5_quiesce to complete */ 4645 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4646 4647 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4648 return reshape_request(mddev, sector_nr, skipped); 4649 4650 /* No need to check resync_max as we never do more than one 4651 * stripe, and as resync_max will always be on a chunk boundary, 4652 * if the check in md_do_sync didn't fire, there is no chance 4653 * of overstepping resync_max here 4654 */ 4655 4656 /* if there is too many failed drives and we are trying 4657 * to resync, then assert that we are finished, because there is 4658 * nothing we can do. 4659 */ 4660 if (mddev->degraded >= conf->max_degraded && 4661 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4662 sector_t rv = mddev->dev_sectors - sector_nr; 4663 *skipped = 1; 4664 return rv; 4665 } 4666 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4667 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4668 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4669 /* we can skip this block, and probably more */ 4670 sync_blocks /= STRIPE_SECTORS; 4671 *skipped = 1; 4672 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4673 } 4674 4675 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4676 4677 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4678 if (sh == NULL) { 4679 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4680 /* make sure we don't swamp the stripe cache if someone else 4681 * is trying to get access 4682 */ 4683 schedule_timeout_uninterruptible(1); 4684 } 4685 /* Need to check if array will still be degraded after recovery/resync 4686 * We don't need to check the 'failed' flag as when that gets set, 4687 * recovery aborts. 4688 */ 4689 for (i = 0; i < conf->raid_disks; i++) 4690 if (conf->disks[i].rdev == NULL) 4691 still_degraded = 1; 4692 4693 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4694 4695 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4696 4697 handle_stripe(sh); 4698 release_stripe(sh); 4699 4700 return STRIPE_SECTORS; 4701 } 4702 4703 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4704 { 4705 /* We may not be able to submit a whole bio at once as there 4706 * may not be enough stripe_heads available. 4707 * We cannot pre-allocate enough stripe_heads as we may need 4708 * more than exist in the cache (if we allow ever large chunks). 4709 * So we do one stripe head at a time and record in 4710 * ->bi_hw_segments how many have been done. 4711 * 4712 * We *know* that this entire raid_bio is in one chunk, so 4713 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4714 */ 4715 struct stripe_head *sh; 4716 int dd_idx; 4717 sector_t sector, logical_sector, last_sector; 4718 int scnt = 0; 4719 int remaining; 4720 int handled = 0; 4721 4722 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4723 sector = raid5_compute_sector(conf, logical_sector, 4724 0, &dd_idx, NULL); 4725 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4726 4727 for (; logical_sector < last_sector; 4728 logical_sector += STRIPE_SECTORS, 4729 sector += STRIPE_SECTORS, 4730 scnt++) { 4731 4732 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4733 /* already done this stripe */ 4734 continue; 4735 4736 sh = get_active_stripe(conf, sector, 0, 1, 0); 4737 4738 if (!sh) { 4739 /* failed to get a stripe - must wait */ 4740 raid5_set_bi_processed_stripes(raid_bio, scnt); 4741 conf->retry_read_aligned = raid_bio; 4742 return handled; 4743 } 4744 4745 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4746 release_stripe(sh); 4747 raid5_set_bi_processed_stripes(raid_bio, scnt); 4748 conf->retry_read_aligned = raid_bio; 4749 return handled; 4750 } 4751 4752 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4753 handle_stripe(sh); 4754 release_stripe(sh); 4755 handled++; 4756 } 4757 remaining = raid5_dec_bi_active_stripes(raid_bio); 4758 if (remaining == 0) { 4759 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 4760 raid_bio, 0); 4761 bio_endio(raid_bio, 0); 4762 } 4763 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4764 wake_up(&conf->wait_for_stripe); 4765 return handled; 4766 } 4767 4768 #define MAX_STRIPE_BATCH 8 4769 static int handle_active_stripes(struct r5conf *conf) 4770 { 4771 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4772 int i, batch_size = 0; 4773 4774 while (batch_size < MAX_STRIPE_BATCH && 4775 (sh = __get_priority_stripe(conf)) != NULL) 4776 batch[batch_size++] = sh; 4777 4778 if (batch_size == 0) 4779 return batch_size; 4780 spin_unlock_irq(&conf->device_lock); 4781 4782 for (i = 0; i < batch_size; i++) 4783 handle_stripe(batch[i]); 4784 4785 cond_resched(); 4786 4787 spin_lock_irq(&conf->device_lock); 4788 for (i = 0; i < batch_size; i++) 4789 __release_stripe(conf, batch[i]); 4790 return batch_size; 4791 } 4792 4793 /* 4794 * This is our raid5 kernel thread. 4795 * 4796 * We scan the hash table for stripes which can be handled now. 4797 * During the scan, completed stripes are saved for us by the interrupt 4798 * handler, so that they will not have to wait for our next wakeup. 4799 */ 4800 static void raid5d(struct md_thread *thread) 4801 { 4802 struct mddev *mddev = thread->mddev; 4803 struct r5conf *conf = mddev->private; 4804 int handled; 4805 struct blk_plug plug; 4806 4807 pr_debug("+++ raid5d active\n"); 4808 4809 md_check_recovery(mddev); 4810 4811 blk_start_plug(&plug); 4812 handled = 0; 4813 spin_lock_irq(&conf->device_lock); 4814 while (1) { 4815 struct bio *bio; 4816 int batch_size; 4817 4818 if ( 4819 !list_empty(&conf->bitmap_list)) { 4820 /* Now is a good time to flush some bitmap updates */ 4821 conf->seq_flush++; 4822 spin_unlock_irq(&conf->device_lock); 4823 bitmap_unplug(mddev->bitmap); 4824 spin_lock_irq(&conf->device_lock); 4825 conf->seq_write = conf->seq_flush; 4826 activate_bit_delay(conf); 4827 } 4828 raid5_activate_delayed(conf); 4829 4830 while ((bio = remove_bio_from_retry(conf))) { 4831 int ok; 4832 spin_unlock_irq(&conf->device_lock); 4833 ok = retry_aligned_read(conf, bio); 4834 spin_lock_irq(&conf->device_lock); 4835 if (!ok) 4836 break; 4837 handled++; 4838 } 4839 4840 batch_size = handle_active_stripes(conf); 4841 if (!batch_size) 4842 break; 4843 handled += batch_size; 4844 4845 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 4846 spin_unlock_irq(&conf->device_lock); 4847 md_check_recovery(mddev); 4848 spin_lock_irq(&conf->device_lock); 4849 } 4850 } 4851 pr_debug("%d stripes handled\n", handled); 4852 4853 spin_unlock_irq(&conf->device_lock); 4854 4855 async_tx_issue_pending_all(); 4856 blk_finish_plug(&plug); 4857 4858 pr_debug("--- raid5d inactive\n"); 4859 } 4860 4861 static ssize_t 4862 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4863 { 4864 struct r5conf *conf = mddev->private; 4865 if (conf) 4866 return sprintf(page, "%d\n", conf->max_nr_stripes); 4867 else 4868 return 0; 4869 } 4870 4871 int 4872 raid5_set_cache_size(struct mddev *mddev, int size) 4873 { 4874 struct r5conf *conf = mddev->private; 4875 int err; 4876 4877 if (size <= 16 || size > 32768) 4878 return -EINVAL; 4879 while (size < conf->max_nr_stripes) { 4880 if (drop_one_stripe(conf)) 4881 conf->max_nr_stripes--; 4882 else 4883 break; 4884 } 4885 err = md_allow_write(mddev); 4886 if (err) 4887 return err; 4888 while (size > conf->max_nr_stripes) { 4889 if (grow_one_stripe(conf)) 4890 conf->max_nr_stripes++; 4891 else break; 4892 } 4893 return 0; 4894 } 4895 EXPORT_SYMBOL(raid5_set_cache_size); 4896 4897 static ssize_t 4898 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4899 { 4900 struct r5conf *conf = mddev->private; 4901 unsigned long new; 4902 int err; 4903 4904 if (len >= PAGE_SIZE) 4905 return -EINVAL; 4906 if (!conf) 4907 return -ENODEV; 4908 4909 if (strict_strtoul(page, 10, &new)) 4910 return -EINVAL; 4911 err = raid5_set_cache_size(mddev, new); 4912 if (err) 4913 return err; 4914 return len; 4915 } 4916 4917 static struct md_sysfs_entry 4918 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4919 raid5_show_stripe_cache_size, 4920 raid5_store_stripe_cache_size); 4921 4922 static ssize_t 4923 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4924 { 4925 struct r5conf *conf = mddev->private; 4926 if (conf) 4927 return sprintf(page, "%d\n", conf->bypass_threshold); 4928 else 4929 return 0; 4930 } 4931 4932 static ssize_t 4933 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4934 { 4935 struct r5conf *conf = mddev->private; 4936 unsigned long new; 4937 if (len >= PAGE_SIZE) 4938 return -EINVAL; 4939 if (!conf) 4940 return -ENODEV; 4941 4942 if (strict_strtoul(page, 10, &new)) 4943 return -EINVAL; 4944 if (new > conf->max_nr_stripes) 4945 return -EINVAL; 4946 conf->bypass_threshold = new; 4947 return len; 4948 } 4949 4950 static struct md_sysfs_entry 4951 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4952 S_IRUGO | S_IWUSR, 4953 raid5_show_preread_threshold, 4954 raid5_store_preread_threshold); 4955 4956 static ssize_t 4957 stripe_cache_active_show(struct mddev *mddev, char *page) 4958 { 4959 struct r5conf *conf = mddev->private; 4960 if (conf) 4961 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4962 else 4963 return 0; 4964 } 4965 4966 static struct md_sysfs_entry 4967 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4968 4969 static struct attribute *raid5_attrs[] = { 4970 &raid5_stripecache_size.attr, 4971 &raid5_stripecache_active.attr, 4972 &raid5_preread_bypass_threshold.attr, 4973 NULL, 4974 }; 4975 static struct attribute_group raid5_attrs_group = { 4976 .name = NULL, 4977 .attrs = raid5_attrs, 4978 }; 4979 4980 static sector_t 4981 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4982 { 4983 struct r5conf *conf = mddev->private; 4984 4985 if (!sectors) 4986 sectors = mddev->dev_sectors; 4987 if (!raid_disks) 4988 /* size is defined by the smallest of previous and new size */ 4989 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4990 4991 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4992 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4993 return sectors * (raid_disks - conf->max_degraded); 4994 } 4995 4996 static void raid5_free_percpu(struct r5conf *conf) 4997 { 4998 struct raid5_percpu *percpu; 4999 unsigned long cpu; 5000 5001 if (!conf->percpu) 5002 return; 5003 5004 get_online_cpus(); 5005 for_each_possible_cpu(cpu) { 5006 percpu = per_cpu_ptr(conf->percpu, cpu); 5007 safe_put_page(percpu->spare_page); 5008 kfree(percpu->scribble); 5009 } 5010 #ifdef CONFIG_HOTPLUG_CPU 5011 unregister_cpu_notifier(&conf->cpu_notify); 5012 #endif 5013 put_online_cpus(); 5014 5015 free_percpu(conf->percpu); 5016 } 5017 5018 static void free_conf(struct r5conf *conf) 5019 { 5020 shrink_stripes(conf); 5021 raid5_free_percpu(conf); 5022 kfree(conf->disks); 5023 kfree(conf->stripe_hashtbl); 5024 kfree(conf); 5025 } 5026 5027 #ifdef CONFIG_HOTPLUG_CPU 5028 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 5029 void *hcpu) 5030 { 5031 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 5032 long cpu = (long)hcpu; 5033 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 5034 5035 switch (action) { 5036 case CPU_UP_PREPARE: 5037 case CPU_UP_PREPARE_FROZEN: 5038 if (conf->level == 6 && !percpu->spare_page) 5039 percpu->spare_page = alloc_page(GFP_KERNEL); 5040 if (!percpu->scribble) 5041 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5042 5043 if (!percpu->scribble || 5044 (conf->level == 6 && !percpu->spare_page)) { 5045 safe_put_page(percpu->spare_page); 5046 kfree(percpu->scribble); 5047 pr_err("%s: failed memory allocation for cpu%ld\n", 5048 __func__, cpu); 5049 return notifier_from_errno(-ENOMEM); 5050 } 5051 break; 5052 case CPU_DEAD: 5053 case CPU_DEAD_FROZEN: 5054 safe_put_page(percpu->spare_page); 5055 kfree(percpu->scribble); 5056 percpu->spare_page = NULL; 5057 percpu->scribble = NULL; 5058 break; 5059 default: 5060 break; 5061 } 5062 return NOTIFY_OK; 5063 } 5064 #endif 5065 5066 static int raid5_alloc_percpu(struct r5conf *conf) 5067 { 5068 unsigned long cpu; 5069 struct page *spare_page; 5070 struct raid5_percpu __percpu *allcpus; 5071 void *scribble; 5072 int err; 5073 5074 allcpus = alloc_percpu(struct raid5_percpu); 5075 if (!allcpus) 5076 return -ENOMEM; 5077 conf->percpu = allcpus; 5078 5079 get_online_cpus(); 5080 err = 0; 5081 for_each_present_cpu(cpu) { 5082 if (conf->level == 6) { 5083 spare_page = alloc_page(GFP_KERNEL); 5084 if (!spare_page) { 5085 err = -ENOMEM; 5086 break; 5087 } 5088 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 5089 } 5090 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5091 if (!scribble) { 5092 err = -ENOMEM; 5093 break; 5094 } 5095 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 5096 } 5097 #ifdef CONFIG_HOTPLUG_CPU 5098 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5099 conf->cpu_notify.priority = 0; 5100 if (err == 0) 5101 err = register_cpu_notifier(&conf->cpu_notify); 5102 #endif 5103 put_online_cpus(); 5104 5105 return err; 5106 } 5107 5108 static struct r5conf *setup_conf(struct mddev *mddev) 5109 { 5110 struct r5conf *conf; 5111 int raid_disk, memory, max_disks; 5112 struct md_rdev *rdev; 5113 struct disk_info *disk; 5114 char pers_name[6]; 5115 5116 if (mddev->new_level != 5 5117 && mddev->new_level != 4 5118 && mddev->new_level != 6) { 5119 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5120 mdname(mddev), mddev->new_level); 5121 return ERR_PTR(-EIO); 5122 } 5123 if ((mddev->new_level == 5 5124 && !algorithm_valid_raid5(mddev->new_layout)) || 5125 (mddev->new_level == 6 5126 && !algorithm_valid_raid6(mddev->new_layout))) { 5127 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5128 mdname(mddev), mddev->new_layout); 5129 return ERR_PTR(-EIO); 5130 } 5131 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5132 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5133 mdname(mddev), mddev->raid_disks); 5134 return ERR_PTR(-EINVAL); 5135 } 5136 5137 if (!mddev->new_chunk_sectors || 5138 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5139 !is_power_of_2(mddev->new_chunk_sectors)) { 5140 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5141 mdname(mddev), mddev->new_chunk_sectors << 9); 5142 return ERR_PTR(-EINVAL); 5143 } 5144 5145 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5146 if (conf == NULL) 5147 goto abort; 5148 spin_lock_init(&conf->device_lock); 5149 init_waitqueue_head(&conf->wait_for_stripe); 5150 init_waitqueue_head(&conf->wait_for_overlap); 5151 INIT_LIST_HEAD(&conf->handle_list); 5152 INIT_LIST_HEAD(&conf->hold_list); 5153 INIT_LIST_HEAD(&conf->delayed_list); 5154 INIT_LIST_HEAD(&conf->bitmap_list); 5155 INIT_LIST_HEAD(&conf->inactive_list); 5156 atomic_set(&conf->active_stripes, 0); 5157 atomic_set(&conf->preread_active_stripes, 0); 5158 atomic_set(&conf->active_aligned_reads, 0); 5159 conf->bypass_threshold = BYPASS_THRESHOLD; 5160 conf->recovery_disabled = mddev->recovery_disabled - 1; 5161 5162 conf->raid_disks = mddev->raid_disks; 5163 if (mddev->reshape_position == MaxSector) 5164 conf->previous_raid_disks = mddev->raid_disks; 5165 else 5166 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5167 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5168 conf->scribble_len = scribble_len(max_disks); 5169 5170 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5171 GFP_KERNEL); 5172 if (!conf->disks) 5173 goto abort; 5174 5175 conf->mddev = mddev; 5176 5177 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5178 goto abort; 5179 5180 conf->level = mddev->new_level; 5181 if (raid5_alloc_percpu(conf) != 0) 5182 goto abort; 5183 5184 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5185 5186 rdev_for_each(rdev, mddev) { 5187 raid_disk = rdev->raid_disk; 5188 if (raid_disk >= max_disks 5189 || raid_disk < 0) 5190 continue; 5191 disk = conf->disks + raid_disk; 5192 5193 if (test_bit(Replacement, &rdev->flags)) { 5194 if (disk->replacement) 5195 goto abort; 5196 disk->replacement = rdev; 5197 } else { 5198 if (disk->rdev) 5199 goto abort; 5200 disk->rdev = rdev; 5201 } 5202 5203 if (test_bit(In_sync, &rdev->flags)) { 5204 char b[BDEVNAME_SIZE]; 5205 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5206 " disk %d\n", 5207 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5208 } else if (rdev->saved_raid_disk != raid_disk) 5209 /* Cannot rely on bitmap to complete recovery */ 5210 conf->fullsync = 1; 5211 } 5212 5213 conf->chunk_sectors = mddev->new_chunk_sectors; 5214 conf->level = mddev->new_level; 5215 if (conf->level == 6) 5216 conf->max_degraded = 2; 5217 else 5218 conf->max_degraded = 1; 5219 conf->algorithm = mddev->new_layout; 5220 conf->max_nr_stripes = NR_STRIPES; 5221 conf->reshape_progress = mddev->reshape_position; 5222 if (conf->reshape_progress != MaxSector) { 5223 conf->prev_chunk_sectors = mddev->chunk_sectors; 5224 conf->prev_algo = mddev->layout; 5225 } 5226 5227 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5228 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5229 if (grow_stripes(conf, conf->max_nr_stripes)) { 5230 printk(KERN_ERR 5231 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5232 mdname(mddev), memory); 5233 goto abort; 5234 } else 5235 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5236 mdname(mddev), memory); 5237 5238 sprintf(pers_name, "raid%d", mddev->new_level); 5239 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5240 if (!conf->thread) { 5241 printk(KERN_ERR 5242 "md/raid:%s: couldn't allocate thread.\n", 5243 mdname(mddev)); 5244 goto abort; 5245 } 5246 5247 return conf; 5248 5249 abort: 5250 if (conf) { 5251 free_conf(conf); 5252 return ERR_PTR(-EIO); 5253 } else 5254 return ERR_PTR(-ENOMEM); 5255 } 5256 5257 5258 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5259 { 5260 switch (algo) { 5261 case ALGORITHM_PARITY_0: 5262 if (raid_disk < max_degraded) 5263 return 1; 5264 break; 5265 case ALGORITHM_PARITY_N: 5266 if (raid_disk >= raid_disks - max_degraded) 5267 return 1; 5268 break; 5269 case ALGORITHM_PARITY_0_6: 5270 if (raid_disk == 0 || 5271 raid_disk == raid_disks - 1) 5272 return 1; 5273 break; 5274 case ALGORITHM_LEFT_ASYMMETRIC_6: 5275 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5276 case ALGORITHM_LEFT_SYMMETRIC_6: 5277 case ALGORITHM_RIGHT_SYMMETRIC_6: 5278 if (raid_disk == raid_disks - 1) 5279 return 1; 5280 } 5281 return 0; 5282 } 5283 5284 static int run(struct mddev *mddev) 5285 { 5286 struct r5conf *conf; 5287 int working_disks = 0; 5288 int dirty_parity_disks = 0; 5289 struct md_rdev *rdev; 5290 sector_t reshape_offset = 0; 5291 int i; 5292 long long min_offset_diff = 0; 5293 int first = 1; 5294 5295 if (mddev->recovery_cp != MaxSector) 5296 printk(KERN_NOTICE "md/raid:%s: not clean" 5297 " -- starting background reconstruction\n", 5298 mdname(mddev)); 5299 5300 rdev_for_each(rdev, mddev) { 5301 long long diff; 5302 if (rdev->raid_disk < 0) 5303 continue; 5304 diff = (rdev->new_data_offset - rdev->data_offset); 5305 if (first) { 5306 min_offset_diff = diff; 5307 first = 0; 5308 } else if (mddev->reshape_backwards && 5309 diff < min_offset_diff) 5310 min_offset_diff = diff; 5311 else if (!mddev->reshape_backwards && 5312 diff > min_offset_diff) 5313 min_offset_diff = diff; 5314 } 5315 5316 if (mddev->reshape_position != MaxSector) { 5317 /* Check that we can continue the reshape. 5318 * Difficulties arise if the stripe we would write to 5319 * next is at or after the stripe we would read from next. 5320 * For a reshape that changes the number of devices, this 5321 * is only possible for a very short time, and mdadm makes 5322 * sure that time appears to have past before assembling 5323 * the array. So we fail if that time hasn't passed. 5324 * For a reshape that keeps the number of devices the same 5325 * mdadm must be monitoring the reshape can keeping the 5326 * critical areas read-only and backed up. It will start 5327 * the array in read-only mode, so we check for that. 5328 */ 5329 sector_t here_new, here_old; 5330 int old_disks; 5331 int max_degraded = (mddev->level == 6 ? 2 : 1); 5332 5333 if (mddev->new_level != mddev->level) { 5334 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5335 "required - aborting.\n", 5336 mdname(mddev)); 5337 return -EINVAL; 5338 } 5339 old_disks = mddev->raid_disks - mddev->delta_disks; 5340 /* reshape_position must be on a new-stripe boundary, and one 5341 * further up in new geometry must map after here in old 5342 * geometry. 5343 */ 5344 here_new = mddev->reshape_position; 5345 if (sector_div(here_new, mddev->new_chunk_sectors * 5346 (mddev->raid_disks - max_degraded))) { 5347 printk(KERN_ERR "md/raid:%s: reshape_position not " 5348 "on a stripe boundary\n", mdname(mddev)); 5349 return -EINVAL; 5350 } 5351 reshape_offset = here_new * mddev->new_chunk_sectors; 5352 /* here_new is the stripe we will write to */ 5353 here_old = mddev->reshape_position; 5354 sector_div(here_old, mddev->chunk_sectors * 5355 (old_disks-max_degraded)); 5356 /* here_old is the first stripe that we might need to read 5357 * from */ 5358 if (mddev->delta_disks == 0) { 5359 if ((here_new * mddev->new_chunk_sectors != 5360 here_old * mddev->chunk_sectors)) { 5361 printk(KERN_ERR "md/raid:%s: reshape position is" 5362 " confused - aborting\n", mdname(mddev)); 5363 return -EINVAL; 5364 } 5365 /* We cannot be sure it is safe to start an in-place 5366 * reshape. It is only safe if user-space is monitoring 5367 * and taking constant backups. 5368 * mdadm always starts a situation like this in 5369 * readonly mode so it can take control before 5370 * allowing any writes. So just check for that. 5371 */ 5372 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5373 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5374 /* not really in-place - so OK */; 5375 else if (mddev->ro == 0) { 5376 printk(KERN_ERR "md/raid:%s: in-place reshape " 5377 "must be started in read-only mode " 5378 "- aborting\n", 5379 mdname(mddev)); 5380 return -EINVAL; 5381 } 5382 } else if (mddev->reshape_backwards 5383 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5384 here_old * mddev->chunk_sectors) 5385 : (here_new * mddev->new_chunk_sectors >= 5386 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5387 /* Reading from the same stripe as writing to - bad */ 5388 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5389 "auto-recovery - aborting.\n", 5390 mdname(mddev)); 5391 return -EINVAL; 5392 } 5393 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5394 mdname(mddev)); 5395 /* OK, we should be able to continue; */ 5396 } else { 5397 BUG_ON(mddev->level != mddev->new_level); 5398 BUG_ON(mddev->layout != mddev->new_layout); 5399 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5400 BUG_ON(mddev->delta_disks != 0); 5401 } 5402 5403 if (mddev->private == NULL) 5404 conf = setup_conf(mddev); 5405 else 5406 conf = mddev->private; 5407 5408 if (IS_ERR(conf)) 5409 return PTR_ERR(conf); 5410 5411 conf->min_offset_diff = min_offset_diff; 5412 mddev->thread = conf->thread; 5413 conf->thread = NULL; 5414 mddev->private = conf; 5415 5416 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5417 i++) { 5418 rdev = conf->disks[i].rdev; 5419 if (!rdev && conf->disks[i].replacement) { 5420 /* The replacement is all we have yet */ 5421 rdev = conf->disks[i].replacement; 5422 conf->disks[i].replacement = NULL; 5423 clear_bit(Replacement, &rdev->flags); 5424 conf->disks[i].rdev = rdev; 5425 } 5426 if (!rdev) 5427 continue; 5428 if (conf->disks[i].replacement && 5429 conf->reshape_progress != MaxSector) { 5430 /* replacements and reshape simply do not mix. */ 5431 printk(KERN_ERR "md: cannot handle concurrent " 5432 "replacement and reshape.\n"); 5433 goto abort; 5434 } 5435 if (test_bit(In_sync, &rdev->flags)) { 5436 working_disks++; 5437 continue; 5438 } 5439 /* This disc is not fully in-sync. However if it 5440 * just stored parity (beyond the recovery_offset), 5441 * when we don't need to be concerned about the 5442 * array being dirty. 5443 * When reshape goes 'backwards', we never have 5444 * partially completed devices, so we only need 5445 * to worry about reshape going forwards. 5446 */ 5447 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5448 if (mddev->major_version == 0 && 5449 mddev->minor_version > 90) 5450 rdev->recovery_offset = reshape_offset; 5451 5452 if (rdev->recovery_offset < reshape_offset) { 5453 /* We need to check old and new layout */ 5454 if (!only_parity(rdev->raid_disk, 5455 conf->algorithm, 5456 conf->raid_disks, 5457 conf->max_degraded)) 5458 continue; 5459 } 5460 if (!only_parity(rdev->raid_disk, 5461 conf->prev_algo, 5462 conf->previous_raid_disks, 5463 conf->max_degraded)) 5464 continue; 5465 dirty_parity_disks++; 5466 } 5467 5468 /* 5469 * 0 for a fully functional array, 1 or 2 for a degraded array. 5470 */ 5471 mddev->degraded = calc_degraded(conf); 5472 5473 if (has_failed(conf)) { 5474 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5475 " (%d/%d failed)\n", 5476 mdname(mddev), mddev->degraded, conf->raid_disks); 5477 goto abort; 5478 } 5479 5480 /* device size must be a multiple of chunk size */ 5481 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5482 mddev->resync_max_sectors = mddev->dev_sectors; 5483 5484 if (mddev->degraded > dirty_parity_disks && 5485 mddev->recovery_cp != MaxSector) { 5486 if (mddev->ok_start_degraded) 5487 printk(KERN_WARNING 5488 "md/raid:%s: starting dirty degraded array" 5489 " - data corruption possible.\n", 5490 mdname(mddev)); 5491 else { 5492 printk(KERN_ERR 5493 "md/raid:%s: cannot start dirty degraded array.\n", 5494 mdname(mddev)); 5495 goto abort; 5496 } 5497 } 5498 5499 if (mddev->degraded == 0) 5500 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5501 " devices, algorithm %d\n", mdname(mddev), conf->level, 5502 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5503 mddev->new_layout); 5504 else 5505 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5506 " out of %d devices, algorithm %d\n", 5507 mdname(mddev), conf->level, 5508 mddev->raid_disks - mddev->degraded, 5509 mddev->raid_disks, mddev->new_layout); 5510 5511 print_raid5_conf(conf); 5512 5513 if (conf->reshape_progress != MaxSector) { 5514 conf->reshape_safe = conf->reshape_progress; 5515 atomic_set(&conf->reshape_stripes, 0); 5516 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5517 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5518 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5519 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5520 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5521 "reshape"); 5522 } 5523 5524 5525 /* Ok, everything is just fine now */ 5526 if (mddev->to_remove == &raid5_attrs_group) 5527 mddev->to_remove = NULL; 5528 else if (mddev->kobj.sd && 5529 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5530 printk(KERN_WARNING 5531 "raid5: failed to create sysfs attributes for %s\n", 5532 mdname(mddev)); 5533 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5534 5535 if (mddev->queue) { 5536 int chunk_size; 5537 bool discard_supported = true; 5538 /* read-ahead size must cover two whole stripes, which 5539 * is 2 * (datadisks) * chunksize where 'n' is the 5540 * number of raid devices 5541 */ 5542 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5543 int stripe = data_disks * 5544 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5545 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5546 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5547 5548 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5549 5550 mddev->queue->backing_dev_info.congested_data = mddev; 5551 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5552 5553 chunk_size = mddev->chunk_sectors << 9; 5554 blk_queue_io_min(mddev->queue, chunk_size); 5555 blk_queue_io_opt(mddev->queue, chunk_size * 5556 (conf->raid_disks - conf->max_degraded)); 5557 /* 5558 * We can only discard a whole stripe. It doesn't make sense to 5559 * discard data disk but write parity disk 5560 */ 5561 stripe = stripe * PAGE_SIZE; 5562 /* Round up to power of 2, as discard handling 5563 * currently assumes that */ 5564 while ((stripe-1) & stripe) 5565 stripe = (stripe | (stripe-1)) + 1; 5566 mddev->queue->limits.discard_alignment = stripe; 5567 mddev->queue->limits.discard_granularity = stripe; 5568 /* 5569 * unaligned part of discard request will be ignored, so can't 5570 * guarantee discard_zerors_data 5571 */ 5572 mddev->queue->limits.discard_zeroes_data = 0; 5573 5574 rdev_for_each(rdev, mddev) { 5575 disk_stack_limits(mddev->gendisk, rdev->bdev, 5576 rdev->data_offset << 9); 5577 disk_stack_limits(mddev->gendisk, rdev->bdev, 5578 rdev->new_data_offset << 9); 5579 /* 5580 * discard_zeroes_data is required, otherwise data 5581 * could be lost. Consider a scenario: discard a stripe 5582 * (the stripe could be inconsistent if 5583 * discard_zeroes_data is 0); write one disk of the 5584 * stripe (the stripe could be inconsistent again 5585 * depending on which disks are used to calculate 5586 * parity); the disk is broken; The stripe data of this 5587 * disk is lost. 5588 */ 5589 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5590 !bdev_get_queue(rdev->bdev)-> 5591 limits.discard_zeroes_data) 5592 discard_supported = false; 5593 } 5594 5595 if (discard_supported && 5596 mddev->queue->limits.max_discard_sectors >= stripe && 5597 mddev->queue->limits.discard_granularity >= stripe) 5598 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5599 mddev->queue); 5600 else 5601 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5602 mddev->queue); 5603 } 5604 5605 return 0; 5606 abort: 5607 md_unregister_thread(&mddev->thread); 5608 print_raid5_conf(conf); 5609 free_conf(conf); 5610 mddev->private = NULL; 5611 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5612 return -EIO; 5613 } 5614 5615 static int stop(struct mddev *mddev) 5616 { 5617 struct r5conf *conf = mddev->private; 5618 5619 md_unregister_thread(&mddev->thread); 5620 if (mddev->queue) 5621 mddev->queue->backing_dev_info.congested_fn = NULL; 5622 free_conf(conf); 5623 mddev->private = NULL; 5624 mddev->to_remove = &raid5_attrs_group; 5625 return 0; 5626 } 5627 5628 static void status(struct seq_file *seq, struct mddev *mddev) 5629 { 5630 struct r5conf *conf = mddev->private; 5631 int i; 5632 5633 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5634 mddev->chunk_sectors / 2, mddev->layout); 5635 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5636 for (i = 0; i < conf->raid_disks; i++) 5637 seq_printf (seq, "%s", 5638 conf->disks[i].rdev && 5639 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5640 seq_printf (seq, "]"); 5641 } 5642 5643 static void print_raid5_conf (struct r5conf *conf) 5644 { 5645 int i; 5646 struct disk_info *tmp; 5647 5648 printk(KERN_DEBUG "RAID conf printout:\n"); 5649 if (!conf) { 5650 printk("(conf==NULL)\n"); 5651 return; 5652 } 5653 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5654 conf->raid_disks, 5655 conf->raid_disks - conf->mddev->degraded); 5656 5657 for (i = 0; i < conf->raid_disks; i++) { 5658 char b[BDEVNAME_SIZE]; 5659 tmp = conf->disks + i; 5660 if (tmp->rdev) 5661 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5662 i, !test_bit(Faulty, &tmp->rdev->flags), 5663 bdevname(tmp->rdev->bdev, b)); 5664 } 5665 } 5666 5667 static int raid5_spare_active(struct mddev *mddev) 5668 { 5669 int i; 5670 struct r5conf *conf = mddev->private; 5671 struct disk_info *tmp; 5672 int count = 0; 5673 unsigned long flags; 5674 5675 for (i = 0; i < conf->raid_disks; i++) { 5676 tmp = conf->disks + i; 5677 if (tmp->replacement 5678 && tmp->replacement->recovery_offset == MaxSector 5679 && !test_bit(Faulty, &tmp->replacement->flags) 5680 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5681 /* Replacement has just become active. */ 5682 if (!tmp->rdev 5683 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5684 count++; 5685 if (tmp->rdev) { 5686 /* Replaced device not technically faulty, 5687 * but we need to be sure it gets removed 5688 * and never re-added. 5689 */ 5690 set_bit(Faulty, &tmp->rdev->flags); 5691 sysfs_notify_dirent_safe( 5692 tmp->rdev->sysfs_state); 5693 } 5694 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5695 } else if (tmp->rdev 5696 && tmp->rdev->recovery_offset == MaxSector 5697 && !test_bit(Faulty, &tmp->rdev->flags) 5698 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5699 count++; 5700 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5701 } 5702 } 5703 spin_lock_irqsave(&conf->device_lock, flags); 5704 mddev->degraded = calc_degraded(conf); 5705 spin_unlock_irqrestore(&conf->device_lock, flags); 5706 print_raid5_conf(conf); 5707 return count; 5708 } 5709 5710 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5711 { 5712 struct r5conf *conf = mddev->private; 5713 int err = 0; 5714 int number = rdev->raid_disk; 5715 struct md_rdev **rdevp; 5716 struct disk_info *p = conf->disks + number; 5717 5718 print_raid5_conf(conf); 5719 if (rdev == p->rdev) 5720 rdevp = &p->rdev; 5721 else if (rdev == p->replacement) 5722 rdevp = &p->replacement; 5723 else 5724 return 0; 5725 5726 if (number >= conf->raid_disks && 5727 conf->reshape_progress == MaxSector) 5728 clear_bit(In_sync, &rdev->flags); 5729 5730 if (test_bit(In_sync, &rdev->flags) || 5731 atomic_read(&rdev->nr_pending)) { 5732 err = -EBUSY; 5733 goto abort; 5734 } 5735 /* Only remove non-faulty devices if recovery 5736 * isn't possible. 5737 */ 5738 if (!test_bit(Faulty, &rdev->flags) && 5739 mddev->recovery_disabled != conf->recovery_disabled && 5740 !has_failed(conf) && 5741 (!p->replacement || p->replacement == rdev) && 5742 number < conf->raid_disks) { 5743 err = -EBUSY; 5744 goto abort; 5745 } 5746 *rdevp = NULL; 5747 synchronize_rcu(); 5748 if (atomic_read(&rdev->nr_pending)) { 5749 /* lost the race, try later */ 5750 err = -EBUSY; 5751 *rdevp = rdev; 5752 } else if (p->replacement) { 5753 /* We must have just cleared 'rdev' */ 5754 p->rdev = p->replacement; 5755 clear_bit(Replacement, &p->replacement->flags); 5756 smp_mb(); /* Make sure other CPUs may see both as identical 5757 * but will never see neither - if they are careful 5758 */ 5759 p->replacement = NULL; 5760 clear_bit(WantReplacement, &rdev->flags); 5761 } else 5762 /* We might have just removed the Replacement as faulty- 5763 * clear the bit just in case 5764 */ 5765 clear_bit(WantReplacement, &rdev->flags); 5766 abort: 5767 5768 print_raid5_conf(conf); 5769 return err; 5770 } 5771 5772 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5773 { 5774 struct r5conf *conf = mddev->private; 5775 int err = -EEXIST; 5776 int disk; 5777 struct disk_info *p; 5778 int first = 0; 5779 int last = conf->raid_disks - 1; 5780 5781 if (mddev->recovery_disabled == conf->recovery_disabled) 5782 return -EBUSY; 5783 5784 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5785 /* no point adding a device */ 5786 return -EINVAL; 5787 5788 if (rdev->raid_disk >= 0) 5789 first = last = rdev->raid_disk; 5790 5791 /* 5792 * find the disk ... but prefer rdev->saved_raid_disk 5793 * if possible. 5794 */ 5795 if (rdev->saved_raid_disk >= 0 && 5796 rdev->saved_raid_disk >= first && 5797 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5798 first = rdev->saved_raid_disk; 5799 5800 for (disk = first; disk <= last; disk++) { 5801 p = conf->disks + disk; 5802 if (p->rdev == NULL) { 5803 clear_bit(In_sync, &rdev->flags); 5804 rdev->raid_disk = disk; 5805 err = 0; 5806 if (rdev->saved_raid_disk != disk) 5807 conf->fullsync = 1; 5808 rcu_assign_pointer(p->rdev, rdev); 5809 goto out; 5810 } 5811 } 5812 for (disk = first; disk <= last; disk++) { 5813 p = conf->disks + disk; 5814 if (test_bit(WantReplacement, &p->rdev->flags) && 5815 p->replacement == NULL) { 5816 clear_bit(In_sync, &rdev->flags); 5817 set_bit(Replacement, &rdev->flags); 5818 rdev->raid_disk = disk; 5819 err = 0; 5820 conf->fullsync = 1; 5821 rcu_assign_pointer(p->replacement, rdev); 5822 break; 5823 } 5824 } 5825 out: 5826 print_raid5_conf(conf); 5827 return err; 5828 } 5829 5830 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5831 { 5832 /* no resync is happening, and there is enough space 5833 * on all devices, so we can resize. 5834 * We need to make sure resync covers any new space. 5835 * If the array is shrinking we should possibly wait until 5836 * any io in the removed space completes, but it hardly seems 5837 * worth it. 5838 */ 5839 sector_t newsize; 5840 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5841 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5842 if (mddev->external_size && 5843 mddev->array_sectors > newsize) 5844 return -EINVAL; 5845 if (mddev->bitmap) { 5846 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5847 if (ret) 5848 return ret; 5849 } 5850 md_set_array_sectors(mddev, newsize); 5851 set_capacity(mddev->gendisk, mddev->array_sectors); 5852 revalidate_disk(mddev->gendisk); 5853 if (sectors > mddev->dev_sectors && 5854 mddev->recovery_cp > mddev->dev_sectors) { 5855 mddev->recovery_cp = mddev->dev_sectors; 5856 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5857 } 5858 mddev->dev_sectors = sectors; 5859 mddev->resync_max_sectors = sectors; 5860 return 0; 5861 } 5862 5863 static int check_stripe_cache(struct mddev *mddev) 5864 { 5865 /* Can only proceed if there are plenty of stripe_heads. 5866 * We need a minimum of one full stripe,, and for sensible progress 5867 * it is best to have about 4 times that. 5868 * If we require 4 times, then the default 256 4K stripe_heads will 5869 * allow for chunk sizes up to 256K, which is probably OK. 5870 * If the chunk size is greater, user-space should request more 5871 * stripe_heads first. 5872 */ 5873 struct r5conf *conf = mddev->private; 5874 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5875 > conf->max_nr_stripes || 5876 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5877 > conf->max_nr_stripes) { 5878 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5879 mdname(mddev), 5880 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5881 / STRIPE_SIZE)*4); 5882 return 0; 5883 } 5884 return 1; 5885 } 5886 5887 static int check_reshape(struct mddev *mddev) 5888 { 5889 struct r5conf *conf = mddev->private; 5890 5891 if (mddev->delta_disks == 0 && 5892 mddev->new_layout == mddev->layout && 5893 mddev->new_chunk_sectors == mddev->chunk_sectors) 5894 return 0; /* nothing to do */ 5895 if (has_failed(conf)) 5896 return -EINVAL; 5897 if (mddev->delta_disks < 0) { 5898 /* We might be able to shrink, but the devices must 5899 * be made bigger first. 5900 * For raid6, 4 is the minimum size. 5901 * Otherwise 2 is the minimum 5902 */ 5903 int min = 2; 5904 if (mddev->level == 6) 5905 min = 4; 5906 if (mddev->raid_disks + mddev->delta_disks < min) 5907 return -EINVAL; 5908 } 5909 5910 if (!check_stripe_cache(mddev)) 5911 return -ENOSPC; 5912 5913 return resize_stripes(conf, (conf->previous_raid_disks 5914 + mddev->delta_disks)); 5915 } 5916 5917 static int raid5_start_reshape(struct mddev *mddev) 5918 { 5919 struct r5conf *conf = mddev->private; 5920 struct md_rdev *rdev; 5921 int spares = 0; 5922 unsigned long flags; 5923 5924 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5925 return -EBUSY; 5926 5927 if (!check_stripe_cache(mddev)) 5928 return -ENOSPC; 5929 5930 if (has_failed(conf)) 5931 return -EINVAL; 5932 5933 rdev_for_each(rdev, mddev) { 5934 if (!test_bit(In_sync, &rdev->flags) 5935 && !test_bit(Faulty, &rdev->flags)) 5936 spares++; 5937 } 5938 5939 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5940 /* Not enough devices even to make a degraded array 5941 * of that size 5942 */ 5943 return -EINVAL; 5944 5945 /* Refuse to reduce size of the array. Any reductions in 5946 * array size must be through explicit setting of array_size 5947 * attribute. 5948 */ 5949 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5950 < mddev->array_sectors) { 5951 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5952 "before number of disks\n", mdname(mddev)); 5953 return -EINVAL; 5954 } 5955 5956 atomic_set(&conf->reshape_stripes, 0); 5957 spin_lock_irq(&conf->device_lock); 5958 conf->previous_raid_disks = conf->raid_disks; 5959 conf->raid_disks += mddev->delta_disks; 5960 conf->prev_chunk_sectors = conf->chunk_sectors; 5961 conf->chunk_sectors = mddev->new_chunk_sectors; 5962 conf->prev_algo = conf->algorithm; 5963 conf->algorithm = mddev->new_layout; 5964 conf->generation++; 5965 /* Code that selects data_offset needs to see the generation update 5966 * if reshape_progress has been set - so a memory barrier needed. 5967 */ 5968 smp_mb(); 5969 if (mddev->reshape_backwards) 5970 conf->reshape_progress = raid5_size(mddev, 0, 0); 5971 else 5972 conf->reshape_progress = 0; 5973 conf->reshape_safe = conf->reshape_progress; 5974 spin_unlock_irq(&conf->device_lock); 5975 5976 /* Add some new drives, as many as will fit. 5977 * We know there are enough to make the newly sized array work. 5978 * Don't add devices if we are reducing the number of 5979 * devices in the array. This is because it is not possible 5980 * to correctly record the "partially reconstructed" state of 5981 * such devices during the reshape and confusion could result. 5982 */ 5983 if (mddev->delta_disks >= 0) { 5984 rdev_for_each(rdev, mddev) 5985 if (rdev->raid_disk < 0 && 5986 !test_bit(Faulty, &rdev->flags)) { 5987 if (raid5_add_disk(mddev, rdev) == 0) { 5988 if (rdev->raid_disk 5989 >= conf->previous_raid_disks) 5990 set_bit(In_sync, &rdev->flags); 5991 else 5992 rdev->recovery_offset = 0; 5993 5994 if (sysfs_link_rdev(mddev, rdev)) 5995 /* Failure here is OK */; 5996 } 5997 } else if (rdev->raid_disk >= conf->previous_raid_disks 5998 && !test_bit(Faulty, &rdev->flags)) { 5999 /* This is a spare that was manually added */ 6000 set_bit(In_sync, &rdev->flags); 6001 } 6002 6003 /* When a reshape changes the number of devices, 6004 * ->degraded is measured against the larger of the 6005 * pre and post number of devices. 6006 */ 6007 spin_lock_irqsave(&conf->device_lock, flags); 6008 mddev->degraded = calc_degraded(conf); 6009 spin_unlock_irqrestore(&conf->device_lock, flags); 6010 } 6011 mddev->raid_disks = conf->raid_disks; 6012 mddev->reshape_position = conf->reshape_progress; 6013 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6014 6015 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6016 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6017 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6018 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6019 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6020 "reshape"); 6021 if (!mddev->sync_thread) { 6022 mddev->recovery = 0; 6023 spin_lock_irq(&conf->device_lock); 6024 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6025 rdev_for_each(rdev, mddev) 6026 rdev->new_data_offset = rdev->data_offset; 6027 smp_wmb(); 6028 conf->reshape_progress = MaxSector; 6029 mddev->reshape_position = MaxSector; 6030 spin_unlock_irq(&conf->device_lock); 6031 return -EAGAIN; 6032 } 6033 conf->reshape_checkpoint = jiffies; 6034 md_wakeup_thread(mddev->sync_thread); 6035 md_new_event(mddev); 6036 return 0; 6037 } 6038 6039 /* This is called from the reshape thread and should make any 6040 * changes needed in 'conf' 6041 */ 6042 static void end_reshape(struct r5conf *conf) 6043 { 6044 6045 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6046 struct md_rdev *rdev; 6047 6048 spin_lock_irq(&conf->device_lock); 6049 conf->previous_raid_disks = conf->raid_disks; 6050 rdev_for_each(rdev, conf->mddev) 6051 rdev->data_offset = rdev->new_data_offset; 6052 smp_wmb(); 6053 conf->reshape_progress = MaxSector; 6054 spin_unlock_irq(&conf->device_lock); 6055 wake_up(&conf->wait_for_overlap); 6056 6057 /* read-ahead size must cover two whole stripes, which is 6058 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6059 */ 6060 if (conf->mddev->queue) { 6061 int data_disks = conf->raid_disks - conf->max_degraded; 6062 int stripe = data_disks * ((conf->chunk_sectors << 9) 6063 / PAGE_SIZE); 6064 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6065 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6066 } 6067 } 6068 } 6069 6070 /* This is called from the raid5d thread with mddev_lock held. 6071 * It makes config changes to the device. 6072 */ 6073 static void raid5_finish_reshape(struct mddev *mddev) 6074 { 6075 struct r5conf *conf = mddev->private; 6076 6077 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6078 6079 if (mddev->delta_disks > 0) { 6080 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6081 set_capacity(mddev->gendisk, mddev->array_sectors); 6082 revalidate_disk(mddev->gendisk); 6083 } else { 6084 int d; 6085 spin_lock_irq(&conf->device_lock); 6086 mddev->degraded = calc_degraded(conf); 6087 spin_unlock_irq(&conf->device_lock); 6088 for (d = conf->raid_disks ; 6089 d < conf->raid_disks - mddev->delta_disks; 6090 d++) { 6091 struct md_rdev *rdev = conf->disks[d].rdev; 6092 if (rdev) 6093 clear_bit(In_sync, &rdev->flags); 6094 rdev = conf->disks[d].replacement; 6095 if (rdev) 6096 clear_bit(In_sync, &rdev->flags); 6097 } 6098 } 6099 mddev->layout = conf->algorithm; 6100 mddev->chunk_sectors = conf->chunk_sectors; 6101 mddev->reshape_position = MaxSector; 6102 mddev->delta_disks = 0; 6103 mddev->reshape_backwards = 0; 6104 } 6105 } 6106 6107 static void raid5_quiesce(struct mddev *mddev, int state) 6108 { 6109 struct r5conf *conf = mddev->private; 6110 6111 switch(state) { 6112 case 2: /* resume for a suspend */ 6113 wake_up(&conf->wait_for_overlap); 6114 break; 6115 6116 case 1: /* stop all writes */ 6117 spin_lock_irq(&conf->device_lock); 6118 /* '2' tells resync/reshape to pause so that all 6119 * active stripes can drain 6120 */ 6121 conf->quiesce = 2; 6122 wait_event_lock_irq(conf->wait_for_stripe, 6123 atomic_read(&conf->active_stripes) == 0 && 6124 atomic_read(&conf->active_aligned_reads) == 0, 6125 conf->device_lock); 6126 conf->quiesce = 1; 6127 spin_unlock_irq(&conf->device_lock); 6128 /* allow reshape to continue */ 6129 wake_up(&conf->wait_for_overlap); 6130 break; 6131 6132 case 0: /* re-enable writes */ 6133 spin_lock_irq(&conf->device_lock); 6134 conf->quiesce = 0; 6135 wake_up(&conf->wait_for_stripe); 6136 wake_up(&conf->wait_for_overlap); 6137 spin_unlock_irq(&conf->device_lock); 6138 break; 6139 } 6140 } 6141 6142 6143 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6144 { 6145 struct r0conf *raid0_conf = mddev->private; 6146 sector_t sectors; 6147 6148 /* for raid0 takeover only one zone is supported */ 6149 if (raid0_conf->nr_strip_zones > 1) { 6150 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6151 mdname(mddev)); 6152 return ERR_PTR(-EINVAL); 6153 } 6154 6155 sectors = raid0_conf->strip_zone[0].zone_end; 6156 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6157 mddev->dev_sectors = sectors; 6158 mddev->new_level = level; 6159 mddev->new_layout = ALGORITHM_PARITY_N; 6160 mddev->new_chunk_sectors = mddev->chunk_sectors; 6161 mddev->raid_disks += 1; 6162 mddev->delta_disks = 1; 6163 /* make sure it will be not marked as dirty */ 6164 mddev->recovery_cp = MaxSector; 6165 6166 return setup_conf(mddev); 6167 } 6168 6169 6170 static void *raid5_takeover_raid1(struct mddev *mddev) 6171 { 6172 int chunksect; 6173 6174 if (mddev->raid_disks != 2 || 6175 mddev->degraded > 1) 6176 return ERR_PTR(-EINVAL); 6177 6178 /* Should check if there are write-behind devices? */ 6179 6180 chunksect = 64*2; /* 64K by default */ 6181 6182 /* The array must be an exact multiple of chunksize */ 6183 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6184 chunksect >>= 1; 6185 6186 if ((chunksect<<9) < STRIPE_SIZE) 6187 /* array size does not allow a suitable chunk size */ 6188 return ERR_PTR(-EINVAL); 6189 6190 mddev->new_level = 5; 6191 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6192 mddev->new_chunk_sectors = chunksect; 6193 6194 return setup_conf(mddev); 6195 } 6196 6197 static void *raid5_takeover_raid6(struct mddev *mddev) 6198 { 6199 int new_layout; 6200 6201 switch (mddev->layout) { 6202 case ALGORITHM_LEFT_ASYMMETRIC_6: 6203 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6204 break; 6205 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6206 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6207 break; 6208 case ALGORITHM_LEFT_SYMMETRIC_6: 6209 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6210 break; 6211 case ALGORITHM_RIGHT_SYMMETRIC_6: 6212 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6213 break; 6214 case ALGORITHM_PARITY_0_6: 6215 new_layout = ALGORITHM_PARITY_0; 6216 break; 6217 case ALGORITHM_PARITY_N: 6218 new_layout = ALGORITHM_PARITY_N; 6219 break; 6220 default: 6221 return ERR_PTR(-EINVAL); 6222 } 6223 mddev->new_level = 5; 6224 mddev->new_layout = new_layout; 6225 mddev->delta_disks = -1; 6226 mddev->raid_disks -= 1; 6227 return setup_conf(mddev); 6228 } 6229 6230 6231 static int raid5_check_reshape(struct mddev *mddev) 6232 { 6233 /* For a 2-drive array, the layout and chunk size can be changed 6234 * immediately as not restriping is needed. 6235 * For larger arrays we record the new value - after validation 6236 * to be used by a reshape pass. 6237 */ 6238 struct r5conf *conf = mddev->private; 6239 int new_chunk = mddev->new_chunk_sectors; 6240 6241 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6242 return -EINVAL; 6243 if (new_chunk > 0) { 6244 if (!is_power_of_2(new_chunk)) 6245 return -EINVAL; 6246 if (new_chunk < (PAGE_SIZE>>9)) 6247 return -EINVAL; 6248 if (mddev->array_sectors & (new_chunk-1)) 6249 /* not factor of array size */ 6250 return -EINVAL; 6251 } 6252 6253 /* They look valid */ 6254 6255 if (mddev->raid_disks == 2) { 6256 /* can make the change immediately */ 6257 if (mddev->new_layout >= 0) { 6258 conf->algorithm = mddev->new_layout; 6259 mddev->layout = mddev->new_layout; 6260 } 6261 if (new_chunk > 0) { 6262 conf->chunk_sectors = new_chunk ; 6263 mddev->chunk_sectors = new_chunk; 6264 } 6265 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6266 md_wakeup_thread(mddev->thread); 6267 } 6268 return check_reshape(mddev); 6269 } 6270 6271 static int raid6_check_reshape(struct mddev *mddev) 6272 { 6273 int new_chunk = mddev->new_chunk_sectors; 6274 6275 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6276 return -EINVAL; 6277 if (new_chunk > 0) { 6278 if (!is_power_of_2(new_chunk)) 6279 return -EINVAL; 6280 if (new_chunk < (PAGE_SIZE >> 9)) 6281 return -EINVAL; 6282 if (mddev->array_sectors & (new_chunk-1)) 6283 /* not factor of array size */ 6284 return -EINVAL; 6285 } 6286 6287 /* They look valid */ 6288 return check_reshape(mddev); 6289 } 6290 6291 static void *raid5_takeover(struct mddev *mddev) 6292 { 6293 /* raid5 can take over: 6294 * raid0 - if there is only one strip zone - make it a raid4 layout 6295 * raid1 - if there are two drives. We need to know the chunk size 6296 * raid4 - trivial - just use a raid4 layout. 6297 * raid6 - Providing it is a *_6 layout 6298 */ 6299 if (mddev->level == 0) 6300 return raid45_takeover_raid0(mddev, 5); 6301 if (mddev->level == 1) 6302 return raid5_takeover_raid1(mddev); 6303 if (mddev->level == 4) { 6304 mddev->new_layout = ALGORITHM_PARITY_N; 6305 mddev->new_level = 5; 6306 return setup_conf(mddev); 6307 } 6308 if (mddev->level == 6) 6309 return raid5_takeover_raid6(mddev); 6310 6311 return ERR_PTR(-EINVAL); 6312 } 6313 6314 static void *raid4_takeover(struct mddev *mddev) 6315 { 6316 /* raid4 can take over: 6317 * raid0 - if there is only one strip zone 6318 * raid5 - if layout is right 6319 */ 6320 if (mddev->level == 0) 6321 return raid45_takeover_raid0(mddev, 4); 6322 if (mddev->level == 5 && 6323 mddev->layout == ALGORITHM_PARITY_N) { 6324 mddev->new_layout = 0; 6325 mddev->new_level = 4; 6326 return setup_conf(mddev); 6327 } 6328 return ERR_PTR(-EINVAL); 6329 } 6330 6331 static struct md_personality raid5_personality; 6332 6333 static void *raid6_takeover(struct mddev *mddev) 6334 { 6335 /* Currently can only take over a raid5. We map the 6336 * personality to an equivalent raid6 personality 6337 * with the Q block at the end. 6338 */ 6339 int new_layout; 6340 6341 if (mddev->pers != &raid5_personality) 6342 return ERR_PTR(-EINVAL); 6343 if (mddev->degraded > 1) 6344 return ERR_PTR(-EINVAL); 6345 if (mddev->raid_disks > 253) 6346 return ERR_PTR(-EINVAL); 6347 if (mddev->raid_disks < 3) 6348 return ERR_PTR(-EINVAL); 6349 6350 switch (mddev->layout) { 6351 case ALGORITHM_LEFT_ASYMMETRIC: 6352 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6353 break; 6354 case ALGORITHM_RIGHT_ASYMMETRIC: 6355 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6356 break; 6357 case ALGORITHM_LEFT_SYMMETRIC: 6358 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6359 break; 6360 case ALGORITHM_RIGHT_SYMMETRIC: 6361 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6362 break; 6363 case ALGORITHM_PARITY_0: 6364 new_layout = ALGORITHM_PARITY_0_6; 6365 break; 6366 case ALGORITHM_PARITY_N: 6367 new_layout = ALGORITHM_PARITY_N; 6368 break; 6369 default: 6370 return ERR_PTR(-EINVAL); 6371 } 6372 mddev->new_level = 6; 6373 mddev->new_layout = new_layout; 6374 mddev->delta_disks = 1; 6375 mddev->raid_disks += 1; 6376 return setup_conf(mddev); 6377 } 6378 6379 6380 static struct md_personality raid6_personality = 6381 { 6382 .name = "raid6", 6383 .level = 6, 6384 .owner = THIS_MODULE, 6385 .make_request = make_request, 6386 .run = run, 6387 .stop = stop, 6388 .status = status, 6389 .error_handler = error, 6390 .hot_add_disk = raid5_add_disk, 6391 .hot_remove_disk= raid5_remove_disk, 6392 .spare_active = raid5_spare_active, 6393 .sync_request = sync_request, 6394 .resize = raid5_resize, 6395 .size = raid5_size, 6396 .check_reshape = raid6_check_reshape, 6397 .start_reshape = raid5_start_reshape, 6398 .finish_reshape = raid5_finish_reshape, 6399 .quiesce = raid5_quiesce, 6400 .takeover = raid6_takeover, 6401 }; 6402 static struct md_personality raid5_personality = 6403 { 6404 .name = "raid5", 6405 .level = 5, 6406 .owner = THIS_MODULE, 6407 .make_request = make_request, 6408 .run = run, 6409 .stop = stop, 6410 .status = status, 6411 .error_handler = error, 6412 .hot_add_disk = raid5_add_disk, 6413 .hot_remove_disk= raid5_remove_disk, 6414 .spare_active = raid5_spare_active, 6415 .sync_request = sync_request, 6416 .resize = raid5_resize, 6417 .size = raid5_size, 6418 .check_reshape = raid5_check_reshape, 6419 .start_reshape = raid5_start_reshape, 6420 .finish_reshape = raid5_finish_reshape, 6421 .quiesce = raid5_quiesce, 6422 .takeover = raid5_takeover, 6423 }; 6424 6425 static struct md_personality raid4_personality = 6426 { 6427 .name = "raid4", 6428 .level = 4, 6429 .owner = THIS_MODULE, 6430 .make_request = make_request, 6431 .run = run, 6432 .stop = stop, 6433 .status = status, 6434 .error_handler = error, 6435 .hot_add_disk = raid5_add_disk, 6436 .hot_remove_disk= raid5_remove_disk, 6437 .spare_active = raid5_spare_active, 6438 .sync_request = sync_request, 6439 .resize = raid5_resize, 6440 .size = raid5_size, 6441 .check_reshape = raid5_check_reshape, 6442 .start_reshape = raid5_start_reshape, 6443 .finish_reshape = raid5_finish_reshape, 6444 .quiesce = raid5_quiesce, 6445 .takeover = raid4_takeover, 6446 }; 6447 6448 static int __init raid5_init(void) 6449 { 6450 register_md_personality(&raid6_personality); 6451 register_md_personality(&raid5_personality); 6452 register_md_personality(&raid4_personality); 6453 return 0; 6454 } 6455 6456 static void raid5_exit(void) 6457 { 6458 unregister_md_personality(&raid6_personality); 6459 unregister_md_personality(&raid5_personality); 6460 unregister_md_personality(&raid4_personality); 6461 } 6462 6463 module_init(raid5_init); 6464 module_exit(raid5_exit); 6465 MODULE_LICENSE("GPL"); 6466 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6467 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6468 MODULE_ALIAS("md-raid5"); 6469 MODULE_ALIAS("md-raid4"); 6470 MODULE_ALIAS("md-level-5"); 6471 MODULE_ALIAS("md-level-4"); 6472 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6473 MODULE_ALIAS("md-raid6"); 6474 MODULE_ALIAS("md-level-6"); 6475 6476 /* This used to be two separate modules, they were: */ 6477 MODULE_ALIAS("raid5"); 6478 MODULE_ALIAS("raid6"); 6479