1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->bm_write is the number of the last batch successfully written. 31 * conf->bm_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is bm_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/module.h> 47 #include <linux/slab.h> 48 #include <linux/highmem.h> 49 #include <linux/bitops.h> 50 #include <linux/kthread.h> 51 #include <asm/atomic.h> 52 #include "raid6.h" 53 54 #include <linux/raid/bitmap.h> 55 #include <linux/async_tx.h> 56 57 /* 58 * Stripe cache 59 */ 60 61 #define NR_STRIPES 256 62 #define STRIPE_SIZE PAGE_SIZE 63 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 64 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 65 #define IO_THRESHOLD 1 66 #define BYPASS_THRESHOLD 1 67 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 68 #define HASH_MASK (NR_HASH - 1) 69 70 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 71 72 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 73 * order without overlap. There may be several bio's per stripe+device, and 74 * a bio could span several devices. 75 * When walking this list for a particular stripe+device, we must never proceed 76 * beyond a bio that extends past this device, as the next bio might no longer 77 * be valid. 78 * This macro is used to determine the 'next' bio in the list, given the sector 79 * of the current stripe+device 80 */ 81 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 82 /* 83 * The following can be used to debug the driver 84 */ 85 #define RAID5_PARANOIA 1 86 #if RAID5_PARANOIA && defined(CONFIG_SMP) 87 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 88 #else 89 # define CHECK_DEVLOCK() 90 #endif 91 92 #ifdef DEBUG 93 #define inline 94 #define __inline__ 95 #endif 96 97 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 98 99 #if !RAID6_USE_EMPTY_ZERO_PAGE 100 /* In .bss so it's zeroed */ 101 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); 102 #endif 103 104 /* 105 * We maintain a biased count of active stripes in the bottom 16 bits of 106 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 107 */ 108 static inline int raid5_bi_phys_segments(struct bio *bio) 109 { 110 return bio->bi_phys_segments & 0xffff; 111 } 112 113 static inline int raid5_bi_hw_segments(struct bio *bio) 114 { 115 return (bio->bi_phys_segments >> 16) & 0xffff; 116 } 117 118 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 119 { 120 --bio->bi_phys_segments; 121 return raid5_bi_phys_segments(bio); 122 } 123 124 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 125 { 126 unsigned short val = raid5_bi_hw_segments(bio); 127 128 --val; 129 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 130 return val; 131 } 132 133 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 134 { 135 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 136 } 137 138 static inline int raid6_next_disk(int disk, int raid_disks) 139 { 140 disk++; 141 return (disk < raid_disks) ? disk : 0; 142 } 143 144 static void return_io(struct bio *return_bi) 145 { 146 struct bio *bi = return_bi; 147 while (bi) { 148 149 return_bi = bi->bi_next; 150 bi->bi_next = NULL; 151 bi->bi_size = 0; 152 bio_endio(bi, 0); 153 bi = return_bi; 154 } 155 } 156 157 static void print_raid5_conf (raid5_conf_t *conf); 158 159 static int stripe_operations_active(struct stripe_head *sh) 160 { 161 return sh->check_state || sh->reconstruct_state || 162 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 163 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 164 } 165 166 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 167 { 168 if (atomic_dec_and_test(&sh->count)) { 169 BUG_ON(!list_empty(&sh->lru)); 170 BUG_ON(atomic_read(&conf->active_stripes)==0); 171 if (test_bit(STRIPE_HANDLE, &sh->state)) { 172 if (test_bit(STRIPE_DELAYED, &sh->state)) { 173 list_add_tail(&sh->lru, &conf->delayed_list); 174 blk_plug_device(conf->mddev->queue); 175 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 176 sh->bm_seq - conf->seq_write > 0) { 177 list_add_tail(&sh->lru, &conf->bitmap_list); 178 blk_plug_device(conf->mddev->queue); 179 } else { 180 clear_bit(STRIPE_BIT_DELAY, &sh->state); 181 list_add_tail(&sh->lru, &conf->handle_list); 182 } 183 md_wakeup_thread(conf->mddev->thread); 184 } else { 185 BUG_ON(stripe_operations_active(sh)); 186 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 187 atomic_dec(&conf->preread_active_stripes); 188 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 189 md_wakeup_thread(conf->mddev->thread); 190 } 191 atomic_dec(&conf->active_stripes); 192 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 193 list_add_tail(&sh->lru, &conf->inactive_list); 194 wake_up(&conf->wait_for_stripe); 195 if (conf->retry_read_aligned) 196 md_wakeup_thread(conf->mddev->thread); 197 } 198 } 199 } 200 } 201 static void release_stripe(struct stripe_head *sh) 202 { 203 raid5_conf_t *conf = sh->raid_conf; 204 unsigned long flags; 205 206 spin_lock_irqsave(&conf->device_lock, flags); 207 __release_stripe(conf, sh); 208 spin_unlock_irqrestore(&conf->device_lock, flags); 209 } 210 211 static inline void remove_hash(struct stripe_head *sh) 212 { 213 pr_debug("remove_hash(), stripe %llu\n", 214 (unsigned long long)sh->sector); 215 216 hlist_del_init(&sh->hash); 217 } 218 219 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 220 { 221 struct hlist_head *hp = stripe_hash(conf, sh->sector); 222 223 pr_debug("insert_hash(), stripe %llu\n", 224 (unsigned long long)sh->sector); 225 226 CHECK_DEVLOCK(); 227 hlist_add_head(&sh->hash, hp); 228 } 229 230 231 /* find an idle stripe, make sure it is unhashed, and return it. */ 232 static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 233 { 234 struct stripe_head *sh = NULL; 235 struct list_head *first; 236 237 CHECK_DEVLOCK(); 238 if (list_empty(&conf->inactive_list)) 239 goto out; 240 first = conf->inactive_list.next; 241 sh = list_entry(first, struct stripe_head, lru); 242 list_del_init(first); 243 remove_hash(sh); 244 atomic_inc(&conf->active_stripes); 245 out: 246 return sh; 247 } 248 249 static void shrink_buffers(struct stripe_head *sh, int num) 250 { 251 struct page *p; 252 int i; 253 254 for (i=0; i<num ; i++) { 255 p = sh->dev[i].page; 256 if (!p) 257 continue; 258 sh->dev[i].page = NULL; 259 put_page(p); 260 } 261 } 262 263 static int grow_buffers(struct stripe_head *sh, int num) 264 { 265 int i; 266 267 for (i=0; i<num; i++) { 268 struct page *page; 269 270 if (!(page = alloc_page(GFP_KERNEL))) { 271 return 1; 272 } 273 sh->dev[i].page = page; 274 } 275 return 0; 276 } 277 278 static void raid5_build_block (struct stripe_head *sh, int i); 279 280 static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) 281 { 282 raid5_conf_t *conf = sh->raid_conf; 283 int i; 284 285 BUG_ON(atomic_read(&sh->count) != 0); 286 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 287 BUG_ON(stripe_operations_active(sh)); 288 289 CHECK_DEVLOCK(); 290 pr_debug("init_stripe called, stripe %llu\n", 291 (unsigned long long)sh->sector); 292 293 remove_hash(sh); 294 295 sh->sector = sector; 296 sh->pd_idx = pd_idx; 297 sh->state = 0; 298 299 sh->disks = disks; 300 301 for (i = sh->disks; i--; ) { 302 struct r5dev *dev = &sh->dev[i]; 303 304 if (dev->toread || dev->read || dev->towrite || dev->written || 305 test_bit(R5_LOCKED, &dev->flags)) { 306 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 307 (unsigned long long)sh->sector, i, dev->toread, 308 dev->read, dev->towrite, dev->written, 309 test_bit(R5_LOCKED, &dev->flags)); 310 BUG(); 311 } 312 dev->flags = 0; 313 raid5_build_block(sh, i); 314 } 315 insert_hash(conf, sh); 316 } 317 318 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) 319 { 320 struct stripe_head *sh; 321 struct hlist_node *hn; 322 323 CHECK_DEVLOCK(); 324 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 325 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 326 if (sh->sector == sector && sh->disks == disks) 327 return sh; 328 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 329 return NULL; 330 } 331 332 static void unplug_slaves(mddev_t *mddev); 333 static void raid5_unplug_device(struct request_queue *q); 334 335 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, 336 int pd_idx, int noblock) 337 { 338 struct stripe_head *sh; 339 340 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 341 342 spin_lock_irq(&conf->device_lock); 343 344 do { 345 wait_event_lock_irq(conf->wait_for_stripe, 346 conf->quiesce == 0, 347 conf->device_lock, /* nothing */); 348 sh = __find_stripe(conf, sector, disks); 349 if (!sh) { 350 if (!conf->inactive_blocked) 351 sh = get_free_stripe(conf); 352 if (noblock && sh == NULL) 353 break; 354 if (!sh) { 355 conf->inactive_blocked = 1; 356 wait_event_lock_irq(conf->wait_for_stripe, 357 !list_empty(&conf->inactive_list) && 358 (atomic_read(&conf->active_stripes) 359 < (conf->max_nr_stripes *3/4) 360 || !conf->inactive_blocked), 361 conf->device_lock, 362 raid5_unplug_device(conf->mddev->queue) 363 ); 364 conf->inactive_blocked = 0; 365 } else 366 init_stripe(sh, sector, pd_idx, disks); 367 } else { 368 if (atomic_read(&sh->count)) { 369 BUG_ON(!list_empty(&sh->lru)); 370 } else { 371 if (!test_bit(STRIPE_HANDLE, &sh->state)) 372 atomic_inc(&conf->active_stripes); 373 if (list_empty(&sh->lru) && 374 !test_bit(STRIPE_EXPANDING, &sh->state)) 375 BUG(); 376 list_del_init(&sh->lru); 377 } 378 } 379 } while (sh == NULL); 380 381 if (sh) 382 atomic_inc(&sh->count); 383 384 spin_unlock_irq(&conf->device_lock); 385 return sh; 386 } 387 388 static void 389 raid5_end_read_request(struct bio *bi, int error); 390 static void 391 raid5_end_write_request(struct bio *bi, int error); 392 393 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 394 { 395 raid5_conf_t *conf = sh->raid_conf; 396 int i, disks = sh->disks; 397 398 might_sleep(); 399 400 for (i = disks; i--; ) { 401 int rw; 402 struct bio *bi; 403 mdk_rdev_t *rdev; 404 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 405 rw = WRITE; 406 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 407 rw = READ; 408 else 409 continue; 410 411 bi = &sh->dev[i].req; 412 413 bi->bi_rw = rw; 414 if (rw == WRITE) 415 bi->bi_end_io = raid5_end_write_request; 416 else 417 bi->bi_end_io = raid5_end_read_request; 418 419 rcu_read_lock(); 420 rdev = rcu_dereference(conf->disks[i].rdev); 421 if (rdev && test_bit(Faulty, &rdev->flags)) 422 rdev = NULL; 423 if (rdev) 424 atomic_inc(&rdev->nr_pending); 425 rcu_read_unlock(); 426 427 if (rdev) { 428 if (s->syncing || s->expanding || s->expanded) 429 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 430 431 set_bit(STRIPE_IO_STARTED, &sh->state); 432 433 bi->bi_bdev = rdev->bdev; 434 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 435 __func__, (unsigned long long)sh->sector, 436 bi->bi_rw, i); 437 atomic_inc(&sh->count); 438 bi->bi_sector = sh->sector + rdev->data_offset; 439 bi->bi_flags = 1 << BIO_UPTODATE; 440 bi->bi_vcnt = 1; 441 bi->bi_max_vecs = 1; 442 bi->bi_idx = 0; 443 bi->bi_io_vec = &sh->dev[i].vec; 444 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 445 bi->bi_io_vec[0].bv_offset = 0; 446 bi->bi_size = STRIPE_SIZE; 447 bi->bi_next = NULL; 448 if (rw == WRITE && 449 test_bit(R5_ReWrite, &sh->dev[i].flags)) 450 atomic_add(STRIPE_SECTORS, 451 &rdev->corrected_errors); 452 generic_make_request(bi); 453 } else { 454 if (rw == WRITE) 455 set_bit(STRIPE_DEGRADED, &sh->state); 456 pr_debug("skip op %ld on disc %d for sector %llu\n", 457 bi->bi_rw, i, (unsigned long long)sh->sector); 458 clear_bit(R5_LOCKED, &sh->dev[i].flags); 459 set_bit(STRIPE_HANDLE, &sh->state); 460 } 461 } 462 } 463 464 static struct dma_async_tx_descriptor * 465 async_copy_data(int frombio, struct bio *bio, struct page *page, 466 sector_t sector, struct dma_async_tx_descriptor *tx) 467 { 468 struct bio_vec *bvl; 469 struct page *bio_page; 470 int i; 471 int page_offset; 472 473 if (bio->bi_sector >= sector) 474 page_offset = (signed)(bio->bi_sector - sector) * 512; 475 else 476 page_offset = (signed)(sector - bio->bi_sector) * -512; 477 bio_for_each_segment(bvl, bio, i) { 478 int len = bio_iovec_idx(bio, i)->bv_len; 479 int clen; 480 int b_offset = 0; 481 482 if (page_offset < 0) { 483 b_offset = -page_offset; 484 page_offset += b_offset; 485 len -= b_offset; 486 } 487 488 if (len > 0 && page_offset + len > STRIPE_SIZE) 489 clen = STRIPE_SIZE - page_offset; 490 else 491 clen = len; 492 493 if (clen > 0) { 494 b_offset += bio_iovec_idx(bio, i)->bv_offset; 495 bio_page = bio_iovec_idx(bio, i)->bv_page; 496 if (frombio) 497 tx = async_memcpy(page, bio_page, page_offset, 498 b_offset, clen, 499 ASYNC_TX_DEP_ACK, 500 tx, NULL, NULL); 501 else 502 tx = async_memcpy(bio_page, page, b_offset, 503 page_offset, clen, 504 ASYNC_TX_DEP_ACK, 505 tx, NULL, NULL); 506 } 507 if (clen < len) /* hit end of page */ 508 break; 509 page_offset += len; 510 } 511 512 return tx; 513 } 514 515 static void ops_complete_biofill(void *stripe_head_ref) 516 { 517 struct stripe_head *sh = stripe_head_ref; 518 struct bio *return_bi = NULL; 519 raid5_conf_t *conf = sh->raid_conf; 520 int i; 521 522 pr_debug("%s: stripe %llu\n", __func__, 523 (unsigned long long)sh->sector); 524 525 /* clear completed biofills */ 526 spin_lock_irq(&conf->device_lock); 527 for (i = sh->disks; i--; ) { 528 struct r5dev *dev = &sh->dev[i]; 529 530 /* acknowledge completion of a biofill operation */ 531 /* and check if we need to reply to a read request, 532 * new R5_Wantfill requests are held off until 533 * !STRIPE_BIOFILL_RUN 534 */ 535 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 536 struct bio *rbi, *rbi2; 537 538 BUG_ON(!dev->read); 539 rbi = dev->read; 540 dev->read = NULL; 541 while (rbi && rbi->bi_sector < 542 dev->sector + STRIPE_SECTORS) { 543 rbi2 = r5_next_bio(rbi, dev->sector); 544 if (!raid5_dec_bi_phys_segments(rbi)) { 545 rbi->bi_next = return_bi; 546 return_bi = rbi; 547 } 548 rbi = rbi2; 549 } 550 } 551 } 552 spin_unlock_irq(&conf->device_lock); 553 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 554 555 return_io(return_bi); 556 557 set_bit(STRIPE_HANDLE, &sh->state); 558 release_stripe(sh); 559 } 560 561 static void ops_run_biofill(struct stripe_head *sh) 562 { 563 struct dma_async_tx_descriptor *tx = NULL; 564 raid5_conf_t *conf = sh->raid_conf; 565 int i; 566 567 pr_debug("%s: stripe %llu\n", __func__, 568 (unsigned long long)sh->sector); 569 570 for (i = sh->disks; i--; ) { 571 struct r5dev *dev = &sh->dev[i]; 572 if (test_bit(R5_Wantfill, &dev->flags)) { 573 struct bio *rbi; 574 spin_lock_irq(&conf->device_lock); 575 dev->read = rbi = dev->toread; 576 dev->toread = NULL; 577 spin_unlock_irq(&conf->device_lock); 578 while (rbi && rbi->bi_sector < 579 dev->sector + STRIPE_SECTORS) { 580 tx = async_copy_data(0, rbi, dev->page, 581 dev->sector, tx); 582 rbi = r5_next_bio(rbi, dev->sector); 583 } 584 } 585 } 586 587 atomic_inc(&sh->count); 588 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 589 ops_complete_biofill, sh); 590 } 591 592 static void ops_complete_compute5(void *stripe_head_ref) 593 { 594 struct stripe_head *sh = stripe_head_ref; 595 int target = sh->ops.target; 596 struct r5dev *tgt = &sh->dev[target]; 597 598 pr_debug("%s: stripe %llu\n", __func__, 599 (unsigned long long)sh->sector); 600 601 set_bit(R5_UPTODATE, &tgt->flags); 602 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 603 clear_bit(R5_Wantcompute, &tgt->flags); 604 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 605 if (sh->check_state == check_state_compute_run) 606 sh->check_state = check_state_compute_result; 607 set_bit(STRIPE_HANDLE, &sh->state); 608 release_stripe(sh); 609 } 610 611 static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 612 { 613 /* kernel stack size limits the total number of disks */ 614 int disks = sh->disks; 615 struct page *xor_srcs[disks]; 616 int target = sh->ops.target; 617 struct r5dev *tgt = &sh->dev[target]; 618 struct page *xor_dest = tgt->page; 619 int count = 0; 620 struct dma_async_tx_descriptor *tx; 621 int i; 622 623 pr_debug("%s: stripe %llu block: %d\n", 624 __func__, (unsigned long long)sh->sector, target); 625 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 626 627 for (i = disks; i--; ) 628 if (i != target) 629 xor_srcs[count++] = sh->dev[i].page; 630 631 atomic_inc(&sh->count); 632 633 if (unlikely(count == 1)) 634 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 635 0, NULL, ops_complete_compute5, sh); 636 else 637 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 638 ASYNC_TX_XOR_ZERO_DST, NULL, 639 ops_complete_compute5, sh); 640 641 return tx; 642 } 643 644 static void ops_complete_prexor(void *stripe_head_ref) 645 { 646 struct stripe_head *sh = stripe_head_ref; 647 648 pr_debug("%s: stripe %llu\n", __func__, 649 (unsigned long long)sh->sector); 650 } 651 652 static struct dma_async_tx_descriptor * 653 ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 654 { 655 /* kernel stack size limits the total number of disks */ 656 int disks = sh->disks; 657 struct page *xor_srcs[disks]; 658 int count = 0, pd_idx = sh->pd_idx, i; 659 660 /* existing parity data subtracted */ 661 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 662 663 pr_debug("%s: stripe %llu\n", __func__, 664 (unsigned long long)sh->sector); 665 666 for (i = disks; i--; ) { 667 struct r5dev *dev = &sh->dev[i]; 668 /* Only process blocks that are known to be uptodate */ 669 if (test_bit(R5_Wantdrain, &dev->flags)) 670 xor_srcs[count++] = dev->page; 671 } 672 673 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 674 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 675 ops_complete_prexor, sh); 676 677 return tx; 678 } 679 680 static struct dma_async_tx_descriptor * 681 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 682 { 683 int disks = sh->disks; 684 int i; 685 686 pr_debug("%s: stripe %llu\n", __func__, 687 (unsigned long long)sh->sector); 688 689 for (i = disks; i--; ) { 690 struct r5dev *dev = &sh->dev[i]; 691 struct bio *chosen; 692 693 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 694 struct bio *wbi; 695 696 spin_lock(&sh->lock); 697 chosen = dev->towrite; 698 dev->towrite = NULL; 699 BUG_ON(dev->written); 700 wbi = dev->written = chosen; 701 spin_unlock(&sh->lock); 702 703 while (wbi && wbi->bi_sector < 704 dev->sector + STRIPE_SECTORS) { 705 tx = async_copy_data(1, wbi, dev->page, 706 dev->sector, tx); 707 wbi = r5_next_bio(wbi, dev->sector); 708 } 709 } 710 } 711 712 return tx; 713 } 714 715 static void ops_complete_postxor(void *stripe_head_ref) 716 { 717 struct stripe_head *sh = stripe_head_ref; 718 int disks = sh->disks, i, pd_idx = sh->pd_idx; 719 720 pr_debug("%s: stripe %llu\n", __func__, 721 (unsigned long long)sh->sector); 722 723 for (i = disks; i--; ) { 724 struct r5dev *dev = &sh->dev[i]; 725 if (dev->written || i == pd_idx) 726 set_bit(R5_UPTODATE, &dev->flags); 727 } 728 729 if (sh->reconstruct_state == reconstruct_state_drain_run) 730 sh->reconstruct_state = reconstruct_state_drain_result; 731 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 732 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 733 else { 734 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 735 sh->reconstruct_state = reconstruct_state_result; 736 } 737 738 set_bit(STRIPE_HANDLE, &sh->state); 739 release_stripe(sh); 740 } 741 742 static void 743 ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 744 { 745 /* kernel stack size limits the total number of disks */ 746 int disks = sh->disks; 747 struct page *xor_srcs[disks]; 748 749 int count = 0, pd_idx = sh->pd_idx, i; 750 struct page *xor_dest; 751 int prexor = 0; 752 unsigned long flags; 753 754 pr_debug("%s: stripe %llu\n", __func__, 755 (unsigned long long)sh->sector); 756 757 /* check if prexor is active which means only process blocks 758 * that are part of a read-modify-write (written) 759 */ 760 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 761 prexor = 1; 762 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 763 for (i = disks; i--; ) { 764 struct r5dev *dev = &sh->dev[i]; 765 if (dev->written) 766 xor_srcs[count++] = dev->page; 767 } 768 } else { 769 xor_dest = sh->dev[pd_idx].page; 770 for (i = disks; i--; ) { 771 struct r5dev *dev = &sh->dev[i]; 772 if (i != pd_idx) 773 xor_srcs[count++] = dev->page; 774 } 775 } 776 777 /* 1/ if we prexor'd then the dest is reused as a source 778 * 2/ if we did not prexor then we are redoing the parity 779 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 780 * for the synchronous xor case 781 */ 782 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 783 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 784 785 atomic_inc(&sh->count); 786 787 if (unlikely(count == 1)) { 788 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 789 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 790 flags, tx, ops_complete_postxor, sh); 791 } else 792 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 793 flags, tx, ops_complete_postxor, sh); 794 } 795 796 static void ops_complete_check(void *stripe_head_ref) 797 { 798 struct stripe_head *sh = stripe_head_ref; 799 800 pr_debug("%s: stripe %llu\n", __func__, 801 (unsigned long long)sh->sector); 802 803 sh->check_state = check_state_check_result; 804 set_bit(STRIPE_HANDLE, &sh->state); 805 release_stripe(sh); 806 } 807 808 static void ops_run_check(struct stripe_head *sh) 809 { 810 /* kernel stack size limits the total number of disks */ 811 int disks = sh->disks; 812 struct page *xor_srcs[disks]; 813 struct dma_async_tx_descriptor *tx; 814 815 int count = 0, pd_idx = sh->pd_idx, i; 816 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 817 818 pr_debug("%s: stripe %llu\n", __func__, 819 (unsigned long long)sh->sector); 820 821 for (i = disks; i--; ) { 822 struct r5dev *dev = &sh->dev[i]; 823 if (i != pd_idx) 824 xor_srcs[count++] = dev->page; 825 } 826 827 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 828 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 829 830 atomic_inc(&sh->count); 831 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 832 ops_complete_check, sh); 833 } 834 835 static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 836 { 837 int overlap_clear = 0, i, disks = sh->disks; 838 struct dma_async_tx_descriptor *tx = NULL; 839 840 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 841 ops_run_biofill(sh); 842 overlap_clear++; 843 } 844 845 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 846 tx = ops_run_compute5(sh); 847 /* terminate the chain if postxor is not set to be run */ 848 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 849 async_tx_ack(tx); 850 } 851 852 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 853 tx = ops_run_prexor(sh, tx); 854 855 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 856 tx = ops_run_biodrain(sh, tx); 857 overlap_clear++; 858 } 859 860 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 861 ops_run_postxor(sh, tx); 862 863 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 864 ops_run_check(sh); 865 866 if (overlap_clear) 867 for (i = disks; i--; ) { 868 struct r5dev *dev = &sh->dev[i]; 869 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 870 wake_up(&sh->raid_conf->wait_for_overlap); 871 } 872 } 873 874 static int grow_one_stripe(raid5_conf_t *conf) 875 { 876 struct stripe_head *sh; 877 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 878 if (!sh) 879 return 0; 880 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); 881 sh->raid_conf = conf; 882 spin_lock_init(&sh->lock); 883 884 if (grow_buffers(sh, conf->raid_disks)) { 885 shrink_buffers(sh, conf->raid_disks); 886 kmem_cache_free(conf->slab_cache, sh); 887 return 0; 888 } 889 sh->disks = conf->raid_disks; 890 /* we just created an active stripe so... */ 891 atomic_set(&sh->count, 1); 892 atomic_inc(&conf->active_stripes); 893 INIT_LIST_HEAD(&sh->lru); 894 release_stripe(sh); 895 return 1; 896 } 897 898 static int grow_stripes(raid5_conf_t *conf, int num) 899 { 900 struct kmem_cache *sc; 901 int devs = conf->raid_disks; 902 903 sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); 904 sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); 905 conf->active_name = 0; 906 sc = kmem_cache_create(conf->cache_name[conf->active_name], 907 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 908 0, 0, NULL); 909 if (!sc) 910 return 1; 911 conf->slab_cache = sc; 912 conf->pool_size = devs; 913 while (num--) 914 if (!grow_one_stripe(conf)) 915 return 1; 916 return 0; 917 } 918 919 #ifdef CONFIG_MD_RAID5_RESHAPE 920 static int resize_stripes(raid5_conf_t *conf, int newsize) 921 { 922 /* Make all the stripes able to hold 'newsize' devices. 923 * New slots in each stripe get 'page' set to a new page. 924 * 925 * This happens in stages: 926 * 1/ create a new kmem_cache and allocate the required number of 927 * stripe_heads. 928 * 2/ gather all the old stripe_heads and tranfer the pages across 929 * to the new stripe_heads. This will have the side effect of 930 * freezing the array as once all stripe_heads have been collected, 931 * no IO will be possible. Old stripe heads are freed once their 932 * pages have been transferred over, and the old kmem_cache is 933 * freed when all stripes are done. 934 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 935 * we simple return a failre status - no need to clean anything up. 936 * 4/ allocate new pages for the new slots in the new stripe_heads. 937 * If this fails, we don't bother trying the shrink the 938 * stripe_heads down again, we just leave them as they are. 939 * As each stripe_head is processed the new one is released into 940 * active service. 941 * 942 * Once step2 is started, we cannot afford to wait for a write, 943 * so we use GFP_NOIO allocations. 944 */ 945 struct stripe_head *osh, *nsh; 946 LIST_HEAD(newstripes); 947 struct disk_info *ndisks; 948 int err; 949 struct kmem_cache *sc; 950 int i; 951 952 if (newsize <= conf->pool_size) 953 return 0; /* never bother to shrink */ 954 955 err = md_allow_write(conf->mddev); 956 if (err) 957 return err; 958 959 /* Step 1 */ 960 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 961 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 962 0, 0, NULL); 963 if (!sc) 964 return -ENOMEM; 965 966 for (i = conf->max_nr_stripes; i; i--) { 967 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 968 if (!nsh) 969 break; 970 971 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 972 973 nsh->raid_conf = conf; 974 spin_lock_init(&nsh->lock); 975 976 list_add(&nsh->lru, &newstripes); 977 } 978 if (i) { 979 /* didn't get enough, give up */ 980 while (!list_empty(&newstripes)) { 981 nsh = list_entry(newstripes.next, struct stripe_head, lru); 982 list_del(&nsh->lru); 983 kmem_cache_free(sc, nsh); 984 } 985 kmem_cache_destroy(sc); 986 return -ENOMEM; 987 } 988 /* Step 2 - Must use GFP_NOIO now. 989 * OK, we have enough stripes, start collecting inactive 990 * stripes and copying them over 991 */ 992 list_for_each_entry(nsh, &newstripes, lru) { 993 spin_lock_irq(&conf->device_lock); 994 wait_event_lock_irq(conf->wait_for_stripe, 995 !list_empty(&conf->inactive_list), 996 conf->device_lock, 997 unplug_slaves(conf->mddev) 998 ); 999 osh = get_free_stripe(conf); 1000 spin_unlock_irq(&conf->device_lock); 1001 atomic_set(&nsh->count, 1); 1002 for(i=0; i<conf->pool_size; i++) 1003 nsh->dev[i].page = osh->dev[i].page; 1004 for( ; i<newsize; i++) 1005 nsh->dev[i].page = NULL; 1006 kmem_cache_free(conf->slab_cache, osh); 1007 } 1008 kmem_cache_destroy(conf->slab_cache); 1009 1010 /* Step 3. 1011 * At this point, we are holding all the stripes so the array 1012 * is completely stalled, so now is a good time to resize 1013 * conf->disks. 1014 */ 1015 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1016 if (ndisks) { 1017 for (i=0; i<conf->raid_disks; i++) 1018 ndisks[i] = conf->disks[i]; 1019 kfree(conf->disks); 1020 conf->disks = ndisks; 1021 } else 1022 err = -ENOMEM; 1023 1024 /* Step 4, return new stripes to service */ 1025 while(!list_empty(&newstripes)) { 1026 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1027 list_del_init(&nsh->lru); 1028 for (i=conf->raid_disks; i < newsize; i++) 1029 if (nsh->dev[i].page == NULL) { 1030 struct page *p = alloc_page(GFP_NOIO); 1031 nsh->dev[i].page = p; 1032 if (!p) 1033 err = -ENOMEM; 1034 } 1035 release_stripe(nsh); 1036 } 1037 /* critical section pass, GFP_NOIO no longer needed */ 1038 1039 conf->slab_cache = sc; 1040 conf->active_name = 1-conf->active_name; 1041 conf->pool_size = newsize; 1042 return err; 1043 } 1044 #endif 1045 1046 static int drop_one_stripe(raid5_conf_t *conf) 1047 { 1048 struct stripe_head *sh; 1049 1050 spin_lock_irq(&conf->device_lock); 1051 sh = get_free_stripe(conf); 1052 spin_unlock_irq(&conf->device_lock); 1053 if (!sh) 1054 return 0; 1055 BUG_ON(atomic_read(&sh->count)); 1056 shrink_buffers(sh, conf->pool_size); 1057 kmem_cache_free(conf->slab_cache, sh); 1058 atomic_dec(&conf->active_stripes); 1059 return 1; 1060 } 1061 1062 static void shrink_stripes(raid5_conf_t *conf) 1063 { 1064 while (drop_one_stripe(conf)) 1065 ; 1066 1067 if (conf->slab_cache) 1068 kmem_cache_destroy(conf->slab_cache); 1069 conf->slab_cache = NULL; 1070 } 1071 1072 static void raid5_end_read_request(struct bio * bi, int error) 1073 { 1074 struct stripe_head *sh = bi->bi_private; 1075 raid5_conf_t *conf = sh->raid_conf; 1076 int disks = sh->disks, i; 1077 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1078 char b[BDEVNAME_SIZE]; 1079 mdk_rdev_t *rdev; 1080 1081 1082 for (i=0 ; i<disks; i++) 1083 if (bi == &sh->dev[i].req) 1084 break; 1085 1086 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1087 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1088 uptodate); 1089 if (i == disks) { 1090 BUG(); 1091 return; 1092 } 1093 1094 if (uptodate) { 1095 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1096 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1097 rdev = conf->disks[i].rdev; 1098 printk_rl(KERN_INFO "raid5:%s: read error corrected" 1099 " (%lu sectors at %llu on %s)\n", 1100 mdname(conf->mddev), STRIPE_SECTORS, 1101 (unsigned long long)(sh->sector 1102 + rdev->data_offset), 1103 bdevname(rdev->bdev, b)); 1104 clear_bit(R5_ReadError, &sh->dev[i].flags); 1105 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1106 } 1107 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1108 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1109 } else { 1110 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1111 int retry = 0; 1112 rdev = conf->disks[i].rdev; 1113 1114 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1115 atomic_inc(&rdev->read_errors); 1116 if (conf->mddev->degraded) 1117 printk_rl(KERN_WARNING 1118 "raid5:%s: read error not correctable " 1119 "(sector %llu on %s).\n", 1120 mdname(conf->mddev), 1121 (unsigned long long)(sh->sector 1122 + rdev->data_offset), 1123 bdn); 1124 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1125 /* Oh, no!!! */ 1126 printk_rl(KERN_WARNING 1127 "raid5:%s: read error NOT corrected!! " 1128 "(sector %llu on %s).\n", 1129 mdname(conf->mddev), 1130 (unsigned long long)(sh->sector 1131 + rdev->data_offset), 1132 bdn); 1133 else if (atomic_read(&rdev->read_errors) 1134 > conf->max_nr_stripes) 1135 printk(KERN_WARNING 1136 "raid5:%s: Too many read errors, failing device %s.\n", 1137 mdname(conf->mddev), bdn); 1138 else 1139 retry = 1; 1140 if (retry) 1141 set_bit(R5_ReadError, &sh->dev[i].flags); 1142 else { 1143 clear_bit(R5_ReadError, &sh->dev[i].flags); 1144 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1145 md_error(conf->mddev, rdev); 1146 } 1147 } 1148 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1149 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1150 set_bit(STRIPE_HANDLE, &sh->state); 1151 release_stripe(sh); 1152 } 1153 1154 static void raid5_end_write_request (struct bio *bi, int error) 1155 { 1156 struct stripe_head *sh = bi->bi_private; 1157 raid5_conf_t *conf = sh->raid_conf; 1158 int disks = sh->disks, i; 1159 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1160 1161 for (i=0 ; i<disks; i++) 1162 if (bi == &sh->dev[i].req) 1163 break; 1164 1165 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1166 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1167 uptodate); 1168 if (i == disks) { 1169 BUG(); 1170 return; 1171 } 1172 1173 if (!uptodate) 1174 md_error(conf->mddev, conf->disks[i].rdev); 1175 1176 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1177 1178 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1179 set_bit(STRIPE_HANDLE, &sh->state); 1180 release_stripe(sh); 1181 } 1182 1183 1184 static sector_t compute_blocknr(struct stripe_head *sh, int i); 1185 1186 static void raid5_build_block (struct stripe_head *sh, int i) 1187 { 1188 struct r5dev *dev = &sh->dev[i]; 1189 1190 bio_init(&dev->req); 1191 dev->req.bi_io_vec = &dev->vec; 1192 dev->req.bi_vcnt++; 1193 dev->req.bi_max_vecs++; 1194 dev->vec.bv_page = dev->page; 1195 dev->vec.bv_len = STRIPE_SIZE; 1196 dev->vec.bv_offset = 0; 1197 1198 dev->req.bi_sector = sh->sector; 1199 dev->req.bi_private = sh; 1200 1201 dev->flags = 0; 1202 dev->sector = compute_blocknr(sh, i); 1203 } 1204 1205 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1206 { 1207 char b[BDEVNAME_SIZE]; 1208 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1209 pr_debug("raid5: error called\n"); 1210 1211 if (!test_bit(Faulty, &rdev->flags)) { 1212 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1213 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1214 unsigned long flags; 1215 spin_lock_irqsave(&conf->device_lock, flags); 1216 mddev->degraded++; 1217 spin_unlock_irqrestore(&conf->device_lock, flags); 1218 /* 1219 * if recovery was running, make sure it aborts. 1220 */ 1221 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1222 } 1223 set_bit(Faulty, &rdev->flags); 1224 printk (KERN_ALERT 1225 "raid5: Disk failure on %s, disabling device.\n" 1226 "raid5: Operation continuing on %d devices.\n", 1227 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1228 } 1229 } 1230 1231 /* 1232 * Input: a 'big' sector number, 1233 * Output: index of the data and parity disk, and the sector # in them. 1234 */ 1235 static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, 1236 unsigned int data_disks, unsigned int * dd_idx, 1237 unsigned int * pd_idx, raid5_conf_t *conf) 1238 { 1239 long stripe; 1240 unsigned long chunk_number; 1241 unsigned int chunk_offset; 1242 sector_t new_sector; 1243 int sectors_per_chunk = conf->chunk_size >> 9; 1244 1245 /* First compute the information on this sector */ 1246 1247 /* 1248 * Compute the chunk number and the sector offset inside the chunk 1249 */ 1250 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1251 chunk_number = r_sector; 1252 BUG_ON(r_sector != chunk_number); 1253 1254 /* 1255 * Compute the stripe number 1256 */ 1257 stripe = chunk_number / data_disks; 1258 1259 /* 1260 * Compute the data disk and parity disk indexes inside the stripe 1261 */ 1262 *dd_idx = chunk_number % data_disks; 1263 1264 /* 1265 * Select the parity disk based on the user selected algorithm. 1266 */ 1267 switch(conf->level) { 1268 case 4: 1269 *pd_idx = data_disks; 1270 break; 1271 case 5: 1272 switch (conf->algorithm) { 1273 case ALGORITHM_LEFT_ASYMMETRIC: 1274 *pd_idx = data_disks - stripe % raid_disks; 1275 if (*dd_idx >= *pd_idx) 1276 (*dd_idx)++; 1277 break; 1278 case ALGORITHM_RIGHT_ASYMMETRIC: 1279 *pd_idx = stripe % raid_disks; 1280 if (*dd_idx >= *pd_idx) 1281 (*dd_idx)++; 1282 break; 1283 case ALGORITHM_LEFT_SYMMETRIC: 1284 *pd_idx = data_disks - stripe % raid_disks; 1285 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 1286 break; 1287 case ALGORITHM_RIGHT_SYMMETRIC: 1288 *pd_idx = stripe % raid_disks; 1289 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 1290 break; 1291 default: 1292 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1293 conf->algorithm); 1294 } 1295 break; 1296 case 6: 1297 1298 /**** FIX THIS ****/ 1299 switch (conf->algorithm) { 1300 case ALGORITHM_LEFT_ASYMMETRIC: 1301 *pd_idx = raid_disks - 1 - (stripe % raid_disks); 1302 if (*pd_idx == raid_disks-1) 1303 (*dd_idx)++; /* Q D D D P */ 1304 else if (*dd_idx >= *pd_idx) 1305 (*dd_idx) += 2; /* D D P Q D */ 1306 break; 1307 case ALGORITHM_RIGHT_ASYMMETRIC: 1308 *pd_idx = stripe % raid_disks; 1309 if (*pd_idx == raid_disks-1) 1310 (*dd_idx)++; /* Q D D D P */ 1311 else if (*dd_idx >= *pd_idx) 1312 (*dd_idx) += 2; /* D D P Q D */ 1313 break; 1314 case ALGORITHM_LEFT_SYMMETRIC: 1315 *pd_idx = raid_disks - 1 - (stripe % raid_disks); 1316 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; 1317 break; 1318 case ALGORITHM_RIGHT_SYMMETRIC: 1319 *pd_idx = stripe % raid_disks; 1320 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; 1321 break; 1322 default: 1323 printk (KERN_CRIT "raid6: unsupported algorithm %d\n", 1324 conf->algorithm); 1325 } 1326 break; 1327 } 1328 1329 /* 1330 * Finally, compute the new sector number 1331 */ 1332 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1333 return new_sector; 1334 } 1335 1336 1337 static sector_t compute_blocknr(struct stripe_head *sh, int i) 1338 { 1339 raid5_conf_t *conf = sh->raid_conf; 1340 int raid_disks = sh->disks; 1341 int data_disks = raid_disks - conf->max_degraded; 1342 sector_t new_sector = sh->sector, check; 1343 int sectors_per_chunk = conf->chunk_size >> 9; 1344 sector_t stripe; 1345 int chunk_offset; 1346 int chunk_number, dummy1, dummy2, dd_idx = i; 1347 sector_t r_sector; 1348 1349 1350 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1351 stripe = new_sector; 1352 BUG_ON(new_sector != stripe); 1353 1354 if (i == sh->pd_idx) 1355 return 0; 1356 switch(conf->level) { 1357 case 4: break; 1358 case 5: 1359 switch (conf->algorithm) { 1360 case ALGORITHM_LEFT_ASYMMETRIC: 1361 case ALGORITHM_RIGHT_ASYMMETRIC: 1362 if (i > sh->pd_idx) 1363 i--; 1364 break; 1365 case ALGORITHM_LEFT_SYMMETRIC: 1366 case ALGORITHM_RIGHT_SYMMETRIC: 1367 if (i < sh->pd_idx) 1368 i += raid_disks; 1369 i -= (sh->pd_idx + 1); 1370 break; 1371 default: 1372 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1373 conf->algorithm); 1374 } 1375 break; 1376 case 6: 1377 if (i == raid6_next_disk(sh->pd_idx, raid_disks)) 1378 return 0; /* It is the Q disk */ 1379 switch (conf->algorithm) { 1380 case ALGORITHM_LEFT_ASYMMETRIC: 1381 case ALGORITHM_RIGHT_ASYMMETRIC: 1382 if (sh->pd_idx == raid_disks-1) 1383 i--; /* Q D D D P */ 1384 else if (i > sh->pd_idx) 1385 i -= 2; /* D D P Q D */ 1386 break; 1387 case ALGORITHM_LEFT_SYMMETRIC: 1388 case ALGORITHM_RIGHT_SYMMETRIC: 1389 if (sh->pd_idx == raid_disks-1) 1390 i--; /* Q D D D P */ 1391 else { 1392 /* D D P Q D */ 1393 if (i < sh->pd_idx) 1394 i += raid_disks; 1395 i -= (sh->pd_idx + 2); 1396 } 1397 break; 1398 default: 1399 printk (KERN_CRIT "raid6: unsupported algorithm %d\n", 1400 conf->algorithm); 1401 } 1402 break; 1403 } 1404 1405 chunk_number = stripe * data_disks + i; 1406 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 1407 1408 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); 1409 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { 1410 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1411 return 0; 1412 } 1413 return r_sector; 1414 } 1415 1416 1417 1418 /* 1419 * Copy data between a page in the stripe cache, and one or more bion 1420 * The page could align with the middle of the bio, or there could be 1421 * several bion, each with several bio_vecs, which cover part of the page 1422 * Multiple bion are linked together on bi_next. There may be extras 1423 * at the end of this list. We ignore them. 1424 */ 1425 static void copy_data(int frombio, struct bio *bio, 1426 struct page *page, 1427 sector_t sector) 1428 { 1429 char *pa = page_address(page); 1430 struct bio_vec *bvl; 1431 int i; 1432 int page_offset; 1433 1434 if (bio->bi_sector >= sector) 1435 page_offset = (signed)(bio->bi_sector - sector) * 512; 1436 else 1437 page_offset = (signed)(sector - bio->bi_sector) * -512; 1438 bio_for_each_segment(bvl, bio, i) { 1439 int len = bio_iovec_idx(bio,i)->bv_len; 1440 int clen; 1441 int b_offset = 0; 1442 1443 if (page_offset < 0) { 1444 b_offset = -page_offset; 1445 page_offset += b_offset; 1446 len -= b_offset; 1447 } 1448 1449 if (len > 0 && page_offset + len > STRIPE_SIZE) 1450 clen = STRIPE_SIZE - page_offset; 1451 else clen = len; 1452 1453 if (clen > 0) { 1454 char *ba = __bio_kmap_atomic(bio, i, KM_USER0); 1455 if (frombio) 1456 memcpy(pa+page_offset, ba+b_offset, clen); 1457 else 1458 memcpy(ba+b_offset, pa+page_offset, clen); 1459 __bio_kunmap_atomic(ba, KM_USER0); 1460 } 1461 if (clen < len) /* hit end of page */ 1462 break; 1463 page_offset += len; 1464 } 1465 } 1466 1467 #define check_xor() do { \ 1468 if (count == MAX_XOR_BLOCKS) { \ 1469 xor_blocks(count, STRIPE_SIZE, dest, ptr);\ 1470 count = 0; \ 1471 } \ 1472 } while(0) 1473 1474 static void compute_parity6(struct stripe_head *sh, int method) 1475 { 1476 raid6_conf_t *conf = sh->raid_conf; 1477 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; 1478 struct bio *chosen; 1479 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1480 void *ptrs[disks]; 1481 1482 qd_idx = raid6_next_disk(pd_idx, disks); 1483 d0_idx = raid6_next_disk(qd_idx, disks); 1484 1485 pr_debug("compute_parity, stripe %llu, method %d\n", 1486 (unsigned long long)sh->sector, method); 1487 1488 switch(method) { 1489 case READ_MODIFY_WRITE: 1490 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ 1491 case RECONSTRUCT_WRITE: 1492 for (i= disks; i-- ;) 1493 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { 1494 chosen = sh->dev[i].towrite; 1495 sh->dev[i].towrite = NULL; 1496 1497 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1498 wake_up(&conf->wait_for_overlap); 1499 1500 BUG_ON(sh->dev[i].written); 1501 sh->dev[i].written = chosen; 1502 } 1503 break; 1504 case CHECK_PARITY: 1505 BUG(); /* Not implemented yet */ 1506 } 1507 1508 for (i = disks; i--;) 1509 if (sh->dev[i].written) { 1510 sector_t sector = sh->dev[i].sector; 1511 struct bio *wbi = sh->dev[i].written; 1512 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { 1513 copy_data(1, wbi, sh->dev[i].page, sector); 1514 wbi = r5_next_bio(wbi, sector); 1515 } 1516 1517 set_bit(R5_LOCKED, &sh->dev[i].flags); 1518 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1519 } 1520 1521 // switch(method) { 1522 // case RECONSTRUCT_WRITE: 1523 // case CHECK_PARITY: 1524 // case UPDATE_PARITY: 1525 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ 1526 /* FIX: Is this ordering of drives even remotely optimal? */ 1527 count = 0; 1528 i = d0_idx; 1529 do { 1530 ptrs[count++] = page_address(sh->dev[i].page); 1531 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) 1532 printk("block %d/%d not uptodate on parity calc\n", i,count); 1533 i = raid6_next_disk(i, disks); 1534 } while ( i != d0_idx ); 1535 // break; 1536 // } 1537 1538 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); 1539 1540 switch(method) { 1541 case RECONSTRUCT_WRITE: 1542 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1543 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); 1544 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1545 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); 1546 break; 1547 case UPDATE_PARITY: 1548 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1549 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); 1550 break; 1551 } 1552 } 1553 1554 1555 /* Compute one missing block */ 1556 static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) 1557 { 1558 int i, count, disks = sh->disks; 1559 void *ptr[MAX_XOR_BLOCKS], *dest, *p; 1560 int pd_idx = sh->pd_idx; 1561 int qd_idx = raid6_next_disk(pd_idx, disks); 1562 1563 pr_debug("compute_block_1, stripe %llu, idx %d\n", 1564 (unsigned long long)sh->sector, dd_idx); 1565 1566 if ( dd_idx == qd_idx ) { 1567 /* We're actually computing the Q drive */ 1568 compute_parity6(sh, UPDATE_PARITY); 1569 } else { 1570 dest = page_address(sh->dev[dd_idx].page); 1571 if (!nozero) memset(dest, 0, STRIPE_SIZE); 1572 count = 0; 1573 for (i = disks ; i--; ) { 1574 if (i == dd_idx || i == qd_idx) 1575 continue; 1576 p = page_address(sh->dev[i].page); 1577 if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) 1578 ptr[count++] = p; 1579 else 1580 printk("compute_block() %d, stripe %llu, %d" 1581 " not present\n", dd_idx, 1582 (unsigned long long)sh->sector, i); 1583 1584 check_xor(); 1585 } 1586 if (count) 1587 xor_blocks(count, STRIPE_SIZE, dest, ptr); 1588 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1589 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1590 } 1591 } 1592 1593 /* Compute two missing blocks */ 1594 static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) 1595 { 1596 int i, count, disks = sh->disks; 1597 int pd_idx = sh->pd_idx; 1598 int qd_idx = raid6_next_disk(pd_idx, disks); 1599 int d0_idx = raid6_next_disk(qd_idx, disks); 1600 int faila, failb; 1601 1602 /* faila and failb are disk numbers relative to d0_idx */ 1603 /* pd_idx become disks-2 and qd_idx become disks-1 */ 1604 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; 1605 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; 1606 1607 BUG_ON(faila == failb); 1608 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } 1609 1610 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", 1611 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); 1612 1613 if ( failb == disks-1 ) { 1614 /* Q disk is one of the missing disks */ 1615 if ( faila == disks-2 ) { 1616 /* Missing P+Q, just recompute */ 1617 compute_parity6(sh, UPDATE_PARITY); 1618 return; 1619 } else { 1620 /* We're missing D+Q; recompute D from P */ 1621 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); 1622 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ 1623 return; 1624 } 1625 } 1626 1627 /* We're missing D+P or D+D; build pointer table */ 1628 { 1629 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1630 void *ptrs[disks]; 1631 1632 count = 0; 1633 i = d0_idx; 1634 do { 1635 ptrs[count++] = page_address(sh->dev[i].page); 1636 i = raid6_next_disk(i, disks); 1637 if (i != dd_idx1 && i != dd_idx2 && 1638 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) 1639 printk("compute_2 with missing block %d/%d\n", count, i); 1640 } while ( i != d0_idx ); 1641 1642 if ( failb == disks-2 ) { 1643 /* We're missing D+P. */ 1644 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); 1645 } else { 1646 /* We're missing D+D. */ 1647 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); 1648 } 1649 1650 /* Both the above update both missing blocks */ 1651 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); 1652 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); 1653 } 1654 } 1655 1656 static void 1657 schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1658 int rcw, int expand) 1659 { 1660 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1661 1662 if (rcw) { 1663 /* if we are not expanding this is a proper write request, and 1664 * there will be bios with new data to be drained into the 1665 * stripe cache 1666 */ 1667 if (!expand) { 1668 sh->reconstruct_state = reconstruct_state_drain_run; 1669 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1670 } else 1671 sh->reconstruct_state = reconstruct_state_run; 1672 1673 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1674 1675 for (i = disks; i--; ) { 1676 struct r5dev *dev = &sh->dev[i]; 1677 1678 if (dev->towrite) { 1679 set_bit(R5_LOCKED, &dev->flags); 1680 set_bit(R5_Wantdrain, &dev->flags); 1681 if (!expand) 1682 clear_bit(R5_UPTODATE, &dev->flags); 1683 s->locked++; 1684 } 1685 } 1686 if (s->locked + 1 == disks) 1687 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1688 atomic_inc(&sh->raid_conf->pending_full_writes); 1689 } else { 1690 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1691 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1692 1693 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1694 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1695 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1696 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1697 1698 for (i = disks; i--; ) { 1699 struct r5dev *dev = &sh->dev[i]; 1700 if (i == pd_idx) 1701 continue; 1702 1703 if (dev->towrite && 1704 (test_bit(R5_UPTODATE, &dev->flags) || 1705 test_bit(R5_Wantcompute, &dev->flags))) { 1706 set_bit(R5_Wantdrain, &dev->flags); 1707 set_bit(R5_LOCKED, &dev->flags); 1708 clear_bit(R5_UPTODATE, &dev->flags); 1709 s->locked++; 1710 } 1711 } 1712 } 1713 1714 /* keep the parity disk locked while asynchronous operations 1715 * are in flight 1716 */ 1717 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1718 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1719 s->locked++; 1720 1721 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 1722 __func__, (unsigned long long)sh->sector, 1723 s->locked, s->ops_request); 1724 } 1725 1726 /* 1727 * Each stripe/dev can have one or more bion attached. 1728 * toread/towrite point to the first in a chain. 1729 * The bi_next chain must be in order. 1730 */ 1731 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 1732 { 1733 struct bio **bip; 1734 raid5_conf_t *conf = sh->raid_conf; 1735 int firstwrite=0; 1736 1737 pr_debug("adding bh b#%llu to stripe s#%llu\n", 1738 (unsigned long long)bi->bi_sector, 1739 (unsigned long long)sh->sector); 1740 1741 1742 spin_lock(&sh->lock); 1743 spin_lock_irq(&conf->device_lock); 1744 if (forwrite) { 1745 bip = &sh->dev[dd_idx].towrite; 1746 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 1747 firstwrite = 1; 1748 } else 1749 bip = &sh->dev[dd_idx].toread; 1750 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 1751 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 1752 goto overlap; 1753 bip = & (*bip)->bi_next; 1754 } 1755 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 1756 goto overlap; 1757 1758 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 1759 if (*bip) 1760 bi->bi_next = *bip; 1761 *bip = bi; 1762 bi->bi_phys_segments++; 1763 spin_unlock_irq(&conf->device_lock); 1764 spin_unlock(&sh->lock); 1765 1766 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 1767 (unsigned long long)bi->bi_sector, 1768 (unsigned long long)sh->sector, dd_idx); 1769 1770 if (conf->mddev->bitmap && firstwrite) { 1771 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 1772 STRIPE_SECTORS, 0); 1773 sh->bm_seq = conf->seq_flush+1; 1774 set_bit(STRIPE_BIT_DELAY, &sh->state); 1775 } 1776 1777 if (forwrite) { 1778 /* check if page is covered */ 1779 sector_t sector = sh->dev[dd_idx].sector; 1780 for (bi=sh->dev[dd_idx].towrite; 1781 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 1782 bi && bi->bi_sector <= sector; 1783 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 1784 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 1785 sector = bi->bi_sector + (bi->bi_size>>9); 1786 } 1787 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 1788 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 1789 } 1790 return 1; 1791 1792 overlap: 1793 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 1794 spin_unlock_irq(&conf->device_lock); 1795 spin_unlock(&sh->lock); 1796 return 0; 1797 } 1798 1799 static void end_reshape(raid5_conf_t *conf); 1800 1801 static int page_is_zero(struct page *p) 1802 { 1803 char *a = page_address(p); 1804 return ((*(u32*)a) == 0 && 1805 memcmp(a, a+4, STRIPE_SIZE-4)==0); 1806 } 1807 1808 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1809 { 1810 int sectors_per_chunk = conf->chunk_size >> 9; 1811 int pd_idx, dd_idx; 1812 int chunk_offset = sector_div(stripe, sectors_per_chunk); 1813 1814 raid5_compute_sector(stripe * (disks - conf->max_degraded) 1815 *sectors_per_chunk + chunk_offset, 1816 disks, disks - conf->max_degraded, 1817 &dd_idx, &pd_idx, conf); 1818 return pd_idx; 1819 } 1820 1821 static void 1822 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, 1823 struct stripe_head_state *s, int disks, 1824 struct bio **return_bi) 1825 { 1826 int i; 1827 for (i = disks; i--; ) { 1828 struct bio *bi; 1829 int bitmap_end = 0; 1830 1831 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1832 mdk_rdev_t *rdev; 1833 rcu_read_lock(); 1834 rdev = rcu_dereference(conf->disks[i].rdev); 1835 if (rdev && test_bit(In_sync, &rdev->flags)) 1836 /* multiple read failures in one stripe */ 1837 md_error(conf->mddev, rdev); 1838 rcu_read_unlock(); 1839 } 1840 spin_lock_irq(&conf->device_lock); 1841 /* fail all writes first */ 1842 bi = sh->dev[i].towrite; 1843 sh->dev[i].towrite = NULL; 1844 if (bi) { 1845 s->to_write--; 1846 bitmap_end = 1; 1847 } 1848 1849 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1850 wake_up(&conf->wait_for_overlap); 1851 1852 while (bi && bi->bi_sector < 1853 sh->dev[i].sector + STRIPE_SECTORS) { 1854 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 1855 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1856 if (!raid5_dec_bi_phys_segments(bi)) { 1857 md_write_end(conf->mddev); 1858 bi->bi_next = *return_bi; 1859 *return_bi = bi; 1860 } 1861 bi = nextbi; 1862 } 1863 /* and fail all 'written' */ 1864 bi = sh->dev[i].written; 1865 sh->dev[i].written = NULL; 1866 if (bi) bitmap_end = 1; 1867 while (bi && bi->bi_sector < 1868 sh->dev[i].sector + STRIPE_SECTORS) { 1869 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 1870 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1871 if (!raid5_dec_bi_phys_segments(bi)) { 1872 md_write_end(conf->mddev); 1873 bi->bi_next = *return_bi; 1874 *return_bi = bi; 1875 } 1876 bi = bi2; 1877 } 1878 1879 /* fail any reads if this device is non-operational and 1880 * the data has not reached the cache yet. 1881 */ 1882 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 1883 (!test_bit(R5_Insync, &sh->dev[i].flags) || 1884 test_bit(R5_ReadError, &sh->dev[i].flags))) { 1885 bi = sh->dev[i].toread; 1886 sh->dev[i].toread = NULL; 1887 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1888 wake_up(&conf->wait_for_overlap); 1889 if (bi) s->to_read--; 1890 while (bi && bi->bi_sector < 1891 sh->dev[i].sector + STRIPE_SECTORS) { 1892 struct bio *nextbi = 1893 r5_next_bio(bi, sh->dev[i].sector); 1894 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1895 if (!raid5_dec_bi_phys_segments(bi)) { 1896 bi->bi_next = *return_bi; 1897 *return_bi = bi; 1898 } 1899 bi = nextbi; 1900 } 1901 } 1902 spin_unlock_irq(&conf->device_lock); 1903 if (bitmap_end) 1904 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 1905 STRIPE_SECTORS, 0, 0); 1906 } 1907 1908 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 1909 if (atomic_dec_and_test(&conf->pending_full_writes)) 1910 md_wakeup_thread(conf->mddev->thread); 1911 } 1912 1913 /* fetch_block5 - checks the given member device to see if its data needs 1914 * to be read or computed to satisfy a request. 1915 * 1916 * Returns 1 when no more member devices need to be checked, otherwise returns 1917 * 0 to tell the loop in handle_stripe_fill5 to continue 1918 */ 1919 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, 1920 int disk_idx, int disks) 1921 { 1922 struct r5dev *dev = &sh->dev[disk_idx]; 1923 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 1924 1925 /* is the data in this block needed, and can we get it? */ 1926 if (!test_bit(R5_LOCKED, &dev->flags) && 1927 !test_bit(R5_UPTODATE, &dev->flags) && 1928 (dev->toread || 1929 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1930 s->syncing || s->expanding || 1931 (s->failed && 1932 (failed_dev->toread || 1933 (failed_dev->towrite && 1934 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { 1935 /* We would like to get this block, possibly by computing it, 1936 * otherwise read it if the backing disk is insync 1937 */ 1938 if ((s->uptodate == disks - 1) && 1939 (s->failed && disk_idx == s->failed_num)) { 1940 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 1941 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 1942 set_bit(R5_Wantcompute, &dev->flags); 1943 sh->ops.target = disk_idx; 1944 s->req_compute = 1; 1945 /* Careful: from this point on 'uptodate' is in the eye 1946 * of raid5_run_ops which services 'compute' operations 1947 * before writes. R5_Wantcompute flags a block that will 1948 * be R5_UPTODATE by the time it is needed for a 1949 * subsequent operation. 1950 */ 1951 s->uptodate++; 1952 return 1; /* uptodate + compute == disks */ 1953 } else if (test_bit(R5_Insync, &dev->flags)) { 1954 set_bit(R5_LOCKED, &dev->flags); 1955 set_bit(R5_Wantread, &dev->flags); 1956 s->locked++; 1957 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 1958 s->syncing); 1959 } 1960 } 1961 1962 return 0; 1963 } 1964 1965 /** 1966 * handle_stripe_fill5 - read or compute data to satisfy pending requests. 1967 */ 1968 static void handle_stripe_fill5(struct stripe_head *sh, 1969 struct stripe_head_state *s, int disks) 1970 { 1971 int i; 1972 1973 /* look for blocks to read/compute, skip this if a compute 1974 * is already in flight, or if the stripe contents are in the 1975 * midst of changing due to a write 1976 */ 1977 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 1978 !sh->reconstruct_state) 1979 for (i = disks; i--; ) 1980 if (fetch_block5(sh, s, i, disks)) 1981 break; 1982 set_bit(STRIPE_HANDLE, &sh->state); 1983 } 1984 1985 static void handle_stripe_fill6(struct stripe_head *sh, 1986 struct stripe_head_state *s, struct r6_state *r6s, 1987 int disks) 1988 { 1989 int i; 1990 for (i = disks; i--; ) { 1991 struct r5dev *dev = &sh->dev[i]; 1992 if (!test_bit(R5_LOCKED, &dev->flags) && 1993 !test_bit(R5_UPTODATE, &dev->flags) && 1994 (dev->toread || (dev->towrite && 1995 !test_bit(R5_OVERWRITE, &dev->flags)) || 1996 s->syncing || s->expanding || 1997 (s->failed >= 1 && 1998 (sh->dev[r6s->failed_num[0]].toread || 1999 s->to_write)) || 2000 (s->failed >= 2 && 2001 (sh->dev[r6s->failed_num[1]].toread || 2002 s->to_write)))) { 2003 /* we would like to get this block, possibly 2004 * by computing it, but we might not be able to 2005 */ 2006 if ((s->uptodate == disks - 1) && 2007 (s->failed && (i == r6s->failed_num[0] || 2008 i == r6s->failed_num[1]))) { 2009 pr_debug("Computing stripe %llu block %d\n", 2010 (unsigned long long)sh->sector, i); 2011 compute_block_1(sh, i, 0); 2012 s->uptodate++; 2013 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2014 /* Computing 2-failure is *very* expensive; only 2015 * do it if failed >= 2 2016 */ 2017 int other; 2018 for (other = disks; other--; ) { 2019 if (other == i) 2020 continue; 2021 if (!test_bit(R5_UPTODATE, 2022 &sh->dev[other].flags)) 2023 break; 2024 } 2025 BUG_ON(other < 0); 2026 pr_debug("Computing stripe %llu blocks %d,%d\n", 2027 (unsigned long long)sh->sector, 2028 i, other); 2029 compute_block_2(sh, i, other); 2030 s->uptodate += 2; 2031 } else if (test_bit(R5_Insync, &dev->flags)) { 2032 set_bit(R5_LOCKED, &dev->flags); 2033 set_bit(R5_Wantread, &dev->flags); 2034 s->locked++; 2035 pr_debug("Reading block %d (sync=%d)\n", 2036 i, s->syncing); 2037 } 2038 } 2039 } 2040 set_bit(STRIPE_HANDLE, &sh->state); 2041 } 2042 2043 2044 /* handle_stripe_clean_event 2045 * any written block on an uptodate or failed drive can be returned. 2046 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2047 * never LOCKED, so we don't need to test 'failed' directly. 2048 */ 2049 static void handle_stripe_clean_event(raid5_conf_t *conf, 2050 struct stripe_head *sh, int disks, struct bio **return_bi) 2051 { 2052 int i; 2053 struct r5dev *dev; 2054 2055 for (i = disks; i--; ) 2056 if (sh->dev[i].written) { 2057 dev = &sh->dev[i]; 2058 if (!test_bit(R5_LOCKED, &dev->flags) && 2059 test_bit(R5_UPTODATE, &dev->flags)) { 2060 /* We can return any write requests */ 2061 struct bio *wbi, *wbi2; 2062 int bitmap_end = 0; 2063 pr_debug("Return write for disc %d\n", i); 2064 spin_lock_irq(&conf->device_lock); 2065 wbi = dev->written; 2066 dev->written = NULL; 2067 while (wbi && wbi->bi_sector < 2068 dev->sector + STRIPE_SECTORS) { 2069 wbi2 = r5_next_bio(wbi, dev->sector); 2070 if (!raid5_dec_bi_phys_segments(wbi)) { 2071 md_write_end(conf->mddev); 2072 wbi->bi_next = *return_bi; 2073 *return_bi = wbi; 2074 } 2075 wbi = wbi2; 2076 } 2077 if (dev->towrite == NULL) 2078 bitmap_end = 1; 2079 spin_unlock_irq(&conf->device_lock); 2080 if (bitmap_end) 2081 bitmap_endwrite(conf->mddev->bitmap, 2082 sh->sector, 2083 STRIPE_SECTORS, 2084 !test_bit(STRIPE_DEGRADED, &sh->state), 2085 0); 2086 } 2087 } 2088 2089 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2090 if (atomic_dec_and_test(&conf->pending_full_writes)) 2091 md_wakeup_thread(conf->mddev->thread); 2092 } 2093 2094 static void handle_stripe_dirtying5(raid5_conf_t *conf, 2095 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2096 { 2097 int rmw = 0, rcw = 0, i; 2098 for (i = disks; i--; ) { 2099 /* would I have to read this buffer for read_modify_write */ 2100 struct r5dev *dev = &sh->dev[i]; 2101 if ((dev->towrite || i == sh->pd_idx) && 2102 !test_bit(R5_LOCKED, &dev->flags) && 2103 !(test_bit(R5_UPTODATE, &dev->flags) || 2104 test_bit(R5_Wantcompute, &dev->flags))) { 2105 if (test_bit(R5_Insync, &dev->flags)) 2106 rmw++; 2107 else 2108 rmw += 2*disks; /* cannot read it */ 2109 } 2110 /* Would I have to read this buffer for reconstruct_write */ 2111 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2112 !test_bit(R5_LOCKED, &dev->flags) && 2113 !(test_bit(R5_UPTODATE, &dev->flags) || 2114 test_bit(R5_Wantcompute, &dev->flags))) { 2115 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2116 else 2117 rcw += 2*disks; 2118 } 2119 } 2120 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2121 (unsigned long long)sh->sector, rmw, rcw); 2122 set_bit(STRIPE_HANDLE, &sh->state); 2123 if (rmw < rcw && rmw > 0) 2124 /* prefer read-modify-write, but need to get some data */ 2125 for (i = disks; i--; ) { 2126 struct r5dev *dev = &sh->dev[i]; 2127 if ((dev->towrite || i == sh->pd_idx) && 2128 !test_bit(R5_LOCKED, &dev->flags) && 2129 !(test_bit(R5_UPTODATE, &dev->flags) || 2130 test_bit(R5_Wantcompute, &dev->flags)) && 2131 test_bit(R5_Insync, &dev->flags)) { 2132 if ( 2133 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2134 pr_debug("Read_old block " 2135 "%d for r-m-w\n", i); 2136 set_bit(R5_LOCKED, &dev->flags); 2137 set_bit(R5_Wantread, &dev->flags); 2138 s->locked++; 2139 } else { 2140 set_bit(STRIPE_DELAYED, &sh->state); 2141 set_bit(STRIPE_HANDLE, &sh->state); 2142 } 2143 } 2144 } 2145 if (rcw <= rmw && rcw > 0) 2146 /* want reconstruct write, but need to get some data */ 2147 for (i = disks; i--; ) { 2148 struct r5dev *dev = &sh->dev[i]; 2149 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2150 i != sh->pd_idx && 2151 !test_bit(R5_LOCKED, &dev->flags) && 2152 !(test_bit(R5_UPTODATE, &dev->flags) || 2153 test_bit(R5_Wantcompute, &dev->flags)) && 2154 test_bit(R5_Insync, &dev->flags)) { 2155 if ( 2156 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2157 pr_debug("Read_old block " 2158 "%d for Reconstruct\n", i); 2159 set_bit(R5_LOCKED, &dev->flags); 2160 set_bit(R5_Wantread, &dev->flags); 2161 s->locked++; 2162 } else { 2163 set_bit(STRIPE_DELAYED, &sh->state); 2164 set_bit(STRIPE_HANDLE, &sh->state); 2165 } 2166 } 2167 } 2168 /* now if nothing is locked, and if we have enough data, 2169 * we can start a write request 2170 */ 2171 /* since handle_stripe can be called at any time we need to handle the 2172 * case where a compute block operation has been submitted and then a 2173 * subsequent call wants to start a write request. raid5_run_ops only 2174 * handles the case where compute block and postxor are requested 2175 * simultaneously. If this is not the case then new writes need to be 2176 * held off until the compute completes. 2177 */ 2178 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2179 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2180 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2181 schedule_reconstruction5(sh, s, rcw == 0, 0); 2182 } 2183 2184 static void handle_stripe_dirtying6(raid5_conf_t *conf, 2185 struct stripe_head *sh, struct stripe_head_state *s, 2186 struct r6_state *r6s, int disks) 2187 { 2188 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2189 int qd_idx = r6s->qd_idx; 2190 for (i = disks; i--; ) { 2191 struct r5dev *dev = &sh->dev[i]; 2192 /* Would I have to read this buffer for reconstruct_write */ 2193 if (!test_bit(R5_OVERWRITE, &dev->flags) 2194 && i != pd_idx && i != qd_idx 2195 && (!test_bit(R5_LOCKED, &dev->flags) 2196 ) && 2197 !test_bit(R5_UPTODATE, &dev->flags)) { 2198 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2199 else { 2200 pr_debug("raid6: must_compute: " 2201 "disk %d flags=%#lx\n", i, dev->flags); 2202 must_compute++; 2203 } 2204 } 2205 } 2206 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", 2207 (unsigned long long)sh->sector, rcw, must_compute); 2208 set_bit(STRIPE_HANDLE, &sh->state); 2209 2210 if (rcw > 0) 2211 /* want reconstruct write, but need to get some data */ 2212 for (i = disks; i--; ) { 2213 struct r5dev *dev = &sh->dev[i]; 2214 if (!test_bit(R5_OVERWRITE, &dev->flags) 2215 && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) 2216 && !test_bit(R5_LOCKED, &dev->flags) && 2217 !test_bit(R5_UPTODATE, &dev->flags) && 2218 test_bit(R5_Insync, &dev->flags)) { 2219 if ( 2220 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2221 pr_debug("Read_old stripe %llu " 2222 "block %d for Reconstruct\n", 2223 (unsigned long long)sh->sector, i); 2224 set_bit(R5_LOCKED, &dev->flags); 2225 set_bit(R5_Wantread, &dev->flags); 2226 s->locked++; 2227 } else { 2228 pr_debug("Request delayed stripe %llu " 2229 "block %d for Reconstruct\n", 2230 (unsigned long long)sh->sector, i); 2231 set_bit(STRIPE_DELAYED, &sh->state); 2232 set_bit(STRIPE_HANDLE, &sh->state); 2233 } 2234 } 2235 } 2236 /* now if nothing is locked, and if we have enough data, we can start a 2237 * write request 2238 */ 2239 if (s->locked == 0 && rcw == 0 && 2240 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2241 if (must_compute > 0) { 2242 /* We have failed blocks and need to compute them */ 2243 switch (s->failed) { 2244 case 0: 2245 BUG(); 2246 case 1: 2247 compute_block_1(sh, r6s->failed_num[0], 0); 2248 break; 2249 case 2: 2250 compute_block_2(sh, r6s->failed_num[0], 2251 r6s->failed_num[1]); 2252 break; 2253 default: /* This request should have been failed? */ 2254 BUG(); 2255 } 2256 } 2257 2258 pr_debug("Computing parity for stripe %llu\n", 2259 (unsigned long long)sh->sector); 2260 compute_parity6(sh, RECONSTRUCT_WRITE); 2261 /* now every locked buffer is ready to be written */ 2262 for (i = disks; i--; ) 2263 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 2264 pr_debug("Writing stripe %llu block %d\n", 2265 (unsigned long long)sh->sector, i); 2266 s->locked++; 2267 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2268 } 2269 if (s->locked == disks) 2270 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2271 atomic_inc(&conf->pending_full_writes); 2272 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ 2273 set_bit(STRIPE_INSYNC, &sh->state); 2274 2275 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2276 atomic_dec(&conf->preread_active_stripes); 2277 if (atomic_read(&conf->preread_active_stripes) < 2278 IO_THRESHOLD) 2279 md_wakeup_thread(conf->mddev->thread); 2280 } 2281 } 2282 } 2283 2284 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2285 struct stripe_head_state *s, int disks) 2286 { 2287 struct r5dev *dev = NULL; 2288 2289 set_bit(STRIPE_HANDLE, &sh->state); 2290 2291 switch (sh->check_state) { 2292 case check_state_idle: 2293 /* start a new check operation if there are no failures */ 2294 if (s->failed == 0) { 2295 BUG_ON(s->uptodate != disks); 2296 sh->check_state = check_state_run; 2297 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2298 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2299 s->uptodate--; 2300 break; 2301 } 2302 dev = &sh->dev[s->failed_num]; 2303 /* fall through */ 2304 case check_state_compute_result: 2305 sh->check_state = check_state_idle; 2306 if (!dev) 2307 dev = &sh->dev[sh->pd_idx]; 2308 2309 /* check that a write has not made the stripe insync */ 2310 if (test_bit(STRIPE_INSYNC, &sh->state)) 2311 break; 2312 2313 /* either failed parity check, or recovery is happening */ 2314 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2315 BUG_ON(s->uptodate != disks); 2316 2317 set_bit(R5_LOCKED, &dev->flags); 2318 s->locked++; 2319 set_bit(R5_Wantwrite, &dev->flags); 2320 2321 clear_bit(STRIPE_DEGRADED, &sh->state); 2322 set_bit(STRIPE_INSYNC, &sh->state); 2323 break; 2324 case check_state_run: 2325 break; /* we will be called again upon completion */ 2326 case check_state_check_result: 2327 sh->check_state = check_state_idle; 2328 2329 /* if a failure occurred during the check operation, leave 2330 * STRIPE_INSYNC not set and let the stripe be handled again 2331 */ 2332 if (s->failed) 2333 break; 2334 2335 /* handle a successful check operation, if parity is correct 2336 * we are done. Otherwise update the mismatch count and repair 2337 * parity if !MD_RECOVERY_CHECK 2338 */ 2339 if (sh->ops.zero_sum_result == 0) 2340 /* parity is correct (on disc, 2341 * not in buffer any more) 2342 */ 2343 set_bit(STRIPE_INSYNC, &sh->state); 2344 else { 2345 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2346 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2347 /* don't try to repair!! */ 2348 set_bit(STRIPE_INSYNC, &sh->state); 2349 else { 2350 sh->check_state = check_state_compute_run; 2351 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2352 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2353 set_bit(R5_Wantcompute, 2354 &sh->dev[sh->pd_idx].flags); 2355 sh->ops.target = sh->pd_idx; 2356 s->uptodate++; 2357 } 2358 } 2359 break; 2360 case check_state_compute_run: 2361 break; 2362 default: 2363 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2364 __func__, sh->check_state, 2365 (unsigned long long) sh->sector); 2366 BUG(); 2367 } 2368 } 2369 2370 2371 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2372 struct stripe_head_state *s, 2373 struct r6_state *r6s, struct page *tmp_page, 2374 int disks) 2375 { 2376 int update_p = 0, update_q = 0; 2377 struct r5dev *dev; 2378 int pd_idx = sh->pd_idx; 2379 int qd_idx = r6s->qd_idx; 2380 2381 set_bit(STRIPE_HANDLE, &sh->state); 2382 2383 BUG_ON(s->failed > 2); 2384 BUG_ON(s->uptodate < disks); 2385 /* Want to check and possibly repair P and Q. 2386 * However there could be one 'failed' device, in which 2387 * case we can only check one of them, possibly using the 2388 * other to generate missing data 2389 */ 2390 2391 /* If !tmp_page, we cannot do the calculations, 2392 * but as we have set STRIPE_HANDLE, we will soon be called 2393 * by stripe_handle with a tmp_page - just wait until then. 2394 */ 2395 if (tmp_page) { 2396 if (s->failed == r6s->q_failed) { 2397 /* The only possible failed device holds 'Q', so it 2398 * makes sense to check P (If anything else were failed, 2399 * we would have used P to recreate it). 2400 */ 2401 compute_block_1(sh, pd_idx, 1); 2402 if (!page_is_zero(sh->dev[pd_idx].page)) { 2403 compute_block_1(sh, pd_idx, 0); 2404 update_p = 1; 2405 } 2406 } 2407 if (!r6s->q_failed && s->failed < 2) { 2408 /* q is not failed, and we didn't use it to generate 2409 * anything, so it makes sense to check it 2410 */ 2411 memcpy(page_address(tmp_page), 2412 page_address(sh->dev[qd_idx].page), 2413 STRIPE_SIZE); 2414 compute_parity6(sh, UPDATE_PARITY); 2415 if (memcmp(page_address(tmp_page), 2416 page_address(sh->dev[qd_idx].page), 2417 STRIPE_SIZE) != 0) { 2418 clear_bit(STRIPE_INSYNC, &sh->state); 2419 update_q = 1; 2420 } 2421 } 2422 if (update_p || update_q) { 2423 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2424 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2425 /* don't try to repair!! */ 2426 update_p = update_q = 0; 2427 } 2428 2429 /* now write out any block on a failed drive, 2430 * or P or Q if they need it 2431 */ 2432 2433 if (s->failed == 2) { 2434 dev = &sh->dev[r6s->failed_num[1]]; 2435 s->locked++; 2436 set_bit(R5_LOCKED, &dev->flags); 2437 set_bit(R5_Wantwrite, &dev->flags); 2438 } 2439 if (s->failed >= 1) { 2440 dev = &sh->dev[r6s->failed_num[0]]; 2441 s->locked++; 2442 set_bit(R5_LOCKED, &dev->flags); 2443 set_bit(R5_Wantwrite, &dev->flags); 2444 } 2445 2446 if (update_p) { 2447 dev = &sh->dev[pd_idx]; 2448 s->locked++; 2449 set_bit(R5_LOCKED, &dev->flags); 2450 set_bit(R5_Wantwrite, &dev->flags); 2451 } 2452 if (update_q) { 2453 dev = &sh->dev[qd_idx]; 2454 s->locked++; 2455 set_bit(R5_LOCKED, &dev->flags); 2456 set_bit(R5_Wantwrite, &dev->flags); 2457 } 2458 clear_bit(STRIPE_DEGRADED, &sh->state); 2459 2460 set_bit(STRIPE_INSYNC, &sh->state); 2461 } 2462 } 2463 2464 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2465 struct r6_state *r6s) 2466 { 2467 int i; 2468 2469 /* We have read all the blocks in this stripe and now we need to 2470 * copy some of them into a target stripe for expand. 2471 */ 2472 struct dma_async_tx_descriptor *tx = NULL; 2473 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2474 for (i = 0; i < sh->disks; i++) 2475 if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { 2476 int dd_idx, pd_idx, j; 2477 struct stripe_head *sh2; 2478 2479 sector_t bn = compute_blocknr(sh, i); 2480 sector_t s = raid5_compute_sector(bn, conf->raid_disks, 2481 conf->raid_disks - 2482 conf->max_degraded, &dd_idx, 2483 &pd_idx, conf); 2484 sh2 = get_active_stripe(conf, s, conf->raid_disks, 2485 pd_idx, 1); 2486 if (sh2 == NULL) 2487 /* so far only the early blocks of this stripe 2488 * have been requested. When later blocks 2489 * get requested, we will try again 2490 */ 2491 continue; 2492 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2493 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2494 /* must have already done this block */ 2495 release_stripe(sh2); 2496 continue; 2497 } 2498 2499 /* place all the copies on one channel */ 2500 tx = async_memcpy(sh2->dev[dd_idx].page, 2501 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2502 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2503 2504 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2505 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2506 for (j = 0; j < conf->raid_disks; j++) 2507 if (j != sh2->pd_idx && 2508 (!r6s || j != raid6_next_disk(sh2->pd_idx, 2509 sh2->disks)) && 2510 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2511 break; 2512 if (j == conf->raid_disks) { 2513 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2514 set_bit(STRIPE_HANDLE, &sh2->state); 2515 } 2516 release_stripe(sh2); 2517 2518 } 2519 /* done submitting copies, wait for them to complete */ 2520 if (tx) { 2521 async_tx_ack(tx); 2522 dma_wait_for_async_tx(tx); 2523 } 2524 } 2525 2526 2527 /* 2528 * handle_stripe - do things to a stripe. 2529 * 2530 * We lock the stripe and then examine the state of various bits 2531 * to see what needs to be done. 2532 * Possible results: 2533 * return some read request which now have data 2534 * return some write requests which are safely on disc 2535 * schedule a read on some buffers 2536 * schedule a write of some buffers 2537 * return confirmation of parity correctness 2538 * 2539 * buffers are taken off read_list or write_list, and bh_cache buffers 2540 * get BH_Lock set before the stripe lock is released. 2541 * 2542 */ 2543 2544 static bool handle_stripe5(struct stripe_head *sh) 2545 { 2546 raid5_conf_t *conf = sh->raid_conf; 2547 int disks = sh->disks, i; 2548 struct bio *return_bi = NULL; 2549 struct stripe_head_state s; 2550 struct r5dev *dev; 2551 mdk_rdev_t *blocked_rdev = NULL; 2552 int prexor; 2553 2554 memset(&s, 0, sizeof(s)); 2555 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 2556 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, 2557 atomic_read(&sh->count), sh->pd_idx, sh->check_state, 2558 sh->reconstruct_state); 2559 2560 spin_lock(&sh->lock); 2561 clear_bit(STRIPE_HANDLE, &sh->state); 2562 clear_bit(STRIPE_DELAYED, &sh->state); 2563 2564 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2565 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2566 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2567 2568 /* Now to look around and see what can be done */ 2569 rcu_read_lock(); 2570 for (i=disks; i--; ) { 2571 mdk_rdev_t *rdev; 2572 struct r5dev *dev = &sh->dev[i]; 2573 clear_bit(R5_Insync, &dev->flags); 2574 2575 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 2576 "written %p\n", i, dev->flags, dev->toread, dev->read, 2577 dev->towrite, dev->written); 2578 2579 /* maybe we can request a biofill operation 2580 * 2581 * new wantfill requests are only permitted while 2582 * ops_complete_biofill is guaranteed to be inactive 2583 */ 2584 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2585 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 2586 set_bit(R5_Wantfill, &dev->flags); 2587 2588 /* now count some things */ 2589 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 2590 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 2591 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 2592 2593 if (test_bit(R5_Wantfill, &dev->flags)) 2594 s.to_fill++; 2595 else if (dev->toread) 2596 s.to_read++; 2597 if (dev->towrite) { 2598 s.to_write++; 2599 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2600 s.non_overwrite++; 2601 } 2602 if (dev->written) 2603 s.written++; 2604 rdev = rcu_dereference(conf->disks[i].rdev); 2605 if (blocked_rdev == NULL && 2606 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 2607 blocked_rdev = rdev; 2608 atomic_inc(&rdev->nr_pending); 2609 } 2610 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2611 /* The ReadError flag will just be confusing now */ 2612 clear_bit(R5_ReadError, &dev->flags); 2613 clear_bit(R5_ReWrite, &dev->flags); 2614 } 2615 if (!rdev || !test_bit(In_sync, &rdev->flags) 2616 || test_bit(R5_ReadError, &dev->flags)) { 2617 s.failed++; 2618 s.failed_num = i; 2619 } else 2620 set_bit(R5_Insync, &dev->flags); 2621 } 2622 rcu_read_unlock(); 2623 2624 if (unlikely(blocked_rdev)) { 2625 if (s.syncing || s.expanding || s.expanded || 2626 s.to_write || s.written) { 2627 set_bit(STRIPE_HANDLE, &sh->state); 2628 goto unlock; 2629 } 2630 /* There is nothing for the blocked_rdev to block */ 2631 rdev_dec_pending(blocked_rdev, conf->mddev); 2632 blocked_rdev = NULL; 2633 } 2634 2635 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 2636 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 2637 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 2638 } 2639 2640 pr_debug("locked=%d uptodate=%d to_read=%d" 2641 " to_write=%d failed=%d failed_num=%d\n", 2642 s.locked, s.uptodate, s.to_read, s.to_write, 2643 s.failed, s.failed_num); 2644 /* check if the array has lost two devices and, if so, some requests might 2645 * need to be failed 2646 */ 2647 if (s.failed > 1 && s.to_read+s.to_write+s.written) 2648 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 2649 if (s.failed > 1 && s.syncing) { 2650 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2651 clear_bit(STRIPE_SYNCING, &sh->state); 2652 s.syncing = 0; 2653 } 2654 2655 /* might be able to return some write requests if the parity block 2656 * is safe, or on a failed drive 2657 */ 2658 dev = &sh->dev[sh->pd_idx]; 2659 if ( s.written && 2660 ((test_bit(R5_Insync, &dev->flags) && 2661 !test_bit(R5_LOCKED, &dev->flags) && 2662 test_bit(R5_UPTODATE, &dev->flags)) || 2663 (s.failed == 1 && s.failed_num == sh->pd_idx))) 2664 handle_stripe_clean_event(conf, sh, disks, &return_bi); 2665 2666 /* Now we might consider reading some blocks, either to check/generate 2667 * parity, or to satisfy requests 2668 * or to load a block that is being partially written. 2669 */ 2670 if (s.to_read || s.non_overwrite || 2671 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 2672 handle_stripe_fill5(sh, &s, disks); 2673 2674 /* Now we check to see if any write operations have recently 2675 * completed 2676 */ 2677 prexor = 0; 2678 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 2679 prexor = 1; 2680 if (sh->reconstruct_state == reconstruct_state_drain_result || 2681 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 2682 sh->reconstruct_state = reconstruct_state_idle; 2683 2684 /* All the 'written' buffers and the parity block are ready to 2685 * be written back to disk 2686 */ 2687 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 2688 for (i = disks; i--; ) { 2689 dev = &sh->dev[i]; 2690 if (test_bit(R5_LOCKED, &dev->flags) && 2691 (i == sh->pd_idx || dev->written)) { 2692 pr_debug("Writing block %d\n", i); 2693 set_bit(R5_Wantwrite, &dev->flags); 2694 if (prexor) 2695 continue; 2696 if (!test_bit(R5_Insync, &dev->flags) || 2697 (i == sh->pd_idx && s.failed == 0)) 2698 set_bit(STRIPE_INSYNC, &sh->state); 2699 } 2700 } 2701 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2702 atomic_dec(&conf->preread_active_stripes); 2703 if (atomic_read(&conf->preread_active_stripes) < 2704 IO_THRESHOLD) 2705 md_wakeup_thread(conf->mddev->thread); 2706 } 2707 } 2708 2709 /* Now to consider new write requests and what else, if anything 2710 * should be read. We do not handle new writes when: 2711 * 1/ A 'write' operation (copy+xor) is already in flight. 2712 * 2/ A 'check' operation is in flight, as it may clobber the parity 2713 * block. 2714 */ 2715 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 2716 handle_stripe_dirtying5(conf, sh, &s, disks); 2717 2718 /* maybe we need to check and possibly fix the parity for this stripe 2719 * Any reads will already have been scheduled, so we just see if enough 2720 * data is available. The parity check is held off while parity 2721 * dependent operations are in flight. 2722 */ 2723 if (sh->check_state || 2724 (s.syncing && s.locked == 0 && 2725 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 2726 !test_bit(STRIPE_INSYNC, &sh->state))) 2727 handle_parity_checks5(conf, sh, &s, disks); 2728 2729 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2730 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2731 clear_bit(STRIPE_SYNCING, &sh->state); 2732 } 2733 2734 /* If the failed drive is just a ReadError, then we might need to progress 2735 * the repair/check process 2736 */ 2737 if (s.failed == 1 && !conf->mddev->ro && 2738 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) 2739 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) 2740 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) 2741 ) { 2742 dev = &sh->dev[s.failed_num]; 2743 if (!test_bit(R5_ReWrite, &dev->flags)) { 2744 set_bit(R5_Wantwrite, &dev->flags); 2745 set_bit(R5_ReWrite, &dev->flags); 2746 set_bit(R5_LOCKED, &dev->flags); 2747 s.locked++; 2748 } else { 2749 /* let's read it back */ 2750 set_bit(R5_Wantread, &dev->flags); 2751 set_bit(R5_LOCKED, &dev->flags); 2752 s.locked++; 2753 } 2754 } 2755 2756 /* Finish reconstruct operations initiated by the expansion process */ 2757 if (sh->reconstruct_state == reconstruct_state_result) { 2758 sh->reconstruct_state = reconstruct_state_idle; 2759 clear_bit(STRIPE_EXPANDING, &sh->state); 2760 for (i = conf->raid_disks; i--; ) { 2761 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2762 set_bit(R5_LOCKED, &sh->dev[i].flags); 2763 s.locked++; 2764 } 2765 } 2766 2767 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2768 !sh->reconstruct_state) { 2769 /* Need to write out all blocks after computing parity */ 2770 sh->disks = conf->raid_disks; 2771 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2772 conf->raid_disks); 2773 schedule_reconstruction5(sh, &s, 1, 1); 2774 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 2775 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2776 atomic_dec(&conf->reshape_stripes); 2777 wake_up(&conf->wait_for_overlap); 2778 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2779 } 2780 2781 if (s.expanding && s.locked == 0 && 2782 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 2783 handle_stripe_expansion(conf, sh, NULL); 2784 2785 unlock: 2786 spin_unlock(&sh->lock); 2787 2788 /* wait for this device to become unblocked */ 2789 if (unlikely(blocked_rdev)) 2790 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2791 2792 if (s.ops_request) 2793 raid5_run_ops(sh, s.ops_request); 2794 2795 ops_run_io(sh, &s); 2796 2797 return_io(return_bi); 2798 2799 return blocked_rdev == NULL; 2800 } 2801 2802 static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2803 { 2804 raid6_conf_t *conf = sh->raid_conf; 2805 int disks = sh->disks; 2806 struct bio *return_bi = NULL; 2807 int i, pd_idx = sh->pd_idx; 2808 struct stripe_head_state s; 2809 struct r6_state r6s; 2810 struct r5dev *dev, *pdev, *qdev; 2811 mdk_rdev_t *blocked_rdev = NULL; 2812 2813 r6s.qd_idx = raid6_next_disk(pd_idx, disks); 2814 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 2815 "pd_idx=%d, qd_idx=%d\n", 2816 (unsigned long long)sh->sector, sh->state, 2817 atomic_read(&sh->count), pd_idx, r6s.qd_idx); 2818 memset(&s, 0, sizeof(s)); 2819 2820 spin_lock(&sh->lock); 2821 clear_bit(STRIPE_HANDLE, &sh->state); 2822 clear_bit(STRIPE_DELAYED, &sh->state); 2823 2824 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2825 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2826 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2827 /* Now to look around and see what can be done */ 2828 2829 rcu_read_lock(); 2830 for (i=disks; i--; ) { 2831 mdk_rdev_t *rdev; 2832 dev = &sh->dev[i]; 2833 clear_bit(R5_Insync, &dev->flags); 2834 2835 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 2836 i, dev->flags, dev->toread, dev->towrite, dev->written); 2837 /* maybe we can reply to a read */ 2838 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 2839 struct bio *rbi, *rbi2; 2840 pr_debug("Return read for disc %d\n", i); 2841 spin_lock_irq(&conf->device_lock); 2842 rbi = dev->toread; 2843 dev->toread = NULL; 2844 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2845 wake_up(&conf->wait_for_overlap); 2846 spin_unlock_irq(&conf->device_lock); 2847 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { 2848 copy_data(0, rbi, dev->page, dev->sector); 2849 rbi2 = r5_next_bio(rbi, dev->sector); 2850 spin_lock_irq(&conf->device_lock); 2851 if (!raid5_dec_bi_phys_segments(rbi)) { 2852 rbi->bi_next = return_bi; 2853 return_bi = rbi; 2854 } 2855 spin_unlock_irq(&conf->device_lock); 2856 rbi = rbi2; 2857 } 2858 } 2859 2860 /* now count some things */ 2861 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 2862 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 2863 2864 2865 if (dev->toread) 2866 s.to_read++; 2867 if (dev->towrite) { 2868 s.to_write++; 2869 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2870 s.non_overwrite++; 2871 } 2872 if (dev->written) 2873 s.written++; 2874 rdev = rcu_dereference(conf->disks[i].rdev); 2875 if (blocked_rdev == NULL && 2876 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 2877 blocked_rdev = rdev; 2878 atomic_inc(&rdev->nr_pending); 2879 } 2880 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2881 /* The ReadError flag will just be confusing now */ 2882 clear_bit(R5_ReadError, &dev->flags); 2883 clear_bit(R5_ReWrite, &dev->flags); 2884 } 2885 if (!rdev || !test_bit(In_sync, &rdev->flags) 2886 || test_bit(R5_ReadError, &dev->flags)) { 2887 if (s.failed < 2) 2888 r6s.failed_num[s.failed] = i; 2889 s.failed++; 2890 } else 2891 set_bit(R5_Insync, &dev->flags); 2892 } 2893 rcu_read_unlock(); 2894 2895 if (unlikely(blocked_rdev)) { 2896 if (s.syncing || s.expanding || s.expanded || 2897 s.to_write || s.written) { 2898 set_bit(STRIPE_HANDLE, &sh->state); 2899 goto unlock; 2900 } 2901 /* There is nothing for the blocked_rdev to block */ 2902 rdev_dec_pending(blocked_rdev, conf->mddev); 2903 blocked_rdev = NULL; 2904 } 2905 2906 pr_debug("locked=%d uptodate=%d to_read=%d" 2907 " to_write=%d failed=%d failed_num=%d,%d\n", 2908 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 2909 r6s.failed_num[0], r6s.failed_num[1]); 2910 /* check if the array has lost >2 devices and, if so, some requests 2911 * might need to be failed 2912 */ 2913 if (s.failed > 2 && s.to_read+s.to_write+s.written) 2914 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 2915 if (s.failed > 2 && s.syncing) { 2916 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2917 clear_bit(STRIPE_SYNCING, &sh->state); 2918 s.syncing = 0; 2919 } 2920 2921 /* 2922 * might be able to return some write requests if the parity blocks 2923 * are safe, or on a failed drive 2924 */ 2925 pdev = &sh->dev[pd_idx]; 2926 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 2927 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 2928 qdev = &sh->dev[r6s.qd_idx]; 2929 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) 2930 || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); 2931 2932 if ( s.written && 2933 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 2934 && !test_bit(R5_LOCKED, &pdev->flags) 2935 && test_bit(R5_UPTODATE, &pdev->flags)))) && 2936 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 2937 && !test_bit(R5_LOCKED, &qdev->flags) 2938 && test_bit(R5_UPTODATE, &qdev->flags))))) 2939 handle_stripe_clean_event(conf, sh, disks, &return_bi); 2940 2941 /* Now we might consider reading some blocks, either to check/generate 2942 * parity, or to satisfy requests 2943 * or to load a block that is being partially written. 2944 */ 2945 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 2946 (s.syncing && (s.uptodate < disks)) || s.expanding) 2947 handle_stripe_fill6(sh, &s, &r6s, disks); 2948 2949 /* now to consider writing and what else, if anything should be read */ 2950 if (s.to_write) 2951 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 2952 2953 /* maybe we need to check and possibly fix the parity for this stripe 2954 * Any reads will already have been scheduled, so we just see if enough 2955 * data is available 2956 */ 2957 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 2958 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 2959 2960 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2961 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2962 clear_bit(STRIPE_SYNCING, &sh->state); 2963 } 2964 2965 /* If the failed drives are just a ReadError, then we might need 2966 * to progress the repair/check process 2967 */ 2968 if (s.failed <= 2 && !conf->mddev->ro) 2969 for (i = 0; i < s.failed; i++) { 2970 dev = &sh->dev[r6s.failed_num[i]]; 2971 if (test_bit(R5_ReadError, &dev->flags) 2972 && !test_bit(R5_LOCKED, &dev->flags) 2973 && test_bit(R5_UPTODATE, &dev->flags) 2974 ) { 2975 if (!test_bit(R5_ReWrite, &dev->flags)) { 2976 set_bit(R5_Wantwrite, &dev->flags); 2977 set_bit(R5_ReWrite, &dev->flags); 2978 set_bit(R5_LOCKED, &dev->flags); 2979 } else { 2980 /* let's read it back */ 2981 set_bit(R5_Wantread, &dev->flags); 2982 set_bit(R5_LOCKED, &dev->flags); 2983 } 2984 } 2985 } 2986 2987 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 2988 /* Need to write out all blocks after computing P&Q */ 2989 sh->disks = conf->raid_disks; 2990 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2991 conf->raid_disks); 2992 compute_parity6(sh, RECONSTRUCT_WRITE); 2993 for (i = conf->raid_disks ; i-- ; ) { 2994 set_bit(R5_LOCKED, &sh->dev[i].flags); 2995 s.locked++; 2996 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2997 } 2998 clear_bit(STRIPE_EXPANDING, &sh->state); 2999 } else if (s.expanded) { 3000 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3001 atomic_dec(&conf->reshape_stripes); 3002 wake_up(&conf->wait_for_overlap); 3003 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3004 } 3005 3006 if (s.expanding && s.locked == 0 && 3007 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3008 handle_stripe_expansion(conf, sh, &r6s); 3009 3010 unlock: 3011 spin_unlock(&sh->lock); 3012 3013 /* wait for this device to become unblocked */ 3014 if (unlikely(blocked_rdev)) 3015 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3016 3017 ops_run_io(sh, &s); 3018 3019 return_io(return_bi); 3020 3021 return blocked_rdev == NULL; 3022 } 3023 3024 /* returns true if the stripe was handled */ 3025 static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3026 { 3027 if (sh->raid_conf->level == 6) 3028 return handle_stripe6(sh, tmp_page); 3029 else 3030 return handle_stripe5(sh); 3031 } 3032 3033 3034 3035 static void raid5_activate_delayed(raid5_conf_t *conf) 3036 { 3037 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3038 while (!list_empty(&conf->delayed_list)) { 3039 struct list_head *l = conf->delayed_list.next; 3040 struct stripe_head *sh; 3041 sh = list_entry(l, struct stripe_head, lru); 3042 list_del_init(l); 3043 clear_bit(STRIPE_DELAYED, &sh->state); 3044 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3045 atomic_inc(&conf->preread_active_stripes); 3046 list_add_tail(&sh->lru, &conf->hold_list); 3047 } 3048 } else 3049 blk_plug_device(conf->mddev->queue); 3050 } 3051 3052 static void activate_bit_delay(raid5_conf_t *conf) 3053 { 3054 /* device_lock is held */ 3055 struct list_head head; 3056 list_add(&head, &conf->bitmap_list); 3057 list_del_init(&conf->bitmap_list); 3058 while (!list_empty(&head)) { 3059 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3060 list_del_init(&sh->lru); 3061 atomic_inc(&sh->count); 3062 __release_stripe(conf, sh); 3063 } 3064 } 3065 3066 static void unplug_slaves(mddev_t *mddev) 3067 { 3068 raid5_conf_t *conf = mddev_to_conf(mddev); 3069 int i; 3070 3071 rcu_read_lock(); 3072 for (i=0; i<mddev->raid_disks; i++) { 3073 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 3074 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 3075 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 3076 3077 atomic_inc(&rdev->nr_pending); 3078 rcu_read_unlock(); 3079 3080 blk_unplug(r_queue); 3081 3082 rdev_dec_pending(rdev, mddev); 3083 rcu_read_lock(); 3084 } 3085 } 3086 rcu_read_unlock(); 3087 } 3088 3089 static void raid5_unplug_device(struct request_queue *q) 3090 { 3091 mddev_t *mddev = q->queuedata; 3092 raid5_conf_t *conf = mddev_to_conf(mddev); 3093 unsigned long flags; 3094 3095 spin_lock_irqsave(&conf->device_lock, flags); 3096 3097 if (blk_remove_plug(q)) { 3098 conf->seq_flush++; 3099 raid5_activate_delayed(conf); 3100 } 3101 md_wakeup_thread(mddev->thread); 3102 3103 spin_unlock_irqrestore(&conf->device_lock, flags); 3104 3105 unplug_slaves(mddev); 3106 } 3107 3108 static int raid5_congested(void *data, int bits) 3109 { 3110 mddev_t *mddev = data; 3111 raid5_conf_t *conf = mddev_to_conf(mddev); 3112 3113 /* No difference between reads and writes. Just check 3114 * how busy the stripe_cache is 3115 */ 3116 if (conf->inactive_blocked) 3117 return 1; 3118 if (conf->quiesce) 3119 return 1; 3120 if (list_empty_careful(&conf->inactive_list)) 3121 return 1; 3122 3123 return 0; 3124 } 3125 3126 /* We want read requests to align with chunks where possible, 3127 * but write requests don't need to. 3128 */ 3129 static int raid5_mergeable_bvec(struct request_queue *q, 3130 struct bvec_merge_data *bvm, 3131 struct bio_vec *biovec) 3132 { 3133 mddev_t *mddev = q->queuedata; 3134 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3135 int max; 3136 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3137 unsigned int bio_sectors = bvm->bi_size >> 9; 3138 3139 if ((bvm->bi_rw & 1) == WRITE) 3140 return biovec->bv_len; /* always allow writes to be mergeable */ 3141 3142 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3143 if (max < 0) max = 0; 3144 if (max <= biovec->bv_len && bio_sectors == 0) 3145 return biovec->bv_len; 3146 else 3147 return max; 3148 } 3149 3150 3151 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3152 { 3153 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3154 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3155 unsigned int bio_sectors = bio->bi_size >> 9; 3156 3157 return chunk_sectors >= 3158 ((sector & (chunk_sectors - 1)) + bio_sectors); 3159 } 3160 3161 /* 3162 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3163 * later sampled by raid5d. 3164 */ 3165 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) 3166 { 3167 unsigned long flags; 3168 3169 spin_lock_irqsave(&conf->device_lock, flags); 3170 3171 bi->bi_next = conf->retry_read_aligned_list; 3172 conf->retry_read_aligned_list = bi; 3173 3174 spin_unlock_irqrestore(&conf->device_lock, flags); 3175 md_wakeup_thread(conf->mddev->thread); 3176 } 3177 3178 3179 static struct bio *remove_bio_from_retry(raid5_conf_t *conf) 3180 { 3181 struct bio *bi; 3182 3183 bi = conf->retry_read_aligned; 3184 if (bi) { 3185 conf->retry_read_aligned = NULL; 3186 return bi; 3187 } 3188 bi = conf->retry_read_aligned_list; 3189 if(bi) { 3190 conf->retry_read_aligned_list = bi->bi_next; 3191 bi->bi_next = NULL; 3192 /* 3193 * this sets the active strip count to 1 and the processed 3194 * strip count to zero (upper 8 bits) 3195 */ 3196 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3197 } 3198 3199 return bi; 3200 } 3201 3202 3203 /* 3204 * The "raid5_align_endio" should check if the read succeeded and if it 3205 * did, call bio_endio on the original bio (having bio_put the new bio 3206 * first). 3207 * If the read failed.. 3208 */ 3209 static void raid5_align_endio(struct bio *bi, int error) 3210 { 3211 struct bio* raid_bi = bi->bi_private; 3212 mddev_t *mddev; 3213 raid5_conf_t *conf; 3214 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3215 mdk_rdev_t *rdev; 3216 3217 bio_put(bi); 3218 3219 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; 3220 conf = mddev_to_conf(mddev); 3221 rdev = (void*)raid_bi->bi_next; 3222 raid_bi->bi_next = NULL; 3223 3224 rdev_dec_pending(rdev, conf->mddev); 3225 3226 if (!error && uptodate) { 3227 bio_endio(raid_bi, 0); 3228 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3229 wake_up(&conf->wait_for_stripe); 3230 return; 3231 } 3232 3233 3234 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3235 3236 add_bio_to_retry(raid_bi, conf); 3237 } 3238 3239 static int bio_fits_rdev(struct bio *bi) 3240 { 3241 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3242 3243 if ((bi->bi_size>>9) > q->max_sectors) 3244 return 0; 3245 blk_recount_segments(q, bi); 3246 if (bi->bi_phys_segments > q->max_phys_segments) 3247 return 0; 3248 3249 if (q->merge_bvec_fn) 3250 /* it's too hard to apply the merge_bvec_fn at this stage, 3251 * just just give up 3252 */ 3253 return 0; 3254 3255 return 1; 3256 } 3257 3258 3259 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3260 { 3261 mddev_t *mddev = q->queuedata; 3262 raid5_conf_t *conf = mddev_to_conf(mddev); 3263 const unsigned int raid_disks = conf->raid_disks; 3264 const unsigned int data_disks = raid_disks - conf->max_degraded; 3265 unsigned int dd_idx, pd_idx; 3266 struct bio* align_bi; 3267 mdk_rdev_t *rdev; 3268 3269 if (!in_chunk_boundary(mddev, raid_bio)) { 3270 pr_debug("chunk_aligned_read : non aligned\n"); 3271 return 0; 3272 } 3273 /* 3274 * use bio_clone to make a copy of the bio 3275 */ 3276 align_bi = bio_clone(raid_bio, GFP_NOIO); 3277 if (!align_bi) 3278 return 0; 3279 /* 3280 * set bi_end_io to a new function, and set bi_private to the 3281 * original bio. 3282 */ 3283 align_bi->bi_end_io = raid5_align_endio; 3284 align_bi->bi_private = raid_bio; 3285 /* 3286 * compute position 3287 */ 3288 align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, 3289 raid_disks, 3290 data_disks, 3291 &dd_idx, 3292 &pd_idx, 3293 conf); 3294 3295 rcu_read_lock(); 3296 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3297 if (rdev && test_bit(In_sync, &rdev->flags)) { 3298 atomic_inc(&rdev->nr_pending); 3299 rcu_read_unlock(); 3300 raid_bio->bi_next = (void*)rdev; 3301 align_bi->bi_bdev = rdev->bdev; 3302 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3303 align_bi->bi_sector += rdev->data_offset; 3304 3305 if (!bio_fits_rdev(align_bi)) { 3306 /* too big in some way */ 3307 bio_put(align_bi); 3308 rdev_dec_pending(rdev, mddev); 3309 return 0; 3310 } 3311 3312 spin_lock_irq(&conf->device_lock); 3313 wait_event_lock_irq(conf->wait_for_stripe, 3314 conf->quiesce == 0, 3315 conf->device_lock, /* nothing */); 3316 atomic_inc(&conf->active_aligned_reads); 3317 spin_unlock_irq(&conf->device_lock); 3318 3319 generic_make_request(align_bi); 3320 return 1; 3321 } else { 3322 rcu_read_unlock(); 3323 bio_put(align_bi); 3324 return 0; 3325 } 3326 } 3327 3328 /* __get_priority_stripe - get the next stripe to process 3329 * 3330 * Full stripe writes are allowed to pass preread active stripes up until 3331 * the bypass_threshold is exceeded. In general the bypass_count 3332 * increments when the handle_list is handled before the hold_list; however, it 3333 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3334 * stripe with in flight i/o. The bypass_count will be reset when the 3335 * head of the hold_list has changed, i.e. the head was promoted to the 3336 * handle_list. 3337 */ 3338 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) 3339 { 3340 struct stripe_head *sh; 3341 3342 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3343 __func__, 3344 list_empty(&conf->handle_list) ? "empty" : "busy", 3345 list_empty(&conf->hold_list) ? "empty" : "busy", 3346 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3347 3348 if (!list_empty(&conf->handle_list)) { 3349 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3350 3351 if (list_empty(&conf->hold_list)) 3352 conf->bypass_count = 0; 3353 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3354 if (conf->hold_list.next == conf->last_hold) 3355 conf->bypass_count++; 3356 else { 3357 conf->last_hold = conf->hold_list.next; 3358 conf->bypass_count -= conf->bypass_threshold; 3359 if (conf->bypass_count < 0) 3360 conf->bypass_count = 0; 3361 } 3362 } 3363 } else if (!list_empty(&conf->hold_list) && 3364 ((conf->bypass_threshold && 3365 conf->bypass_count > conf->bypass_threshold) || 3366 atomic_read(&conf->pending_full_writes) == 0)) { 3367 sh = list_entry(conf->hold_list.next, 3368 typeof(*sh), lru); 3369 conf->bypass_count -= conf->bypass_threshold; 3370 if (conf->bypass_count < 0) 3371 conf->bypass_count = 0; 3372 } else 3373 return NULL; 3374 3375 list_del_init(&sh->lru); 3376 atomic_inc(&sh->count); 3377 BUG_ON(atomic_read(&sh->count) != 1); 3378 return sh; 3379 } 3380 3381 static int make_request(struct request_queue *q, struct bio * bi) 3382 { 3383 mddev_t *mddev = q->queuedata; 3384 raid5_conf_t *conf = mddev_to_conf(mddev); 3385 unsigned int dd_idx, pd_idx; 3386 sector_t new_sector; 3387 sector_t logical_sector, last_sector; 3388 struct stripe_head *sh; 3389 const int rw = bio_data_dir(bi); 3390 int cpu, remaining; 3391 3392 if (unlikely(bio_barrier(bi))) { 3393 bio_endio(bi, -EOPNOTSUPP); 3394 return 0; 3395 } 3396 3397 md_write_start(mddev, bi); 3398 3399 cpu = part_stat_lock(); 3400 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 3401 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 3402 bio_sectors(bi)); 3403 part_stat_unlock(); 3404 3405 if (rw == READ && 3406 mddev->reshape_position == MaxSector && 3407 chunk_aligned_read(q,bi)) 3408 return 0; 3409 3410 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3411 last_sector = bi->bi_sector + (bi->bi_size>>9); 3412 bi->bi_next = NULL; 3413 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3414 3415 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3416 DEFINE_WAIT(w); 3417 int disks, data_disks; 3418 3419 retry: 3420 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3421 if (likely(conf->expand_progress == MaxSector)) 3422 disks = conf->raid_disks; 3423 else { 3424 /* spinlock is needed as expand_progress may be 3425 * 64bit on a 32bit platform, and so it might be 3426 * possible to see a half-updated value 3427 * Ofcourse expand_progress could change after 3428 * the lock is dropped, so once we get a reference 3429 * to the stripe that we think it is, we will have 3430 * to check again. 3431 */ 3432 spin_lock_irq(&conf->device_lock); 3433 disks = conf->raid_disks; 3434 if (logical_sector >= conf->expand_progress) 3435 disks = conf->previous_raid_disks; 3436 else { 3437 if (logical_sector >= conf->expand_lo) { 3438 spin_unlock_irq(&conf->device_lock); 3439 schedule(); 3440 goto retry; 3441 } 3442 } 3443 spin_unlock_irq(&conf->device_lock); 3444 } 3445 data_disks = disks - conf->max_degraded; 3446 3447 new_sector = raid5_compute_sector(logical_sector, disks, data_disks, 3448 &dd_idx, &pd_idx, conf); 3449 pr_debug("raid5: make_request, sector %llu logical %llu\n", 3450 (unsigned long long)new_sector, 3451 (unsigned long long)logical_sector); 3452 3453 sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); 3454 if (sh) { 3455 if (unlikely(conf->expand_progress != MaxSector)) { 3456 /* expansion might have moved on while waiting for a 3457 * stripe, so we must do the range check again. 3458 * Expansion could still move past after this 3459 * test, but as we are holding a reference to 3460 * 'sh', we know that if that happens, 3461 * STRIPE_EXPANDING will get set and the expansion 3462 * won't proceed until we finish with the stripe. 3463 */ 3464 int must_retry = 0; 3465 spin_lock_irq(&conf->device_lock); 3466 if (logical_sector < conf->expand_progress && 3467 disks == conf->previous_raid_disks) 3468 /* mismatch, need to try again */ 3469 must_retry = 1; 3470 spin_unlock_irq(&conf->device_lock); 3471 if (must_retry) { 3472 release_stripe(sh); 3473 goto retry; 3474 } 3475 } 3476 /* FIXME what if we get a false positive because these 3477 * are being updated. 3478 */ 3479 if (logical_sector >= mddev->suspend_lo && 3480 logical_sector < mddev->suspend_hi) { 3481 release_stripe(sh); 3482 schedule(); 3483 goto retry; 3484 } 3485 3486 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3487 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3488 /* Stripe is busy expanding or 3489 * add failed due to overlap. Flush everything 3490 * and wait a while 3491 */ 3492 raid5_unplug_device(mddev->queue); 3493 release_stripe(sh); 3494 schedule(); 3495 goto retry; 3496 } 3497 finish_wait(&conf->wait_for_overlap, &w); 3498 set_bit(STRIPE_HANDLE, &sh->state); 3499 clear_bit(STRIPE_DELAYED, &sh->state); 3500 release_stripe(sh); 3501 } else { 3502 /* cannot get stripe for read-ahead, just give-up */ 3503 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3504 finish_wait(&conf->wait_for_overlap, &w); 3505 break; 3506 } 3507 3508 } 3509 spin_lock_irq(&conf->device_lock); 3510 remaining = raid5_dec_bi_phys_segments(bi); 3511 spin_unlock_irq(&conf->device_lock); 3512 if (remaining == 0) { 3513 3514 if ( rw == WRITE ) 3515 md_write_end(mddev); 3516 3517 bio_endio(bi, 0); 3518 } 3519 return 0; 3520 } 3521 3522 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 3523 { 3524 /* reshaping is quite different to recovery/resync so it is 3525 * handled quite separately ... here. 3526 * 3527 * On each call to sync_request, we gather one chunk worth of 3528 * destination stripes and flag them as expanding. 3529 * Then we find all the source stripes and request reads. 3530 * As the reads complete, handle_stripe will copy the data 3531 * into the destination stripe and release that stripe. 3532 */ 3533 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 3534 struct stripe_head *sh; 3535 int pd_idx; 3536 sector_t first_sector, last_sector; 3537 int raid_disks = conf->previous_raid_disks; 3538 int data_disks = raid_disks - conf->max_degraded; 3539 int new_data_disks = conf->raid_disks - conf->max_degraded; 3540 int i; 3541 int dd_idx; 3542 sector_t writepos, safepos, gap; 3543 3544 if (sector_nr == 0 && 3545 conf->expand_progress != 0) { 3546 /* restarting in the middle, skip the initial sectors */ 3547 sector_nr = conf->expand_progress; 3548 sector_div(sector_nr, new_data_disks); 3549 *skipped = 1; 3550 return sector_nr; 3551 } 3552 3553 /* we update the metadata when there is more than 3Meg 3554 * in the block range (that is rather arbitrary, should 3555 * probably be time based) or when the data about to be 3556 * copied would over-write the source of the data at 3557 * the front of the range. 3558 * i.e. one new_stripe forward from expand_progress new_maps 3559 * to after where expand_lo old_maps to 3560 */ 3561 writepos = conf->expand_progress + 3562 conf->chunk_size/512*(new_data_disks); 3563 sector_div(writepos, new_data_disks); 3564 safepos = conf->expand_lo; 3565 sector_div(safepos, data_disks); 3566 gap = conf->expand_progress - conf->expand_lo; 3567 3568 if (writepos >= safepos || 3569 gap > (new_data_disks)*3000*2 /*3Meg*/) { 3570 /* Cannot proceed until we've updated the superblock... */ 3571 wait_event(conf->wait_for_overlap, 3572 atomic_read(&conf->reshape_stripes)==0); 3573 mddev->reshape_position = conf->expand_progress; 3574 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3575 md_wakeup_thread(mddev->thread); 3576 wait_event(mddev->sb_wait, mddev->flags == 0 || 3577 kthread_should_stop()); 3578 spin_lock_irq(&conf->device_lock); 3579 conf->expand_lo = mddev->reshape_position; 3580 spin_unlock_irq(&conf->device_lock); 3581 wake_up(&conf->wait_for_overlap); 3582 } 3583 3584 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { 3585 int j; 3586 int skipped = 0; 3587 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); 3588 sh = get_active_stripe(conf, sector_nr+i, 3589 conf->raid_disks, pd_idx, 0); 3590 set_bit(STRIPE_EXPANDING, &sh->state); 3591 atomic_inc(&conf->reshape_stripes); 3592 /* If any of this stripe is beyond the end of the old 3593 * array, then we need to zero those blocks 3594 */ 3595 for (j=sh->disks; j--;) { 3596 sector_t s; 3597 if (j == sh->pd_idx) 3598 continue; 3599 if (conf->level == 6 && 3600 j == raid6_next_disk(sh->pd_idx, sh->disks)) 3601 continue; 3602 s = compute_blocknr(sh, j); 3603 if (s < mddev->array_sectors) { 3604 skipped = 1; 3605 continue; 3606 } 3607 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 3608 set_bit(R5_Expanded, &sh->dev[j].flags); 3609 set_bit(R5_UPTODATE, &sh->dev[j].flags); 3610 } 3611 if (!skipped) { 3612 set_bit(STRIPE_EXPAND_READY, &sh->state); 3613 set_bit(STRIPE_HANDLE, &sh->state); 3614 } 3615 release_stripe(sh); 3616 } 3617 spin_lock_irq(&conf->device_lock); 3618 conf->expand_progress = (sector_nr + i) * new_data_disks; 3619 spin_unlock_irq(&conf->device_lock); 3620 /* Ok, those stripe are ready. We can start scheduling 3621 * reads on the source stripes. 3622 * The source stripes are determined by mapping the first and last 3623 * block on the destination stripes. 3624 */ 3625 first_sector = 3626 raid5_compute_sector(sector_nr*(new_data_disks), 3627 raid_disks, data_disks, 3628 &dd_idx, &pd_idx, conf); 3629 last_sector = 3630 raid5_compute_sector((sector_nr+conf->chunk_size/512) 3631 *(new_data_disks) -1, 3632 raid_disks, data_disks, 3633 &dd_idx, &pd_idx, conf); 3634 if (last_sector >= (mddev->size<<1)) 3635 last_sector = (mddev->size<<1)-1; 3636 while (first_sector <= last_sector) { 3637 pd_idx = stripe_to_pdidx(first_sector, conf, 3638 conf->previous_raid_disks); 3639 sh = get_active_stripe(conf, first_sector, 3640 conf->previous_raid_disks, pd_idx, 0); 3641 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3642 set_bit(STRIPE_HANDLE, &sh->state); 3643 release_stripe(sh); 3644 first_sector += STRIPE_SECTORS; 3645 } 3646 /* If this takes us to the resync_max point where we have to pause, 3647 * then we need to write out the superblock. 3648 */ 3649 sector_nr += conf->chunk_size>>9; 3650 if (sector_nr >= mddev->resync_max) { 3651 /* Cannot proceed until we've updated the superblock... */ 3652 wait_event(conf->wait_for_overlap, 3653 atomic_read(&conf->reshape_stripes) == 0); 3654 mddev->reshape_position = conf->expand_progress; 3655 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3656 md_wakeup_thread(mddev->thread); 3657 wait_event(mddev->sb_wait, 3658 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 3659 || kthread_should_stop()); 3660 spin_lock_irq(&conf->device_lock); 3661 conf->expand_lo = mddev->reshape_position; 3662 spin_unlock_irq(&conf->device_lock); 3663 wake_up(&conf->wait_for_overlap); 3664 } 3665 return conf->chunk_size>>9; 3666 } 3667 3668 /* FIXME go_faster isn't used */ 3669 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 3670 { 3671 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 3672 struct stripe_head *sh; 3673 int pd_idx; 3674 int raid_disks = conf->raid_disks; 3675 sector_t max_sector = mddev->size << 1; 3676 int sync_blocks; 3677 int still_degraded = 0; 3678 int i; 3679 3680 if (sector_nr >= max_sector) { 3681 /* just being told to finish up .. nothing much to do */ 3682 unplug_slaves(mddev); 3683 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3684 end_reshape(conf); 3685 return 0; 3686 } 3687 3688 if (mddev->curr_resync < max_sector) /* aborted */ 3689 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3690 &sync_blocks, 1); 3691 else /* completed sync */ 3692 conf->fullsync = 0; 3693 bitmap_close_sync(mddev->bitmap); 3694 3695 return 0; 3696 } 3697 3698 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3699 return reshape_request(mddev, sector_nr, skipped); 3700 3701 /* No need to check resync_max as we never do more than one 3702 * stripe, and as resync_max will always be on a chunk boundary, 3703 * if the check in md_do_sync didn't fire, there is no chance 3704 * of overstepping resync_max here 3705 */ 3706 3707 /* if there is too many failed drives and we are trying 3708 * to resync, then assert that we are finished, because there is 3709 * nothing we can do. 3710 */ 3711 if (mddev->degraded >= conf->max_degraded && 3712 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3713 sector_t rv = (mddev->size << 1) - sector_nr; 3714 *skipped = 1; 3715 return rv; 3716 } 3717 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 3718 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 3719 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 3720 /* we can skip this block, and probably more */ 3721 sync_blocks /= STRIPE_SECTORS; 3722 *skipped = 1; 3723 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 3724 } 3725 3726 3727 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 3728 3729 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); 3730 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); 3731 if (sh == NULL) { 3732 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 3733 /* make sure we don't swamp the stripe cache if someone else 3734 * is trying to get access 3735 */ 3736 schedule_timeout_uninterruptible(1); 3737 } 3738 /* Need to check if array will still be degraded after recovery/resync 3739 * We don't need to check the 'failed' flag as when that gets set, 3740 * recovery aborts. 3741 */ 3742 for (i=0; i<mddev->raid_disks; i++) 3743 if (conf->disks[i].rdev == NULL) 3744 still_degraded = 1; 3745 3746 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 3747 3748 spin_lock(&sh->lock); 3749 set_bit(STRIPE_SYNCING, &sh->state); 3750 clear_bit(STRIPE_INSYNC, &sh->state); 3751 spin_unlock(&sh->lock); 3752 3753 /* wait for any blocked device to be handled */ 3754 while(unlikely(!handle_stripe(sh, NULL))) 3755 ; 3756 release_stripe(sh); 3757 3758 return STRIPE_SECTORS; 3759 } 3760 3761 static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) 3762 { 3763 /* We may not be able to submit a whole bio at once as there 3764 * may not be enough stripe_heads available. 3765 * We cannot pre-allocate enough stripe_heads as we may need 3766 * more than exist in the cache (if we allow ever large chunks). 3767 * So we do one stripe head at a time and record in 3768 * ->bi_hw_segments how many have been done. 3769 * 3770 * We *know* that this entire raid_bio is in one chunk, so 3771 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 3772 */ 3773 struct stripe_head *sh; 3774 int dd_idx, pd_idx; 3775 sector_t sector, logical_sector, last_sector; 3776 int scnt = 0; 3777 int remaining; 3778 int handled = 0; 3779 3780 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3781 sector = raid5_compute_sector( logical_sector, 3782 conf->raid_disks, 3783 conf->raid_disks - conf->max_degraded, 3784 &dd_idx, 3785 &pd_idx, 3786 conf); 3787 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 3788 3789 for (; logical_sector < last_sector; 3790 logical_sector += STRIPE_SECTORS, 3791 sector += STRIPE_SECTORS, 3792 scnt++) { 3793 3794 if (scnt < raid5_bi_hw_segments(raid_bio)) 3795 /* already done this stripe */ 3796 continue; 3797 3798 sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); 3799 3800 if (!sh) { 3801 /* failed to get a stripe - must wait */ 3802 raid5_set_bi_hw_segments(raid_bio, scnt); 3803 conf->retry_read_aligned = raid_bio; 3804 return handled; 3805 } 3806 3807 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 3808 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 3809 release_stripe(sh); 3810 raid5_set_bi_hw_segments(raid_bio, scnt); 3811 conf->retry_read_aligned = raid_bio; 3812 return handled; 3813 } 3814 3815 handle_stripe(sh, NULL); 3816 release_stripe(sh); 3817 handled++; 3818 } 3819 spin_lock_irq(&conf->device_lock); 3820 remaining = raid5_dec_bi_phys_segments(raid_bio); 3821 spin_unlock_irq(&conf->device_lock); 3822 if (remaining == 0) 3823 bio_endio(raid_bio, 0); 3824 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3825 wake_up(&conf->wait_for_stripe); 3826 return handled; 3827 } 3828 3829 3830 3831 /* 3832 * This is our raid5 kernel thread. 3833 * 3834 * We scan the hash table for stripes which can be handled now. 3835 * During the scan, completed stripes are saved for us by the interrupt 3836 * handler, so that they will not have to wait for our next wakeup. 3837 */ 3838 static void raid5d(mddev_t *mddev) 3839 { 3840 struct stripe_head *sh; 3841 raid5_conf_t *conf = mddev_to_conf(mddev); 3842 int handled; 3843 3844 pr_debug("+++ raid5d active\n"); 3845 3846 md_check_recovery(mddev); 3847 3848 handled = 0; 3849 spin_lock_irq(&conf->device_lock); 3850 while (1) { 3851 struct bio *bio; 3852 3853 if (conf->seq_flush != conf->seq_write) { 3854 int seq = conf->seq_flush; 3855 spin_unlock_irq(&conf->device_lock); 3856 bitmap_unplug(mddev->bitmap); 3857 spin_lock_irq(&conf->device_lock); 3858 conf->seq_write = seq; 3859 activate_bit_delay(conf); 3860 } 3861 3862 while ((bio = remove_bio_from_retry(conf))) { 3863 int ok; 3864 spin_unlock_irq(&conf->device_lock); 3865 ok = retry_aligned_read(conf, bio); 3866 spin_lock_irq(&conf->device_lock); 3867 if (!ok) 3868 break; 3869 handled++; 3870 } 3871 3872 sh = __get_priority_stripe(conf); 3873 3874 if (!sh) 3875 break; 3876 spin_unlock_irq(&conf->device_lock); 3877 3878 handled++; 3879 handle_stripe(sh, conf->spare_page); 3880 release_stripe(sh); 3881 3882 spin_lock_irq(&conf->device_lock); 3883 } 3884 pr_debug("%d stripes handled\n", handled); 3885 3886 spin_unlock_irq(&conf->device_lock); 3887 3888 async_tx_issue_pending_all(); 3889 unplug_slaves(mddev); 3890 3891 pr_debug("--- raid5d inactive\n"); 3892 } 3893 3894 static ssize_t 3895 raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 3896 { 3897 raid5_conf_t *conf = mddev_to_conf(mddev); 3898 if (conf) 3899 return sprintf(page, "%d\n", conf->max_nr_stripes); 3900 else 3901 return 0; 3902 } 3903 3904 static ssize_t 3905 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 3906 { 3907 raid5_conf_t *conf = mddev_to_conf(mddev); 3908 unsigned long new; 3909 int err; 3910 3911 if (len >= PAGE_SIZE) 3912 return -EINVAL; 3913 if (!conf) 3914 return -ENODEV; 3915 3916 if (strict_strtoul(page, 10, &new)) 3917 return -EINVAL; 3918 if (new <= 16 || new > 32768) 3919 return -EINVAL; 3920 while (new < conf->max_nr_stripes) { 3921 if (drop_one_stripe(conf)) 3922 conf->max_nr_stripes--; 3923 else 3924 break; 3925 } 3926 err = md_allow_write(mddev); 3927 if (err) 3928 return err; 3929 while (new > conf->max_nr_stripes) { 3930 if (grow_one_stripe(conf)) 3931 conf->max_nr_stripes++; 3932 else break; 3933 } 3934 return len; 3935 } 3936 3937 static struct md_sysfs_entry 3938 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 3939 raid5_show_stripe_cache_size, 3940 raid5_store_stripe_cache_size); 3941 3942 static ssize_t 3943 raid5_show_preread_threshold(mddev_t *mddev, char *page) 3944 { 3945 raid5_conf_t *conf = mddev_to_conf(mddev); 3946 if (conf) 3947 return sprintf(page, "%d\n", conf->bypass_threshold); 3948 else 3949 return 0; 3950 } 3951 3952 static ssize_t 3953 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 3954 { 3955 raid5_conf_t *conf = mddev_to_conf(mddev); 3956 unsigned long new; 3957 if (len >= PAGE_SIZE) 3958 return -EINVAL; 3959 if (!conf) 3960 return -ENODEV; 3961 3962 if (strict_strtoul(page, 10, &new)) 3963 return -EINVAL; 3964 if (new > conf->max_nr_stripes) 3965 return -EINVAL; 3966 conf->bypass_threshold = new; 3967 return len; 3968 } 3969 3970 static struct md_sysfs_entry 3971 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 3972 S_IRUGO | S_IWUSR, 3973 raid5_show_preread_threshold, 3974 raid5_store_preread_threshold); 3975 3976 static ssize_t 3977 stripe_cache_active_show(mddev_t *mddev, char *page) 3978 { 3979 raid5_conf_t *conf = mddev_to_conf(mddev); 3980 if (conf) 3981 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 3982 else 3983 return 0; 3984 } 3985 3986 static struct md_sysfs_entry 3987 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 3988 3989 static struct attribute *raid5_attrs[] = { 3990 &raid5_stripecache_size.attr, 3991 &raid5_stripecache_active.attr, 3992 &raid5_preread_bypass_threshold.attr, 3993 NULL, 3994 }; 3995 static struct attribute_group raid5_attrs_group = { 3996 .name = NULL, 3997 .attrs = raid5_attrs, 3998 }; 3999 4000 static int run(mddev_t *mddev) 4001 { 4002 raid5_conf_t *conf; 4003 int raid_disk, memory; 4004 mdk_rdev_t *rdev; 4005 struct disk_info *disk; 4006 struct list_head *tmp; 4007 int working_disks = 0; 4008 4009 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { 4010 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4011 mdname(mddev), mddev->level); 4012 return -EIO; 4013 } 4014 4015 if (mddev->reshape_position != MaxSector) { 4016 /* Check that we can continue the reshape. 4017 * Currently only disks can change, it must 4018 * increase, and we must be past the point where 4019 * a stripe over-writes itself 4020 */ 4021 sector_t here_new, here_old; 4022 int old_disks; 4023 int max_degraded = (mddev->level == 5 ? 1 : 2); 4024 4025 if (mddev->new_level != mddev->level || 4026 mddev->new_layout != mddev->layout || 4027 mddev->new_chunk != mddev->chunk_size) { 4028 printk(KERN_ERR "raid5: %s: unsupported reshape " 4029 "required - aborting.\n", 4030 mdname(mddev)); 4031 return -EINVAL; 4032 } 4033 if (mddev->delta_disks <= 0) { 4034 printk(KERN_ERR "raid5: %s: unsupported reshape " 4035 "(reduce disks) required - aborting.\n", 4036 mdname(mddev)); 4037 return -EINVAL; 4038 } 4039 old_disks = mddev->raid_disks - mddev->delta_disks; 4040 /* reshape_position must be on a new-stripe boundary, and one 4041 * further up in new geometry must map after here in old 4042 * geometry. 4043 */ 4044 here_new = mddev->reshape_position; 4045 if (sector_div(here_new, (mddev->chunk_size>>9)* 4046 (mddev->raid_disks - max_degraded))) { 4047 printk(KERN_ERR "raid5: reshape_position not " 4048 "on a stripe boundary\n"); 4049 return -EINVAL; 4050 } 4051 /* here_new is the stripe we will write to */ 4052 here_old = mddev->reshape_position; 4053 sector_div(here_old, (mddev->chunk_size>>9)* 4054 (old_disks-max_degraded)); 4055 /* here_old is the first stripe that we might need to read 4056 * from */ 4057 if (here_new >= here_old) { 4058 /* Reading from the same stripe as writing to - bad */ 4059 printk(KERN_ERR "raid5: reshape_position too early for " 4060 "auto-recovery - aborting.\n"); 4061 return -EINVAL; 4062 } 4063 printk(KERN_INFO "raid5: reshape will continue\n"); 4064 /* OK, we should be able to continue; */ 4065 } 4066 4067 4068 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); 4069 if ((conf = mddev->private) == NULL) 4070 goto abort; 4071 if (mddev->reshape_position == MaxSector) { 4072 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; 4073 } else { 4074 conf->raid_disks = mddev->raid_disks; 4075 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4076 } 4077 4078 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4079 GFP_KERNEL); 4080 if (!conf->disks) 4081 goto abort; 4082 4083 conf->mddev = mddev; 4084 4085 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4086 goto abort; 4087 4088 if (mddev->level == 6) { 4089 conf->spare_page = alloc_page(GFP_KERNEL); 4090 if (!conf->spare_page) 4091 goto abort; 4092 } 4093 spin_lock_init(&conf->device_lock); 4094 mddev->queue->queue_lock = &conf->device_lock; 4095 init_waitqueue_head(&conf->wait_for_stripe); 4096 init_waitqueue_head(&conf->wait_for_overlap); 4097 INIT_LIST_HEAD(&conf->handle_list); 4098 INIT_LIST_HEAD(&conf->hold_list); 4099 INIT_LIST_HEAD(&conf->delayed_list); 4100 INIT_LIST_HEAD(&conf->bitmap_list); 4101 INIT_LIST_HEAD(&conf->inactive_list); 4102 atomic_set(&conf->active_stripes, 0); 4103 atomic_set(&conf->preread_active_stripes, 0); 4104 atomic_set(&conf->active_aligned_reads, 0); 4105 conf->bypass_threshold = BYPASS_THRESHOLD; 4106 4107 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4108 4109 rdev_for_each(rdev, tmp, mddev) { 4110 raid_disk = rdev->raid_disk; 4111 if (raid_disk >= conf->raid_disks 4112 || raid_disk < 0) 4113 continue; 4114 disk = conf->disks + raid_disk; 4115 4116 disk->rdev = rdev; 4117 4118 if (test_bit(In_sync, &rdev->flags)) { 4119 char b[BDEVNAME_SIZE]; 4120 printk(KERN_INFO "raid5: device %s operational as raid" 4121 " disk %d\n", bdevname(rdev->bdev,b), 4122 raid_disk); 4123 working_disks++; 4124 } else 4125 /* Cannot rely on bitmap to complete recovery */ 4126 conf->fullsync = 1; 4127 } 4128 4129 /* 4130 * 0 for a fully functional array, 1 or 2 for a degraded array. 4131 */ 4132 mddev->degraded = conf->raid_disks - working_disks; 4133 conf->mddev = mddev; 4134 conf->chunk_size = mddev->chunk_size; 4135 conf->level = mddev->level; 4136 if (conf->level == 6) 4137 conf->max_degraded = 2; 4138 else 4139 conf->max_degraded = 1; 4140 conf->algorithm = mddev->layout; 4141 conf->max_nr_stripes = NR_STRIPES; 4142 conf->expand_progress = mddev->reshape_position; 4143 4144 /* device size must be a multiple of chunk size */ 4145 mddev->size &= ~(mddev->chunk_size/1024 -1); 4146 mddev->resync_max_sectors = mddev->size << 1; 4147 4148 if (conf->level == 6 && conf->raid_disks < 4) { 4149 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4150 mdname(mddev), conf->raid_disks); 4151 goto abort; 4152 } 4153 if (!conf->chunk_size || conf->chunk_size % 4) { 4154 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4155 conf->chunk_size, mdname(mddev)); 4156 goto abort; 4157 } 4158 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { 4159 printk(KERN_ERR 4160 "raid5: unsupported parity algorithm %d for %s\n", 4161 conf->algorithm, mdname(mddev)); 4162 goto abort; 4163 } 4164 if (mddev->degraded > conf->max_degraded) { 4165 printk(KERN_ERR "raid5: not enough operational devices for %s" 4166 " (%d/%d failed)\n", 4167 mdname(mddev), mddev->degraded, conf->raid_disks); 4168 goto abort; 4169 } 4170 4171 if (mddev->degraded > 0 && 4172 mddev->recovery_cp != MaxSector) { 4173 if (mddev->ok_start_degraded) 4174 printk(KERN_WARNING 4175 "raid5: starting dirty degraded array: %s" 4176 "- data corruption possible.\n", 4177 mdname(mddev)); 4178 else { 4179 printk(KERN_ERR 4180 "raid5: cannot start dirty degraded array for %s\n", 4181 mdname(mddev)); 4182 goto abort; 4183 } 4184 } 4185 4186 { 4187 mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); 4188 if (!mddev->thread) { 4189 printk(KERN_ERR 4190 "raid5: couldn't allocate thread for %s\n", 4191 mdname(mddev)); 4192 goto abort; 4193 } 4194 } 4195 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4196 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4197 if (grow_stripes(conf, conf->max_nr_stripes)) { 4198 printk(KERN_ERR 4199 "raid5: couldn't allocate %dkB for buffers\n", memory); 4200 shrink_stripes(conf); 4201 md_unregister_thread(mddev->thread); 4202 goto abort; 4203 } else 4204 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4205 memory, mdname(mddev)); 4206 4207 if (mddev->degraded == 0) 4208 printk("raid5: raid level %d set %s active with %d out of %d" 4209 " devices, algorithm %d\n", conf->level, mdname(mddev), 4210 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 4211 conf->algorithm); 4212 else 4213 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 4214 " out of %d devices, algorithm %d\n", conf->level, 4215 mdname(mddev), mddev->raid_disks - mddev->degraded, 4216 mddev->raid_disks, conf->algorithm); 4217 4218 print_raid5_conf(conf); 4219 4220 if (conf->expand_progress != MaxSector) { 4221 printk("...ok start reshape thread\n"); 4222 conf->expand_lo = conf->expand_progress; 4223 atomic_set(&conf->reshape_stripes, 0); 4224 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4225 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4226 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4227 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4228 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4229 "%s_reshape"); 4230 } 4231 4232 /* read-ahead size must cover two whole stripes, which is 4233 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4234 */ 4235 { 4236 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4237 int stripe = data_disks * 4238 (mddev->chunk_size / PAGE_SIZE); 4239 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4240 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4241 } 4242 4243 /* Ok, everything is just fine now */ 4244 if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 4245 printk(KERN_WARNING 4246 "raid5: failed to create sysfs attributes for %s\n", 4247 mdname(mddev)); 4248 4249 mddev->queue->unplug_fn = raid5_unplug_device; 4250 mddev->queue->backing_dev_info.congested_data = mddev; 4251 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4252 4253 mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - 4254 conf->max_degraded); 4255 4256 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4257 4258 return 0; 4259 abort: 4260 if (conf) { 4261 print_raid5_conf(conf); 4262 safe_put_page(conf->spare_page); 4263 kfree(conf->disks); 4264 kfree(conf->stripe_hashtbl); 4265 kfree(conf); 4266 } 4267 mddev->private = NULL; 4268 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 4269 return -EIO; 4270 } 4271 4272 4273 4274 static int stop(mddev_t *mddev) 4275 { 4276 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4277 4278 md_unregister_thread(mddev->thread); 4279 mddev->thread = NULL; 4280 shrink_stripes(conf); 4281 kfree(conf->stripe_hashtbl); 4282 mddev->queue->backing_dev_info.congested_fn = NULL; 4283 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 4284 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 4285 kfree(conf->disks); 4286 kfree(conf); 4287 mddev->private = NULL; 4288 return 0; 4289 } 4290 4291 #ifdef DEBUG 4292 static void print_sh (struct seq_file *seq, struct stripe_head *sh) 4293 { 4294 int i; 4295 4296 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", 4297 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 4298 seq_printf(seq, "sh %llu, count %d.\n", 4299 (unsigned long long)sh->sector, atomic_read(&sh->count)); 4300 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); 4301 for (i = 0; i < sh->disks; i++) { 4302 seq_printf(seq, "(cache%d: %p %ld) ", 4303 i, sh->dev[i].page, sh->dev[i].flags); 4304 } 4305 seq_printf(seq, "\n"); 4306 } 4307 4308 static void printall (struct seq_file *seq, raid5_conf_t *conf) 4309 { 4310 struct stripe_head *sh; 4311 struct hlist_node *hn; 4312 int i; 4313 4314 spin_lock_irq(&conf->device_lock); 4315 for (i = 0; i < NR_HASH; i++) { 4316 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 4317 if (sh->raid_conf != conf) 4318 continue; 4319 print_sh(seq, sh); 4320 } 4321 } 4322 spin_unlock_irq(&conf->device_lock); 4323 } 4324 #endif 4325 4326 static void status (struct seq_file *seq, mddev_t *mddev) 4327 { 4328 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4329 int i; 4330 4331 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); 4332 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 4333 for (i = 0; i < conf->raid_disks; i++) 4334 seq_printf (seq, "%s", 4335 conf->disks[i].rdev && 4336 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 4337 seq_printf (seq, "]"); 4338 #ifdef DEBUG 4339 seq_printf (seq, "\n"); 4340 printall(seq, conf); 4341 #endif 4342 } 4343 4344 static void print_raid5_conf (raid5_conf_t *conf) 4345 { 4346 int i; 4347 struct disk_info *tmp; 4348 4349 printk("RAID5 conf printout:\n"); 4350 if (!conf) { 4351 printk("(conf==NULL)\n"); 4352 return; 4353 } 4354 printk(" --- rd:%d wd:%d\n", conf->raid_disks, 4355 conf->raid_disks - conf->mddev->degraded); 4356 4357 for (i = 0; i < conf->raid_disks; i++) { 4358 char b[BDEVNAME_SIZE]; 4359 tmp = conf->disks + i; 4360 if (tmp->rdev) 4361 printk(" disk %d, o:%d, dev:%s\n", 4362 i, !test_bit(Faulty, &tmp->rdev->flags), 4363 bdevname(tmp->rdev->bdev,b)); 4364 } 4365 } 4366 4367 static int raid5_spare_active(mddev_t *mddev) 4368 { 4369 int i; 4370 raid5_conf_t *conf = mddev->private; 4371 struct disk_info *tmp; 4372 4373 for (i = 0; i < conf->raid_disks; i++) { 4374 tmp = conf->disks + i; 4375 if (tmp->rdev 4376 && !test_bit(Faulty, &tmp->rdev->flags) 4377 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 4378 unsigned long flags; 4379 spin_lock_irqsave(&conf->device_lock, flags); 4380 mddev->degraded--; 4381 spin_unlock_irqrestore(&conf->device_lock, flags); 4382 } 4383 } 4384 print_raid5_conf(conf); 4385 return 0; 4386 } 4387 4388 static int raid5_remove_disk(mddev_t *mddev, int number) 4389 { 4390 raid5_conf_t *conf = mddev->private; 4391 int err = 0; 4392 mdk_rdev_t *rdev; 4393 struct disk_info *p = conf->disks + number; 4394 4395 print_raid5_conf(conf); 4396 rdev = p->rdev; 4397 if (rdev) { 4398 if (test_bit(In_sync, &rdev->flags) || 4399 atomic_read(&rdev->nr_pending)) { 4400 err = -EBUSY; 4401 goto abort; 4402 } 4403 /* Only remove non-faulty devices if recovery 4404 * isn't possible. 4405 */ 4406 if (!test_bit(Faulty, &rdev->flags) && 4407 mddev->degraded <= conf->max_degraded) { 4408 err = -EBUSY; 4409 goto abort; 4410 } 4411 p->rdev = NULL; 4412 synchronize_rcu(); 4413 if (atomic_read(&rdev->nr_pending)) { 4414 /* lost the race, try later */ 4415 err = -EBUSY; 4416 p->rdev = rdev; 4417 } 4418 } 4419 abort: 4420 4421 print_raid5_conf(conf); 4422 return err; 4423 } 4424 4425 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 4426 { 4427 raid5_conf_t *conf = mddev->private; 4428 int err = -EEXIST; 4429 int disk; 4430 struct disk_info *p; 4431 int first = 0; 4432 int last = conf->raid_disks - 1; 4433 4434 if (mddev->degraded > conf->max_degraded) 4435 /* no point adding a device */ 4436 return -EINVAL; 4437 4438 if (rdev->raid_disk >= 0) 4439 first = last = rdev->raid_disk; 4440 4441 /* 4442 * find the disk ... but prefer rdev->saved_raid_disk 4443 * if possible. 4444 */ 4445 if (rdev->saved_raid_disk >= 0 && 4446 rdev->saved_raid_disk >= first && 4447 conf->disks[rdev->saved_raid_disk].rdev == NULL) 4448 disk = rdev->saved_raid_disk; 4449 else 4450 disk = first; 4451 for ( ; disk <= last ; disk++) 4452 if ((p=conf->disks + disk)->rdev == NULL) { 4453 clear_bit(In_sync, &rdev->flags); 4454 rdev->raid_disk = disk; 4455 err = 0; 4456 if (rdev->saved_raid_disk != disk) 4457 conf->fullsync = 1; 4458 rcu_assign_pointer(p->rdev, rdev); 4459 break; 4460 } 4461 print_raid5_conf(conf); 4462 return err; 4463 } 4464 4465 static int raid5_resize(mddev_t *mddev, sector_t sectors) 4466 { 4467 /* no resync is happening, and there is enough space 4468 * on all devices, so we can resize. 4469 * We need to make sure resync covers any new space. 4470 * If the array is shrinking we should possibly wait until 4471 * any io in the removed space completes, but it hardly seems 4472 * worth it. 4473 */ 4474 raid5_conf_t *conf = mddev_to_conf(mddev); 4475 4476 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4477 mddev->array_sectors = sectors * (mddev->raid_disks 4478 - conf->max_degraded); 4479 set_capacity(mddev->gendisk, mddev->array_sectors); 4480 mddev->changed = 1; 4481 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 4482 mddev->recovery_cp = mddev->size << 1; 4483 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4484 } 4485 mddev->size = sectors /2; 4486 mddev->resync_max_sectors = sectors; 4487 return 0; 4488 } 4489 4490 #ifdef CONFIG_MD_RAID5_RESHAPE 4491 static int raid5_check_reshape(mddev_t *mddev) 4492 { 4493 raid5_conf_t *conf = mddev_to_conf(mddev); 4494 int err; 4495 4496 if (mddev->delta_disks < 0 || 4497 mddev->new_level != mddev->level) 4498 return -EINVAL; /* Cannot shrink array or change level yet */ 4499 if (mddev->delta_disks == 0) 4500 return 0; /* nothing to do */ 4501 if (mddev->bitmap) 4502 /* Cannot grow a bitmap yet */ 4503 return -EBUSY; 4504 4505 /* Can only proceed if there are plenty of stripe_heads. 4506 * We need a minimum of one full stripe,, and for sensible progress 4507 * it is best to have about 4 times that. 4508 * If we require 4 times, then the default 256 4K stripe_heads will 4509 * allow for chunk sizes up to 256K, which is probably OK. 4510 * If the chunk size is greater, user-space should request more 4511 * stripe_heads first. 4512 */ 4513 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || 4514 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { 4515 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 4516 (mddev->chunk_size / STRIPE_SIZE)*4); 4517 return -ENOSPC; 4518 } 4519 4520 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 4521 if (err) 4522 return err; 4523 4524 if (mddev->degraded > conf->max_degraded) 4525 return -EINVAL; 4526 /* looks like we might be able to manage this */ 4527 return 0; 4528 } 4529 4530 static int raid5_start_reshape(mddev_t *mddev) 4531 { 4532 raid5_conf_t *conf = mddev_to_conf(mddev); 4533 mdk_rdev_t *rdev; 4534 struct list_head *rtmp; 4535 int spares = 0; 4536 int added_devices = 0; 4537 unsigned long flags; 4538 4539 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4540 return -EBUSY; 4541 4542 rdev_for_each(rdev, rtmp, mddev) 4543 if (rdev->raid_disk < 0 && 4544 !test_bit(Faulty, &rdev->flags)) 4545 spares++; 4546 4547 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 4548 /* Not enough devices even to make a degraded array 4549 * of that size 4550 */ 4551 return -EINVAL; 4552 4553 atomic_set(&conf->reshape_stripes, 0); 4554 spin_lock_irq(&conf->device_lock); 4555 conf->previous_raid_disks = conf->raid_disks; 4556 conf->raid_disks += mddev->delta_disks; 4557 conf->expand_progress = 0; 4558 conf->expand_lo = 0; 4559 spin_unlock_irq(&conf->device_lock); 4560 4561 /* Add some new drives, as many as will fit. 4562 * We know there are enough to make the newly sized array work. 4563 */ 4564 rdev_for_each(rdev, rtmp, mddev) 4565 if (rdev->raid_disk < 0 && 4566 !test_bit(Faulty, &rdev->flags)) { 4567 if (raid5_add_disk(mddev, rdev) == 0) { 4568 char nm[20]; 4569 set_bit(In_sync, &rdev->flags); 4570 added_devices++; 4571 rdev->recovery_offset = 0; 4572 sprintf(nm, "rd%d", rdev->raid_disk); 4573 if (sysfs_create_link(&mddev->kobj, 4574 &rdev->kobj, nm)) 4575 printk(KERN_WARNING 4576 "raid5: failed to create " 4577 " link %s for %s\n", 4578 nm, mdname(mddev)); 4579 } else 4580 break; 4581 } 4582 4583 spin_lock_irqsave(&conf->device_lock, flags); 4584 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; 4585 spin_unlock_irqrestore(&conf->device_lock, flags); 4586 mddev->raid_disks = conf->raid_disks; 4587 mddev->reshape_position = 0; 4588 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4589 4590 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4591 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4592 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4593 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4594 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4595 "%s_reshape"); 4596 if (!mddev->sync_thread) { 4597 mddev->recovery = 0; 4598 spin_lock_irq(&conf->device_lock); 4599 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 4600 conf->expand_progress = MaxSector; 4601 spin_unlock_irq(&conf->device_lock); 4602 return -EAGAIN; 4603 } 4604 md_wakeup_thread(mddev->sync_thread); 4605 md_new_event(mddev); 4606 return 0; 4607 } 4608 #endif 4609 4610 static void end_reshape(raid5_conf_t *conf) 4611 { 4612 struct block_device *bdev; 4613 4614 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 4615 conf->mddev->array_sectors = 2 * conf->mddev->size * 4616 (conf->raid_disks - conf->max_degraded); 4617 set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); 4618 conf->mddev->changed = 1; 4619 4620 bdev = bdget_disk(conf->mddev->gendisk, 0); 4621 if (bdev) { 4622 mutex_lock(&bdev->bd_inode->i_mutex); 4623 i_size_write(bdev->bd_inode, 4624 (loff_t)conf->mddev->array_sectors << 9); 4625 mutex_unlock(&bdev->bd_inode->i_mutex); 4626 bdput(bdev); 4627 } 4628 spin_lock_irq(&conf->device_lock); 4629 conf->expand_progress = MaxSector; 4630 spin_unlock_irq(&conf->device_lock); 4631 conf->mddev->reshape_position = MaxSector; 4632 4633 /* read-ahead size must cover two whole stripes, which is 4634 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4635 */ 4636 { 4637 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4638 int stripe = data_disks * 4639 (conf->mddev->chunk_size / PAGE_SIZE); 4640 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4641 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4642 } 4643 } 4644 } 4645 4646 static void raid5_quiesce(mddev_t *mddev, int state) 4647 { 4648 raid5_conf_t *conf = mddev_to_conf(mddev); 4649 4650 switch(state) { 4651 case 2: /* resume for a suspend */ 4652 wake_up(&conf->wait_for_overlap); 4653 break; 4654 4655 case 1: /* stop all writes */ 4656 spin_lock_irq(&conf->device_lock); 4657 conf->quiesce = 1; 4658 wait_event_lock_irq(conf->wait_for_stripe, 4659 atomic_read(&conf->active_stripes) == 0 && 4660 atomic_read(&conf->active_aligned_reads) == 0, 4661 conf->device_lock, /* nothing */); 4662 spin_unlock_irq(&conf->device_lock); 4663 break; 4664 4665 case 0: /* re-enable writes */ 4666 spin_lock_irq(&conf->device_lock); 4667 conf->quiesce = 0; 4668 wake_up(&conf->wait_for_stripe); 4669 wake_up(&conf->wait_for_overlap); 4670 spin_unlock_irq(&conf->device_lock); 4671 break; 4672 } 4673 } 4674 4675 static struct mdk_personality raid6_personality = 4676 { 4677 .name = "raid6", 4678 .level = 6, 4679 .owner = THIS_MODULE, 4680 .make_request = make_request, 4681 .run = run, 4682 .stop = stop, 4683 .status = status, 4684 .error_handler = error, 4685 .hot_add_disk = raid5_add_disk, 4686 .hot_remove_disk= raid5_remove_disk, 4687 .spare_active = raid5_spare_active, 4688 .sync_request = sync_request, 4689 .resize = raid5_resize, 4690 #ifdef CONFIG_MD_RAID5_RESHAPE 4691 .check_reshape = raid5_check_reshape, 4692 .start_reshape = raid5_start_reshape, 4693 #endif 4694 .quiesce = raid5_quiesce, 4695 }; 4696 static struct mdk_personality raid5_personality = 4697 { 4698 .name = "raid5", 4699 .level = 5, 4700 .owner = THIS_MODULE, 4701 .make_request = make_request, 4702 .run = run, 4703 .stop = stop, 4704 .status = status, 4705 .error_handler = error, 4706 .hot_add_disk = raid5_add_disk, 4707 .hot_remove_disk= raid5_remove_disk, 4708 .spare_active = raid5_spare_active, 4709 .sync_request = sync_request, 4710 .resize = raid5_resize, 4711 #ifdef CONFIG_MD_RAID5_RESHAPE 4712 .check_reshape = raid5_check_reshape, 4713 .start_reshape = raid5_start_reshape, 4714 #endif 4715 .quiesce = raid5_quiesce, 4716 }; 4717 4718 static struct mdk_personality raid4_personality = 4719 { 4720 .name = "raid4", 4721 .level = 4, 4722 .owner = THIS_MODULE, 4723 .make_request = make_request, 4724 .run = run, 4725 .stop = stop, 4726 .status = status, 4727 .error_handler = error, 4728 .hot_add_disk = raid5_add_disk, 4729 .hot_remove_disk= raid5_remove_disk, 4730 .spare_active = raid5_spare_active, 4731 .sync_request = sync_request, 4732 .resize = raid5_resize, 4733 #ifdef CONFIG_MD_RAID5_RESHAPE 4734 .check_reshape = raid5_check_reshape, 4735 .start_reshape = raid5_start_reshape, 4736 #endif 4737 .quiesce = raid5_quiesce, 4738 }; 4739 4740 static int __init raid5_init(void) 4741 { 4742 int e; 4743 4744 e = raid6_select_algo(); 4745 if ( e ) 4746 return e; 4747 register_md_personality(&raid6_personality); 4748 register_md_personality(&raid5_personality); 4749 register_md_personality(&raid4_personality); 4750 return 0; 4751 } 4752 4753 static void raid5_exit(void) 4754 { 4755 unregister_md_personality(&raid6_personality); 4756 unregister_md_personality(&raid5_personality); 4757 unregister_md_personality(&raid4_personality); 4758 } 4759 4760 module_init(raid5_init); 4761 module_exit(raid5_exit); 4762 MODULE_LICENSE("GPL"); 4763 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 4764 MODULE_ALIAS("md-raid5"); 4765 MODULE_ALIAS("md-raid4"); 4766 MODULE_ALIAS("md-level-5"); 4767 MODULE_ALIAS("md-level-4"); 4768 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 4769 MODULE_ALIAS("md-raid6"); 4770 MODULE_ALIAS("md-level-6"); 4771 4772 /* This used to be two separate modules, they were: */ 4773 MODULE_ALIAS("raid5"); 4774 MODULE_ALIAS("raid6"); 4775