raid5.c (9134d02bc0af4a8747d448d1f811ec5f8eb96df6) raid5.c (1f98a13f623e0ef666690a18c1250335fc6d7ef1)
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible

--- 33 unchanged lines hidden (view full) ---

42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible

--- 33 unchanged lines hidden (view full) ---

42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/async.h>
51#include <linux/seq_file.h>
50#include <linux/seq_file.h>
52#include <linux/cpu.h>
53#include "md.h"
54#include "raid5.h"
55#include "bitmap.h"
56
57/*
58 * Stripe cache
59 */
60

--- 435 unchanged lines hidden (view full) ---

496static struct dma_async_tx_descriptor *
497async_copy_data(int frombio, struct bio *bio, struct page *page,
498 sector_t sector, struct dma_async_tx_descriptor *tx)
499{
500 struct bio_vec *bvl;
501 struct page *bio_page;
502 int i;
503 int page_offset;
51#include "md.h"
52#include "raid5.h"
53#include "bitmap.h"
54
55/*
56 * Stripe cache
57 */
58

--- 435 unchanged lines hidden (view full) ---

494static struct dma_async_tx_descriptor *
495async_copy_data(int frombio, struct bio *bio, struct page *page,
496 sector_t sector, struct dma_async_tx_descriptor *tx)
497{
498 struct bio_vec *bvl;
499 struct page *bio_page;
500 int i;
501 int page_offset;
504 struct async_submit_ctl submit;
505 enum async_tx_flags flags = 0;
506
507 if (bio->bi_sector >= sector)
508 page_offset = (signed)(bio->bi_sector - sector) * 512;
509 else
510 page_offset = (signed)(sector - bio->bi_sector) * -512;
502
503 if (bio->bi_sector >= sector)
504 page_offset = (signed)(bio->bi_sector - sector) * 512;
505 else
506 page_offset = (signed)(sector - bio->bi_sector) * -512;
511
512 if (frombio)
513 flags |= ASYNC_TX_FENCE;
514 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
515
516 bio_for_each_segment(bvl, bio, i) {
517 int len = bio_iovec_idx(bio, i)->bv_len;
518 int clen;
519 int b_offset = 0;
520
521 if (page_offset < 0) {
522 b_offset = -page_offset;
523 page_offset += b_offset;

--- 5 unchanged lines hidden (view full) ---

529 else
530 clen = len;
531
532 if (clen > 0) {
533 b_offset += bio_iovec_idx(bio, i)->bv_offset;
534 bio_page = bio_iovec_idx(bio, i)->bv_page;
535 if (frombio)
536 tx = async_memcpy(page, bio_page, page_offset,
507 bio_for_each_segment(bvl, bio, i) {
508 int len = bio_iovec_idx(bio, i)->bv_len;
509 int clen;
510 int b_offset = 0;
511
512 if (page_offset < 0) {
513 b_offset = -page_offset;
514 page_offset += b_offset;

--- 5 unchanged lines hidden (view full) ---

520 else
521 clen = len;
522
523 if (clen > 0) {
524 b_offset += bio_iovec_idx(bio, i)->bv_offset;
525 bio_page = bio_iovec_idx(bio, i)->bv_page;
526 if (frombio)
527 tx = async_memcpy(page, bio_page, page_offset,
537 b_offset, clen, &submit);
528 b_offset, clen,
529 ASYNC_TX_DEP_ACK,
530 tx, NULL, NULL);
538 else
539 tx = async_memcpy(bio_page, page, b_offset,
531 else
532 tx = async_memcpy(bio_page, page, b_offset,
540 page_offset, clen, &submit);
533 page_offset, clen,
534 ASYNC_TX_DEP_ACK,
535 tx, NULL, NULL);
541 }
536 }
542 /* chain the operations */
543 submit.depend_tx = tx;
544
545 if (clen < len) /* hit end of page */
546 break;
547 page_offset += len;
548 }
549
550 return tx;
551}
552

--- 42 unchanged lines hidden (view full) ---

595 set_bit(STRIPE_HANDLE, &sh->state);
596 release_stripe(sh);
597}
598
599static void ops_run_biofill(struct stripe_head *sh)
600{
601 struct dma_async_tx_descriptor *tx = NULL;
602 raid5_conf_t *conf = sh->raid_conf;
537 if (clen < len) /* hit end of page */
538 break;
539 page_offset += len;
540 }
541
542 return tx;
543}
544

--- 42 unchanged lines hidden (view full) ---

587 set_bit(STRIPE_HANDLE, &sh->state);
588 release_stripe(sh);
589}
590
591static void ops_run_biofill(struct stripe_head *sh)
592{
593 struct dma_async_tx_descriptor *tx = NULL;
594 raid5_conf_t *conf = sh->raid_conf;
603 struct async_submit_ctl submit;
604 int i;
605
606 pr_debug("%s: stripe %llu\n", __func__,
607 (unsigned long long)sh->sector);
608
609 for (i = sh->disks; i--; ) {
610 struct r5dev *dev = &sh->dev[i];
611 if (test_bit(R5_Wantfill, &dev->flags)) {

--- 7 unchanged lines hidden (view full) ---

619 tx = async_copy_data(0, rbi, dev->page,
620 dev->sector, tx);
621 rbi = r5_next_bio(rbi, dev->sector);
622 }
623 }
624 }
625
626 atomic_inc(&sh->count);
595 int i;
596
597 pr_debug("%s: stripe %llu\n", __func__,
598 (unsigned long long)sh->sector);
599
600 for (i = sh->disks; i--; ) {
601 struct r5dev *dev = &sh->dev[i];
602 if (test_bit(R5_Wantfill, &dev->flags)) {

--- 7 unchanged lines hidden (view full) ---

610 tx = async_copy_data(0, rbi, dev->page,
611 dev->sector, tx);
612 rbi = r5_next_bio(rbi, dev->sector);
613 }
614 }
615 }
616
617 atomic_inc(&sh->count);
627 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
628 async_trigger_callback(&submit);
618 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
619 ops_complete_biofill, sh);
629}
630
620}
621
631static void mark_target_uptodate(struct stripe_head *sh, int target)
622static void ops_complete_compute5(void *stripe_head_ref)
632{
623{
633 struct r5dev *tgt;
634
635 if (target < 0)
636 return;
637
638 tgt = &sh->dev[target];
639 set_bit(R5_UPTODATE, &tgt->flags);
640 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
641 clear_bit(R5_Wantcompute, &tgt->flags);
642}
643
644static void ops_complete_compute(void *stripe_head_ref)
645{
646 struct stripe_head *sh = stripe_head_ref;
624 struct stripe_head *sh = stripe_head_ref;
625 int target = sh->ops.target;
626 struct r5dev *tgt = &sh->dev[target];
647
648 pr_debug("%s: stripe %llu\n", __func__,
649 (unsigned long long)sh->sector);
650
627
628 pr_debug("%s: stripe %llu\n", __func__,
629 (unsigned long long)sh->sector);
630
651 /* mark the computed target(s) as uptodate */
652 mark_target_uptodate(sh, sh->ops.target);
653 mark_target_uptodate(sh, sh->ops.target2);
654
631 set_bit(R5_UPTODATE, &tgt->flags);
632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
633 clear_bit(R5_Wantcompute, &tgt->flags);
655 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
656 if (sh->check_state == check_state_compute_run)
657 sh->check_state = check_state_compute_result;
658 set_bit(STRIPE_HANDLE, &sh->state);
659 release_stripe(sh);
660}
661
634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
635 if (sh->check_state == check_state_compute_run)
636 sh->check_state = check_state_compute_result;
637 set_bit(STRIPE_HANDLE, &sh->state);
638 release_stripe(sh);
639}
640
662/* return a pointer to the address conversion region of the scribble buffer */
663static addr_conv_t *to_addr_conv(struct stripe_head *sh,
664 struct raid5_percpu *percpu)
641static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
665{
642{
666 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
667}
668
669static struct dma_async_tx_descriptor *
670ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
671{
643 /* kernel stack size limits the total number of disks */
672 int disks = sh->disks;
644 int disks = sh->disks;
673 struct page **xor_srcs = percpu->scribble;
645 struct page *xor_srcs[disks];
674 int target = sh->ops.target;
675 struct r5dev *tgt = &sh->dev[target];
676 struct page *xor_dest = tgt->page;
677 int count = 0;
678 struct dma_async_tx_descriptor *tx;
646 int target = sh->ops.target;
647 struct r5dev *tgt = &sh->dev[target];
648 struct page *xor_dest = tgt->page;
649 int count = 0;
650 struct dma_async_tx_descriptor *tx;
679 struct async_submit_ctl submit;
680 int i;
681
682 pr_debug("%s: stripe %llu block: %d\n",
683 __func__, (unsigned long long)sh->sector, target);
684 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
685
686 for (i = disks; i--; )
687 if (i != target)
688 xor_srcs[count++] = sh->dev[i].page;
689
690 atomic_inc(&sh->count);
691
651 int i;
652
653 pr_debug("%s: stripe %llu block: %d\n",
654 __func__, (unsigned long long)sh->sector, target);
655 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
656
657 for (i = disks; i--; )
658 if (i != target)
659 xor_srcs[count++] = sh->dev[i].page;
660
661 atomic_inc(&sh->count);
662
692 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
693 ops_complete_compute, sh, to_addr_conv(sh, percpu));
694 if (unlikely(count == 1))
663 if (unlikely(count == 1))
695 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
665 0, NULL, ops_complete_compute5, sh);
696 else
666 else
697 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
668 ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh);
698
699 return tx;
700}
701
670
671 return tx;
672}
673
702/* set_syndrome_sources - populate source buffers for gen_syndrome
703 * @srcs - (struct page *) array of size sh->disks
704 * @sh - stripe_head to parse
705 *
706 * Populates srcs in proper layout order for the stripe and returns the
707 * 'count' of sources to be used in a call to async_gen_syndrome. The P
708 * destination buffer is recorded in srcs[count] and the Q destination
709 * is recorded in srcs[count+1]].
710 */
711static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
712{
713 int disks = sh->disks;
714 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
715 int d0_idx = raid6_d0(sh);
716 int count;
717 int i;
718
719 for (i = 0; i < disks; i++)
720 srcs[i] = (void *)raid6_empty_zero_page;
721
722 count = 0;
723 i = d0_idx;
724 do {
725 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
726
727 srcs[slot] = sh->dev[i].page;
728 i = raid6_next_disk(i, disks);
729 } while (i != d0_idx);
730 BUG_ON(count != syndrome_disks);
731
732 return count;
733}
734
735static struct dma_async_tx_descriptor *
736ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
737{
738 int disks = sh->disks;
739 struct page **blocks = percpu->scribble;
740 int target;
741 int qd_idx = sh->qd_idx;
742 struct dma_async_tx_descriptor *tx;
743 struct async_submit_ctl submit;
744 struct r5dev *tgt;
745 struct page *dest;
746 int i;
747 int count;
748
749 if (sh->ops.target < 0)
750 target = sh->ops.target2;
751 else if (sh->ops.target2 < 0)
752 target = sh->ops.target;
753 else
754 /* we should only have one valid target */
755 BUG();
756 BUG_ON(target < 0);
757 pr_debug("%s: stripe %llu block: %d\n",
758 __func__, (unsigned long long)sh->sector, target);
759
760 tgt = &sh->dev[target];
761 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
762 dest = tgt->page;
763
764 atomic_inc(&sh->count);
765
766 if (target == qd_idx) {
767 count = set_syndrome_sources(blocks, sh);
768 blocks[count] = NULL; /* regenerating p is not necessary */
769 BUG_ON(blocks[count+1] != dest); /* q should already be set */
770 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
771 ops_complete_compute, sh,
772 to_addr_conv(sh, percpu));
773 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
774 } else {
775 /* Compute any data- or p-drive using XOR */
776 count = 0;
777 for (i = disks; i-- ; ) {
778 if (i == target || i == qd_idx)
779 continue;
780 blocks[count++] = sh->dev[i].page;
781 }
782
783 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
784 NULL, ops_complete_compute, sh,
785 to_addr_conv(sh, percpu));
786 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
787 }
788
789 return tx;
790}
791
792static struct dma_async_tx_descriptor *
793ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
794{
795 int i, count, disks = sh->disks;
796 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
797 int d0_idx = raid6_d0(sh);
798 int faila = -1, failb = -1;
799 int target = sh->ops.target;
800 int target2 = sh->ops.target2;
801 struct r5dev *tgt = &sh->dev[target];
802 struct r5dev *tgt2 = &sh->dev[target2];
803 struct dma_async_tx_descriptor *tx;
804 struct page **blocks = percpu->scribble;
805 struct async_submit_ctl submit;
806
807 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
808 __func__, (unsigned long long)sh->sector, target, target2);
809 BUG_ON(target < 0 || target2 < 0);
810 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
811 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
812
813 /* we need to open-code set_syndrome_sources to handle to the
814 * slot number conversion for 'faila' and 'failb'
815 */
816 for (i = 0; i < disks ; i++)
817 blocks[i] = (void *)raid6_empty_zero_page;
818 count = 0;
819 i = d0_idx;
820 do {
821 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
822
823 blocks[slot] = sh->dev[i].page;
824
825 if (i == target)
826 faila = slot;
827 if (i == target2)
828 failb = slot;
829 i = raid6_next_disk(i, disks);
830 } while (i != d0_idx);
831 BUG_ON(count != syndrome_disks);
832
833 BUG_ON(faila == failb);
834 if (failb < faila)
835 swap(faila, failb);
836 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
837 __func__, (unsigned long long)sh->sector, faila, failb);
838
839 atomic_inc(&sh->count);
840
841 if (failb == syndrome_disks+1) {
842 /* Q disk is one of the missing disks */
843 if (faila == syndrome_disks) {
844 /* Missing P+Q, just recompute */
845 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
846 ops_complete_compute, sh,
847 to_addr_conv(sh, percpu));
848 return async_gen_syndrome(blocks, 0, count+2,
849 STRIPE_SIZE, &submit);
850 } else {
851 struct page *dest;
852 int data_target;
853 int qd_idx = sh->qd_idx;
854
855 /* Missing D+Q: recompute D from P, then recompute Q */
856 if (target == qd_idx)
857 data_target = target2;
858 else
859 data_target = target;
860
861 count = 0;
862 for (i = disks; i-- ; ) {
863 if (i == data_target || i == qd_idx)
864 continue;
865 blocks[count++] = sh->dev[i].page;
866 }
867 dest = sh->dev[data_target].page;
868 init_async_submit(&submit,
869 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
870 NULL, NULL, NULL,
871 to_addr_conv(sh, percpu));
872 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
873 &submit);
874
875 count = set_syndrome_sources(blocks, sh);
876 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
877 ops_complete_compute, sh,
878 to_addr_conv(sh, percpu));
879 return async_gen_syndrome(blocks, 0, count+2,
880 STRIPE_SIZE, &submit);
881 }
882 }
883
884 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute,
885 sh, to_addr_conv(sh, percpu));
886 if (failb == syndrome_disks) {
887 /* We're missing D+P. */
888 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
889 faila, blocks, &submit);
890 } else {
891 /* We're missing D+D. */
892 return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE,
893 faila, failb, blocks, &submit);
894 }
895}
896
897
898static void ops_complete_prexor(void *stripe_head_ref)
899{
900 struct stripe_head *sh = stripe_head_ref;
901
902 pr_debug("%s: stripe %llu\n", __func__,
903 (unsigned long long)sh->sector);
904}
905
906static struct dma_async_tx_descriptor *
674static void ops_complete_prexor(void *stripe_head_ref)
675{
676 struct stripe_head *sh = stripe_head_ref;
677
678 pr_debug("%s: stripe %llu\n", __func__,
679 (unsigned long long)sh->sector);
680}
681
682static struct dma_async_tx_descriptor *
907ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
908 struct dma_async_tx_descriptor *tx)
683ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
909{
684{
685 /* kernel stack size limits the total number of disks */
910 int disks = sh->disks;
686 int disks = sh->disks;
911 struct page **xor_srcs = percpu->scribble;
687 struct page *xor_srcs[disks];
912 int count = 0, pd_idx = sh->pd_idx, i;
688 int count = 0, pd_idx = sh->pd_idx, i;
913 struct async_submit_ctl submit;
914
915 /* existing parity data subtracted */
916 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
917
918 pr_debug("%s: stripe %llu\n", __func__,
919 (unsigned long long)sh->sector);
920
921 for (i = disks; i--; ) {
922 struct r5dev *dev = &sh->dev[i];
923 /* Only process blocks that are known to be uptodate */
924 if (test_bit(R5_Wantdrain, &dev->flags))
925 xor_srcs[count++] = dev->page;
926 }
927
689
690 /* existing parity data subtracted */
691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
692
693 pr_debug("%s: stripe %llu\n", __func__,
694 (unsigned long long)sh->sector);
695
696 for (i = disks; i--; ) {
697 struct r5dev *dev = &sh->dev[i];
698 /* Only process blocks that are known to be uptodate */
699 if (test_bit(R5_Wantdrain, &dev->flags))
700 xor_srcs[count++] = dev->page;
701 }
702
928 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
929 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
930 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
704 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
705 ops_complete_prexor, sh);
931
932 return tx;
933}
934
935static struct dma_async_tx_descriptor *
936ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
937{
938 int disks = sh->disks;

--- 23 unchanged lines hidden (view full) ---

962 wbi = r5_next_bio(wbi, dev->sector);
963 }
964 }
965 }
966
967 return tx;
968}
969
706
707 return tx;
708}
709
710static struct dma_async_tx_descriptor *
711ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
712{
713 int disks = sh->disks;

--- 23 unchanged lines hidden (view full) ---

737 wbi = r5_next_bio(wbi, dev->sector);
738 }
739 }
740 }
741
742 return tx;
743}
744
970static void ops_complete_reconstruct(void *stripe_head_ref)
745static void ops_complete_postxor(void *stripe_head_ref)
971{
972 struct stripe_head *sh = stripe_head_ref;
746{
747 struct stripe_head *sh = stripe_head_ref;
973 int disks = sh->disks;
974 int pd_idx = sh->pd_idx;
975 int qd_idx = sh->qd_idx;
976 int i;
748 int disks = sh->disks, i, pd_idx = sh->pd_idx;
977
978 pr_debug("%s: stripe %llu\n", __func__,
979 (unsigned long long)sh->sector);
980
981 for (i = disks; i--; ) {
982 struct r5dev *dev = &sh->dev[i];
749
750 pr_debug("%s: stripe %llu\n", __func__,
751 (unsigned long long)sh->sector);
752
753 for (i = disks; i--; ) {
754 struct r5dev *dev = &sh->dev[i];
983
984 if (dev->written || i == pd_idx || i == qd_idx)
755 if (dev->written || i == pd_idx)
985 set_bit(R5_UPTODATE, &dev->flags);
986 }
987
988 if (sh->reconstruct_state == reconstruct_state_drain_run)
989 sh->reconstruct_state = reconstruct_state_drain_result;
990 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
991 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
992 else {
993 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
994 sh->reconstruct_state = reconstruct_state_result;
995 }
996
997 set_bit(STRIPE_HANDLE, &sh->state);
998 release_stripe(sh);
999}
1000
1001static void
756 set_bit(R5_UPTODATE, &dev->flags);
757 }
758
759 if (sh->reconstruct_state == reconstruct_state_drain_run)
760 sh->reconstruct_state = reconstruct_state_drain_result;
761 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
762 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
763 else {
764 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
765 sh->reconstruct_state = reconstruct_state_result;
766 }
767
768 set_bit(STRIPE_HANDLE, &sh->state);
769 release_stripe(sh);
770}
771
772static void
1002ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1003 struct dma_async_tx_descriptor *tx)
773ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1004{
774{
775 /* kernel stack size limits the total number of disks */
1005 int disks = sh->disks;
776 int disks = sh->disks;
1006 struct page **xor_srcs = percpu->scribble;
1007 struct async_submit_ctl submit;
777 struct page *xor_srcs[disks];
778
1008 int count = 0, pd_idx = sh->pd_idx, i;
1009 struct page *xor_dest;
1010 int prexor = 0;
1011 unsigned long flags;
1012
1013 pr_debug("%s: stripe %llu\n", __func__,
1014 (unsigned long long)sh->sector);
1015

--- 17 unchanged lines hidden (view full) ---

1033 }
1034 }
1035
1036 /* 1/ if we prexor'd then the dest is reused as a source
1037 * 2/ if we did not prexor then we are redoing the parity
1038 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1039 * for the synchronous xor case
1040 */
779 int count = 0, pd_idx = sh->pd_idx, i;
780 struct page *xor_dest;
781 int prexor = 0;
782 unsigned long flags;
783
784 pr_debug("%s: stripe %llu\n", __func__,
785 (unsigned long long)sh->sector);
786

--- 17 unchanged lines hidden (view full) ---

804 }
805 }
806
807 /* 1/ if we prexor'd then the dest is reused as a source
808 * 2/ if we did not prexor then we are redoing the parity
809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
810 * for the synchronous xor case
811 */
1041 flags = ASYNC_TX_ACK |
812 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
1042 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1043
1044 atomic_inc(&sh->count);
1045
813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
814
815 atomic_inc(&sh->count);
816
1046 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1047 to_addr_conv(sh, percpu));
1048 if (unlikely(count == 1))
1049 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1050 else
1051 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
817 if (unlikely(count == 1)) {
818 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
820 flags, tx, ops_complete_postxor, sh);
821 } else
822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
823 flags, tx, ops_complete_postxor, sh);
1052}
1053
824}
825
1054static void
1055ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1056 struct dma_async_tx_descriptor *tx)
1057{
1058 struct async_submit_ctl submit;
1059 struct page **blocks = percpu->scribble;
1060 int count;
1061
1062 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1063
1064 count = set_syndrome_sources(blocks, sh);
1065
1066 atomic_inc(&sh->count);
1067
1068 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1069 sh, to_addr_conv(sh, percpu));
1070 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1071}
1072
1073static void ops_complete_check(void *stripe_head_ref)
1074{
1075 struct stripe_head *sh = stripe_head_ref;
1076
1077 pr_debug("%s: stripe %llu\n", __func__,
1078 (unsigned long long)sh->sector);
1079
1080 sh->check_state = check_state_check_result;
1081 set_bit(STRIPE_HANDLE, &sh->state);
1082 release_stripe(sh);
1083}
1084
826static void ops_complete_check(void *stripe_head_ref)
827{
828 struct stripe_head *sh = stripe_head_ref;
829
830 pr_debug("%s: stripe %llu\n", __func__,
831 (unsigned long long)sh->sector);
832
833 sh->check_state = check_state_check_result;
834 set_bit(STRIPE_HANDLE, &sh->state);
835 release_stripe(sh);
836}
837
1085static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
838static void ops_run_check(struct stripe_head *sh)
1086{
839{
840 /* kernel stack size limits the total number of disks */
1087 int disks = sh->disks;
841 int disks = sh->disks;
1088 int pd_idx = sh->pd_idx;
1089 int qd_idx = sh->qd_idx;
1090 struct page *xor_dest;
1091 struct page **xor_srcs = percpu->scribble;
842 struct page *xor_srcs[disks];
1092 struct dma_async_tx_descriptor *tx;
843 struct dma_async_tx_descriptor *tx;
1093 struct async_submit_ctl submit;
1094 int count;
1095 int i;
1096
844
845 int count = 0, pd_idx = sh->pd_idx, i;
846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
847
1097 pr_debug("%s: stripe %llu\n", __func__,
1098 (unsigned long long)sh->sector);
1099
848 pr_debug("%s: stripe %llu\n", __func__,
849 (unsigned long long)sh->sector);
850
1100 count = 0;
1101 xor_dest = sh->dev[pd_idx].page;
1102 xor_srcs[count++] = xor_dest;
1103 for (i = disks; i--; ) {
851 for (i = disks; i--; ) {
1104 if (i == pd_idx || i == qd_idx)
1105 continue;
1106 xor_srcs[count++] = sh->dev[i].page;
852 struct r5dev *dev = &sh->dev[i];
853 if (i != pd_idx)
854 xor_srcs[count++] = dev->page;
1107 }
1108
855 }
856
1109 init_async_submit(&submit, 0, NULL, NULL, NULL,
1110 to_addr_conv(sh, percpu));
1111 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1112 &sh->ops.zero_sum_result, &submit);
857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
1113
1114 atomic_inc(&sh->count);
859
860 atomic_inc(&sh->count);
1115 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1116 tx = async_trigger_callback(&submit);
861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
862 ops_complete_check, sh);
1117}
1118
863}
864
1119static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
1120{
866{
1121 struct page **srcs = percpu->scribble;
1122 struct async_submit_ctl submit;
1123 int count;
1124
1125 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1126 (unsigned long long)sh->sector, checkp);
1127
1128 count = set_syndrome_sources(srcs, sh);
1129 if (!checkp)
1130 srcs[count] = NULL;
1131
1132 atomic_inc(&sh->count);
1133 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1134 sh, to_addr_conv(sh, percpu));
1135 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1136 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1137}
1138
1139static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1140{
1141 int overlap_clear = 0, i, disks = sh->disks;
1142 struct dma_async_tx_descriptor *tx = NULL;
867 int overlap_clear = 0, i, disks = sh->disks;
868 struct dma_async_tx_descriptor *tx = NULL;
1143 raid5_conf_t *conf = sh->raid_conf;
1144 int level = conf->level;
1145 struct raid5_percpu *percpu;
1146 unsigned long cpu;
1147
869
1148 cpu = get_cpu();
1149 percpu = per_cpu_ptr(conf->percpu, cpu);
1150 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1151 ops_run_biofill(sh);
1152 overlap_clear++;
1153 }
1154
1155 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
871 ops_run_biofill(sh);
872 overlap_clear++;
873 }
874
875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1156 if (level < 6)
1157 tx = ops_run_compute5(sh, percpu);
1158 else {
1159 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1160 tx = ops_run_compute6_1(sh, percpu);
1161 else
1162 tx = ops_run_compute6_2(sh, percpu);
1163 }
1164 /* terminate the chain if reconstruct is not set to be run */
1165 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
876 tx = ops_run_compute5(sh);
877 /* terminate the chain if postxor is not set to be run */
878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
1166 async_tx_ack(tx);
1167 }
1168
1169 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
879 async_tx_ack(tx);
880 }
881
882 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1170 tx = ops_run_prexor(sh, percpu, tx);
883 tx = ops_run_prexor(sh, tx);
1171
1172 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1173 tx = ops_run_biodrain(sh, tx);
1174 overlap_clear++;
1175 }
1176
884
885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
886 tx = ops_run_biodrain(sh, tx);
887 overlap_clear++;
888 }
889
1177 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1178 if (level < 6)
1179 ops_run_reconstruct5(sh, percpu, tx);
1180 else
1181 ops_run_reconstruct6(sh, percpu, tx);
1182 }
890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
891 ops_run_postxor(sh, tx);
1183
892
1184 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1185 if (sh->check_state == check_state_run)
1186 ops_run_check_p(sh, percpu);
1187 else if (sh->check_state == check_state_run_q)
1188 ops_run_check_pq(sh, percpu, 0);
1189 else if (sh->check_state == check_state_run_pq)
1190 ops_run_check_pq(sh, percpu, 1);
1191 else
1192 BUG();
1193 }
893 if (test_bit(STRIPE_OP_CHECK, &ops_request))
894 ops_run_check(sh);
1194
1195 if (overlap_clear)
1196 for (i = disks; i--; ) {
1197 struct r5dev *dev = &sh->dev[i];
1198 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1199 wake_up(&sh->raid_conf->wait_for_overlap);
1200 }
895
896 if (overlap_clear)
897 for (i = disks; i--; ) {
898 struct r5dev *dev = &sh->dev[i];
899 if (test_and_clear_bit(R5_Overlap, &dev->flags))
900 wake_up(&sh->raid_conf->wait_for_overlap);
901 }
1201 put_cpu();
1202}
1203
1204static int grow_one_stripe(raid5_conf_t *conf)
1205{
1206 struct stripe_head *sh;
1207 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1208 if (!sh)
1209 return 0;

--- 33 unchanged lines hidden (view full) ---

1243 conf->slab_cache = sc;
1244 conf->pool_size = devs;
1245 while (num--)
1246 if (!grow_one_stripe(conf))
1247 return 1;
1248 return 0;
1249}
1250
902}
903
904static int grow_one_stripe(raid5_conf_t *conf)
905{
906 struct stripe_head *sh;
907 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
908 if (!sh)
909 return 0;

--- 33 unchanged lines hidden (view full) ---

943 conf->slab_cache = sc;
944 conf->pool_size = devs;
945 while (num--)
946 if (!grow_one_stripe(conf))
947 return 1;
948 return 0;
949}
950
1251/**
1252 * scribble_len - return the required size of the scribble region
1253 * @num - total number of disks in the array
1254 *
1255 * The size must be enough to contain:
1256 * 1/ a struct page pointer for each device in the array +2
1257 * 2/ room to convert each entry in (1) to its corresponding dma
1258 * (dma_map_page()) or page (page_address()) address.
1259 *
1260 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1261 * calculate over all devices (not just the data blocks), using zeros in place
1262 * of the P and Q blocks.
1263 */
1264static size_t scribble_len(int num)
1265{
1266 size_t len;
1267
1268 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1269
1270 return len;
1271}
1272
1273static int resize_stripes(raid5_conf_t *conf, int newsize)
1274{
1275 /* Make all the stripes able to hold 'newsize' devices.
1276 * New slots in each stripe get 'page' set to a new page.
1277 *
1278 * This happens in stages:
1279 * 1/ create a new kmem_cache and allocate the required number of
1280 * stripe_heads.

--- 12 unchanged lines hidden (view full) ---

1293 * active service.
1294 *
1295 * Once step2 is started, we cannot afford to wait for a write,
1296 * so we use GFP_NOIO allocations.
1297 */
1298 struct stripe_head *osh, *nsh;
1299 LIST_HEAD(newstripes);
1300 struct disk_info *ndisks;
951static int resize_stripes(raid5_conf_t *conf, int newsize)
952{
953 /* Make all the stripes able to hold 'newsize' devices.
954 * New slots in each stripe get 'page' set to a new page.
955 *
956 * This happens in stages:
957 * 1/ create a new kmem_cache and allocate the required number of
958 * stripe_heads.

--- 12 unchanged lines hidden (view full) ---

971 * active service.
972 *
973 * Once step2 is started, we cannot afford to wait for a write,
974 * so we use GFP_NOIO allocations.
975 */
976 struct stripe_head *osh, *nsh;
977 LIST_HEAD(newstripes);
978 struct disk_info *ndisks;
1301 unsigned long cpu;
1302 int err;
1303 struct kmem_cache *sc;
1304 int i;
1305
1306 if (newsize <= conf->pool_size)
1307 return 0; /* never bother to shrink */
1308
1309 err = md_allow_write(conf->mddev);

--- 49 unchanged lines hidden (view full) ---

1359 nsh->dev[i].page = NULL;
1360 kmem_cache_free(conf->slab_cache, osh);
1361 }
1362 kmem_cache_destroy(conf->slab_cache);
1363
1364 /* Step 3.
1365 * At this point, we are holding all the stripes so the array
1366 * is completely stalled, so now is a good time to resize
979 int err;
980 struct kmem_cache *sc;
981 int i;
982
983 if (newsize <= conf->pool_size)
984 return 0; /* never bother to shrink */
985
986 err = md_allow_write(conf->mddev);

--- 49 unchanged lines hidden (view full) ---

1036 nsh->dev[i].page = NULL;
1037 kmem_cache_free(conf->slab_cache, osh);
1038 }
1039 kmem_cache_destroy(conf->slab_cache);
1040
1041 /* Step 3.
1042 * At this point, we are holding all the stripes so the array
1043 * is completely stalled, so now is a good time to resize
1367 * conf->disks and the scribble region
1044 * conf->disks.
1368 */
1369 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1370 if (ndisks) {
1371 for (i=0; i<conf->raid_disks; i++)
1372 ndisks[i] = conf->disks[i];
1373 kfree(conf->disks);
1374 conf->disks = ndisks;
1375 } else
1376 err = -ENOMEM;
1377
1045 */
1046 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1047 if (ndisks) {
1048 for (i=0; i<conf->raid_disks; i++)
1049 ndisks[i] = conf->disks[i];
1050 kfree(conf->disks);
1051 conf->disks = ndisks;
1052 } else
1053 err = -ENOMEM;
1054
1378 get_online_cpus();
1379 conf->scribble_len = scribble_len(newsize);
1380 for_each_present_cpu(cpu) {
1381 struct raid5_percpu *percpu;
1382 void *scribble;
1383
1384 percpu = per_cpu_ptr(conf->percpu, cpu);
1385 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1386
1387 if (scribble) {
1388 kfree(percpu->scribble);
1389 percpu->scribble = scribble;
1390 } else {
1391 err = -ENOMEM;
1392 break;
1393 }
1394 }
1395 put_online_cpus();
1396
1397 /* Step 4, return new stripes to service */
1398 while(!list_empty(&newstripes)) {
1399 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1400 list_del_init(&nsh->lru);
1055 /* Step 4, return new stripes to service */
1056 while(!list_empty(&newstripes)) {
1057 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1058 list_del_init(&nsh->lru);
1401
1402 for (i=conf->raid_disks; i < newsize; i++)
1403 if (nsh->dev[i].page == NULL) {
1404 struct page *p = alloc_page(GFP_NOIO);
1405 nsh->dev[i].page = p;
1406 if (!p)
1407 err = -ENOMEM;
1408 }
1409 release_stripe(nsh);

--- 522 unchanged lines hidden (view full) ---

1932 || sh2.qd_idx != sh->qd_idx) {
1933 printk(KERN_ERR "compute_blocknr: map not correct\n");
1934 return 0;
1935 }
1936 return r_sector;
1937}
1938
1939
1059 for (i=conf->raid_disks; i < newsize; i++)
1060 if (nsh->dev[i].page == NULL) {
1061 struct page *p = alloc_page(GFP_NOIO);
1062 nsh->dev[i].page = p;
1063 if (!p)
1064 err = -ENOMEM;
1065 }
1066 release_stripe(nsh);

--- 522 unchanged lines hidden (view full) ---

1589 || sh2.qd_idx != sh->qd_idx) {
1590 printk(KERN_ERR "compute_blocknr: map not correct\n");
1591 return 0;
1592 }
1593 return r_sector;
1594}
1595
1596
1597
1598/*
1599 * Copy data between a page in the stripe cache, and one or more bion
1600 * The page could align with the middle of the bio, or there could be
1601 * several bion, each with several bio_vecs, which cover part of the page
1602 * Multiple bion are linked together on bi_next. There may be extras
1603 * at the end of this list. We ignore them.
1604 */
1605static void copy_data(int frombio, struct bio *bio,
1606 struct page *page,
1607 sector_t sector)
1608{
1609 char *pa = page_address(page);
1610 struct bio_vec *bvl;
1611 int i;
1612 int page_offset;
1613
1614 if (bio->bi_sector >= sector)
1615 page_offset = (signed)(bio->bi_sector - sector) * 512;
1616 else
1617 page_offset = (signed)(sector - bio->bi_sector) * -512;
1618 bio_for_each_segment(bvl, bio, i) {
1619 int len = bio_iovec_idx(bio,i)->bv_len;
1620 int clen;
1621 int b_offset = 0;
1622
1623 if (page_offset < 0) {
1624 b_offset = -page_offset;
1625 page_offset += b_offset;
1626 len -= b_offset;
1627 }
1628
1629 if (len > 0 && page_offset + len > STRIPE_SIZE)
1630 clen = STRIPE_SIZE - page_offset;
1631 else clen = len;
1632
1633 if (clen > 0) {
1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1635 if (frombio)
1636 memcpy(pa+page_offset, ba+b_offset, clen);
1637 else
1638 memcpy(ba+b_offset, pa+page_offset, clen);
1639 __bio_kunmap_atomic(ba, KM_USER0);
1640 }
1641 if (clen < len) /* hit end of page */
1642 break;
1643 page_offset += len;
1644 }
1645}
1646
1647#define check_xor() do { \
1648 if (count == MAX_XOR_BLOCKS) { \
1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1650 count = 0; \
1651 } \
1652 } while(0)
1653
1654static void compute_parity6(struct stripe_head *sh, int method)
1655{
1656 raid5_conf_t *conf = sh->raid_conf;
1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1659 struct bio *chosen;
1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1661 void *ptrs[syndrome_disks+2];
1662
1663 pd_idx = sh->pd_idx;
1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1666
1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1668 (unsigned long long)sh->sector, method);
1669
1670 switch(method) {
1671 case READ_MODIFY_WRITE:
1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1673 case RECONSTRUCT_WRITE:
1674 for (i= disks; i-- ;)
1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1676 chosen = sh->dev[i].towrite;
1677 sh->dev[i].towrite = NULL;
1678
1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1680 wake_up(&conf->wait_for_overlap);
1681
1682 BUG_ON(sh->dev[i].written);
1683 sh->dev[i].written = chosen;
1684 }
1685 break;
1686 case CHECK_PARITY:
1687 BUG(); /* Not implemented yet */
1688 }
1689
1690 for (i = disks; i--;)
1691 if (sh->dev[i].written) {
1692 sector_t sector = sh->dev[i].sector;
1693 struct bio *wbi = sh->dev[i].written;
1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1695 copy_data(1, wbi, sh->dev[i].page, sector);
1696 wbi = r5_next_bio(wbi, sector);
1697 }
1698
1699 set_bit(R5_LOCKED, &sh->dev[i].flags);
1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1701 }
1702
1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1704
1705 for (i = 0; i < disks; i++)
1706 ptrs[i] = (void *)raid6_empty_zero_page;
1707
1708 count = 0;
1709 i = d0_idx;
1710 do {
1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1712
1713 ptrs[slot] = page_address(sh->dev[i].page);
1714 if (slot < syndrome_disks &&
1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1716 printk(KERN_ERR "block %d/%d not uptodate "
1717 "on parity calc\n", i, count);
1718 BUG();
1719 }
1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1726
1727 switch(method) {
1728 case RECONSTRUCT_WRITE:
1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1733 break;
1734 case UPDATE_PARITY:
1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1737 break;
1738 }
1739}
1740
1741
1742/* Compute one missing block */
1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1744{
1745 int i, count, disks = sh->disks;
1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1747 int qd_idx = sh->qd_idx;
1748
1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1750 (unsigned long long)sh->sector, dd_idx);
1751
1752 if ( dd_idx == qd_idx ) {
1753 /* We're actually computing the Q drive */
1754 compute_parity6(sh, UPDATE_PARITY);
1755 } else {
1756 dest = page_address(sh->dev[dd_idx].page);
1757 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1758 count = 0;
1759 for (i = disks ; i--; ) {
1760 if (i == dd_idx || i == qd_idx)
1761 continue;
1762 p = page_address(sh->dev[i].page);
1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1764 ptr[count++] = p;
1765 else
1766 printk("compute_block() %d, stripe %llu, %d"
1767 " not present\n", dd_idx,
1768 (unsigned long long)sh->sector, i);
1769
1770 check_xor();
1771 }
1772 if (count)
1773 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1776 }
1777}
1778
1779/* Compute two missing blocks */
1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1781{
1782 int i, count, disks = sh->disks;
1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1784 int d0_idx = raid6_d0(sh);
1785 int faila = -1, failb = -1;
1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1788
1789 for (i = 0; i < disks ; i++)
1790 ptrs[i] = (void *)raid6_empty_zero_page;
1791 count = 0;
1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1805
1806 BUG_ON(faila == failb);
1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1808
1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1812
1813 if (failb == syndrome_disks+1) {
1814 /* Q disk is one of the missing disks */
1815 if (faila == syndrome_disks) {
1816 /* Missing P+Q, just recompute */
1817 compute_parity6(sh, UPDATE_PARITY);
1818 return;
1819 } else {
1820 /* We're missing D+Q; recompute D from P */
1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1825 return;
1826 }
1827 }
1828
1829 /* We're missing D+P or D+D; */
1830 if (failb == syndrome_disks) {
1831 /* We're missing D+P. */
1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1833 } else {
1834 /* We're missing D+D. */
1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1836 ptrs);
1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1842}
1843
1940static void
1844static void
1941schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1942 int rcw, int expand)
1943{
1944 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1846 int rcw, int expand)
1847{
1848 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1945 raid5_conf_t *conf = sh->raid_conf;
1946 int level = conf->level;
1947
1948 if (rcw) {
1949 /* if we are not expanding this is a proper write request, and
1950 * there will be bios with new data to be drained into the
1951 * stripe cache
1952 */
1953 if (!expand) {
1954 sh->reconstruct_state = reconstruct_state_drain_run;
1955 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1956 } else
1957 sh->reconstruct_state = reconstruct_state_run;
1958
1849
1850 if (rcw) {
1851 /* if we are not expanding this is a proper write request, and
1852 * there will be bios with new data to be drained into the
1853 * stripe cache
1854 */
1855 if (!expand) {
1856 sh->reconstruct_state = reconstruct_state_drain_run;
1857 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1858 } else
1859 sh->reconstruct_state = reconstruct_state_run;
1860
1959 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1960
1961 for (i = disks; i--; ) {
1962 struct r5dev *dev = &sh->dev[i];
1963
1964 if (dev->towrite) {
1965 set_bit(R5_LOCKED, &dev->flags);
1966 set_bit(R5_Wantdrain, &dev->flags);
1967 if (!expand)
1968 clear_bit(R5_UPTODATE, &dev->flags);
1969 s->locked++;
1970 }
1971 }
1862
1863 for (i = disks; i--; ) {
1864 struct r5dev *dev = &sh->dev[i];
1865
1866 if (dev->towrite) {
1867 set_bit(R5_LOCKED, &dev->flags);
1868 set_bit(R5_Wantdrain, &dev->flags);
1869 if (!expand)
1870 clear_bit(R5_UPTODATE, &dev->flags);
1871 s->locked++;
1872 }
1873 }
1972 if (s->locked + conf->max_degraded == disks)
1874 if (s->locked + 1 == disks)
1973 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1974 atomic_inc(&conf->pending_full_writes);
1876 atomic_inc(&sh->raid_conf->pending_full_writes);
1975 } else {
1877 } else {
1976 BUG_ON(level == 6);
1977 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1978 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1979
1980 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1981 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1982 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1880
1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1983 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1984
1985 for (i = disks; i--; ) {
1986 struct r5dev *dev = &sh->dev[i];
1987 if (i == pd_idx)
1988 continue;
1989
1990 if (dev->towrite &&
1991 (test_bit(R5_UPTODATE, &dev->flags) ||
1992 test_bit(R5_Wantcompute, &dev->flags))) {
1993 set_bit(R5_Wantdrain, &dev->flags);
1994 set_bit(R5_LOCKED, &dev->flags);
1995 clear_bit(R5_UPTODATE, &dev->flags);
1996 s->locked++;
1997 }
1998 }
1999 }
2000
1885
1886 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i];
1888 if (i == pd_idx)
1889 continue;
1890
1891 if (dev->towrite &&
1892 (test_bit(R5_UPTODATE, &dev->flags) ||
1893 test_bit(R5_Wantcompute, &dev->flags))) {
1894 set_bit(R5_Wantdrain, &dev->flags);
1895 set_bit(R5_LOCKED, &dev->flags);
1896 clear_bit(R5_UPTODATE, &dev->flags);
1897 s->locked++;
1898 }
1899 }
1900 }
1901
2001 /* keep the parity disk(s) locked while asynchronous operations
1902 /* keep the parity disk locked while asynchronous operations
2002 * are in flight
2003 */
2004 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2005 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2006 s->locked++;
2007
1903 * are in flight
1904 */
1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1907 s->locked++;
1908
2008 if (level == 6) {
2009 int qd_idx = sh->qd_idx;
2010 struct r5dev *dev = &sh->dev[qd_idx];
2011
2012 set_bit(R5_LOCKED, &dev->flags);
2013 clear_bit(R5_UPTODATE, &dev->flags);
2014 s->locked++;
2015 }
2016
2017 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2018 __func__, (unsigned long long)sh->sector,
2019 s->locked, s->ops_request);
2020}
2021
2022/*
2023 * Each stripe/dev can have one or more bion attached.
2024 * toread/towrite point to the first in a chain.

--- 64 unchanged lines hidden (view full) ---

2089 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2090 spin_unlock_irq(&conf->device_lock);
2091 spin_unlock(&sh->lock);
2092 return 0;
2093}
2094
2095static void end_reshape(raid5_conf_t *conf);
2096
1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1910 __func__, (unsigned long long)sh->sector,
1911 s->locked, s->ops_request);
1912}
1913
1914/*
1915 * Each stripe/dev can have one or more bion attached.
1916 * toread/towrite point to the first in a chain.

--- 64 unchanged lines hidden (view full) ---

1981 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1982 spin_unlock_irq(&conf->device_lock);
1983 spin_unlock(&sh->lock);
1984 return 0;
1985}
1986
1987static void end_reshape(raid5_conf_t *conf);
1988
1989static int page_is_zero(struct page *p)
1990{
1991 char *a = page_address(p);
1992 return ((*(u32*)a) == 0 &&
1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1994}
1995
2097static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2098 struct stripe_head *sh)
2099{
2100 int sectors_per_chunk =
2101 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2102 int dd_idx;
2103 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2104 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;

--- 123 unchanged lines hidden (view full) ---

2228 * otherwise read it if the backing disk is insync
2229 */
2230 if ((s->uptodate == disks - 1) &&
2231 (s->failed && disk_idx == s->failed_num)) {
2232 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2233 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2234 set_bit(R5_Wantcompute, &dev->flags);
2235 sh->ops.target = disk_idx;
1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh)
1998{
1999 int sectors_per_chunk =
2000 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2001 int dd_idx;
2002 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2003 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;

--- 123 unchanged lines hidden (view full) ---

2127 * otherwise read it if the backing disk is insync
2128 */
2129 if ((s->uptodate == disks - 1) &&
2130 (s->failed && disk_idx == s->failed_num)) {
2131 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2132 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2133 set_bit(R5_Wantcompute, &dev->flags);
2134 sh->ops.target = disk_idx;
2236 sh->ops.target2 = -1;
2237 s->req_compute = 1;
2238 /* Careful: from this point on 'uptodate' is in the eye
2135 s->req_compute = 1;
2136 /* Careful: from this point on 'uptodate' is in the eye
2239 * of raid_run_ops which services 'compute' operations
2137 * of raid5_run_ops which services 'compute' operations
2240 * before writes. R5_Wantcompute flags a block that will
2241 * be R5_UPTODATE by the time it is needed for a
2242 * subsequent operation.
2243 */
2244 s->uptodate++;
2245 return 1; /* uptodate + compute == disks */
2246 } else if (test_bit(R5_Insync, &dev->flags)) {
2247 set_bit(R5_LOCKED, &dev->flags);

--- 22 unchanged lines hidden (view full) ---

2270 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2271 !sh->reconstruct_state)
2272 for (i = disks; i--; )
2273 if (fetch_block5(sh, s, i, disks))
2274 break;
2275 set_bit(STRIPE_HANDLE, &sh->state);
2276}
2277
2138 * before writes. R5_Wantcompute flags a block that will
2139 * be R5_UPTODATE by the time it is needed for a
2140 * subsequent operation.
2141 */
2142 s->uptodate++;
2143 return 1; /* uptodate + compute == disks */
2144 } else if (test_bit(R5_Insync, &dev->flags)) {
2145 set_bit(R5_LOCKED, &dev->flags);

--- 22 unchanged lines hidden (view full) ---

2168 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2169 !sh->reconstruct_state)
2170 for (i = disks; i--; )
2171 if (fetch_block5(sh, s, i, disks))
2172 break;
2173 set_bit(STRIPE_HANDLE, &sh->state);
2174}
2175
2278/* fetch_block6 - checks the given member device to see if its data needs
2279 * to be read or computed to satisfy a request.
2280 *
2281 * Returns 1 when no more member devices need to be checked, otherwise returns
2282 * 0 to tell the loop in handle_stripe_fill6 to continue
2283 */
2284static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2285 struct r6_state *r6s, int disk_idx, int disks)
2286{
2287 struct r5dev *dev = &sh->dev[disk_idx];
2288 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2289 &sh->dev[r6s->failed_num[1]] };
2290
2291 if (!test_bit(R5_LOCKED, &dev->flags) &&
2292 !test_bit(R5_UPTODATE, &dev->flags) &&
2293 (dev->toread ||
2294 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2295 s->syncing || s->expanding ||
2296 (s->failed >= 1 &&
2297 (fdev[0]->toread || s->to_write)) ||
2298 (s->failed >= 2 &&
2299 (fdev[1]->toread || s->to_write)))) {
2300 /* we would like to get this block, possibly by computing it,
2301 * otherwise read it if the backing disk is insync
2302 */
2303 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2304 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2305 if ((s->uptodate == disks - 1) &&
2306 (s->failed && (disk_idx == r6s->failed_num[0] ||
2307 disk_idx == r6s->failed_num[1]))) {
2308 /* have disk failed, and we're requested to fetch it;
2309 * do compute it
2310 */
2311 pr_debug("Computing stripe %llu block %d\n",
2312 (unsigned long long)sh->sector, disk_idx);
2313 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2314 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2315 set_bit(R5_Wantcompute, &dev->flags);
2316 sh->ops.target = disk_idx;
2317 sh->ops.target2 = -1; /* no 2nd target */
2318 s->req_compute = 1;
2319 s->uptodate++;
2320 return 1;
2321 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2322 /* Computing 2-failure is *very* expensive; only
2323 * do it if failed >= 2
2324 */
2325 int other;
2326 for (other = disks; other--; ) {
2327 if (other == disk_idx)
2328 continue;
2329 if (!test_bit(R5_UPTODATE,
2330 &sh->dev[other].flags))
2331 break;
2332 }
2333 BUG_ON(other < 0);
2334 pr_debug("Computing stripe %llu blocks %d,%d\n",
2335 (unsigned long long)sh->sector,
2336 disk_idx, other);
2337 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2338 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2339 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2340 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2341 sh->ops.target = disk_idx;
2342 sh->ops.target2 = other;
2343 s->uptodate += 2;
2344 s->req_compute = 1;
2345 return 1;
2346 } else if (test_bit(R5_Insync, &dev->flags)) {
2347 set_bit(R5_LOCKED, &dev->flags);
2348 set_bit(R5_Wantread, &dev->flags);
2349 s->locked++;
2350 pr_debug("Reading block %d (sync=%d)\n",
2351 disk_idx, s->syncing);
2352 }
2353 }
2354
2355 return 0;
2356}
2357
2358/**
2359 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2360 */
2361static void handle_stripe_fill6(struct stripe_head *sh,
2362 struct stripe_head_state *s, struct r6_state *r6s,
2363 int disks)
2364{
2365 int i;
2176static void handle_stripe_fill6(struct stripe_head *sh,
2177 struct stripe_head_state *s, struct r6_state *r6s,
2178 int disks)
2179{
2180 int i;
2366
2367 /* look for blocks to read/compute, skip this if a compute
2368 * is already in flight, or if the stripe contents are in the
2369 * midst of changing due to a write
2370 */
2371 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2372 !sh->reconstruct_state)
2373 for (i = disks; i--; )
2374 if (fetch_block6(sh, s, r6s, i, disks))
2375 break;
2181 for (i = disks; i--; ) {
2182 struct r5dev *dev = &sh->dev[i];
2183 if (!test_bit(R5_LOCKED, &dev->flags) &&
2184 !test_bit(R5_UPTODATE, &dev->flags) &&
2185 (dev->toread || (dev->towrite &&
2186 !test_bit(R5_OVERWRITE, &dev->flags)) ||
2187 s->syncing || s->expanding ||
2188 (s->failed >= 1 &&
2189 (sh->dev[r6s->failed_num[0]].toread ||
2190 s->to_write)) ||
2191 (s->failed >= 2 &&
2192 (sh->dev[r6s->failed_num[1]].toread ||
2193 s->to_write)))) {
2194 /* we would like to get this block, possibly
2195 * by computing it, but we might not be able to
2196 */
2197 if ((s->uptodate == disks - 1) &&
2198 (s->failed && (i == r6s->failed_num[0] ||
2199 i == r6s->failed_num[1]))) {
2200 pr_debug("Computing stripe %llu block %d\n",
2201 (unsigned long long)sh->sector, i);
2202 compute_block_1(sh, i, 0);
2203 s->uptodate++;
2204 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
2205 /* Computing 2-failure is *very* expensive; only
2206 * do it if failed >= 2
2207 */
2208 int other;
2209 for (other = disks; other--; ) {
2210 if (other == i)
2211 continue;
2212 if (!test_bit(R5_UPTODATE,
2213 &sh->dev[other].flags))
2214 break;
2215 }
2216 BUG_ON(other < 0);
2217 pr_debug("Computing stripe %llu blocks %d,%d\n",
2218 (unsigned long long)sh->sector,
2219 i, other);
2220 compute_block_2(sh, i, other);
2221 s->uptodate += 2;
2222 } else if (test_bit(R5_Insync, &dev->flags)) {
2223 set_bit(R5_LOCKED, &dev->flags);
2224 set_bit(R5_Wantread, &dev->flags);
2225 s->locked++;
2226 pr_debug("Reading block %d (sync=%d)\n",
2227 i, s->syncing);
2228 }
2229 }
2230 }
2376 set_bit(STRIPE_HANDLE, &sh->state);
2377}
2378
2379
2380/* handle_stripe_clean_event
2381 * any written block on an uptodate or failed drive can be returned.
2382 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2383 * never LOCKED, so we don't need to test 'failed' directly.

--- 117 unchanged lines hidden (view full) ---

2501 }
2502 }
2503 }
2504 /* now if nothing is locked, and if we have enough data,
2505 * we can start a write request
2506 */
2507 /* since handle_stripe can be called at any time we need to handle the
2508 * case where a compute block operation has been submitted and then a
2231 set_bit(STRIPE_HANDLE, &sh->state);
2232}
2233
2234
2235/* handle_stripe_clean_event
2236 * any written block on an uptodate or failed drive can be returned.
2237 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2238 * never LOCKED, so we don't need to test 'failed' directly.

--- 117 unchanged lines hidden (view full) ---

2356 }
2357 }
2358 }
2359 /* now if nothing is locked, and if we have enough data,
2360 * we can start a write request
2361 */
2362 /* since handle_stripe can be called at any time we need to handle the
2363 * case where a compute block operation has been submitted and then a
2509 * subsequent call wants to start a write request. raid_run_ops only
2510 * handles the case where compute block and reconstruct are requested
2364 * subsequent call wants to start a write request. raid5_run_ops only
2365 * handles the case where compute block and postxor are requested
2511 * simultaneously. If this is not the case then new writes need to be
2512 * held off until the compute completes.
2513 */
2514 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2515 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2516 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2366 * simultaneously. If this is not the case then new writes need to be
2367 * held off until the compute completes.
2368 */
2369 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2370 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2371 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2517 schedule_reconstruction(sh, s, rcw == 0, 0);
2372 schedule_reconstruction5(sh, s, rcw == 0, 0);
2518}
2519
2520static void handle_stripe_dirtying6(raid5_conf_t *conf,
2521 struct stripe_head *sh, struct stripe_head_state *s,
2522 struct r6_state *r6s, int disks)
2523{
2373}
2374
2375static void handle_stripe_dirtying6(raid5_conf_t *conf,
2376 struct stripe_head *sh, struct stripe_head_state *s,
2377 struct r6_state *r6s, int disks)
2378{
2524 int rcw = 0, pd_idx = sh->pd_idx, i;
2379 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2525 int qd_idx = sh->qd_idx;
2380 int qd_idx = sh->qd_idx;
2526
2527 set_bit(STRIPE_HANDLE, &sh->state);
2528 for (i = disks; i--; ) {
2529 struct r5dev *dev = &sh->dev[i];
2381 for (i = disks; i--; ) {
2382 struct r5dev *dev = &sh->dev[i];
2530 /* check if we haven't enough data */
2531 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2532 i != pd_idx && i != qd_idx &&
2533 !test_bit(R5_LOCKED, &dev->flags) &&
2534 !(test_bit(R5_UPTODATE, &dev->flags) ||
2535 test_bit(R5_Wantcompute, &dev->flags))) {
2536 rcw++;
2537 if (!test_bit(R5_Insync, &dev->flags))
2538 continue; /* it's a failed drive */
2539
2540 if (
2541 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2542 pr_debug("Read_old stripe %llu "
2543 "block %d for Reconstruct\n",
2544 (unsigned long long)sh->sector, i);
2545 set_bit(R5_LOCKED, &dev->flags);
2546 set_bit(R5_Wantread, &dev->flags);
2547 s->locked++;
2548 } else {
2549 pr_debug("Request delayed stripe %llu "
2550 "block %d for Reconstruct\n",
2551 (unsigned long long)sh->sector, i);
2552 set_bit(STRIPE_DELAYED, &sh->state);
2553 set_bit(STRIPE_HANDLE, &sh->state);
2383 /* Would I have to read this buffer for reconstruct_write */
2384 if (!test_bit(R5_OVERWRITE, &dev->flags)
2385 && i != pd_idx && i != qd_idx
2386 && (!test_bit(R5_LOCKED, &dev->flags)
2387 ) &&
2388 !test_bit(R5_UPTODATE, &dev->flags)) {
2389 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2390 else {
2391 pr_debug("raid6: must_compute: "
2392 "disk %d flags=%#lx\n", i, dev->flags);
2393 must_compute++;
2554 }
2555 }
2556 }
2394 }
2395 }
2396 }
2397 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2398 (unsigned long long)sh->sector, rcw, must_compute);
2399 set_bit(STRIPE_HANDLE, &sh->state);
2400
2401 if (rcw > 0)
2402 /* want reconstruct write, but need to get some data */
2403 for (i = disks; i--; ) {
2404 struct r5dev *dev = &sh->dev[i];
2405 if (!test_bit(R5_OVERWRITE, &dev->flags)
2406 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2407 && !test_bit(R5_LOCKED, &dev->flags) &&
2408 !test_bit(R5_UPTODATE, &dev->flags) &&
2409 test_bit(R5_Insync, &dev->flags)) {
2410 if (
2411 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2412 pr_debug("Read_old stripe %llu "
2413 "block %d for Reconstruct\n",
2414 (unsigned long long)sh->sector, i);
2415 set_bit(R5_LOCKED, &dev->flags);
2416 set_bit(R5_Wantread, &dev->flags);
2417 s->locked++;
2418 } else {
2419 pr_debug("Request delayed stripe %llu "
2420 "block %d for Reconstruct\n",
2421 (unsigned long long)sh->sector, i);
2422 set_bit(STRIPE_DELAYED, &sh->state);
2423 set_bit(STRIPE_HANDLE, &sh->state);
2424 }
2425 }
2426 }
2557 /* now if nothing is locked, and if we have enough data, we can start a
2558 * write request
2559 */
2427 /* now if nothing is locked, and if we have enough data, we can start a
2428 * write request
2429 */
2560 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2561 s->locked == 0 && rcw == 0 &&
2430 if (s->locked == 0 && rcw == 0 &&
2562 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2431 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2563 schedule_reconstruction(sh, s, 1, 0);
2432 if (must_compute > 0) {
2433 /* We have failed blocks and need to compute them */
2434 switch (s->failed) {
2435 case 0:
2436 BUG();
2437 case 1:
2438 compute_block_1(sh, r6s->failed_num[0], 0);
2439 break;
2440 case 2:
2441 compute_block_2(sh, r6s->failed_num[0],
2442 r6s->failed_num[1]);
2443 break;
2444 default: /* This request should have been failed? */
2445 BUG();
2446 }
2447 }
2448
2449 pr_debug("Computing parity for stripe %llu\n",
2450 (unsigned long long)sh->sector);
2451 compute_parity6(sh, RECONSTRUCT_WRITE);
2452 /* now every locked buffer is ready to be written */
2453 for (i = disks; i--; )
2454 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2455 pr_debug("Writing stripe %llu block %d\n",
2456 (unsigned long long)sh->sector, i);
2457 s->locked++;
2458 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2459 }
2460 if (s->locked == disks)
2461 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2462 atomic_inc(&conf->pending_full_writes);
2463 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2464 set_bit(STRIPE_INSYNC, &sh->state);
2465
2466 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2467 atomic_dec(&conf->preread_active_stripes);
2468 if (atomic_read(&conf->preread_active_stripes) <
2469 IO_THRESHOLD)
2470 md_wakeup_thread(conf->mddev->thread);
2471 }
2564 }
2565}
2566
2567static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2568 struct stripe_head_state *s, int disks)
2569{
2570 struct r5dev *dev = NULL;
2571

--- 42 unchanged lines hidden (view full) ---

2614 */
2615 if (s->failed)
2616 break;
2617
2618 /* handle a successful check operation, if parity is correct
2619 * we are done. Otherwise update the mismatch count and repair
2620 * parity if !MD_RECOVERY_CHECK
2621 */
2472 }
2473}
2474
2475static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2476 struct stripe_head_state *s, int disks)
2477{
2478 struct r5dev *dev = NULL;
2479

--- 42 unchanged lines hidden (view full) ---

2522 */
2523 if (s->failed)
2524 break;
2525
2526 /* handle a successful check operation, if parity is correct
2527 * we are done. Otherwise update the mismatch count and repair
2528 * parity if !MD_RECOVERY_CHECK
2529 */
2622 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2530 if (sh->ops.zero_sum_result == 0)
2623 /* parity is correct (on disc,
2624 * not in buffer any more)
2625 */
2626 set_bit(STRIPE_INSYNC, &sh->state);
2627 else {
2628 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2629 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2630 /* don't try to repair!! */
2631 set_bit(STRIPE_INSYNC, &sh->state);
2632 else {
2633 sh->check_state = check_state_compute_run;
2634 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2635 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2636 set_bit(R5_Wantcompute,
2637 &sh->dev[sh->pd_idx].flags);
2638 sh->ops.target = sh->pd_idx;
2531 /* parity is correct (on disc,
2532 * not in buffer any more)
2533 */
2534 set_bit(STRIPE_INSYNC, &sh->state);
2535 else {
2536 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2537 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2538 /* don't try to repair!! */
2539 set_bit(STRIPE_INSYNC, &sh->state);
2540 else {
2541 sh->check_state = check_state_compute_run;
2542 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2543 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2544 set_bit(R5_Wantcompute,
2545 &sh->dev[sh->pd_idx].flags);
2546 sh->ops.target = sh->pd_idx;
2639 sh->ops.target2 = -1;
2640 s->uptodate++;
2641 }
2642 }
2643 break;
2644 case check_state_compute_run:
2645 break;
2646 default:
2647 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2648 __func__, sh->check_state,
2649 (unsigned long long) sh->sector);
2650 BUG();
2651 }
2652}
2653
2654
2655static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2547 s->uptodate++;
2548 }
2549 }
2550 break;
2551 case check_state_compute_run:
2552 break;
2553 default:
2554 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2555 __func__, sh->check_state,
2556 (unsigned long long) sh->sector);
2557 BUG();
2558 }
2559}
2560
2561
2562static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2656 struct stripe_head_state *s,
2657 struct r6_state *r6s, int disks)
2563 struct stripe_head_state *s,
2564 struct r6_state *r6s, struct page *tmp_page,
2565 int disks)
2658{
2566{
2567 int update_p = 0, update_q = 0;
2568 struct r5dev *dev;
2659 int pd_idx = sh->pd_idx;
2660 int qd_idx = sh->qd_idx;
2569 int pd_idx = sh->pd_idx;
2570 int qd_idx = sh->qd_idx;
2661 struct r5dev *dev;
2662
2663 set_bit(STRIPE_HANDLE, &sh->state);
2664
2665 BUG_ON(s->failed > 2);
2571
2572 set_bit(STRIPE_HANDLE, &sh->state);
2573
2574 BUG_ON(s->failed > 2);
2666
2575 BUG_ON(s->uptodate < disks);
2667 /* Want to check and possibly repair P and Q.
2668 * However there could be one 'failed' device, in which
2669 * case we can only check one of them, possibly using the
2670 * other to generate missing data
2671 */
2672
2576 /* Want to check and possibly repair P and Q.
2577 * However there could be one 'failed' device, in which
2578 * case we can only check one of them, possibly using the
2579 * other to generate missing data
2580 */
2581
2673 switch (sh->check_state) {
2674 case check_state_idle:
2675 /* start a new check operation if there are < 2 failures */
2582 /* If !tmp_page, we cannot do the calculations,
2583 * but as we have set STRIPE_HANDLE, we will soon be called
2584 * by stripe_handle with a tmp_page - just wait until then.
2585 */
2586 if (tmp_page) {
2676 if (s->failed == r6s->q_failed) {
2587 if (s->failed == r6s->q_failed) {
2677 /* The only possible failed device holds Q, so it
2588 /* The only possible failed device holds 'Q', so it
2678 * makes sense to check P (If anything else were failed,
2679 * we would have used P to recreate it).
2680 */
2589 * makes sense to check P (If anything else were failed,
2590 * we would have used P to recreate it).
2591 */
2681 sh->check_state = check_state_run;
2592 compute_block_1(sh, pd_idx, 1);
2593 if (!page_is_zero(sh->dev[pd_idx].page)) {
2594 compute_block_1(sh, pd_idx, 0);
2595 update_p = 1;
2596 }
2682 }
2683 if (!r6s->q_failed && s->failed < 2) {
2597 }
2598 if (!r6s->q_failed && s->failed < 2) {
2684 /* Q is not failed, and we didn't use it to generate
2599 /* q is not failed, and we didn't use it to generate
2685 * anything, so it makes sense to check it
2686 */
2600 * anything, so it makes sense to check it
2601 */
2687 if (sh->check_state == check_state_run)
2688 sh->check_state = check_state_run_pq;
2689 else
2690 sh->check_state = check_state_run_q;
2602 memcpy(page_address(tmp_page),
2603 page_address(sh->dev[qd_idx].page),
2604 STRIPE_SIZE);
2605 compute_parity6(sh, UPDATE_PARITY);
2606 if (memcmp(page_address(tmp_page),
2607 page_address(sh->dev[qd_idx].page),
2608 STRIPE_SIZE) != 0) {
2609 clear_bit(STRIPE_INSYNC, &sh->state);
2610 update_q = 1;
2611 }
2691 }
2612 }
2692
2693 /* discard potentially stale zero_sum_result */
2694 sh->ops.zero_sum_result = 0;
2695
2696 if (sh->check_state == check_state_run) {
2697 /* async_xor_zero_sum destroys the contents of P */
2698 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2699 s->uptodate--;
2613 if (update_p || update_q) {
2614 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2615 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2616 /* don't try to repair!! */
2617 update_p = update_q = 0;
2700 }
2618 }
2701 if (sh->check_state >= check_state_run &&
2702 sh->check_state <= check_state_run_pq) {
2703 /* async_syndrome_zero_sum preserves P and Q, so
2704 * no need to mark them !uptodate here
2705 */
2706 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2707 break;
2708 }
2709
2619
2710 /* we have 2-disk failure */
2711 BUG_ON(s->failed != 2);
2712 /* fall through */
2713 case check_state_compute_result:
2714 sh->check_state = check_state_idle;
2715
2716 /* check that a write has not made the stripe insync */
2717 if (test_bit(STRIPE_INSYNC, &sh->state))
2718 break;
2719
2720 /* now write out any block on a failed drive,
2620 /* now write out any block on a failed drive,
2721 * or P or Q if they were recomputed
2621 * or P or Q if they need it
2722 */
2622 */
2723 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2623
2724 if (s->failed == 2) {
2725 dev = &sh->dev[r6s->failed_num[1]];
2726 s->locked++;
2727 set_bit(R5_LOCKED, &dev->flags);
2728 set_bit(R5_Wantwrite, &dev->flags);
2729 }
2730 if (s->failed >= 1) {
2731 dev = &sh->dev[r6s->failed_num[0]];
2732 s->locked++;
2733 set_bit(R5_LOCKED, &dev->flags);
2734 set_bit(R5_Wantwrite, &dev->flags);
2735 }
2624 if (s->failed == 2) {
2625 dev = &sh->dev[r6s->failed_num[1]];
2626 s->locked++;
2627 set_bit(R5_LOCKED, &dev->flags);
2628 set_bit(R5_Wantwrite, &dev->flags);
2629 }
2630 if (s->failed >= 1) {
2631 dev = &sh->dev[r6s->failed_num[0]];
2632 s->locked++;
2633 set_bit(R5_LOCKED, &dev->flags);
2634 set_bit(R5_Wantwrite, &dev->flags);
2635 }
2736 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2636
2637 if (update_p) {
2737 dev = &sh->dev[pd_idx];
2738 s->locked++;
2739 set_bit(R5_LOCKED, &dev->flags);
2740 set_bit(R5_Wantwrite, &dev->flags);
2741 }
2638 dev = &sh->dev[pd_idx];
2639 s->locked++;
2640 set_bit(R5_LOCKED, &dev->flags);
2641 set_bit(R5_Wantwrite, &dev->flags);
2642 }
2742 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2643 if (update_q) {
2743 dev = &sh->dev[qd_idx];
2744 s->locked++;
2745 set_bit(R5_LOCKED, &dev->flags);
2746 set_bit(R5_Wantwrite, &dev->flags);
2747 }
2748 clear_bit(STRIPE_DEGRADED, &sh->state);
2749
2750 set_bit(STRIPE_INSYNC, &sh->state);
2644 dev = &sh->dev[qd_idx];
2645 s->locked++;
2646 set_bit(R5_LOCKED, &dev->flags);
2647 set_bit(R5_Wantwrite, &dev->flags);
2648 }
2649 clear_bit(STRIPE_DEGRADED, &sh->state);
2650
2651 set_bit(STRIPE_INSYNC, &sh->state);
2751 break;
2752 case check_state_run:
2753 case check_state_run_q:
2754 case check_state_run_pq:
2755 break; /* we will be called again upon completion */
2756 case check_state_check_result:
2757 sh->check_state = check_state_idle;
2758
2759 /* handle a successful check operation, if parity is correct
2760 * we are done. Otherwise update the mismatch count and repair
2761 * parity if !MD_RECOVERY_CHECK
2762 */
2763 if (sh->ops.zero_sum_result == 0) {
2764 /* both parities are correct */
2765 if (!s->failed)
2766 set_bit(STRIPE_INSYNC, &sh->state);
2767 else {
2768 /* in contrast to the raid5 case we can validate
2769 * parity, but still have a failure to write
2770 * back
2771 */
2772 sh->check_state = check_state_compute_result;
2773 /* Returning at this point means that we may go
2774 * off and bring p and/or q uptodate again so
2775 * we make sure to check zero_sum_result again
2776 * to verify if p or q need writeback
2777 */
2778 }
2779 } else {
2780 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2781 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2782 /* don't try to repair!! */
2783 set_bit(STRIPE_INSYNC, &sh->state);
2784 else {
2785 int *target = &sh->ops.target;
2786
2787 sh->ops.target = -1;
2788 sh->ops.target2 = -1;
2789 sh->check_state = check_state_compute_run;
2790 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2791 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2792 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2793 set_bit(R5_Wantcompute,
2794 &sh->dev[pd_idx].flags);
2795 *target = pd_idx;
2796 target = &sh->ops.target2;
2797 s->uptodate++;
2798 }
2799 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2800 set_bit(R5_Wantcompute,
2801 &sh->dev[qd_idx].flags);
2802 *target = qd_idx;
2803 s->uptodate++;
2804 }
2805 }
2806 }
2807 break;
2808 case check_state_compute_run:
2809 break;
2810 default:
2811 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2812 __func__, sh->check_state,
2813 (unsigned long long) sh->sector);
2814 BUG();
2815 }
2816}
2817
2818static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2819 struct r6_state *r6s)
2820{
2821 int i;
2822
2823 /* We have read all the blocks in this stripe and now we need to
2824 * copy some of them into a target stripe for expand.
2825 */
2826 struct dma_async_tx_descriptor *tx = NULL;
2827 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2828 for (i = 0; i < sh->disks; i++)
2829 if (i != sh->pd_idx && i != sh->qd_idx) {
2830 int dd_idx, j;
2831 struct stripe_head *sh2;
2652 }
2653}
2654
2655static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2656 struct r6_state *r6s)
2657{
2658 int i;
2659
2660 /* We have read all the blocks in this stripe and now we need to
2661 * copy some of them into a target stripe for expand.
2662 */
2663 struct dma_async_tx_descriptor *tx = NULL;
2664 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2665 for (i = 0; i < sh->disks; i++)
2666 if (i != sh->pd_idx && i != sh->qd_idx) {
2667 int dd_idx, j;
2668 struct stripe_head *sh2;
2832 struct async_submit_ctl submit;
2833
2834 sector_t bn = compute_blocknr(sh, i, 1);
2835 sector_t s = raid5_compute_sector(conf, bn, 0,
2836 &dd_idx, NULL);
2837 sh2 = get_active_stripe(conf, s, 0, 1, 1);
2838 if (sh2 == NULL)
2839 /* so far only the early blocks of this stripe
2840 * have been requested. When later blocks
2841 * get requested, we will try again
2842 */
2843 continue;
2844 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2845 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2846 /* must have already done this block */
2847 release_stripe(sh2);
2848 continue;
2849 }
2850
2851 /* place all the copies on one channel */
2669
2670 sector_t bn = compute_blocknr(sh, i, 1);
2671 sector_t s = raid5_compute_sector(conf, bn, 0,
2672 &dd_idx, NULL);
2673 sh2 = get_active_stripe(conf, s, 0, 1, 1);
2674 if (sh2 == NULL)
2675 /* so far only the early blocks of this stripe
2676 * have been requested. When later blocks
2677 * get requested, we will try again
2678 */
2679 continue;
2680 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2681 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2682 /* must have already done this block */
2683 release_stripe(sh2);
2684 continue;
2685 }
2686
2687 /* place all the copies on one channel */
2852 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2853 tx = async_memcpy(sh2->dev[dd_idx].page,
2688 tx = async_memcpy(sh2->dev[dd_idx].page,
2854 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2855 &submit);
2689 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2690 ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2856
2857 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2858 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2859 for (j = 0; j < conf->raid_disks; j++)
2860 if (j != sh2->pd_idx &&
2861 (!r6s || j != sh2->qd_idx) &&
2862 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2863 break;

--- 269 unchanged lines hidden (view full) ---

3133 }
3134 }
3135
3136 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3137 !sh->reconstruct_state) {
3138 /* Need to write out all blocks after computing parity */
3139 sh->disks = conf->raid_disks;
3140 stripe_set_idx(sh->sector, conf, 0, sh);
2691
2692 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2693 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2694 for (j = 0; j < conf->raid_disks; j++)
2695 if (j != sh2->pd_idx &&
2696 (!r6s || j != sh2->qd_idx) &&
2697 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2698 break;

--- 269 unchanged lines hidden (view full) ---

2968 }
2969 }
2970
2971 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2972 !sh->reconstruct_state) {
2973 /* Need to write out all blocks after computing parity */
2974 sh->disks = conf->raid_disks;
2975 stripe_set_idx(sh->sector, conf, 0, sh);
3141 schedule_reconstruction(sh, &s, 1, 1);
2976 schedule_reconstruction5(sh, &s, 1, 1);
3142 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3143 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3144 atomic_dec(&conf->reshape_stripes);
3145 wake_up(&conf->wait_for_overlap);
3146 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3147 }
3148
3149 if (s.expanding && s.locked == 0 &&
3150 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3151 handle_stripe_expansion(conf, sh, NULL);
3152
3153 unlock:
3154 spin_unlock(&sh->lock);
3155
3156 /* wait for this device to become unblocked */
3157 if (unlikely(blocked_rdev))
3158 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3159
3160 if (s.ops_request)
2977 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2978 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2979 atomic_dec(&conf->reshape_stripes);
2980 wake_up(&conf->wait_for_overlap);
2981 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2982 }
2983
2984 if (s.expanding && s.locked == 0 &&
2985 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
2986 handle_stripe_expansion(conf, sh, NULL);
2987
2988 unlock:
2989 spin_unlock(&sh->lock);
2990
2991 /* wait for this device to become unblocked */
2992 if (unlikely(blocked_rdev))
2993 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2994
2995 if (s.ops_request)
3161 raid_run_ops(sh, s.ops_request);
2996 raid5_run_ops(sh, s.ops_request);
3162
3163 ops_run_io(sh, &s);
3164
3165 return_io(return_bi);
3166
3167 return blocked_rdev == NULL;
3168}
3169
2997
2998 ops_run_io(sh, &s);
2999
3000 return_io(return_bi);
3001
3002 return blocked_rdev == NULL;
3003}
3004
3170static bool handle_stripe6(struct stripe_head *sh)
3005static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3171{
3172 raid5_conf_t *conf = sh->raid_conf;
3173 int disks = sh->disks;
3174 struct bio *return_bi = NULL;
3175 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
3176 struct stripe_head_state s;
3177 struct r6_state r6s;
3178 struct r5dev *dev, *pdev, *qdev;
3179 mdk_rdev_t *blocked_rdev = NULL;
3180
3181 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3006{
3007 raid5_conf_t *conf = sh->raid_conf;
3008 int disks = sh->disks;
3009 struct bio *return_bi = NULL;
3010 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
3011 struct stripe_head_state s;
3012 struct r6_state r6s;
3013 struct r5dev *dev, *pdev, *qdev;
3014 mdk_rdev_t *blocked_rdev = NULL;
3015
3016 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3182 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3017 "pd_idx=%d, qd_idx=%d\n",
3183 (unsigned long long)sh->sector, sh->state,
3018 (unsigned long long)sh->sector, sh->state,
3184 atomic_read(&sh->count), pd_idx, qd_idx,
3185 sh->check_state, sh->reconstruct_state);
3019 atomic_read(&sh->count), pd_idx, qd_idx);
3186 memset(&s, 0, sizeof(s));
3187
3188 spin_lock(&sh->lock);
3189 clear_bit(STRIPE_HANDLE, &sh->state);
3190 clear_bit(STRIPE_DELAYED, &sh->state);
3191
3192 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3193 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3194 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3195 /* Now to look around and see what can be done */
3196
3197 rcu_read_lock();
3198 for (i=disks; i--; ) {
3199 mdk_rdev_t *rdev;
3200 dev = &sh->dev[i];
3201 clear_bit(R5_Insync, &dev->flags);
3202
3203 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3204 i, dev->flags, dev->toread, dev->towrite, dev->written);
3020 memset(&s, 0, sizeof(s));
3021
3022 spin_lock(&sh->lock);
3023 clear_bit(STRIPE_HANDLE, &sh->state);
3024 clear_bit(STRIPE_DELAYED, &sh->state);
3025
3026 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3027 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3028 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3029 /* Now to look around and see what can be done */
3030
3031 rcu_read_lock();
3032 for (i=disks; i--; ) {
3033 mdk_rdev_t *rdev;
3034 dev = &sh->dev[i];
3035 clear_bit(R5_Insync, &dev->flags);
3036
3037 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3038 i, dev->flags, dev->toread, dev->towrite, dev->written);
3205 /* maybe we can reply to a read
3206 *
3207 * new wantfill requests are only permitted while
3208 * ops_complete_biofill is guaranteed to be inactive
3209 */
3210 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3211 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3212 set_bit(R5_Wantfill, &dev->flags);
3039 /* maybe we can reply to a read */
3040 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
3041 struct bio *rbi, *rbi2;
3042 pr_debug("Return read for disc %d\n", i);
3043 spin_lock_irq(&conf->device_lock);
3044 rbi = dev->toread;
3045 dev->toread = NULL;
3046 if (test_and_clear_bit(R5_Overlap, &dev->flags))
3047 wake_up(&conf->wait_for_overlap);
3048 spin_unlock_irq(&conf->device_lock);
3049 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
3050 copy_data(0, rbi, dev->page, dev->sector);
3051 rbi2 = r5_next_bio(rbi, dev->sector);
3052 spin_lock_irq(&conf->device_lock);
3053 if (!raid5_dec_bi_phys_segments(rbi)) {
3054 rbi->bi_next = return_bi;
3055 return_bi = rbi;
3056 }
3057 spin_unlock_irq(&conf->device_lock);
3058 rbi = rbi2;
3059 }
3060 }
3213
3214 /* now count some things */
3215 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3216 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3061
3062 /* now count some things */
3063 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3064 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3217 if (test_bit(R5_Wantcompute, &dev->flags))
3218 BUG_ON(++s.compute > 2);
3219
3065
3220 if (test_bit(R5_Wantfill, &dev->flags)) {
3221 s.to_fill++;
3222 } else if (dev->toread)
3066
3067 if (dev->toread)
3223 s.to_read++;
3224 if (dev->towrite) {
3225 s.to_write++;
3226 if (!test_bit(R5_OVERWRITE, &dev->flags))
3227 s.non_overwrite++;
3228 }
3229 if (dev->written)
3230 s.written++;

--- 24 unchanged lines hidden (view full) ---

3255 set_bit(STRIPE_HANDLE, &sh->state);
3256 goto unlock;
3257 }
3258 /* There is nothing for the blocked_rdev to block */
3259 rdev_dec_pending(blocked_rdev, conf->mddev);
3260 blocked_rdev = NULL;
3261 }
3262
3068 s.to_read++;
3069 if (dev->towrite) {
3070 s.to_write++;
3071 if (!test_bit(R5_OVERWRITE, &dev->flags))
3072 s.non_overwrite++;
3073 }
3074 if (dev->written)
3075 s.written++;

--- 24 unchanged lines hidden (view full) ---

3100 set_bit(STRIPE_HANDLE, &sh->state);
3101 goto unlock;
3102 }
3103 /* There is nothing for the blocked_rdev to block */
3104 rdev_dec_pending(blocked_rdev, conf->mddev);
3105 blocked_rdev = NULL;
3106 }
3107
3263 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3264 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3265 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3266 }
3267
3268 pr_debug("locked=%d uptodate=%d to_read=%d"
3269 " to_write=%d failed=%d failed_num=%d,%d\n",
3270 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3271 r6s.failed_num[0], r6s.failed_num[1]);
3272 /* check if the array has lost >2 devices and, if so, some requests
3273 * might need to be failed
3274 */
3275 if (s.failed > 2 && s.to_read+s.to_write+s.written)

--- 24 unchanged lines hidden (view full) ---

3300 && test_bit(R5_UPTODATE, &qdev->flags)))))
3301 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3302
3303 /* Now we might consider reading some blocks, either to check/generate
3304 * parity, or to satisfy requests
3305 * or to load a block that is being partially written.
3306 */
3307 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3108 pr_debug("locked=%d uptodate=%d to_read=%d"
3109 " to_write=%d failed=%d failed_num=%d,%d\n",
3110 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3111 r6s.failed_num[0], r6s.failed_num[1]);
3112 /* check if the array has lost >2 devices and, if so, some requests
3113 * might need to be failed
3114 */
3115 if (s.failed > 2 && s.to_read+s.to_write+s.written)

--- 24 unchanged lines hidden (view full) ---

3140 && test_bit(R5_UPTODATE, &qdev->flags)))))
3141 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3142
3143 /* Now we might consider reading some blocks, either to check/generate
3144 * parity, or to satisfy requests
3145 * or to load a block that is being partially written.
3146 */
3147 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3308 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3148 (s.syncing && (s.uptodate < disks)) || s.expanding)
3309 handle_stripe_fill6(sh, &s, &r6s, disks);
3310
3149 handle_stripe_fill6(sh, &s, &r6s, disks);
3150
3311 /* Now we check to see if any write operations have recently
3312 * completed
3313 */
3314 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3315 int qd_idx = sh->qd_idx;
3316
3317 sh->reconstruct_state = reconstruct_state_idle;
3318 /* All the 'written' buffers and the parity blocks are ready to
3319 * be written back to disk
3320 */
3321 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3322 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3323 for (i = disks; i--; ) {
3324 dev = &sh->dev[i];
3325 if (test_bit(R5_LOCKED, &dev->flags) &&
3326 (i == sh->pd_idx || i == qd_idx ||
3327 dev->written)) {
3328 pr_debug("Writing block %d\n", i);
3329 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3330 set_bit(R5_Wantwrite, &dev->flags);
3331 if (!test_bit(R5_Insync, &dev->flags) ||
3332 ((i == sh->pd_idx || i == qd_idx) &&
3333 s.failed == 0))
3334 set_bit(STRIPE_INSYNC, &sh->state);
3335 }
3336 }
3337 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3338 atomic_dec(&conf->preread_active_stripes);
3339 if (atomic_read(&conf->preread_active_stripes) <
3340 IO_THRESHOLD)
3341 md_wakeup_thread(conf->mddev->thread);
3342 }
3343 }
3344
3345 /* Now to consider new write requests and what else, if anything
3346 * should be read. We do not handle new writes when:
3347 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3348 * 2/ A 'check' operation is in flight, as it may clobber the parity
3349 * block.
3350 */
3351 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3151 /* now to consider writing and what else, if anything should be read */
3152 if (s.to_write)
3352 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3353
3354 /* maybe we need to check and possibly fix the parity for this stripe
3355 * Any reads will already have been scheduled, so we just see if enough
3153 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3154
3155 /* maybe we need to check and possibly fix the parity for this stripe
3156 * Any reads will already have been scheduled, so we just see if enough
3356 * data is available. The parity check is held off while parity
3357 * dependent operations are in flight.
3157 * data is available
3358 */
3158 */
3359 if (sh->check_state ||
3360 (s.syncing && s.locked == 0 &&
3361 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3362 !test_bit(STRIPE_INSYNC, &sh->state)))
3363 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3159 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
3160 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
3364
3365 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3366 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3367 clear_bit(STRIPE_SYNCING, &sh->state);
3368 }
3369
3370 /* If the failed drives are just a ReadError, then we might need
3371 * to progress the repair/check process

--- 4 unchanged lines hidden (view full) ---

3376 if (test_bit(R5_ReadError, &dev->flags)
3377 && !test_bit(R5_LOCKED, &dev->flags)
3378 && test_bit(R5_UPTODATE, &dev->flags)
3379 ) {
3380 if (!test_bit(R5_ReWrite, &dev->flags)) {
3381 set_bit(R5_Wantwrite, &dev->flags);
3382 set_bit(R5_ReWrite, &dev->flags);
3383 set_bit(R5_LOCKED, &dev->flags);
3161
3162 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3163 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3164 clear_bit(STRIPE_SYNCING, &sh->state);
3165 }
3166
3167 /* If the failed drives are just a ReadError, then we might need
3168 * to progress the repair/check process

--- 4 unchanged lines hidden (view full) ---

3173 if (test_bit(R5_ReadError, &dev->flags)
3174 && !test_bit(R5_LOCKED, &dev->flags)
3175 && test_bit(R5_UPTODATE, &dev->flags)
3176 ) {
3177 if (!test_bit(R5_ReWrite, &dev->flags)) {
3178 set_bit(R5_Wantwrite, &dev->flags);
3179 set_bit(R5_ReWrite, &dev->flags);
3180 set_bit(R5_LOCKED, &dev->flags);
3384 s.locked++;
3385 } else {
3386 /* let's read it back */
3387 set_bit(R5_Wantread, &dev->flags);
3388 set_bit(R5_LOCKED, &dev->flags);
3181 } else {
3182 /* let's read it back */
3183 set_bit(R5_Wantread, &dev->flags);
3184 set_bit(R5_LOCKED, &dev->flags);
3389 s.locked++;
3390 }
3391 }
3392 }
3393
3185 }
3186 }
3187 }
3188
3394 /* Finish reconstruct operations initiated by the expansion process */
3395 if (sh->reconstruct_state == reconstruct_state_result) {
3396 sh->reconstruct_state = reconstruct_state_idle;
3397 clear_bit(STRIPE_EXPANDING, &sh->state);
3398 for (i = conf->raid_disks; i--; ) {
3399 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3400 set_bit(R5_LOCKED, &sh->dev[i].flags);
3401 s.locked++;
3402 }
3403 }
3404
3405 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3406 !sh->reconstruct_state) {
3189 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
3407 struct stripe_head *sh2
3408 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3409 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3410 /* sh cannot be written until sh2 has been read.
3411 * so arrange for sh to be delayed a little
3412 */
3413 set_bit(STRIPE_DELAYED, &sh->state);
3414 set_bit(STRIPE_HANDLE, &sh->state);

--- 4 unchanged lines hidden (view full) ---

3419 goto unlock;
3420 }
3421 if (sh2)
3422 release_stripe(sh2);
3423
3424 /* Need to write out all blocks after computing P&Q */
3425 sh->disks = conf->raid_disks;
3426 stripe_set_idx(sh->sector, conf, 0, sh);
3190 struct stripe_head *sh2
3191 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3192 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3193 /* sh cannot be written until sh2 has been read.
3194 * so arrange for sh to be delayed a little
3195 */
3196 set_bit(STRIPE_DELAYED, &sh->state);
3197 set_bit(STRIPE_HANDLE, &sh->state);

--- 4 unchanged lines hidden (view full) ---

3202 goto unlock;
3203 }
3204 if (sh2)
3205 release_stripe(sh2);
3206
3207 /* Need to write out all blocks after computing P&Q */
3208 sh->disks = conf->raid_disks;
3209 stripe_set_idx(sh->sector, conf, 0, sh);
3427 schedule_reconstruction(sh, &s, 1, 1);
3428 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3210 compute_parity6(sh, RECONSTRUCT_WRITE);
3211 for (i = conf->raid_disks ; i-- ; ) {
3212 set_bit(R5_LOCKED, &sh->dev[i].flags);
3213 s.locked++;
3214 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3215 }
3216 clear_bit(STRIPE_EXPANDING, &sh->state);
3217 } else if (s.expanded) {
3429 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3430 atomic_dec(&conf->reshape_stripes);
3431 wake_up(&conf->wait_for_overlap);
3432 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3433 }
3434
3435 if (s.expanding && s.locked == 0 &&
3436 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3437 handle_stripe_expansion(conf, sh, &r6s);
3438
3439 unlock:
3440 spin_unlock(&sh->lock);
3441
3442 /* wait for this device to become unblocked */
3443 if (unlikely(blocked_rdev))
3444 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3445
3218 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3219 atomic_dec(&conf->reshape_stripes);
3220 wake_up(&conf->wait_for_overlap);
3221 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3222 }
3223
3224 if (s.expanding && s.locked == 0 &&
3225 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3226 handle_stripe_expansion(conf, sh, &r6s);
3227
3228 unlock:
3229 spin_unlock(&sh->lock);
3230
3231 /* wait for this device to become unblocked */
3232 if (unlikely(blocked_rdev))
3233 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3234
3446 if (s.ops_request)
3447 raid_run_ops(sh, s.ops_request);
3448
3449 ops_run_io(sh, &s);
3450
3451 return_io(return_bi);
3452
3453 return blocked_rdev == NULL;
3454}
3455
3456/* returns true if the stripe was handled */
3235 ops_run_io(sh, &s);
3236
3237 return_io(return_bi);
3238
3239 return blocked_rdev == NULL;
3240}
3241
3242/* returns true if the stripe was handled */
3457static bool handle_stripe(struct stripe_head *sh)
3243static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
3458{
3459 if (sh->raid_conf->level == 6)
3244{
3245 if (sh->raid_conf->level == 6)
3460 return handle_stripe6(sh);
3246 return handle_stripe6(sh, tmp_page);
3461 else
3462 return handle_stripe5(sh);
3463}
3464
3247 else
3248 return handle_stripe5(sh);
3249}
3250
3251
3252
3465static void raid5_activate_delayed(raid5_conf_t *conf)
3466{
3467 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3468 while (!list_empty(&conf->delayed_list)) {
3469 struct list_head *l = conf->delayed_list.next;
3470 struct stripe_head *sh;
3471 sh = list_entry(l, struct stripe_head, lru);
3472 list_del_init(l);

--- 340 unchanged lines hidden (view full) ---

3813 raid5_conf_t *conf = mddev->private;
3814 int dd_idx;
3815 sector_t new_sector;
3816 sector_t logical_sector, last_sector;
3817 struct stripe_head *sh;
3818 const int rw = bio_data_dir(bi);
3819 int cpu, remaining;
3820
3253static void raid5_activate_delayed(raid5_conf_t *conf)
3254{
3255 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3256 while (!list_empty(&conf->delayed_list)) {
3257 struct list_head *l = conf->delayed_list.next;
3258 struct stripe_head *sh;
3259 sh = list_entry(l, struct stripe_head, lru);
3260 list_del_init(l);

--- 340 unchanged lines hidden (view full) ---

3601 raid5_conf_t *conf = mddev->private;
3602 int dd_idx;
3603 sector_t new_sector;
3604 sector_t logical_sector, last_sector;
3605 struct stripe_head *sh;
3606 const int rw = bio_data_dir(bi);
3607 int cpu, remaining;
3608
3821 if (unlikely(bio_barrier(bi))) {
3609 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3822 bio_endio(bi, -EOPNOTSUPP);
3823 return 0;
3824 }
3825
3826 md_write_start(mddev, bi);
3827
3828 cpu = part_stat_lock();
3829 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);

--- 434 unchanged lines hidden (view full) ---

4264 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4265
4266 spin_lock(&sh->lock);
4267 set_bit(STRIPE_SYNCING, &sh->state);
4268 clear_bit(STRIPE_INSYNC, &sh->state);
4269 spin_unlock(&sh->lock);
4270
4271 /* wait for any blocked device to be handled */
3610 bio_endio(bi, -EOPNOTSUPP);
3611 return 0;
3612 }
3613
3614 md_write_start(mddev, bi);
3615
3616 cpu = part_stat_lock();
3617 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);

--- 434 unchanged lines hidden (view full) ---

4052 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4053
4054 spin_lock(&sh->lock);
4055 set_bit(STRIPE_SYNCING, &sh->state);
4056 clear_bit(STRIPE_INSYNC, &sh->state);
4057 spin_unlock(&sh->lock);
4058
4059 /* wait for any blocked device to be handled */
4272 while (unlikely(!handle_stripe(sh)))
4060 while(unlikely(!handle_stripe(sh, NULL)))
4273 ;
4274 release_stripe(sh);
4275
4276 return STRIPE_SECTORS;
4277}
4278
4279static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4280{

--- 40 unchanged lines hidden (view full) ---

4321 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4322 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4323 release_stripe(sh);
4324 raid5_set_bi_hw_segments(raid_bio, scnt);
4325 conf->retry_read_aligned = raid_bio;
4326 return handled;
4327 }
4328
4061 ;
4062 release_stripe(sh);
4063
4064 return STRIPE_SECTORS;
4065}
4066
4067static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4068{

--- 40 unchanged lines hidden (view full) ---

4109 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4110 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4111 release_stripe(sh);
4112 raid5_set_bi_hw_segments(raid_bio, scnt);
4113 conf->retry_read_aligned = raid_bio;
4114 return handled;
4115 }
4116
4329 handle_stripe(sh);
4117 handle_stripe(sh, NULL);
4330 release_stripe(sh);
4331 handled++;
4332 }
4333 spin_lock_irq(&conf->device_lock);
4334 remaining = raid5_dec_bi_phys_segments(raid_bio);
4335 spin_unlock_irq(&conf->device_lock);
4336 if (remaining == 0)
4337 bio_endio(raid_bio, 0);
4338 if (atomic_dec_and_test(&conf->active_aligned_reads))
4339 wake_up(&conf->wait_for_stripe);
4340 return handled;
4341}
4342
4118 release_stripe(sh);
4119 handled++;
4120 }
4121 spin_lock_irq(&conf->device_lock);
4122 remaining = raid5_dec_bi_phys_segments(raid_bio);
4123 spin_unlock_irq(&conf->device_lock);
4124 if (remaining == 0)
4125 bio_endio(raid_bio, 0);
4126 if (atomic_dec_and_test(&conf->active_aligned_reads))
4127 wake_up(&conf->wait_for_stripe);
4128 return handled;
4129}
4130
4343#ifdef CONFIG_MULTICORE_RAID456
4344static void __process_stripe(void *param, async_cookie_t cookie)
4345{
4346 struct stripe_head *sh = param;
4347
4131
4348 handle_stripe(sh);
4349 release_stripe(sh);
4350}
4351
4132
4352static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4353{
4354 async_schedule_domain(__process_stripe, sh, domain);
4355}
4356
4357static void synchronize_stripe_processing(struct list_head *domain)
4358{
4359 async_synchronize_full_domain(domain);
4360}
4361#else
4362static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4363{
4364 handle_stripe(sh);
4365 release_stripe(sh);
4366 cond_resched();
4367}
4368
4369static void synchronize_stripe_processing(struct list_head *domain)
4370{
4371}
4372#endif
4373
4374
4375/*
4376 * This is our raid5 kernel thread.
4377 *
4378 * We scan the hash table for stripes which can be handled now.
4379 * During the scan, completed stripes are saved for us by the interrupt
4380 * handler, so that they will not have to wait for our next wakeup.
4381 */
4382static void raid5d(mddev_t *mddev)
4383{
4384 struct stripe_head *sh;
4385 raid5_conf_t *conf = mddev->private;
4386 int handled;
4133/*
4134 * This is our raid5 kernel thread.
4135 *
4136 * We scan the hash table for stripes which can be handled now.
4137 * During the scan, completed stripes are saved for us by the interrupt
4138 * handler, so that they will not have to wait for our next wakeup.
4139 */
4140static void raid5d(mddev_t *mddev)
4141{
4142 struct stripe_head *sh;
4143 raid5_conf_t *conf = mddev->private;
4144 int handled;
4387 LIST_HEAD(raid_domain);
4388
4389 pr_debug("+++ raid5d active\n");
4390
4391 md_check_recovery(mddev);
4392
4393 handled = 0;
4394 spin_lock_irq(&conf->device_lock);
4395 while (1) {

--- 20 unchanged lines hidden (view full) ---

4416
4417 sh = __get_priority_stripe(conf);
4418
4419 if (!sh)
4420 break;
4421 spin_unlock_irq(&conf->device_lock);
4422
4423 handled++;
4145
4146 pr_debug("+++ raid5d active\n");
4147
4148 md_check_recovery(mddev);
4149
4150 handled = 0;
4151 spin_lock_irq(&conf->device_lock);
4152 while (1) {

--- 20 unchanged lines hidden (view full) ---

4173
4174 sh = __get_priority_stripe(conf);
4175
4176 if (!sh)
4177 break;
4178 spin_unlock_irq(&conf->device_lock);
4179
4180 handled++;
4424 process_stripe(sh, &raid_domain);
4181 handle_stripe(sh, conf->spare_page);
4182 release_stripe(sh);
4425
4426 spin_lock_irq(&conf->device_lock);
4427 }
4428 pr_debug("%d stripes handled\n", handled);
4429
4430 spin_unlock_irq(&conf->device_lock);
4431
4183
4184 spin_lock_irq(&conf->device_lock);
4185 }
4186 pr_debug("%d stripes handled\n", handled);
4187
4188 spin_unlock_irq(&conf->device_lock);
4189
4432 synchronize_stripe_processing(&raid_domain);
4433 async_tx_issue_pending_all();
4434 unplug_slaves(mddev);
4435
4436 pr_debug("--- raid5d inactive\n");
4437}
4438
4439static ssize_t
4440raid5_show_stripe_cache_size(mddev_t *mddev, char *page)

--- 116 unchanged lines hidden (view full) ---

4557 raid_disks = conf->previous_raid_disks;
4558 }
4559
4560 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4561 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4562 return sectors * (raid_disks - conf->max_degraded);
4563}
4564
4190 async_tx_issue_pending_all();
4191 unplug_slaves(mddev);
4192
4193 pr_debug("--- raid5d inactive\n");
4194}
4195
4196static ssize_t
4197raid5_show_stripe_cache_size(mddev_t *mddev, char *page)

--- 116 unchanged lines hidden (view full) ---

4314 raid_disks = conf->previous_raid_disks;
4315 }
4316
4317 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4318 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4319 return sectors * (raid_disks - conf->max_degraded);
4320}
4321
4565static void raid5_free_percpu(raid5_conf_t *conf)
4566{
4567 struct raid5_percpu *percpu;
4568 unsigned long cpu;
4569
4570 if (!conf->percpu)
4571 return;
4572
4573 get_online_cpus();
4574 for_each_possible_cpu(cpu) {
4575 percpu = per_cpu_ptr(conf->percpu, cpu);
4576 safe_put_page(percpu->spare_page);
4577 kfree(percpu->scribble);
4578 }
4579#ifdef CONFIG_HOTPLUG_CPU
4580 unregister_cpu_notifier(&conf->cpu_notify);
4581#endif
4582 put_online_cpus();
4583
4584 free_percpu(conf->percpu);
4585}
4586
4587static void free_conf(raid5_conf_t *conf)
4588{
4589 shrink_stripes(conf);
4322static void free_conf(raid5_conf_t *conf)
4323{
4324 shrink_stripes(conf);
4590 raid5_free_percpu(conf);
4325 safe_put_page(conf->spare_page);
4591 kfree(conf->disks);
4592 kfree(conf->stripe_hashtbl);
4593 kfree(conf);
4594}
4595
4326 kfree(conf->disks);
4327 kfree(conf->stripe_hashtbl);
4328 kfree(conf);
4329}
4330
4596#ifdef CONFIG_HOTPLUG_CPU
4597static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4598 void *hcpu)
4599{
4600 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4601 long cpu = (long)hcpu;
4602 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4603
4604 switch (action) {
4605 case CPU_UP_PREPARE:
4606 case CPU_UP_PREPARE_FROZEN:
4607 if (conf->level == 6 && !percpu->spare_page)
4608 percpu->spare_page = alloc_page(GFP_KERNEL);
4609 if (!percpu->scribble)
4610 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4611
4612 if (!percpu->scribble ||
4613 (conf->level == 6 && !percpu->spare_page)) {
4614 safe_put_page(percpu->spare_page);
4615 kfree(percpu->scribble);
4616 pr_err("%s: failed memory allocation for cpu%ld\n",
4617 __func__, cpu);
4618 return NOTIFY_BAD;
4619 }
4620 break;
4621 case CPU_DEAD:
4622 case CPU_DEAD_FROZEN:
4623 safe_put_page(percpu->spare_page);
4624 kfree(percpu->scribble);
4625 percpu->spare_page = NULL;
4626 percpu->scribble = NULL;
4627 break;
4628 default:
4629 break;
4630 }
4631 return NOTIFY_OK;
4632}
4633#endif
4634
4635static int raid5_alloc_percpu(raid5_conf_t *conf)
4636{
4637 unsigned long cpu;
4638 struct page *spare_page;
4639 struct raid5_percpu *allcpus;
4640 void *scribble;
4641 int err;
4642
4643 allcpus = alloc_percpu(struct raid5_percpu);
4644 if (!allcpus)
4645 return -ENOMEM;
4646 conf->percpu = allcpus;
4647
4648 get_online_cpus();
4649 err = 0;
4650 for_each_present_cpu(cpu) {
4651 if (conf->level == 6) {
4652 spare_page = alloc_page(GFP_KERNEL);
4653 if (!spare_page) {
4654 err = -ENOMEM;
4655 break;
4656 }
4657 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4658 }
4659 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4660 if (!scribble) {
4661 err = -ENOMEM;
4662 break;
4663 }
4664 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4665 }
4666#ifdef CONFIG_HOTPLUG_CPU
4667 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4668 conf->cpu_notify.priority = 0;
4669 if (err == 0)
4670 err = register_cpu_notifier(&conf->cpu_notify);
4671#endif
4672 put_online_cpus();
4673
4674 return err;
4675}
4676
4677static raid5_conf_t *setup_conf(mddev_t *mddev)
4678{
4679 raid5_conf_t *conf;
4680 int raid_disk, memory;
4681 mdk_rdev_t *rdev;
4682 struct disk_info *disk;
4683
4684 if (mddev->new_level != 5

--- 25 unchanged lines hidden (view full) ---

4710 return ERR_PTR(-EINVAL);
4711 }
4712
4713 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
4714 if (conf == NULL)
4715 goto abort;
4716
4717 conf->raid_disks = mddev->raid_disks;
4331static raid5_conf_t *setup_conf(mddev_t *mddev)
4332{
4333 raid5_conf_t *conf;
4334 int raid_disk, memory;
4335 mdk_rdev_t *rdev;
4336 struct disk_info *disk;
4337
4338 if (mddev->new_level != 5

--- 25 unchanged lines hidden (view full) ---

4364 return ERR_PTR(-EINVAL);
4365 }
4366
4367 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
4368 if (conf == NULL)
4369 goto abort;
4370
4371 conf->raid_disks = mddev->raid_disks;
4718 conf->scribble_len = scribble_len(conf->raid_disks);
4719 if (mddev->reshape_position == MaxSector)
4720 conf->previous_raid_disks = mddev->raid_disks;
4721 else
4722 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4723
4724 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
4725 GFP_KERNEL);
4726 if (!conf->disks)
4727 goto abort;
4728
4729 conf->mddev = mddev;
4730
4731 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4732 goto abort;
4733
4372 if (mddev->reshape_position == MaxSector)
4373 conf->previous_raid_disks = mddev->raid_disks;
4374 else
4375 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4376
4377 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
4378 GFP_KERNEL);
4379 if (!conf->disks)
4380 goto abort;
4381
4382 conf->mddev = mddev;
4383
4384 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4385 goto abort;
4386
4734 conf->level = mddev->new_level;
4735 if (raid5_alloc_percpu(conf) != 0)
4736 goto abort;
4737
4387 if (mddev->new_level == 6) {
4388 conf->spare_page = alloc_page(GFP_KERNEL);
4389 if (!conf->spare_page)
4390 goto abort;
4391 }
4738 spin_lock_init(&conf->device_lock);
4739 init_waitqueue_head(&conf->wait_for_stripe);
4740 init_waitqueue_head(&conf->wait_for_overlap);
4741 INIT_LIST_HEAD(&conf->handle_list);
4742 INIT_LIST_HEAD(&conf->hold_list);
4743 INIT_LIST_HEAD(&conf->delayed_list);
4744 INIT_LIST_HEAD(&conf->bitmap_list);
4745 INIT_LIST_HEAD(&conf->inactive_list);

--- 1043 unchanged lines hidden ---
4392 spin_lock_init(&conf->device_lock);
4393 init_waitqueue_head(&conf->wait_for_stripe);
4394 init_waitqueue_head(&conf->wait_for_overlap);
4395 INIT_LIST_HEAD(&conf->handle_list);
4396 INIT_LIST_HEAD(&conf->hold_list);
4397 INIT_LIST_HEAD(&conf->delayed_list);
4398 INIT_LIST_HEAD(&conf->bitmap_list);
4399 INIT_LIST_HEAD(&conf->inactive_list);

--- 1043 unchanged lines hidden ---